aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt4
-rw-r--r--Documentation/kmemleak.txt142
-rw-r--r--MAINTAINERS16
-rw-r--r--arch/powerpc/include/asm/hw_irq.h39
-rw-r--r--arch/powerpc/include/asm/paca.h1
-rw-r--r--arch/powerpc/include/asm/perf_counter.h98
-rw-r--r--arch/powerpc/include/asm/reg.h2
-rw-r--r--arch/powerpc/include/asm/systbl.h2
-rw-r--r--arch/powerpc/include/asm/unistd.h1
-rw-r--r--arch/powerpc/kernel/Makefile3
-rw-r--r--arch/powerpc/kernel/asm-offsets.c1
-rw-r--r--arch/powerpc/kernel/entry_64.S9
-rw-r--r--arch/powerpc/kernel/irq.c5
-rw-r--r--arch/powerpc/kernel/perf_counter.c1263
-rw-r--r--arch/powerpc/kernel/power4-pmu.c598
-rw-r--r--arch/powerpc/kernel/power5+-pmu.c671
-rw-r--r--arch/powerpc/kernel/power5-pmu.c611
-rw-r--r--arch/powerpc/kernel/power6-pmu.c532
-rw-r--r--arch/powerpc/kernel/power7-pmu.c357
-rw-r--r--arch/powerpc/kernel/ppc970-pmu.c482
-rw-r--r--arch/powerpc/mm/fault.c10
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype1
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/ia32/ia32entry.S3
-rw-r--r--arch/x86/include/asm/atomic_32.h236
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h2
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/irq_vectors.h8
-rw-r--r--arch/x86/include/asm/perf_counter.h100
-rw-r--r--arch/x86/include/asm/unistd_32.h1
-rw-r--r--arch/x86/include/asm/unistd_64.h3
-rw-r--r--arch/x86/kernel/apic/apic.c3
-rw-r--r--arch/x86/kernel/apic/io_apic.c6
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c1704
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/entry_64.S5
-rw-r--r--arch/x86/kernel/irq.c10
-rw-r--r--arch/x86/kernel/irqinit.c15
-rw-r--r--arch/x86/kernel/signal.c1
-rw-r--r--arch/x86/kernel/syscall_table_32.S1
-rw-r--r--arch/x86/kernel/traps.c12
-rw-r--r--arch/x86/mm/fault.c12
-rw-r--r--arch/x86/mm/memtest.c9
-rw-r--r--arch/x86/oprofile/nmi_int.c7
-rw-r--r--arch/x86/oprofile/op_model_ppro.c10
-rw-r--r--arch/x86/vdso/vdso32-setup.c6
-rw-r--r--arch/x86/vdso/vma.c7
-rw-r--r--drivers/ata/ahci.c87
-rw-r--r--drivers/ata/ata_piix.c11
-rw-r--r--drivers/ata/libata-core.c11
-rw-r--r--drivers/ata/libata-sff.c20
-rw-r--r--drivers/ata/sata_nv.c131
-rw-r--r--drivers/ata/sata_sil.c2
-rw-r--r--drivers/ata/sata_sx4.c11
-rw-r--r--drivers/char/sysrq.c2
-rw-r--r--drivers/char/vt.c9
-rw-r--r--drivers/firmware/dmi_scan.c1
-rw-r--r--drivers/video/console/vgacon.c5
-rw-r--r--fs/block_dev.c6
-rw-r--r--fs/exec.c9
-rw-r--r--fs/jfs/jfs_imap.c1
-rw-r--r--fs/jfs/super.c4
-rw-r--r--include/asm-generic/atomic.h2
-rw-r--r--include/linux/init_task.h10
-rw-r--r--include/linux/irq.h18
-rw-r--r--include/linux/kernel_stat.h5
-rw-r--r--include/linux/kmemleak.h96
-rw-r--r--include/linux/percpu.h5
-rw-r--r--include/linux/perf_counter.h697
-rw-r--r--include/linux/prctl.h3
-rw-r--r--include/linux/sched.h21
-rw-r--r--include/linux/slab.h2
-rw-r--r--include/linux/syscalls.h5
-rw-r--r--init/Kconfig34
-rw-r--r--init/main.c45
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/exit.c16
-rw-r--r--kernel/fork.c12
-rw-r--r--kernel/irq/handle.c11
-rw-r--r--kernel/module.c56
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/perf_counter.c4260
-rw-r--r--kernel/profile.c6
-rw-r--r--kernel/sched.c87
-rw-r--r--kernel/sched_cpupri.c8
-rw-r--r--kernel/slow-work.c4
-rw-r--r--kernel/sys.c7
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c27
-rw-r--r--kernel/timer.c3
-rw-r--r--lib/Kconfig.debug32
-rw-r--r--lib/cpumask.c11
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bootmem.c12
-rw-r--r--mm/kmemleak-test.c111
-rw-r--r--mm/kmemleak.c1498
-rw-r--r--mm/mmap.c5
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/page_alloc.c11
-rw-r--r--mm/page_cgroup.c12
-rw-r--r--mm/slab.c117
-rw-r--r--mm/slob.c7
-rw-r--r--mm/slub.c22
-rw-r--r--mm/vmalloc.c33
-rw-r--r--tools/perf/.gitignore16
-rw-r--r--tools/perf/Documentation/Makefile300
-rw-r--r--tools/perf/Documentation/asciidoc.conf91
-rw-r--r--tools/perf/Documentation/manpage-1.72.xsl14
-rw-r--r--tools/perf/Documentation/manpage-base.xsl35
-rw-r--r--tools/perf/Documentation/manpage-bold-literal.xsl17
-rw-r--r--tools/perf/Documentation/manpage-normal.xsl13
-rw-r--r--tools/perf/Documentation/manpage-suppress-sp.xsl21
-rw-r--r--tools/perf/Documentation/perf-annotate.txt29
-rw-r--r--tools/perf/Documentation/perf-help.txt38
-rw-r--r--tools/perf/Documentation/perf-list.txt25
-rw-r--r--tools/perf/Documentation/perf-record.txt42
-rw-r--r--tools/perf/Documentation/perf-report.txt26
-rw-r--r--tools/perf/Documentation/perf-stat.txt66
-rw-r--r--tools/perf/Documentation/perf-top.txt39
-rw-r--r--tools/perf/Documentation/perf.txt24
-rw-r--r--tools/perf/Makefile929
-rw-r--r--tools/perf/builtin-annotate.c1356
-rw-r--r--tools/perf/builtin-help.c461
-rw-r--r--tools/perf/builtin-list.c20
-rw-r--r--tools/perf/builtin-record.c582
-rw-r--r--tools/perf/builtin-report.c1316
-rw-r--r--tools/perf/builtin-stat.c367
-rw-r--r--tools/perf/builtin-top.c736
-rw-r--r--tools/perf/builtin.h26
-rw-r--r--tools/perf/command-list.txt10
-rw-r--r--tools/perf/design.txt442
-rw-r--r--tools/perf/perf.c428
-rw-r--r--tools/perf/perf.h67
-rwxr-xr-xtools/perf/util/PERF-VERSION-GEN42
-rw-r--r--tools/perf/util/abspath.c117
-rw-r--r--tools/perf/util/alias.c77
-rw-r--r--tools/perf/util/cache.h119
-rw-r--r--tools/perf/util/color.c241
-rw-r--r--tools/perf/util/color.h36
-rw-r--r--tools/perf/util/config.c873
-rw-r--r--tools/perf/util/ctype.c26
-rw-r--r--tools/perf/util/environment.c9
-rw-r--r--tools/perf/util/exec_cmd.c165
-rw-r--r--tools/perf/util/exec_cmd.h13
-rwxr-xr-xtools/perf/util/generate-cmdlist.sh24
-rw-r--r--tools/perf/util/help.c367
-rw-r--r--tools/perf/util/help.h29
-rw-r--r--tools/perf/util/levenshtein.c84
-rw-r--r--tools/perf/util/levenshtein.h8
-rw-r--r--tools/perf/util/list.h603
-rw-r--r--tools/perf/util/pager.c99
-rw-r--r--tools/perf/util/parse-events.c316
-rw-r--r--tools/perf/util/parse-events.h17
-rw-r--r--tools/perf/util/parse-options.c508
-rw-r--r--tools/perf/util/parse-options.h174
-rw-r--r--tools/perf/util/path.c353
-rw-r--r--tools/perf/util/quote.c481
-rw-r--r--tools/perf/util/quote.h68
-rw-r--r--tools/perf/util/rbtree.c383
-rw-r--r--tools/perf/util/rbtree.h171
-rw-r--r--tools/perf/util/run-command.c395
-rw-r--r--tools/perf/util/run-command.h93
-rw-r--r--tools/perf/util/sigchain.c52
-rw-r--r--tools/perf/util/sigchain.h11
-rw-r--r--tools/perf/util/strbuf.c359
-rw-r--r--tools/perf/util/strbuf.h137
-rw-r--r--tools/perf/util/string.c34
-rw-r--r--tools/perf/util/string.h8
-rw-r--r--tools/perf/util/symbol.c641
-rw-r--r--tools/perf/util/symbol.h47
-rw-r--r--tools/perf/util/usage.c80
-rw-r--r--tools/perf/util/util.h410
-rw-r--r--tools/perf/util/wrapper.c206
178 files changed, 29785 insertions, 323 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 72d3bf08d79b..7bcdebffdab3 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1083,6 +1083,10 @@ and is between 256 and 4096 characters. It is defined in the file
1083 Configure the RouterBoard 532 series on-chip 1083 Configure the RouterBoard 532 series on-chip
1084 Ethernet adapter MAC address. 1084 Ethernet adapter MAC address.
1085 1085
1086 kmemleak= [KNL] Boot-time kmemleak enable/disable
1087 Valid arguments: on, off
1088 Default: on
1089
1086 kstack=N [X86] Print N words from the kernel stack 1090 kstack=N [X86] Print N words from the kernel stack
1087 in oops dumps. 1091 in oops dumps.
1088 1092
diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt
new file mode 100644
index 000000000000..0112da3b9ab8
--- /dev/null
+++ b/Documentation/kmemleak.txt
@@ -0,0 +1,142 @@
1Kernel Memory Leak Detector
2===========================
3
4Introduction
5------------
6
7Kmemleak provides a way of detecting possible kernel memory leaks in a
8way similar to a tracing garbage collector
9(http://en.wikipedia.org/wiki/Garbage_collection_%28computer_science%29#Tracing_garbage_collectors),
10with the difference that the orphan objects are not freed but only
11reported via /sys/kernel/debug/kmemleak. A similar method is used by the
12Valgrind tool (memcheck --leak-check) to detect the memory leaks in
13user-space applications.
14
15Usage
16-----
17
18CONFIG_DEBUG_KMEMLEAK in "Kernel hacking" has to be enabled. A kernel
19thread scans the memory every 10 minutes (by default) and prints any new
20unreferenced objects found. To trigger an intermediate scan and display
21all the possible memory leaks:
22
23 # mount -t debugfs nodev /sys/kernel/debug/
24 # cat /sys/kernel/debug/kmemleak
25
26Note that the orphan objects are listed in the order they were allocated
27and one object at the beginning of the list may cause other subsequent
28objects to be reported as orphan.
29
30Memory scanning parameters can be modified at run-time by writing to the
31/sys/kernel/debug/kmemleak file. The following parameters are supported:
32
33 off - disable kmemleak (irreversible)
34 stack=on - enable the task stacks scanning
35 stack=off - disable the tasks stacks scanning
36 scan=on - start the automatic memory scanning thread
37 scan=off - stop the automatic memory scanning thread
38 scan=<secs> - set the automatic memory scanning period in seconds (0
39 to disable it)
40
41Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on
42the kernel command line.
43
44Basic Algorithm
45---------------
46
47The memory allocations via kmalloc, vmalloc, kmem_cache_alloc and
48friends are traced and the pointers, together with additional
49information like size and stack trace, are stored in a prio search tree.
50The corresponding freeing function calls are tracked and the pointers
51removed from the kmemleak data structures.
52
53An allocated block of memory is considered orphan if no pointer to its
54start address or to any location inside the block can be found by
55scanning the memory (including saved registers). This means that there
56might be no way for the kernel to pass the address of the allocated
57block to a freeing function and therefore the block is considered a
58memory leak.
59
60The scanning algorithm steps:
61
62 1. mark all objects as white (remaining white objects will later be
63 considered orphan)
64 2. scan the memory starting with the data section and stacks, checking
65 the values against the addresses stored in the prio search tree. If
66 a pointer to a white object is found, the object is added to the
67 gray list
68 3. scan the gray objects for matching addresses (some white objects
69 can become gray and added at the end of the gray list) until the
70 gray set is finished
71 4. the remaining white objects are considered orphan and reported via
72 /sys/kernel/debug/kmemleak
73
74Some allocated memory blocks have pointers stored in the kernel's
75internal data structures and they cannot be detected as orphans. To
76avoid this, kmemleak can also store the number of values pointing to an
77address inside the block address range that need to be found so that the
78block is not considered a leak. One example is __vmalloc().
79
80Kmemleak API
81------------
82
83See the include/linux/kmemleak.h header for the functions prototype.
84
85kmemleak_init - initialize kmemleak
86kmemleak_alloc - notify of a memory block allocation
87kmemleak_free - notify of a memory block freeing
88kmemleak_not_leak - mark an object as not a leak
89kmemleak_ignore - do not scan or report an object as leak
90kmemleak_scan_area - add scan areas inside a memory block
91kmemleak_no_scan - do not scan a memory block
92kmemleak_erase - erase an old value in a pointer variable
93kmemleak_alloc_recursive - as kmemleak_alloc but checks the recursiveness
94kmemleak_free_recursive - as kmemleak_free but checks the recursiveness
95
96Dealing with false positives/negatives
97--------------------------------------
98
99The false negatives are real memory leaks (orphan objects) but not
100reported by kmemleak because values found during the memory scanning
101point to such objects. To reduce the number of false negatives, kmemleak
102provides the kmemleak_ignore, kmemleak_scan_area, kmemleak_no_scan and
103kmemleak_erase functions (see above). The task stacks also increase the
104amount of false negatives and their scanning is not enabled by default.
105
106The false positives are objects wrongly reported as being memory leaks
107(orphan). For objects known not to be leaks, kmemleak provides the
108kmemleak_not_leak function. The kmemleak_ignore could also be used if
109the memory block is known not to contain other pointers and it will no
110longer be scanned.
111
112Some of the reported leaks are only transient, especially on SMP
113systems, because of pointers temporarily stored in CPU registers or
114stacks. Kmemleak defines MSECS_MIN_AGE (defaulting to 1000) representing
115the minimum age of an object to be reported as a memory leak.
116
117Limitations and Drawbacks
118-------------------------
119
120The main drawback is the reduced performance of memory allocation and
121freeing. To avoid other penalties, the memory scanning is only performed
122when the /sys/kernel/debug/kmemleak file is read. Anyway, this tool is
123intended for debugging purposes where the performance might not be the
124most important requirement.
125
126To keep the algorithm simple, kmemleak scans for values pointing to any
127address inside a block's address range. This may lead to an increased
128number of false negatives. However, it is likely that a real memory leak
129will eventually become visible.
130
131Another source of false negatives is the data stored in non-pointer
132values. In a future version, kmemleak could only scan the pointer
133members in the allocated structures. This feature would solve many of
134the false negative cases described above.
135
136The tool can report false positives. These are cases where an allocated
137block doesn't need to be freed (some cases in the init_call functions),
138the pointer is calculated by other methods than the usual container_of
139macro or the pointer is stored in a location not scanned by kmemleak.
140
141Page allocations and ioremap are not tracked. Only the ARM and x86
142architectures are currently supported.
diff --git a/MAINTAINERS b/MAINTAINERS
index e697b67031a2..c944d618dc83 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3372,6 +3372,12 @@ F: Documentation/trace/kmemtrace.txt
3372F: include/trace/kmemtrace.h 3372F: include/trace/kmemtrace.h
3373F: kernel/trace/kmemtrace.c 3373F: kernel/trace/kmemtrace.c
3374 3374
3375KMEMLEAK
3376P: Catalin Marinas
3377M: catalin.marinas@arm.com
3378L: linux-kernel@vger.kernel.org
3379S: Maintained
3380
3375KPROBES 3381KPROBES
3376P: Ananth N Mavinakayanahalli 3382P: Ananth N Mavinakayanahalli
3377M: ananth@in.ibm.com 3383M: ananth@in.ibm.com
@@ -4405,6 +4411,16 @@ S: Maintained
4405F: include/linux/delayacct.h 4411F: include/linux/delayacct.h
4406F: kernel/delayacct.c 4412F: kernel/delayacct.c
4407 4413
4414PERFORMANCE COUNTER SUBSYSTEM
4415P: Peter Zijlstra
4416M: a.p.zijlstra@chello.nl
4417P: Paul Mackerras
4418M: paulus@samba.org
4419P: Ingo Molnar
4420M: mingo@elte.hu
4421L: linux-kernel@vger.kernel.org
4422S: Supported
4423
4408PERSONALITY HANDLING 4424PERSONALITY HANDLING
4409P: Christoph Hellwig 4425P: Christoph Hellwig
4410M: hch@infradead.org 4426M: hch@infradead.org
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index b7e034b0a6dd..20a44d0c9fdd 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -131,5 +131,44 @@ static inline int irqs_disabled_flags(unsigned long flags)
131 */ 131 */
132struct irq_chip; 132struct irq_chip;
133 133
134#ifdef CONFIG_PERF_COUNTERS
135static inline unsigned long test_perf_counter_pending(void)
136{
137 unsigned long x;
138
139 asm volatile("lbz %0,%1(13)"
140 : "=r" (x)
141 : "i" (offsetof(struct paca_struct, perf_counter_pending)));
142 return x;
143}
144
145static inline void set_perf_counter_pending(void)
146{
147 asm volatile("stb %0,%1(13)" : :
148 "r" (1),
149 "i" (offsetof(struct paca_struct, perf_counter_pending)));
150}
151
152static inline void clear_perf_counter_pending(void)
153{
154 asm volatile("stb %0,%1(13)" : :
155 "r" (0),
156 "i" (offsetof(struct paca_struct, perf_counter_pending)));
157}
158
159extern void perf_counter_do_pending(void);
160
161#else
162
163static inline unsigned long test_perf_counter_pending(void)
164{
165 return 0;
166}
167
168static inline void set_perf_counter_pending(void) {}
169static inline void clear_perf_counter_pending(void) {}
170static inline void perf_counter_do_pending(void) {}
171#endif /* CONFIG_PERF_COUNTERS */
172
134#endif /* __KERNEL__ */ 173#endif /* __KERNEL__ */
135#endif /* _ASM_POWERPC_HW_IRQ_H */ 174#endif /* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 082b3aedf145..6ef055723019 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,6 +99,7 @@ struct paca_struct {
99 u8 soft_enabled; /* irq soft-enable flag */ 99 u8 soft_enabled; /* irq soft-enable flag */
100 u8 hard_enabled; /* set if irqs are enabled in MSR */ 100 u8 hard_enabled; /* set if irqs are enabled in MSR */
101 u8 io_sync; /* writel() needs spin_unlock sync */ 101 u8 io_sync; /* writel() needs spin_unlock sync */
102 u8 perf_counter_pending; /* PM interrupt while soft-disabled */
102 103
103 /* Stuff for accurate time accounting */ 104 /* Stuff for accurate time accounting */
104 u64 user_time; /* accumulated usermode TB ticks */ 105 u64 user_time; /* accumulated usermode TB ticks */
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
new file mode 100644
index 000000000000..cc7c887705b8
--- /dev/null
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -0,0 +1,98 @@
1/*
2 * Performance counter support - PowerPC-specific definitions.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/types.h>
12
13#define MAX_HWCOUNTERS 8
14#define MAX_EVENT_ALTERNATIVES 8
15#define MAX_LIMITED_HWCOUNTERS 2
16
17/*
18 * This struct provides the constants and functions needed to
19 * describe the PMU on a particular POWER-family CPU.
20 */
21struct power_pmu {
22 int n_counter;
23 int max_alternatives;
24 u64 add_fields;
25 u64 test_adder;
26 int (*compute_mmcr)(u64 events[], int n_ev,
27 unsigned int hwc[], u64 mmcr[]);
28 int (*get_constraint)(u64 event, u64 *mskp, u64 *valp);
29 int (*get_alternatives)(u64 event, unsigned int flags,
30 u64 alt[]);
31 void (*disable_pmc)(unsigned int pmc, u64 mmcr[]);
32 int (*limited_pmc_event)(u64 event);
33 u32 flags;
34 int n_generic;
35 int *generic_events;
36 int (*cache_events)[PERF_COUNT_HW_CACHE_MAX]
37 [PERF_COUNT_HW_CACHE_OP_MAX]
38 [PERF_COUNT_HW_CACHE_RESULT_MAX];
39};
40
41extern struct power_pmu *ppmu;
42
43/*
44 * Values for power_pmu.flags
45 */
46#define PPMU_LIMITED_PMC5_6 1 /* PMC5/6 have limited function */
47#define PPMU_ALT_SIPR 2 /* uses alternate posn for SIPR/HV */
48
49/*
50 * Values for flags to get_alternatives()
51 */
52#define PPMU_LIMITED_PMC_OK 1 /* can put this on a limited PMC */
53#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */
54#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */
55
56struct pt_regs;
57extern unsigned long perf_misc_flags(struct pt_regs *regs);
58#define perf_misc_flags(regs) perf_misc_flags(regs)
59
60extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
61
62/*
63 * The power_pmu.get_constraint function returns a 64-bit value and
64 * a 64-bit mask that express the constraints between this event and
65 * other events.
66 *
67 * The value and mask are divided up into (non-overlapping) bitfields
68 * of three different types:
69 *
70 * Select field: this expresses the constraint that some set of bits
71 * in MMCR* needs to be set to a specific value for this event. For a
72 * select field, the mask contains 1s in every bit of the field, and
73 * the value contains a unique value for each possible setting of the
74 * MMCR* bits. The constraint checking code will ensure that two events
75 * that set the same field in their masks have the same value in their
76 * value dwords.
77 *
78 * Add field: this expresses the constraint that there can be at most
79 * N events in a particular class. A field of k bits can be used for
80 * N <= 2^(k-1) - 1. The mask has the most significant bit of the field
81 * set (and the other bits 0), and the value has only the least significant
82 * bit of the field set. In addition, the 'add_fields' and 'test_adder'
83 * in the struct power_pmu for this processor come into play. The
84 * add_fields value contains 1 in the LSB of the field, and the
85 * test_adder contains 2^(k-1) - 1 - N in the field.
86 *
87 * NAND field: this expresses the constraint that you may not have events
88 * in all of a set of classes. (For example, on PPC970, you can't select
89 * events from the FPU, ISU and IDU simultaneously, although any two are
90 * possible.) For N classes, the field is N+1 bits wide, and each class
91 * is assigned one bit from the least-significant N bits. The mask has
92 * only the most-significant bit set, and the value has only the bit
93 * for the event's class set. The test_adder has the least significant
94 * bit set in the field.
95 *
96 * If an event is not subject to the constraint expressed by a particular
97 * field, then it will have 0 in both the mask and value for that field.
98 */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index e8018d540e87..fb359b0a6937 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -492,11 +492,13 @@
492#define MMCR0_FCHV 0x00000001UL /* freeze conditions in hypervisor mode */ 492#define MMCR0_FCHV 0x00000001UL /* freeze conditions in hypervisor mode */
493#define SPRN_MMCR1 798 493#define SPRN_MMCR1 798
494#define SPRN_MMCRA 0x312 494#define SPRN_MMCRA 0x312
495#define MMCRA_SDSYNC 0x80000000UL /* SDAR synced with SIAR */
495#define MMCRA_SIHV 0x10000000UL /* state of MSR HV when SIAR set */ 496#define MMCRA_SIHV 0x10000000UL /* state of MSR HV when SIAR set */
496#define MMCRA_SIPR 0x08000000UL /* state of MSR PR when SIAR set */ 497#define MMCRA_SIPR 0x08000000UL /* state of MSR PR when SIAR set */
497#define MMCRA_SLOT 0x07000000UL /* SLOT bits (37-39) */ 498#define MMCRA_SLOT 0x07000000UL /* SLOT bits (37-39) */
498#define MMCRA_SLOT_SHIFT 24 499#define MMCRA_SLOT_SHIFT 24
499#define MMCRA_SAMPLE_ENABLE 0x00000001UL /* enable sampling */ 500#define MMCRA_SAMPLE_ENABLE 0x00000001UL /* enable sampling */
501#define POWER6_MMCRA_SDSYNC 0x0000080000000000ULL /* SDAR/SIAR synced */
500#define POWER6_MMCRA_SIHV 0x0000040000000000ULL 502#define POWER6_MMCRA_SIHV 0x0000040000000000ULL
501#define POWER6_MMCRA_SIPR 0x0000020000000000ULL 503#define POWER6_MMCRA_SIPR 0x0000020000000000ULL
502#define POWER6_MMCRA_THRM 0x00000020UL 504#define POWER6_MMCRA_THRM 0x00000020UL
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index d98a30dfd41c..a0b92de51c7e 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,6 +322,6 @@ SYSCALL_SPU(epoll_create1)
322SYSCALL_SPU(dup3) 322SYSCALL_SPU(dup3)
323SYSCALL_SPU(pipe2) 323SYSCALL_SPU(pipe2)
324SYSCALL(inotify_init1) 324SYSCALL(inotify_init1)
325SYSCALL(ni_syscall) 325SYSCALL_SPU(perf_counter_open)
326COMPAT_SYS_SPU(preadv) 326COMPAT_SYS_SPU(preadv)
327COMPAT_SYS_SPU(pwritev) 327COMPAT_SYS_SPU(pwritev)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index 3f06f8ec81c5..4badac2d11d1 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,6 +341,7 @@
341#define __NR_dup3 316 341#define __NR_dup3 316
342#define __NR_pipe2 317 342#define __NR_pipe2 317
343#define __NR_inotify_init1 318 343#define __NR_inotify_init1 318
344#define __NR_perf_counter_open 319
344#define __NR_preadv 320 345#define __NR_preadv 320
345#define __NR_pwritev 321 346#define __NR_pwritev 321
346 347
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 71901fbda4a5..a2c683403c2b 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,6 +94,9 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o
94 94
95obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 95obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
96obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o 96obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
97obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o power4-pmu.o ppc970-pmu.o \
98 power5-pmu.o power5+-pmu.o power6-pmu.o \
99 power7-pmu.o
97 100
98obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o 101obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
99 102
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 1e40bc053946..e981d1ce1914 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -131,6 +131,7 @@ int main(void)
131 DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); 131 DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
132 DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); 132 DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
133 DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); 133 DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
134 DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
134 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache)); 135 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
135 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); 136 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
136 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); 137 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index abfc32330479..43e073477c34 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
5262: 5262:
527 TRACE_AND_RESTORE_IRQ(r5); 527 TRACE_AND_RESTORE_IRQ(r5);
528 528
529#ifdef CONFIG_PERF_COUNTERS
530 /* check paca->perf_counter_pending if we're enabling ints */
531 lbz r3,PACAPERFPEND(r13)
532 and. r3,r3,r5
533 beq 27f
534 bl .perf_counter_do_pending
53527:
536#endif /* CONFIG_PERF_COUNTERS */
537
529 /* extract EE bit and use it to restore paca->hard_enabled */ 538 /* extract EE bit and use it to restore paca->hard_enabled */
530 ld r3,_MSR(r1) 539 ld r3,_MSR(r1)
531 rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ 540 rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 8c1a4966867e..feff792ed0f9 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -135,6 +135,11 @@ notrace void raw_local_irq_restore(unsigned long en)
135 iseries_handle_interrupts(); 135 iseries_handle_interrupts();
136 } 136 }
137 137
138 if (test_perf_counter_pending()) {
139 clear_perf_counter_pending();
140 perf_counter_do_pending();
141 }
142
138 /* 143 /*
139 * if (get_paca()->hard_enabled) return; 144 * if (get_paca()->hard_enabled) return;
140 * But again we need to take care that gcc gets hard_enabled directly 145 * But again we need to take care that gcc gets hard_enabled directly
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
new file mode 100644
index 000000000000..bb202388170e
--- /dev/null
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -0,0 +1,1263 @@
1/*
2 * Performance counter support - powerpc architecture code
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/sched.h>
13#include <linux/perf_counter.h>
14#include <linux/percpu.h>
15#include <linux/hardirq.h>
16#include <asm/reg.h>
17#include <asm/pmc.h>
18#include <asm/machdep.h>
19#include <asm/firmware.h>
20#include <asm/ptrace.h>
21
22struct cpu_hw_counters {
23 int n_counters;
24 int n_percpu;
25 int disabled;
26 int n_added;
27 int n_limited;
28 u8 pmcs_enabled;
29 struct perf_counter *counter[MAX_HWCOUNTERS];
30 u64 events[MAX_HWCOUNTERS];
31 unsigned int flags[MAX_HWCOUNTERS];
32 u64 mmcr[3];
33 struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
34 u8 limited_hwidx[MAX_LIMITED_HWCOUNTERS];
35};
36DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
37
38struct power_pmu *ppmu;
39
40/*
41 * Normally, to ignore kernel events we set the FCS (freeze counters
42 * in supervisor mode) bit in MMCR0, but if the kernel runs with the
43 * hypervisor bit set in the MSR, or if we are running on a processor
44 * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
45 * then we need to use the FCHV bit to ignore kernel events.
46 */
47static unsigned int freeze_counters_kernel = MMCR0_FCS;
48
49static void perf_counter_interrupt(struct pt_regs *regs);
50
51void perf_counter_print_debug(void)
52{
53}
54
55/*
56 * Read one performance monitor counter (PMC).
57 */
58static unsigned long read_pmc(int idx)
59{
60 unsigned long val;
61
62 switch (idx) {
63 case 1:
64 val = mfspr(SPRN_PMC1);
65 break;
66 case 2:
67 val = mfspr(SPRN_PMC2);
68 break;
69 case 3:
70 val = mfspr(SPRN_PMC3);
71 break;
72 case 4:
73 val = mfspr(SPRN_PMC4);
74 break;
75 case 5:
76 val = mfspr(SPRN_PMC5);
77 break;
78 case 6:
79 val = mfspr(SPRN_PMC6);
80 break;
81 case 7:
82 val = mfspr(SPRN_PMC7);
83 break;
84 case 8:
85 val = mfspr(SPRN_PMC8);
86 break;
87 default:
88 printk(KERN_ERR "oops trying to read PMC%d\n", idx);
89 val = 0;
90 }
91 return val;
92}
93
94/*
95 * Write one PMC.
96 */
97static void write_pmc(int idx, unsigned long val)
98{
99 switch (idx) {
100 case 1:
101 mtspr(SPRN_PMC1, val);
102 break;
103 case 2:
104 mtspr(SPRN_PMC2, val);
105 break;
106 case 3:
107 mtspr(SPRN_PMC3, val);
108 break;
109 case 4:
110 mtspr(SPRN_PMC4, val);
111 break;
112 case 5:
113 mtspr(SPRN_PMC5, val);
114 break;
115 case 6:
116 mtspr(SPRN_PMC6, val);
117 break;
118 case 7:
119 mtspr(SPRN_PMC7, val);
120 break;
121 case 8:
122 mtspr(SPRN_PMC8, val);
123 break;
124 default:
125 printk(KERN_ERR "oops trying to write PMC%d\n", idx);
126 }
127}
128
129/*
130 * Check if a set of events can all go on the PMU at once.
131 * If they can't, this will look at alternative codes for the events
132 * and see if any combination of alternative codes is feasible.
133 * The feasible set is returned in event[].
134 */
135static int power_check_constraints(u64 event[], unsigned int cflags[],
136 int n_ev)
137{
138 u64 mask, value, nv;
139 u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
140 u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
141 u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
142 u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
143 int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
144 int i, j;
145 u64 addf = ppmu->add_fields;
146 u64 tadd = ppmu->test_adder;
147
148 if (n_ev > ppmu->n_counter)
149 return -1;
150
151 /* First see if the events will go on as-is */
152 for (i = 0; i < n_ev; ++i) {
153 if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
154 && !ppmu->limited_pmc_event(event[i])) {
155 ppmu->get_alternatives(event[i], cflags[i],
156 alternatives[i]);
157 event[i] = alternatives[i][0];
158 }
159 if (ppmu->get_constraint(event[i], &amasks[i][0],
160 &avalues[i][0]))
161 return -1;
162 }
163 value = mask = 0;
164 for (i = 0; i < n_ev; ++i) {
165 nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
166 if ((((nv + tadd) ^ value) & mask) != 0 ||
167 (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
168 break;
169 value = nv;
170 mask |= amasks[i][0];
171 }
172 if (i == n_ev)
173 return 0; /* all OK */
174
175 /* doesn't work, gather alternatives... */
176 if (!ppmu->get_alternatives)
177 return -1;
178 for (i = 0; i < n_ev; ++i) {
179 choice[i] = 0;
180 n_alt[i] = ppmu->get_alternatives(event[i], cflags[i],
181 alternatives[i]);
182 for (j = 1; j < n_alt[i]; ++j)
183 ppmu->get_constraint(alternatives[i][j],
184 &amasks[i][j], &avalues[i][j]);
185 }
186
187 /* enumerate all possibilities and see if any will work */
188 i = 0;
189 j = -1;
190 value = mask = nv = 0;
191 while (i < n_ev) {
192 if (j >= 0) {
193 /* we're backtracking, restore context */
194 value = svalues[i];
195 mask = smasks[i];
196 j = choice[i];
197 }
198 /*
199 * See if any alternative k for event i,
200 * where k > j, will satisfy the constraints.
201 */
202 while (++j < n_alt[i]) {
203 nv = (value | avalues[i][j]) +
204 (value & avalues[i][j] & addf);
205 if ((((nv + tadd) ^ value) & mask) == 0 &&
206 (((nv + tadd) ^ avalues[i][j])
207 & amasks[i][j]) == 0)
208 break;
209 }
210 if (j >= n_alt[i]) {
211 /*
212 * No feasible alternative, backtrack
213 * to event i-1 and continue enumerating its
214 * alternatives from where we got up to.
215 */
216 if (--i < 0)
217 return -1;
218 } else {
219 /*
220 * Found a feasible alternative for event i,
221 * remember where we got up to with this event,
222 * go on to the next event, and start with
223 * the first alternative for it.
224 */
225 choice[i] = j;
226 svalues[i] = value;
227 smasks[i] = mask;
228 value = nv;
229 mask |= amasks[i][j];
230 ++i;
231 j = -1;
232 }
233 }
234
235 /* OK, we have a feasible combination, tell the caller the solution */
236 for (i = 0; i < n_ev; ++i)
237 event[i] = alternatives[i][choice[i]];
238 return 0;
239}
240
241/*
242 * Check if newly-added counters have consistent settings for
243 * exclude_{user,kernel,hv} with each other and any previously
244 * added counters.
245 */
246static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
247 int n_prev, int n_new)
248{
249 int eu = 0, ek = 0, eh = 0;
250 int i, n, first;
251 struct perf_counter *counter;
252
253 n = n_prev + n_new;
254 if (n <= 1)
255 return 0;
256
257 first = 1;
258 for (i = 0; i < n; ++i) {
259 if (cflags[i] & PPMU_LIMITED_PMC_OK) {
260 cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
261 continue;
262 }
263 counter = ctrs[i];
264 if (first) {
265 eu = counter->attr.exclude_user;
266 ek = counter->attr.exclude_kernel;
267 eh = counter->attr.exclude_hv;
268 first = 0;
269 } else if (counter->attr.exclude_user != eu ||
270 counter->attr.exclude_kernel != ek ||
271 counter->attr.exclude_hv != eh) {
272 return -EAGAIN;
273 }
274 }
275
276 if (eu || ek || eh)
277 for (i = 0; i < n; ++i)
278 if (cflags[i] & PPMU_LIMITED_PMC_OK)
279 cflags[i] |= PPMU_LIMITED_PMC_REQD;
280
281 return 0;
282}
283
284static void power_pmu_read(struct perf_counter *counter)
285{
286 long val, delta, prev;
287
288 if (!counter->hw.idx)
289 return;
290 /*
291 * Performance monitor interrupts come even when interrupts
292 * are soft-disabled, as long as interrupts are hard-enabled.
293 * Therefore we treat them like NMIs.
294 */
295 do {
296 prev = atomic64_read(&counter->hw.prev_count);
297 barrier();
298 val = read_pmc(counter->hw.idx);
299 } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
300
301 /* The counters are only 32 bits wide */
302 delta = (val - prev) & 0xfffffffful;
303 atomic64_add(delta, &counter->count);
304 atomic64_sub(delta, &counter->hw.period_left);
305}
306
307/*
308 * On some machines, PMC5 and PMC6 can't be written, don't respect
309 * the freeze conditions, and don't generate interrupts. This tells
310 * us if `counter' is using such a PMC.
311 */
312static int is_limited_pmc(int pmcnum)
313{
314 return (ppmu->flags & PPMU_LIMITED_PMC5_6)
315 && (pmcnum == 5 || pmcnum == 6);
316}
317
318static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
319 unsigned long pmc5, unsigned long pmc6)
320{
321 struct perf_counter *counter;
322 u64 val, prev, delta;
323 int i;
324
325 for (i = 0; i < cpuhw->n_limited; ++i) {
326 counter = cpuhw->limited_counter[i];
327 if (!counter->hw.idx)
328 continue;
329 val = (counter->hw.idx == 5) ? pmc5 : pmc6;
330 prev = atomic64_read(&counter->hw.prev_count);
331 counter->hw.idx = 0;
332 delta = (val - prev) & 0xfffffffful;
333 atomic64_add(delta, &counter->count);
334 }
335}
336
337static void thaw_limited_counters(struct cpu_hw_counters *cpuhw,
338 unsigned long pmc5, unsigned long pmc6)
339{
340 struct perf_counter *counter;
341 u64 val;
342 int i;
343
344 for (i = 0; i < cpuhw->n_limited; ++i) {
345 counter = cpuhw->limited_counter[i];
346 counter->hw.idx = cpuhw->limited_hwidx[i];
347 val = (counter->hw.idx == 5) ? pmc5 : pmc6;
348 atomic64_set(&counter->hw.prev_count, val);
349 perf_counter_update_userpage(counter);
350 }
351}
352
353/*
354 * Since limited counters don't respect the freeze conditions, we
355 * have to read them immediately after freezing or unfreezing the
356 * other counters. We try to keep the values from the limited
357 * counters as consistent as possible by keeping the delay (in
358 * cycles and instructions) between freezing/unfreezing and reading
359 * the limited counters as small and consistent as possible.
360 * Therefore, if any limited counters are in use, we read them
361 * both, and always in the same order, to minimize variability,
362 * and do it inside the same asm that writes MMCR0.
363 */
364static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
365{
366 unsigned long pmc5, pmc6;
367
368 if (!cpuhw->n_limited) {
369 mtspr(SPRN_MMCR0, mmcr0);
370 return;
371 }
372
373 /*
374 * Write MMCR0, then read PMC5 and PMC6 immediately.
375 * To ensure we don't get a performance monitor interrupt
376 * between writing MMCR0 and freezing/thawing the limited
377 * counters, we first write MMCR0 with the counter overflow
378 * interrupt enable bits turned off.
379 */
380 asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
381 : "=&r" (pmc5), "=&r" (pmc6)
382 : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
383 "i" (SPRN_MMCR0),
384 "i" (SPRN_PMC5), "i" (SPRN_PMC6));
385
386 if (mmcr0 & MMCR0_FC)
387 freeze_limited_counters(cpuhw, pmc5, pmc6);
388 else
389 thaw_limited_counters(cpuhw, pmc5, pmc6);
390
391 /*
392 * Write the full MMCR0 including the counter overflow interrupt
393 * enable bits, if necessary.
394 */
395 if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
396 mtspr(SPRN_MMCR0, mmcr0);
397}
398
399/*
400 * Disable all counters to prevent PMU interrupts and to allow
401 * counters to be added or removed.
402 */
403void hw_perf_disable(void)
404{
405 struct cpu_hw_counters *cpuhw;
406 unsigned long ret;
407 unsigned long flags;
408
409 local_irq_save(flags);
410 cpuhw = &__get_cpu_var(cpu_hw_counters);
411
412 ret = cpuhw->disabled;
413 if (!ret) {
414 cpuhw->disabled = 1;
415 cpuhw->n_added = 0;
416
417 /*
418 * Check if we ever enabled the PMU on this cpu.
419 */
420 if (!cpuhw->pmcs_enabled) {
421 if (ppc_md.enable_pmcs)
422 ppc_md.enable_pmcs();
423 cpuhw->pmcs_enabled = 1;
424 }
425
426 /*
427 * Disable instruction sampling if it was enabled
428 */
429 if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
430 mtspr(SPRN_MMCRA,
431 cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
432 mb();
433 }
434
435 /*
436 * Set the 'freeze counters' bit.
437 * The barrier is to make sure the mtspr has been
438 * executed and the PMU has frozen the counters
439 * before we return.
440 */
441 write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
442 mb();
443 }
444 local_irq_restore(flags);
445}
446
447/*
448 * Re-enable all counters if disable == 0.
449 * If we were previously disabled and counters were added, then
450 * put the new config on the PMU.
451 */
452void hw_perf_enable(void)
453{
454 struct perf_counter *counter;
455 struct cpu_hw_counters *cpuhw;
456 unsigned long flags;
457 long i;
458 unsigned long val;
459 s64 left;
460 unsigned int hwc_index[MAX_HWCOUNTERS];
461 int n_lim;
462 int idx;
463
464 local_irq_save(flags);
465 cpuhw = &__get_cpu_var(cpu_hw_counters);
466 if (!cpuhw->disabled) {
467 local_irq_restore(flags);
468 return;
469 }
470 cpuhw->disabled = 0;
471
472 /*
473 * If we didn't change anything, or only removed counters,
474 * no need to recalculate MMCR* settings and reset the PMCs.
475 * Just reenable the PMU with the current MMCR* settings
476 * (possibly updated for removal of counters).
477 */
478 if (!cpuhw->n_added) {
479 mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
480 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
481 if (cpuhw->n_counters == 0)
482 get_lppaca()->pmcregs_in_use = 0;
483 goto out_enable;
484 }
485
486 /*
487 * Compute MMCR* values for the new set of counters
488 */
489 if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
490 cpuhw->mmcr)) {
491 /* shouldn't ever get here */
492 printk(KERN_ERR "oops compute_mmcr failed\n");
493 goto out;
494 }
495
496 /*
497 * Add in MMCR0 freeze bits corresponding to the
498 * attr.exclude_* bits for the first counter.
499 * We have already checked that all counters have the
500 * same values for these bits as the first counter.
501 */
502 counter = cpuhw->counter[0];
503 if (counter->attr.exclude_user)
504 cpuhw->mmcr[0] |= MMCR0_FCP;
505 if (counter->attr.exclude_kernel)
506 cpuhw->mmcr[0] |= freeze_counters_kernel;
507 if (counter->attr.exclude_hv)
508 cpuhw->mmcr[0] |= MMCR0_FCHV;
509
510 /*
511 * Write the new configuration to MMCR* with the freeze
512 * bit set and set the hardware counters to their initial values.
513 * Then unfreeze the counters.
514 */
515 get_lppaca()->pmcregs_in_use = 1;
516 mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
517 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
518 mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
519 | MMCR0_FC);
520
521 /*
522 * Read off any pre-existing counters that need to move
523 * to another PMC.
524 */
525 for (i = 0; i < cpuhw->n_counters; ++i) {
526 counter = cpuhw->counter[i];
527 if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
528 power_pmu_read(counter);
529 write_pmc(counter->hw.idx, 0);
530 counter->hw.idx = 0;
531 }
532 }
533
534 /*
535 * Initialize the PMCs for all the new and moved counters.
536 */
537 cpuhw->n_limited = n_lim = 0;
538 for (i = 0; i < cpuhw->n_counters; ++i) {
539 counter = cpuhw->counter[i];
540 if (counter->hw.idx)
541 continue;
542 idx = hwc_index[i] + 1;
543 if (is_limited_pmc(idx)) {
544 cpuhw->limited_counter[n_lim] = counter;
545 cpuhw->limited_hwidx[n_lim] = idx;
546 ++n_lim;
547 continue;
548 }
549 val = 0;
550 if (counter->hw.sample_period) {
551 left = atomic64_read(&counter->hw.period_left);
552 if (left < 0x80000000L)
553 val = 0x80000000L - left;
554 }
555 atomic64_set(&counter->hw.prev_count, val);
556 counter->hw.idx = idx;
557 write_pmc(idx, val);
558 perf_counter_update_userpage(counter);
559 }
560 cpuhw->n_limited = n_lim;
561 cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
562
563 out_enable:
564 mb();
565 write_mmcr0(cpuhw, cpuhw->mmcr[0]);
566
567 /*
568 * Enable instruction sampling if necessary
569 */
570 if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
571 mb();
572 mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
573 }
574
575 out:
576 local_irq_restore(flags);
577}
578
579static int collect_events(struct perf_counter *group, int max_count,
580 struct perf_counter *ctrs[], u64 *events,
581 unsigned int *flags)
582{
583 int n = 0;
584 struct perf_counter *counter;
585
586 if (!is_software_counter(group)) {
587 if (n >= max_count)
588 return -1;
589 ctrs[n] = group;
590 flags[n] = group->hw.counter_base;
591 events[n++] = group->hw.config;
592 }
593 list_for_each_entry(counter, &group->sibling_list, list_entry) {
594 if (!is_software_counter(counter) &&
595 counter->state != PERF_COUNTER_STATE_OFF) {
596 if (n >= max_count)
597 return -1;
598 ctrs[n] = counter;
599 flags[n] = counter->hw.counter_base;
600 events[n++] = counter->hw.config;
601 }
602 }
603 return n;
604}
605
606static void counter_sched_in(struct perf_counter *counter, int cpu)
607{
608 counter->state = PERF_COUNTER_STATE_ACTIVE;
609 counter->oncpu = cpu;
610 counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
611 if (is_software_counter(counter))
612 counter->pmu->enable(counter);
613}
614
615/*
616 * Called to enable a whole group of counters.
617 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
618 * Assumes the caller has disabled interrupts and has
619 * frozen the PMU with hw_perf_save_disable.
620 */
621int hw_perf_group_sched_in(struct perf_counter *group_leader,
622 struct perf_cpu_context *cpuctx,
623 struct perf_counter_context *ctx, int cpu)
624{
625 struct cpu_hw_counters *cpuhw;
626 long i, n, n0;
627 struct perf_counter *sub;
628
629 cpuhw = &__get_cpu_var(cpu_hw_counters);
630 n0 = cpuhw->n_counters;
631 n = collect_events(group_leader, ppmu->n_counter - n0,
632 &cpuhw->counter[n0], &cpuhw->events[n0],
633 &cpuhw->flags[n0]);
634 if (n < 0)
635 return -EAGAIN;
636 if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n))
637 return -EAGAIN;
638 i = power_check_constraints(cpuhw->events, cpuhw->flags, n + n0);
639 if (i < 0)
640 return -EAGAIN;
641 cpuhw->n_counters = n0 + n;
642 cpuhw->n_added += n;
643
644 /*
645 * OK, this group can go on; update counter states etc.,
646 * and enable any software counters
647 */
648 for (i = n0; i < n0 + n; ++i)
649 cpuhw->counter[i]->hw.config = cpuhw->events[i];
650 cpuctx->active_oncpu += n;
651 n = 1;
652 counter_sched_in(group_leader, cpu);
653 list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
654 if (sub->state != PERF_COUNTER_STATE_OFF) {
655 counter_sched_in(sub, cpu);
656 ++n;
657 }
658 }
659 ctx->nr_active += n;
660
661 return 1;
662}
663
664/*
665 * Add a counter to the PMU.
666 * If all counters are not already frozen, then we disable and
667 * re-enable the PMU in order to get hw_perf_enable to do the
668 * actual work of reconfiguring the PMU.
669 */
670static int power_pmu_enable(struct perf_counter *counter)
671{
672 struct cpu_hw_counters *cpuhw;
673 unsigned long flags;
674 int n0;
675 int ret = -EAGAIN;
676
677 local_irq_save(flags);
678 perf_disable();
679
680 /*
681 * Add the counter to the list (if there is room)
682 * and check whether the total set is still feasible.
683 */
684 cpuhw = &__get_cpu_var(cpu_hw_counters);
685 n0 = cpuhw->n_counters;
686 if (n0 >= ppmu->n_counter)
687 goto out;
688 cpuhw->counter[n0] = counter;
689 cpuhw->events[n0] = counter->hw.config;
690 cpuhw->flags[n0] = counter->hw.counter_base;
691 if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1))
692 goto out;
693 if (power_check_constraints(cpuhw->events, cpuhw->flags, n0 + 1))
694 goto out;
695
696 counter->hw.config = cpuhw->events[n0];
697 ++cpuhw->n_counters;
698 ++cpuhw->n_added;
699
700 ret = 0;
701 out:
702 perf_enable();
703 local_irq_restore(flags);
704 return ret;
705}
706
707/*
708 * Remove a counter from the PMU.
709 */
710static void power_pmu_disable(struct perf_counter *counter)
711{
712 struct cpu_hw_counters *cpuhw;
713 long i;
714 unsigned long flags;
715
716 local_irq_save(flags);
717 perf_disable();
718
719 power_pmu_read(counter);
720
721 cpuhw = &__get_cpu_var(cpu_hw_counters);
722 for (i = 0; i < cpuhw->n_counters; ++i) {
723 if (counter == cpuhw->counter[i]) {
724 while (++i < cpuhw->n_counters)
725 cpuhw->counter[i-1] = cpuhw->counter[i];
726 --cpuhw->n_counters;
727 ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
728 if (counter->hw.idx) {
729 write_pmc(counter->hw.idx, 0);
730 counter->hw.idx = 0;
731 }
732 perf_counter_update_userpage(counter);
733 break;
734 }
735 }
736 for (i = 0; i < cpuhw->n_limited; ++i)
737 if (counter == cpuhw->limited_counter[i])
738 break;
739 if (i < cpuhw->n_limited) {
740 while (++i < cpuhw->n_limited) {
741 cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
742 cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
743 }
744 --cpuhw->n_limited;
745 }
746 if (cpuhw->n_counters == 0) {
747 /* disable exceptions if no counters are running */
748 cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
749 }
750
751 perf_enable();
752 local_irq_restore(flags);
753}
754
755/*
756 * Re-enable interrupts on a counter after they were throttled
757 * because they were coming too fast.
758 */
759static void power_pmu_unthrottle(struct perf_counter *counter)
760{
761 s64 val, left;
762 unsigned long flags;
763
764 if (!counter->hw.idx || !counter->hw.sample_period)
765 return;
766 local_irq_save(flags);
767 perf_disable();
768 power_pmu_read(counter);
769 left = counter->hw.sample_period;
770 counter->hw.last_period = left;
771 val = 0;
772 if (left < 0x80000000L)
773 val = 0x80000000L - left;
774 write_pmc(counter->hw.idx, val);
775 atomic64_set(&counter->hw.prev_count, val);
776 atomic64_set(&counter->hw.period_left, left);
777 perf_counter_update_userpage(counter);
778 perf_enable();
779 local_irq_restore(flags);
780}
781
782struct pmu power_pmu = {
783 .enable = power_pmu_enable,
784 .disable = power_pmu_disable,
785 .read = power_pmu_read,
786 .unthrottle = power_pmu_unthrottle,
787};
788
789/*
790 * Return 1 if we might be able to put counter on a limited PMC,
791 * or 0 if not.
792 * A counter can only go on a limited PMC if it counts something
793 * that a limited PMC can count, doesn't require interrupts, and
794 * doesn't exclude any processor mode.
795 */
796static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
797 unsigned int flags)
798{
799 int n;
800 u64 alt[MAX_EVENT_ALTERNATIVES];
801
802 if (counter->attr.exclude_user
803 || counter->attr.exclude_kernel
804 || counter->attr.exclude_hv
805 || counter->attr.sample_period)
806 return 0;
807
808 if (ppmu->limited_pmc_event(ev))
809 return 1;
810
811 /*
812 * The requested event isn't on a limited PMC already;
813 * see if any alternative code goes on a limited PMC.
814 */
815 if (!ppmu->get_alternatives)
816 return 0;
817
818 flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
819 n = ppmu->get_alternatives(ev, flags, alt);
820
821 return n > 0;
822}
823
824/*
825 * Find an alternative event that goes on a normal PMC, if possible,
826 * and return the event code, or 0 if there is no such alternative.
827 * (Note: event code 0 is "don't count" on all machines.)
828 */
829static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
830{
831 u64 alt[MAX_EVENT_ALTERNATIVES];
832 int n;
833
834 flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
835 n = ppmu->get_alternatives(ev, flags, alt);
836 if (!n)
837 return 0;
838 return alt[0];
839}
840
841/* Number of perf_counters counting hardware events */
842static atomic_t num_counters;
843/* Used to avoid races in calling reserve/release_pmc_hardware */
844static DEFINE_MUTEX(pmc_reserve_mutex);
845
846/*
847 * Release the PMU if this is the last perf_counter.
848 */
849static void hw_perf_counter_destroy(struct perf_counter *counter)
850{
851 if (!atomic_add_unless(&num_counters, -1, 1)) {
852 mutex_lock(&pmc_reserve_mutex);
853 if (atomic_dec_return(&num_counters) == 0)
854 release_pmc_hardware();
855 mutex_unlock(&pmc_reserve_mutex);
856 }
857}
858
859/*
860 * Translate a generic cache event config to a raw event code.
861 */
862static int hw_perf_cache_event(u64 config, u64 *eventp)
863{
864 unsigned long type, op, result;
865 int ev;
866
867 if (!ppmu->cache_events)
868 return -EINVAL;
869
870 /* unpack config */
871 type = config & 0xff;
872 op = (config >> 8) & 0xff;
873 result = (config >> 16) & 0xff;
874
875 if (type >= PERF_COUNT_HW_CACHE_MAX ||
876 op >= PERF_COUNT_HW_CACHE_OP_MAX ||
877 result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
878 return -EINVAL;
879
880 ev = (*ppmu->cache_events)[type][op][result];
881 if (ev == 0)
882 return -EOPNOTSUPP;
883 if (ev == -1)
884 return -EINVAL;
885 *eventp = ev;
886 return 0;
887}
888
889const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
890{
891 u64 ev;
892 unsigned long flags;
893 struct perf_counter *ctrs[MAX_HWCOUNTERS];
894 u64 events[MAX_HWCOUNTERS];
895 unsigned int cflags[MAX_HWCOUNTERS];
896 int n;
897 int err;
898
899 if (!ppmu)
900 return ERR_PTR(-ENXIO);
901 switch (counter->attr.type) {
902 case PERF_TYPE_HARDWARE:
903 ev = counter->attr.config;
904 if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
905 return ERR_PTR(-EOPNOTSUPP);
906 ev = ppmu->generic_events[ev];
907 break;
908 case PERF_TYPE_HW_CACHE:
909 err = hw_perf_cache_event(counter->attr.config, &ev);
910 if (err)
911 return ERR_PTR(err);
912 break;
913 case PERF_TYPE_RAW:
914 ev = counter->attr.config;
915 break;
916 }
917 counter->hw.config_base = ev;
918 counter->hw.idx = 0;
919
920 /*
921 * If we are not running on a hypervisor, force the
922 * exclude_hv bit to 0 so that we don't care what
923 * the user set it to.
924 */
925 if (!firmware_has_feature(FW_FEATURE_LPAR))
926 counter->attr.exclude_hv = 0;
927
928 /*
929 * If this is a per-task counter, then we can use
930 * PM_RUN_* events interchangeably with their non RUN_*
931 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
932 * XXX we should check if the task is an idle task.
933 */
934 flags = 0;
935 if (counter->ctx->task)
936 flags |= PPMU_ONLY_COUNT_RUN;
937
938 /*
939 * If this machine has limited counters, check whether this
940 * event could go on a limited counter.
941 */
942 if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
943 if (can_go_on_limited_pmc(counter, ev, flags)) {
944 flags |= PPMU_LIMITED_PMC_OK;
945 } else if (ppmu->limited_pmc_event(ev)) {
946 /*
947 * The requested event is on a limited PMC,
948 * but we can't use a limited PMC; see if any
949 * alternative goes on a normal PMC.
950 */
951 ev = normal_pmc_alternative(ev, flags);
952 if (!ev)
953 return ERR_PTR(-EINVAL);
954 }
955 }
956
957 /*
958 * If this is in a group, check if it can go on with all the
959 * other hardware counters in the group. We assume the counter
960 * hasn't been linked into its leader's sibling list at this point.
961 */
962 n = 0;
963 if (counter->group_leader != counter) {
964 n = collect_events(counter->group_leader, ppmu->n_counter - 1,
965 ctrs, events, cflags);
966 if (n < 0)
967 return ERR_PTR(-EINVAL);
968 }
969 events[n] = ev;
970 ctrs[n] = counter;
971 cflags[n] = flags;
972 if (check_excludes(ctrs, cflags, n, 1))
973 return ERR_PTR(-EINVAL);
974 if (power_check_constraints(events, cflags, n + 1))
975 return ERR_PTR(-EINVAL);
976
977 counter->hw.config = events[n];
978 counter->hw.counter_base = cflags[n];
979 counter->hw.last_period = counter->hw.sample_period;
980 atomic64_set(&counter->hw.period_left, counter->hw.last_period);
981
982 /*
983 * See if we need to reserve the PMU.
984 * If no counters are currently in use, then we have to take a
985 * mutex to ensure that we don't race with another task doing
986 * reserve_pmc_hardware or release_pmc_hardware.
987 */
988 err = 0;
989 if (!atomic_inc_not_zero(&num_counters)) {
990 mutex_lock(&pmc_reserve_mutex);
991 if (atomic_read(&num_counters) == 0 &&
992 reserve_pmc_hardware(perf_counter_interrupt))
993 err = -EBUSY;
994 else
995 atomic_inc(&num_counters);
996 mutex_unlock(&pmc_reserve_mutex);
997 }
998 counter->destroy = hw_perf_counter_destroy;
999
1000 if (err)
1001 return ERR_PTR(err);
1002 return &power_pmu;
1003}
1004
1005/*
1006 * A counter has overflowed; update its count and record
1007 * things if requested. Note that interrupts are hard-disabled
1008 * here so there is no possibility of being interrupted.
1009 */
1010static void record_and_restart(struct perf_counter *counter, long val,
1011 struct pt_regs *regs, int nmi)
1012{
1013 u64 period = counter->hw.sample_period;
1014 s64 prev, delta, left;
1015 int record = 0;
1016 u64 addr, mmcra, sdsync;
1017
1018 /* we don't have to worry about interrupts here */
1019 prev = atomic64_read(&counter->hw.prev_count);
1020 delta = (val - prev) & 0xfffffffful;
1021 atomic64_add(delta, &counter->count);
1022
1023 /*
1024 * See if the total period for this counter has expired,
1025 * and update for the next period.
1026 */
1027 val = 0;
1028 left = atomic64_read(&counter->hw.period_left) - delta;
1029 if (period) {
1030 if (left <= 0) {
1031 left += period;
1032 if (left <= 0)
1033 left = period;
1034 record = 1;
1035 }
1036 if (left < 0x80000000L)
1037 val = 0x80000000L - left;
1038 }
1039
1040 /*
1041 * Finally record data if requested.
1042 */
1043 if (record) {
1044 struct perf_sample_data data = {
1045 .regs = regs,
1046 .addr = 0,
1047 .period = counter->hw.last_period,
1048 };
1049
1050 if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
1051 /*
1052 * The user wants a data address recorded.
1053 * If we're not doing instruction sampling,
1054 * give them the SDAR (sampled data address).
1055 * If we are doing instruction sampling, then only
1056 * give them the SDAR if it corresponds to the
1057 * instruction pointed to by SIAR; this is indicated
1058 * by the [POWER6_]MMCRA_SDSYNC bit in MMCRA.
1059 */
1060 mmcra = regs->dsisr;
1061 sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
1062 POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
1063 if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
1064 data.addr = mfspr(SPRN_SDAR);
1065 }
1066 if (perf_counter_overflow(counter, nmi, &data)) {
1067 /*
1068 * Interrupts are coming too fast - throttle them
1069 * by setting the counter to 0, so it will be
1070 * at least 2^30 cycles until the next interrupt
1071 * (assuming each counter counts at most 2 counts
1072 * per cycle).
1073 */
1074 val = 0;
1075 left = ~0ULL >> 1;
1076 }
1077 }
1078
1079 write_pmc(counter->hw.idx, val);
1080 atomic64_set(&counter->hw.prev_count, val);
1081 atomic64_set(&counter->hw.period_left, left);
1082 perf_counter_update_userpage(counter);
1083}
1084
1085/*
1086 * Called from generic code to get the misc flags (i.e. processor mode)
1087 * for an event.
1088 */
1089unsigned long perf_misc_flags(struct pt_regs *regs)
1090{
1091 unsigned long mmcra;
1092
1093 if (TRAP(regs) != 0xf00) {
1094 /* not a PMU interrupt */
1095 return user_mode(regs) ? PERF_EVENT_MISC_USER :
1096 PERF_EVENT_MISC_KERNEL;
1097 }
1098
1099 mmcra = regs->dsisr;
1100 if (ppmu->flags & PPMU_ALT_SIPR) {
1101 if (mmcra & POWER6_MMCRA_SIHV)
1102 return PERF_EVENT_MISC_HYPERVISOR;
1103 return (mmcra & POWER6_MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
1104 PERF_EVENT_MISC_KERNEL;
1105 }
1106 if (mmcra & MMCRA_SIHV)
1107 return PERF_EVENT_MISC_HYPERVISOR;
1108 return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
1109 PERF_EVENT_MISC_KERNEL;
1110}
1111
1112/*
1113 * Called from generic code to get the instruction pointer
1114 * for an event.
1115 */
1116unsigned long perf_instruction_pointer(struct pt_regs *regs)
1117{
1118 unsigned long mmcra;
1119 unsigned long ip;
1120 unsigned long slot;
1121
1122 if (TRAP(regs) != 0xf00)
1123 return regs->nip; /* not a PMU interrupt */
1124
1125 ip = mfspr(SPRN_SIAR);
1126 mmcra = regs->dsisr;
1127 if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
1128 slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
1129 if (slot > 1)
1130 ip += 4 * (slot - 1);
1131 }
1132 return ip;
1133}
1134
1135/*
1136 * Performance monitor interrupt stuff
1137 */
1138static void perf_counter_interrupt(struct pt_regs *regs)
1139{
1140 int i;
1141 struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
1142 struct perf_counter *counter;
1143 long val;
1144 int found = 0;
1145 int nmi;
1146
1147 if (cpuhw->n_limited)
1148 freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
1149 mfspr(SPRN_PMC6));
1150
1151 /*
1152 * Overload regs->dsisr to store MMCRA so we only need to read it once.
1153 */
1154 regs->dsisr = mfspr(SPRN_MMCRA);
1155
1156 /*
1157 * If interrupts were soft-disabled when this PMU interrupt
1158 * occurred, treat it as an NMI.
1159 */
1160 nmi = !regs->softe;
1161 if (nmi)
1162 nmi_enter();
1163 else
1164 irq_enter();
1165
1166 for (i = 0; i < cpuhw->n_counters; ++i) {
1167 counter = cpuhw->counter[i];
1168 if (!counter->hw.idx || is_limited_pmc(counter->hw.idx))
1169 continue;
1170 val = read_pmc(counter->hw.idx);
1171 if ((int)val < 0) {
1172 /* counter has overflowed */
1173 found = 1;
1174 record_and_restart(counter, val, regs, nmi);
1175 }
1176 }
1177
1178 /*
1179 * In case we didn't find and reset the counter that caused
1180 * the interrupt, scan all counters and reset any that are
1181 * negative, to avoid getting continual interrupts.
1182 * Any that we processed in the previous loop will not be negative.
1183 */
1184 if (!found) {
1185 for (i = 0; i < ppmu->n_counter; ++i) {
1186 if (is_limited_pmc(i + 1))
1187 continue;
1188 val = read_pmc(i + 1);
1189 if ((int)val < 0)
1190 write_pmc(i + 1, 0);
1191 }
1192 }
1193
1194 /*
1195 * Reset MMCR0 to its normal value. This will set PMXE and
1196 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
1197 * and thus allow interrupts to occur again.
1198 * XXX might want to use MSR.PM to keep the counters frozen until
1199 * we get back out of this interrupt.
1200 */
1201 write_mmcr0(cpuhw, cpuhw->mmcr[0]);
1202
1203 if (nmi)
1204 nmi_exit();
1205 else
1206 irq_exit();
1207}
1208
1209void hw_perf_counter_setup(int cpu)
1210{
1211 struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
1212
1213 memset(cpuhw, 0, sizeof(*cpuhw));
1214 cpuhw->mmcr[0] = MMCR0_FC;
1215}
1216
1217extern struct power_pmu power4_pmu;
1218extern struct power_pmu ppc970_pmu;
1219extern struct power_pmu power5_pmu;
1220extern struct power_pmu power5p_pmu;
1221extern struct power_pmu power6_pmu;
1222extern struct power_pmu power7_pmu;
1223
1224static int init_perf_counters(void)
1225{
1226 unsigned long pvr;
1227
1228 /* XXX should get this from cputable */
1229 pvr = mfspr(SPRN_PVR);
1230 switch (PVR_VER(pvr)) {
1231 case PV_POWER4:
1232 case PV_POWER4p:
1233 ppmu = &power4_pmu;
1234 break;
1235 case PV_970:
1236 case PV_970FX:
1237 case PV_970MP:
1238 ppmu = &ppc970_pmu;
1239 break;
1240 case PV_POWER5:
1241 ppmu = &power5_pmu;
1242 break;
1243 case PV_POWER5p:
1244 ppmu = &power5p_pmu;
1245 break;
1246 case 0x3e:
1247 ppmu = &power6_pmu;
1248 break;
1249 case 0x3f:
1250 ppmu = &power7_pmu;
1251 break;
1252 }
1253
1254 /*
1255 * Use FCHV to ignore kernel events if MSR.HV is set.
1256 */
1257 if (mfmsr() & MSR_HV)
1258 freeze_counters_kernel = MMCR0_FCHV;
1259
1260 return 0;
1261}
1262
1263arch_initcall(init_perf_counters);
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
new file mode 100644
index 000000000000..07bd308a5fa7
--- /dev/null
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -0,0 +1,598 @@
1/*
2 * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors.
3 *
4 * Copyright 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER4
17 */
18#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
21#define PM_UNIT_MSK 0xf
22#define PM_LOWER_SH 6
23#define PM_LOWER_MSK 1
24#define PM_LOWER_MSKS 0x40
25#define PM_BYTE_SH 4 /* Byte number of event bus to use */
26#define PM_BYTE_MSK 3
27#define PM_PMCSEL_MSK 7
28
29/*
30 * Unit code values
31 */
32#define PM_FPU 1
33#define PM_ISU1 2
34#define PM_IFU 3
35#define PM_IDU0 4
36#define PM_ISU1_ALT 6
37#define PM_ISU2 7
38#define PM_IFU_ALT 8
39#define PM_LSU0 9
40#define PM_LSU1 0xc
41#define PM_GPS 0xf
42
43/*
44 * Bits in MMCR0 for POWER4
45 */
46#define MMCR0_PMC1SEL_SH 8
47#define MMCR0_PMC2SEL_SH 1
48#define MMCR_PMCSEL_MSK 0x1f
49
50/*
51 * Bits in MMCR1 for POWER4
52 */
53#define MMCR1_TTM0SEL_SH 62
54#define MMCR1_TTC0SEL_SH 61
55#define MMCR1_TTM1SEL_SH 59
56#define MMCR1_TTC1SEL_SH 58
57#define MMCR1_TTM2SEL_SH 56
58#define MMCR1_TTC2SEL_SH 55
59#define MMCR1_TTM3SEL_SH 53
60#define MMCR1_TTC3SEL_SH 52
61#define MMCR1_TTMSEL_MSK 3
62#define MMCR1_TD_CP_DBG0SEL_SH 50
63#define MMCR1_TD_CP_DBG1SEL_SH 48
64#define MMCR1_TD_CP_DBG2SEL_SH 46
65#define MMCR1_TD_CP_DBG3SEL_SH 44
66#define MMCR1_DEBUG0SEL_SH 43
67#define MMCR1_DEBUG1SEL_SH 42
68#define MMCR1_DEBUG2SEL_SH 41
69#define MMCR1_DEBUG3SEL_SH 40
70#define MMCR1_PMC1_ADDER_SEL_SH 39
71#define MMCR1_PMC2_ADDER_SEL_SH 38
72#define MMCR1_PMC6_ADDER_SEL_SH 37
73#define MMCR1_PMC5_ADDER_SEL_SH 36
74#define MMCR1_PMC8_ADDER_SEL_SH 35
75#define MMCR1_PMC7_ADDER_SEL_SH 34
76#define MMCR1_PMC3_ADDER_SEL_SH 33
77#define MMCR1_PMC4_ADDER_SEL_SH 32
78#define MMCR1_PMC3SEL_SH 27
79#define MMCR1_PMC4SEL_SH 22
80#define MMCR1_PMC5SEL_SH 17
81#define MMCR1_PMC6SEL_SH 12
82#define MMCR1_PMC7SEL_SH 7
83#define MMCR1_PMC8SEL_SH 2 /* note bit 0 is in MMCRA for GP */
84
85static short mmcr1_adder_bits[8] = {
86 MMCR1_PMC1_ADDER_SEL_SH,
87 MMCR1_PMC2_ADDER_SEL_SH,
88 MMCR1_PMC3_ADDER_SEL_SH,
89 MMCR1_PMC4_ADDER_SEL_SH,
90 MMCR1_PMC5_ADDER_SEL_SH,
91 MMCR1_PMC6_ADDER_SEL_SH,
92 MMCR1_PMC7_ADDER_SEL_SH,
93 MMCR1_PMC8_ADDER_SEL_SH
94};
95
96/*
97 * Bits in MMCRA
98 */
99#define MMCRA_PMC8SEL0_SH 17 /* PMC8SEL bit 0 for GP */
100
101/*
102 * Layout of constraint bits:
103 * 6666555555555544444444443333333333222222222211111111110000000000
104 * 3210987654321098765432109876543210987654321098765432109876543210
105 * |[ >[ >[ >|||[ >[ >< >< >< >< ><><><><><><><><>
106 * | UC1 UC2 UC3 ||| PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
107 * \SMPL ||\TTC3SEL
108 * |\TTC_IFU_SEL
109 * \TTM2SEL0
110 *
111 * SMPL - SAMPLE_ENABLE constraint
112 * 56: SAMPLE_ENABLE value 0x0100_0000_0000_0000
113 *
114 * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2
115 * 55: UC1 error 0x0080_0000_0000_0000
116 * 54: FPU events needed 0x0040_0000_0000_0000
117 * 53: ISU1 events needed 0x0020_0000_0000_0000
118 * 52: IDU0|ISU2 events needed 0x0010_0000_0000_0000
119 *
120 * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0
121 * 51: UC2 error 0x0008_0000_0000_0000
122 * 50: FPU events needed 0x0004_0000_0000_0000
123 * 49: IFU events needed 0x0002_0000_0000_0000
124 * 48: LSU0 events needed 0x0001_0000_0000_0000
125 *
126 * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1
127 * 47: UC3 error 0x8000_0000_0000
128 * 46: LSU0 events needed 0x4000_0000_0000
129 * 45: IFU events needed 0x2000_0000_0000
130 * 44: IDU0|ISU2 events needed 0x1000_0000_0000
131 * 43: ISU1 events needed 0x0800_0000_0000
132 *
133 * TTM2SEL0
134 * 42: 0 = IDU0 events needed
135 * 1 = ISU2 events needed 0x0400_0000_0000
136 *
137 * TTC_IFU_SEL
138 * 41: 0 = IFU.U events needed
139 * 1 = IFU.L events needed 0x0200_0000_0000
140 *
141 * TTC3SEL
142 * 40: 0 = LSU1.U events needed
143 * 1 = LSU1.L events needed 0x0100_0000_0000
144 *
145 * PS1
146 * 39: PS1 error 0x0080_0000_0000
147 * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
148 *
149 * PS2
150 * 35: PS2 error 0x0008_0000_0000
151 * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
152 *
153 * B0
154 * 28-31: Byte 0 event source 0xf000_0000
155 * 1 = FPU
156 * 2 = ISU1
157 * 3 = IFU
158 * 4 = IDU0
159 * 7 = ISU2
160 * 9 = LSU0
161 * c = LSU1
162 * f = GPS
163 *
164 * B1, B2, B3
165 * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
166 *
167 * P8
168 * 15: P8 error 0x8000
169 * 14-15: Count of events needing PMC8
170 *
171 * P1..P7
172 * 0-13: Count of events needing PMC1..PMC7
173 *
174 * Note: this doesn't allow events using IFU.U to be combined with events
175 * using IFU.L, though that is feasible (using TTM0 and TTM2). However
176 * there are no listed events for IFU.L (they are debug events not
177 * verified for performance monitoring) so this shouldn't cause a
178 * problem.
179 */
180
181static struct unitinfo {
182 u64 value, mask;
183 int unit;
184 int lowerbit;
185} p4_unitinfo[16] = {
186 [PM_FPU] = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 },
187 [PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
188 [PM_ISU1_ALT] =
189 { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
190 [PM_IFU] = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
191 [PM_IFU_ALT] =
192 { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
193 [PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 },
194 [PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 },
195 [PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 },
196 [PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 },
197 [PM_GPS] = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 }
198};
199
200static unsigned char direct_marked_event[8] = {
201 (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
202 (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
203 (1<<3), /* PMC3: PM_MRK_ST_CMPL_INT */
204 (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
205 (1<<4) | (1<<5), /* PMC5: PM_MRK_GRP_TIMEO */
206 (1<<3) | (1<<4) | (1<<5),
207 /* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
208 (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
209 (1<<4), /* PMC8: PM_MRK_LSU_FIN */
210};
211
212/*
213 * Returns 1 if event counts things relating to marked instructions
214 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
215 */
216static int p4_marked_instr_event(u64 event)
217{
218 int pmc, psel, unit, byte, bit;
219 unsigned int mask;
220
221 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
222 psel = event & PM_PMCSEL_MSK;
223 if (pmc) {
224 if (direct_marked_event[pmc - 1] & (1 << psel))
225 return 1;
226 if (psel == 0) /* add events */
227 bit = (pmc <= 4)? pmc - 1: 8 - pmc;
228 else if (psel == 6) /* decode events */
229 bit = 4;
230 else
231 return 0;
232 } else
233 bit = psel;
234
235 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
236 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
237 mask = 0;
238 switch (unit) {
239 case PM_LSU1:
240 if (event & PM_LOWER_MSKS)
241 mask = 1 << 28; /* byte 7 bit 4 */
242 else
243 mask = 6 << 24; /* byte 3 bits 1 and 2 */
244 break;
245 case PM_LSU0:
246 /* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */
247 mask = 0x083dff00;
248 }
249 return (mask >> (byte * 8 + bit)) & 1;
250}
251
252static int p4_get_constraint(u64 event, u64 *maskp, u64 *valp)
253{
254 int pmc, byte, unit, lower, sh;
255 u64 mask = 0, value = 0;
256 int grp = -1;
257
258 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
259 if (pmc) {
260 if (pmc > 8)
261 return -1;
262 sh = (pmc - 1) * 2;
263 mask |= 2 << sh;
264 value |= 1 << sh;
265 grp = ((pmc - 1) >> 1) & 1;
266 }
267 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
268 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
269 if (unit) {
270 lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK;
271
272 /*
273 * Bus events on bytes 0 and 2 can be counted
274 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
275 */
276 if (!pmc)
277 grp = byte & 1;
278
279 if (!p4_unitinfo[unit].unit)
280 return -1;
281 mask |= p4_unitinfo[unit].mask;
282 value |= p4_unitinfo[unit].value;
283 sh = p4_unitinfo[unit].lowerbit;
284 if (sh > 1)
285 value |= (u64)lower << sh;
286 else if (lower != sh)
287 return -1;
288 unit = p4_unitinfo[unit].unit;
289
290 /* Set byte lane select field */
291 mask |= 0xfULL << (28 - 4 * byte);
292 value |= (u64)unit << (28 - 4 * byte);
293 }
294 if (grp == 0) {
295 /* increment PMC1/2/5/6 field */
296 mask |= 0x8000000000ull;
297 value |= 0x1000000000ull;
298 } else {
299 /* increment PMC3/4/7/8 field */
300 mask |= 0x800000000ull;
301 value |= 0x100000000ull;
302 }
303
304 /* Marked instruction events need sample_enable set */
305 if (p4_marked_instr_event(event)) {
306 mask |= 1ull << 56;
307 value |= 1ull << 56;
308 }
309
310 /* PMCSEL=6 decode events on byte 2 need sample_enable clear */
311 if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2)
312 mask |= 1ull << 56;
313
314 *maskp = mask;
315 *valp = value;
316 return 0;
317}
318
319static unsigned int ppc_inst_cmpl[] = {
320 0x1001, 0x4001, 0x6001, 0x7001, 0x8001
321};
322
323static int p4_get_alternatives(u64 event, unsigned int flags, u64 alt[])
324{
325 int i, j, na;
326
327 alt[0] = event;
328 na = 1;
329
330 /* 2 possibilities for PM_GRP_DISP_REJECT */
331 if (event == 0x8003 || event == 0x0224) {
332 alt[1] = event ^ (0x8003 ^ 0x0224);
333 return 2;
334 }
335
336 /* 2 possibilities for PM_ST_MISS_L1 */
337 if (event == 0x0c13 || event == 0x0c23) {
338 alt[1] = event ^ (0x0c13 ^ 0x0c23);
339 return 2;
340 }
341
342 /* several possibilities for PM_INST_CMPL */
343 for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) {
344 if (event == ppc_inst_cmpl[i]) {
345 for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j)
346 if (j != i)
347 alt[na++] = ppc_inst_cmpl[j];
348 break;
349 }
350 }
351
352 return na;
353}
354
355static int p4_compute_mmcr(u64 event[], int n_ev,
356 unsigned int hwc[], u64 mmcr[])
357{
358 u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
359 unsigned int pmc, unit, byte, psel, lower;
360 unsigned int ttm, grp;
361 unsigned int pmc_inuse = 0;
362 unsigned int pmc_grp_use[2];
363 unsigned char busbyte[4];
364 unsigned char unituse[16];
365 unsigned int unitlower = 0;
366 int i;
367
368 if (n_ev > 8)
369 return -1;
370
371 /* First pass to count resource use */
372 pmc_grp_use[0] = pmc_grp_use[1] = 0;
373 memset(busbyte, 0, sizeof(busbyte));
374 memset(unituse, 0, sizeof(unituse));
375 for (i = 0; i < n_ev; ++i) {
376 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
377 if (pmc) {
378 if (pmc_inuse & (1 << (pmc - 1)))
379 return -1;
380 pmc_inuse |= 1 << (pmc - 1);
381 /* count 1/2/5/6 vs 3/4/7/8 use */
382 ++pmc_grp_use[((pmc - 1) >> 1) & 1];
383 }
384 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
385 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
386 lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK;
387 if (unit) {
388 if (!pmc)
389 ++pmc_grp_use[byte & 1];
390 if (unit == 6 || unit == 8)
391 /* map alt ISU1/IFU codes: 6->2, 8->3 */
392 unit = (unit >> 1) - 1;
393 if (busbyte[byte] && busbyte[byte] != unit)
394 return -1;
395 busbyte[byte] = unit;
396 lower <<= unit;
397 if (unituse[unit] && lower != (unitlower & lower))
398 return -1;
399 unituse[unit] = 1;
400 unitlower |= lower;
401 }
402 }
403 if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
404 return -1;
405
406 /*
407 * Assign resources and set multiplexer selects.
408 *
409 * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2.
410 * Each TTMx can only select one unit, but since
411 * units 2 and 6 are both ISU1, and 3 and 8 are both IFU,
412 * we have some choices.
413 */
414 if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) {
415 unituse[6] = 1; /* Move 2 to 6 */
416 unituse[2] = 0;
417 }
418 if (unituse[3] & (unituse[1] | unituse[2])) {
419 unituse[8] = 1; /* Move 3 to 8 */
420 unituse[3] = 0;
421 unitlower = (unitlower & ~8) | ((unitlower & 8) << 5);
422 }
423 /* Check only one unit per TTMx */
424 if (unituse[1] + unituse[2] + unituse[3] > 1 ||
425 unituse[4] + unituse[6] + unituse[7] > 1 ||
426 unituse[8] + unituse[9] > 1 ||
427 (unituse[5] | unituse[10] | unituse[11] |
428 unituse[13] | unituse[14]))
429 return -1;
430
431 /* Set TTMxSEL fields. Note, units 1-3 => TTM0SEL codes 0-2 */
432 mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH;
433 mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH;
434 mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH;
435
436 /* Set TTCxSEL fields. */
437 if (unitlower & 0xe)
438 mmcr1 |= 1ull << MMCR1_TTC0SEL_SH;
439 if (unitlower & 0xf0)
440 mmcr1 |= 1ull << MMCR1_TTC1SEL_SH;
441 if (unitlower & 0xf00)
442 mmcr1 |= 1ull << MMCR1_TTC2SEL_SH;
443 if (unitlower & 0x7000)
444 mmcr1 |= 1ull << MMCR1_TTC3SEL_SH;
445
446 /* Set byte lane select fields. */
447 for (byte = 0; byte < 4; ++byte) {
448 unit = busbyte[byte];
449 if (!unit)
450 continue;
451 if (unit == 0xf) {
452 /* special case for GPS */
453 mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte);
454 } else {
455 if (!unituse[unit])
456 ttm = unit - 1; /* 2->1, 3->2 */
457 else
458 ttm = unit >> 2;
459 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte);
460 }
461 }
462
463 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
464 for (i = 0; i < n_ev; ++i) {
465 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
466 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
467 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
468 psel = event[i] & PM_PMCSEL_MSK;
469 if (!pmc) {
470 /* Bus event or 00xxx direct event (off or cycles) */
471 if (unit)
472 psel |= 0x10 | ((byte & 2) << 2);
473 for (pmc = 0; pmc < 8; ++pmc) {
474 if (pmc_inuse & (1 << pmc))
475 continue;
476 grp = (pmc >> 1) & 1;
477 if (unit) {
478 if (grp == (byte & 1))
479 break;
480 } else if (pmc_grp_use[grp] < 4) {
481 ++pmc_grp_use[grp];
482 break;
483 }
484 }
485 pmc_inuse |= 1 << pmc;
486 } else {
487 /* Direct event */
488 --pmc;
489 if (psel == 0 && (byte & 2))
490 /* add events on higher-numbered bus */
491 mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
492 else if (psel == 6 && byte == 3)
493 /* seem to need to set sample_enable here */
494 mmcra |= MMCRA_SAMPLE_ENABLE;
495 psel |= 8;
496 }
497 if (pmc <= 1)
498 mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc);
499 else
500 mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
501 if (pmc == 7) /* PMC8 */
502 mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH;
503 hwc[i] = pmc;
504 if (p4_marked_instr_event(event[i]))
505 mmcra |= MMCRA_SAMPLE_ENABLE;
506 }
507
508 if (pmc_inuse & 1)
509 mmcr0 |= MMCR0_PMC1CE;
510 if (pmc_inuse & 0xfe)
511 mmcr0 |= MMCR0_PMCjCE;
512
513 mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
514
515 /* Return MMCRx values */
516 mmcr[0] = mmcr0;
517 mmcr[1] = mmcr1;
518 mmcr[2] = mmcra;
519 return 0;
520}
521
522static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
523{
524 /*
525 * Setting the PMCxSEL field to 0 disables PMC x.
526 * (Note that pmc is 0-based here, not 1-based.)
527 */
528 if (pmc <= 1) {
529 mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc));
530 } else {
531 mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)));
532 if (pmc == 7)
533 mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH);
534 }
535}
536
537static int p4_generic_events[] = {
538 [PERF_COUNT_HW_CPU_CYCLES] = 7,
539 [PERF_COUNT_HW_INSTRUCTIONS] = 0x1001,
540 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */
541 [PERF_COUNT_HW_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */
542 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */
543 [PERF_COUNT_HW_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */
544};
545
546#define C(x) PERF_COUNT_HW_CACHE_##x
547
548/*
549 * Table of generalized cache-related events.
550 * 0 means not supported, -1 means nonsensical, other values
551 * are event codes.
552 */
553static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
554 [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
555 [C(OP_READ)] = { 0x8c10, 0x3c10 },
556 [C(OP_WRITE)] = { 0x7c10, 0xc13 },
557 [C(OP_PREFETCH)] = { 0xc35, 0 },
558 },
559 [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
560 [C(OP_READ)] = { 0, 0 },
561 [C(OP_WRITE)] = { -1, -1 },
562 [C(OP_PREFETCH)] = { 0, 0 },
563 },
564 [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
565 [C(OP_READ)] = { 0, 0 },
566 [C(OP_WRITE)] = { 0, 0 },
567 [C(OP_PREFETCH)] = { 0xc34, 0 },
568 },
569 [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
570 [C(OP_READ)] = { 0, 0x904 },
571 [C(OP_WRITE)] = { -1, -1 },
572 [C(OP_PREFETCH)] = { -1, -1 },
573 },
574 [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
575 [C(OP_READ)] = { 0, 0x900 },
576 [C(OP_WRITE)] = { -1, -1 },
577 [C(OP_PREFETCH)] = { -1, -1 },
578 },
579 [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
580 [C(OP_READ)] = { 0x330, 0x331 },
581 [C(OP_WRITE)] = { -1, -1 },
582 [C(OP_PREFETCH)] = { -1, -1 },
583 },
584};
585
586struct power_pmu power4_pmu = {
587 .n_counter = 8,
588 .max_alternatives = 5,
589 .add_fields = 0x0000001100005555ull,
590 .test_adder = 0x0011083300000000ull,
591 .compute_mmcr = p4_compute_mmcr,
592 .get_constraint = p4_get_constraint,
593 .get_alternatives = p4_get_alternatives,
594 .disable_pmc = p4_disable_pmc,
595 .n_generic = ARRAY_SIZE(p4_generic_events),
596 .generic_events = p4_generic_events,
597 .cache_events = &power4_cache_events,
598};
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
new file mode 100644
index 000000000000..41e5d2d958d4
--- /dev/null
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -0,0 +1,671 @@
1/*
2 * Performance counter support for POWER5+/++ (not POWER5) processors.
3 *
4 * Copyright 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3)
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */
22#define PM_UNIT_MSK 0xf
23#define PM_BYTE_SH 12 /* Byte number of event bus to use */
24#define PM_BYTE_MSK 7
25#define PM_GRS_SH 8 /* Storage subsystem mux select */
26#define PM_GRS_MSK 7
27#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */
28#define PM_PMCSEL_MSK 0x7f
29
30/* Values in PM_UNIT field */
31#define PM_FPU 0
32#define PM_ISU0 1
33#define PM_IFU 2
34#define PM_ISU1 3
35#define PM_IDU 4
36#define PM_ISU0_ALT 6
37#define PM_GRS 7
38#define PM_LSU0 8
39#define PM_LSU1 0xc
40#define PM_LASTUNIT 0xc
41
42/*
43 * Bits in MMCR1 for POWER5+
44 */
45#define MMCR1_TTM0SEL_SH 62
46#define MMCR1_TTM1SEL_SH 60
47#define MMCR1_TTM2SEL_SH 58
48#define MMCR1_TTM3SEL_SH 56
49#define MMCR1_TTMSEL_MSK 3
50#define MMCR1_TD_CP_DBG0SEL_SH 54
51#define MMCR1_TD_CP_DBG1SEL_SH 52
52#define MMCR1_TD_CP_DBG2SEL_SH 50
53#define MMCR1_TD_CP_DBG3SEL_SH 48
54#define MMCR1_GRS_L2SEL_SH 46
55#define MMCR1_GRS_L2SEL_MSK 3
56#define MMCR1_GRS_L3SEL_SH 44
57#define MMCR1_GRS_L3SEL_MSK 3
58#define MMCR1_GRS_MCSEL_SH 41
59#define MMCR1_GRS_MCSEL_MSK 7
60#define MMCR1_GRS_FABSEL_SH 39
61#define MMCR1_GRS_FABSEL_MSK 3
62#define MMCR1_PMC1_ADDER_SEL_SH 35
63#define MMCR1_PMC2_ADDER_SEL_SH 34
64#define MMCR1_PMC3_ADDER_SEL_SH 33
65#define MMCR1_PMC4_ADDER_SEL_SH 32
66#define MMCR1_PMC1SEL_SH 25
67#define MMCR1_PMC2SEL_SH 17
68#define MMCR1_PMC3SEL_SH 9
69#define MMCR1_PMC4SEL_SH 1
70#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
71#define MMCR1_PMCSEL_MSK 0x7f
72
73/*
74 * Bits in MMCRA
75 */
76
77/*
78 * Layout of constraint bits:
79 * 6666555555555544444444443333333333222222222211111111110000000000
80 * 3210987654321098765432109876543210987654321098765432109876543210
81 * [ ><><>< ><> <><>[ > < >< >< >< ><><><><><><>
82 * NC G0G1G2 G3 T0T1 UC B0 B1 B2 B3 P6P5P4P3P2P1
83 *
84 * NC - number of counters
85 * 51: NC error 0x0008_0000_0000_0000
86 * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
87 *
88 * G0..G3 - GRS mux constraints
89 * 46-47: GRS_L2SEL value
90 * 44-45: GRS_L3SEL value
91 * 41-44: GRS_MCSEL value
92 * 39-40: GRS_FABSEL value
93 * Note that these match up with their bit positions in MMCR1
94 *
95 * T0 - TTM0 constraint
96 * 36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000
97 *
98 * T1 - TTM1 constraint
99 * 34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000
100 *
101 * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
102 * 33: UC3 error 0x02_0000_0000
103 * 32: FPU|IFU|ISU1 events needed 0x01_0000_0000
104 * 31: ISU0 events needed 0x01_8000_0000
105 * 30: IDU|GRS events needed 0x00_4000_0000
106 *
107 * B0
108 * 24-27: Byte 0 event source 0x0f00_0000
109 * Encoding as for the event code
110 *
111 * B1, B2, B3
112 * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
113 *
114 * P6
115 * 11: P6 error 0x800
116 * 10-11: Count of events needing PMC6
117 *
118 * P1..P5
119 * 0-9: Count of events needing PMC1..PMC5
120 */
121
122static const int grsel_shift[8] = {
123 MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
124 MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
125 MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
126};
127
128/* Masks and values for using events from the various units */
129static u64 unit_cons[PM_LASTUNIT+1][2] = {
130 [PM_FPU] = { 0x3200000000ull, 0x0100000000ull },
131 [PM_ISU0] = { 0x0200000000ull, 0x0080000000ull },
132 [PM_ISU1] = { 0x3200000000ull, 0x3100000000ull },
133 [PM_IFU] = { 0x3200000000ull, 0x2100000000ull },
134 [PM_IDU] = { 0x0e00000000ull, 0x0040000000ull },
135 [PM_GRS] = { 0x0e00000000ull, 0x0c40000000ull },
136};
137
138static int power5p_get_constraint(u64 event, u64 *maskp, u64 *valp)
139{
140 int pmc, byte, unit, sh;
141 int bit, fmask;
142 u64 mask = 0, value = 0;
143
144 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
145 if (pmc) {
146 if (pmc > 6)
147 return -1;
148 sh = (pmc - 1) * 2;
149 mask |= 2 << sh;
150 value |= 1 << sh;
151 if (pmc >= 5 && !(event == 0x500009 || event == 0x600005))
152 return -1;
153 }
154 if (event & PM_BUSEVENT_MSK) {
155 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
156 if (unit > PM_LASTUNIT)
157 return -1;
158 if (unit == PM_ISU0_ALT)
159 unit = PM_ISU0;
160 mask |= unit_cons[unit][0];
161 value |= unit_cons[unit][1];
162 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
163 if (byte >= 4) {
164 if (unit != PM_LSU1)
165 return -1;
166 /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
167 ++unit;
168 byte &= 3;
169 }
170 if (unit == PM_GRS) {
171 bit = event & 7;
172 fmask = (bit == 6)? 7: 3;
173 sh = grsel_shift[bit];
174 mask |= (u64)fmask << sh;
175 value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
176 }
177 /* Set byte lane select field */
178 mask |= 0xfULL << (24 - 4 * byte);
179 value |= (u64)unit << (24 - 4 * byte);
180 }
181 if (pmc < 5) {
182 /* need a counter from PMC1-4 set */
183 mask |= 0x8000000000000ull;
184 value |= 0x1000000000000ull;
185 }
186 *maskp = mask;
187 *valp = value;
188 return 0;
189}
190
191static int power5p_limited_pmc_event(u64 event)
192{
193 int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
194
195 return pmc == 5 || pmc == 6;
196}
197
198#define MAX_ALT 3 /* at most 3 alternatives for any event */
199
200static const unsigned int event_alternatives[][MAX_ALT] = {
201 { 0x100c0, 0x40001f }, /* PM_GCT_FULL_CYC */
202 { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */
203 { 0x230e2, 0x323087 }, /* PM_BR_PRED_CR */
204 { 0x230e3, 0x223087, 0x3230a0 }, /* PM_BR_PRED_TA */
205 { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */
206 { 0x800c4, 0xc20e0 }, /* PM_DTLB_MISS */
207 { 0xc50c6, 0xc60e0 }, /* PM_MRK_DTLB_MISS */
208 { 0x100005, 0x600005 }, /* PM_RUN_CYC */
209 { 0x100009, 0x200009 }, /* PM_INST_CMPL */
210 { 0x200015, 0x300015 }, /* PM_LSU_LMQ_SRQ_EMPTY_CYC */
211 { 0x300009, 0x400009 }, /* PM_INST_DISP */
212};
213
214/*
215 * Scan the alternatives table for a match and return the
216 * index into the alternatives table if found, else -1.
217 */
218static int find_alternative(unsigned int event)
219{
220 int i, j;
221
222 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
223 if (event < event_alternatives[i][0])
224 break;
225 for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
226 if (event == event_alternatives[i][j])
227 return i;
228 }
229 return -1;
230}
231
232static const unsigned char bytedecode_alternatives[4][4] = {
233 /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 },
234 /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e },
235 /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 },
236 /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e }
237};
238
239/*
240 * Some direct events for decodes of event bus byte 3 have alternative
241 * PMCSEL values on other counters. This returns the alternative
242 * event code for those that do, or -1 otherwise. This also handles
243 * alternative PCMSEL values for add events.
244 */
245static s64 find_alternative_bdecode(u64 event)
246{
247 int pmc, altpmc, pp, j;
248
249 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
250 if (pmc == 0 || pmc > 4)
251 return -1;
252 altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */
253 pp = event & PM_PMCSEL_MSK;
254 for (j = 0; j < 4; ++j) {
255 if (bytedecode_alternatives[pmc - 1][j] == pp) {
256 return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
257 (altpmc << PM_PMC_SH) |
258 bytedecode_alternatives[altpmc - 1][j];
259 }
260 }
261
262 /* new decode alternatives for power5+ */
263 if (pmc == 1 && (pp == 0x0d || pp == 0x0e))
264 return event + (2 << PM_PMC_SH) + (0x2e - 0x0d);
265 if (pmc == 3 && (pp == 0x2e || pp == 0x2f))
266 return event - (2 << PM_PMC_SH) - (0x2e - 0x0d);
267
268 /* alternative add event encodings */
269 if (pp == 0x10 || pp == 0x28)
270 return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) |
271 (altpmc << PM_PMC_SH);
272
273 return -1;
274}
275
276static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[])
277{
278 int i, j, nalt = 1;
279 int nlim;
280 s64 ae;
281
282 alt[0] = event;
283 nalt = 1;
284 nlim = power5p_limited_pmc_event(event);
285 i = find_alternative(event);
286 if (i >= 0) {
287 for (j = 0; j < MAX_ALT; ++j) {
288 ae = event_alternatives[i][j];
289 if (ae && ae != event)
290 alt[nalt++] = ae;
291 nlim += power5p_limited_pmc_event(ae);
292 }
293 } else {
294 ae = find_alternative_bdecode(event);
295 if (ae > 0)
296 alt[nalt++] = ae;
297 }
298
299 if (flags & PPMU_ONLY_COUNT_RUN) {
300 /*
301 * We're only counting in RUN state,
302 * so PM_CYC is equivalent to PM_RUN_CYC
303 * and PM_INST_CMPL === PM_RUN_INST_CMPL.
304 * This doesn't include alternatives that don't provide
305 * any extra flexibility in assigning PMCs (e.g.
306 * 0x100005 for PM_RUN_CYC vs. 0xf for PM_CYC).
307 * Note that even with these additional alternatives
308 * we never end up with more than 3 alternatives for any event.
309 */
310 j = nalt;
311 for (i = 0; i < nalt; ++i) {
312 switch (alt[i]) {
313 case 0xf: /* PM_CYC */
314 alt[j++] = 0x600005; /* PM_RUN_CYC */
315 ++nlim;
316 break;
317 case 0x600005: /* PM_RUN_CYC */
318 alt[j++] = 0xf;
319 break;
320 case 0x100009: /* PM_INST_CMPL */
321 alt[j++] = 0x500009; /* PM_RUN_INST_CMPL */
322 ++nlim;
323 break;
324 case 0x500009: /* PM_RUN_INST_CMPL */
325 alt[j++] = 0x100009; /* PM_INST_CMPL */
326 alt[j++] = 0x200009;
327 break;
328 }
329 }
330 nalt = j;
331 }
332
333 if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
334 /* remove the limited PMC events */
335 j = 0;
336 for (i = 0; i < nalt; ++i) {
337 if (!power5p_limited_pmc_event(alt[i])) {
338 alt[j] = alt[i];
339 ++j;
340 }
341 }
342 nalt = j;
343 } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
344 /* remove all but the limited PMC events */
345 j = 0;
346 for (i = 0; i < nalt; ++i) {
347 if (power5p_limited_pmc_event(alt[i])) {
348 alt[j] = alt[i];
349 ++j;
350 }
351 }
352 nalt = j;
353 }
354
355 return nalt;
356}
357
358/*
359 * Map of which direct events on which PMCs are marked instruction events.
360 * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
361 * Bit 0 is set if it is marked for all PMCs.
362 * The 0x80 bit indicates a byte decode PMCSEL value.
363 */
364static unsigned char direct_event_is_marked[0x28] = {
365 0, /* 00 */
366 0x1f, /* 01 PM_IOPS_CMPL */
367 0x2, /* 02 PM_MRK_GRP_DISP */
368 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
369 0, /* 04 */
370 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
371 0x80, /* 06 */
372 0x80, /* 07 */
373 0, 0, 0,/* 08 - 0a */
374 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
375 0, /* 0c */
376 0x80, /* 0d */
377 0x80, /* 0e */
378 0, /* 0f */
379 0, /* 10 */
380 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
381 0, /* 12 */
382 0x10, /* 13 PM_MRK_GRP_CMPL */
383 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
384 0x2, /* 15 PM_MRK_GRP_ISSUED */
385 0x80, /* 16 */
386 0x80, /* 17 */
387 0, 0, 0, 0, 0,
388 0x80, /* 1d */
389 0x80, /* 1e */
390 0, /* 1f */
391 0x80, /* 20 */
392 0x80, /* 21 */
393 0x80, /* 22 */
394 0x80, /* 23 */
395 0x80, /* 24 */
396 0x80, /* 25 */
397 0x80, /* 26 */
398 0x80, /* 27 */
399};
400
401/*
402 * Returns 1 if event counts things relating to marked instructions
403 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
404 */
405static int power5p_marked_instr_event(u64 event)
406{
407 int pmc, psel;
408 int bit, byte, unit;
409 u32 mask;
410
411 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
412 psel = event & PM_PMCSEL_MSK;
413 if (pmc >= 5)
414 return 0;
415
416 bit = -1;
417 if (psel < sizeof(direct_event_is_marked)) {
418 if (direct_event_is_marked[psel] & (1 << pmc))
419 return 1;
420 if (direct_event_is_marked[psel] & 0x80)
421 bit = 4;
422 else if (psel == 0x08)
423 bit = pmc - 1;
424 else if (psel == 0x10)
425 bit = 4 - pmc;
426 else if (psel == 0x1b && (pmc == 1 || pmc == 3))
427 bit = 4;
428 } else if ((psel & 0x48) == 0x40) {
429 bit = psel & 7;
430 } else if (psel == 0x28) {
431 bit = pmc - 1;
432 } else if (pmc == 3 && (psel == 0x2e || psel == 0x2f)) {
433 bit = 4;
434 }
435
436 if (!(event & PM_BUSEVENT_MSK) || bit == -1)
437 return 0;
438
439 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
440 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
441 if (unit == PM_LSU0) {
442 /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
443 mask = 0x5dff00;
444 } else if (unit == PM_LSU1 && byte >= 4) {
445 byte -= 4;
446 /* byte 5 bits 6-7, byte 6 bits 0,4, byte 7 bits 0-4,6 */
447 mask = 0x5f11c000;
448 } else
449 return 0;
450
451 return (mask >> (byte * 8 + bit)) & 1;
452}
453
454static int power5p_compute_mmcr(u64 event[], int n_ev,
455 unsigned int hwc[], u64 mmcr[])
456{
457 u64 mmcr1 = 0;
458 u64 mmcra = 0;
459 unsigned int pmc, unit, byte, psel;
460 unsigned int ttm;
461 int i, isbus, bit, grsel;
462 unsigned int pmc_inuse = 0;
463 unsigned char busbyte[4];
464 unsigned char unituse[16];
465 int ttmuse;
466
467 if (n_ev > 6)
468 return -1;
469
470 /* First pass to count resource use */
471 memset(busbyte, 0, sizeof(busbyte));
472 memset(unituse, 0, sizeof(unituse));
473 for (i = 0; i < n_ev; ++i) {
474 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
475 if (pmc) {
476 if (pmc > 6)
477 return -1;
478 if (pmc_inuse & (1 << (pmc - 1)))
479 return -1;
480 pmc_inuse |= 1 << (pmc - 1);
481 }
482 if (event[i] & PM_BUSEVENT_MSK) {
483 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
484 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
485 if (unit > PM_LASTUNIT)
486 return -1;
487 if (unit == PM_ISU0_ALT)
488 unit = PM_ISU0;
489 if (byte >= 4) {
490 if (unit != PM_LSU1)
491 return -1;
492 ++unit;
493 byte &= 3;
494 }
495 if (busbyte[byte] && busbyte[byte] != unit)
496 return -1;
497 busbyte[byte] = unit;
498 unituse[unit] = 1;
499 }
500 }
501
502 /*
503 * Assign resources and set multiplexer selects.
504 *
505 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
506 * choice we have to deal with.
507 */
508 if (unituse[PM_ISU0] &
509 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
510 unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */
511 unituse[PM_ISU0] = 0;
512 }
513 /* Set TTM[01]SEL fields. */
514 ttmuse = 0;
515 for (i = PM_FPU; i <= PM_ISU1; ++i) {
516 if (!unituse[i])
517 continue;
518 if (ttmuse++)
519 return -1;
520 mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
521 }
522 ttmuse = 0;
523 for (; i <= PM_GRS; ++i) {
524 if (!unituse[i])
525 continue;
526 if (ttmuse++)
527 return -1;
528 mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
529 }
530 if (ttmuse > 1)
531 return -1;
532
533 /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
534 for (byte = 0; byte < 4; ++byte) {
535 unit = busbyte[byte];
536 if (!unit)
537 continue;
538 if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
539 /* get ISU0 through TTM1 rather than TTM0 */
540 unit = PM_ISU0_ALT;
541 } else if (unit == PM_LSU1 + 1) {
542 /* select lower word of LSU1 for this byte */
543 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
544 }
545 ttm = unit >> 2;
546 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
547 }
548
549 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
550 for (i = 0; i < n_ev; ++i) {
551 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
552 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
553 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
554 psel = event[i] & PM_PMCSEL_MSK;
555 isbus = event[i] & PM_BUSEVENT_MSK;
556 if (!pmc) {
557 /* Bus event or any-PMC direct event */
558 for (pmc = 0; pmc < 4; ++pmc) {
559 if (!(pmc_inuse & (1 << pmc)))
560 break;
561 }
562 if (pmc >= 4)
563 return -1;
564 pmc_inuse |= 1 << pmc;
565 } else if (pmc <= 4) {
566 /* Direct event */
567 --pmc;
568 if (isbus && (byte & 2) &&
569 (psel == 8 || psel == 0x10 || psel == 0x28))
570 /* add events on higher-numbered bus */
571 mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
572 } else {
573 /* Instructions or run cycles on PMC5/6 */
574 --pmc;
575 }
576 if (isbus && unit == PM_GRS) {
577 bit = psel & 7;
578 grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
579 mmcr1 |= (u64)grsel << grsel_shift[bit];
580 }
581 if (power5p_marked_instr_event(event[i]))
582 mmcra |= MMCRA_SAMPLE_ENABLE;
583 if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1))
584 /* select alternate byte lane */
585 psel |= 0x10;
586 if (pmc <= 3)
587 mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
588 hwc[i] = pmc;
589 }
590
591 /* Return MMCRx values */
592 mmcr[0] = 0;
593 if (pmc_inuse & 1)
594 mmcr[0] = MMCR0_PMC1CE;
595 if (pmc_inuse & 0x3e)
596 mmcr[0] |= MMCR0_PMCjCE;
597 mmcr[1] = mmcr1;
598 mmcr[2] = mmcra;
599 return 0;
600}
601
602static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
603{
604 if (pmc <= 3)
605 mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
606}
607
608static int power5p_generic_events[] = {
609 [PERF_COUNT_HW_CPU_CYCLES] = 0xf,
610 [PERF_COUNT_HW_INSTRUCTIONS] = 0x100009,
611 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x1c10a8, /* LD_REF_L1 */
612 [PERF_COUNT_HW_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */
613 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */
614 [PERF_COUNT_HW_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */
615};
616
617#define C(x) PERF_COUNT_HW_CACHE_##x
618
619/*
620 * Table of generalized cache-related events.
621 * 0 means not supported, -1 means nonsensical, other values
622 * are event codes.
623 */
624static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
625 [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
626 [C(OP_READ)] = { 0x1c10a8, 0x3c1088 },
627 [C(OP_WRITE)] = { 0x2c10a8, 0xc10c3 },
628 [C(OP_PREFETCH)] = { 0xc70e7, -1 },
629 },
630 [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
631 [C(OP_READ)] = { 0, 0 },
632 [C(OP_WRITE)] = { -1, -1 },
633 [C(OP_PREFETCH)] = { 0, 0 },
634 },
635 [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
636 [C(OP_READ)] = { 0, 0 },
637 [C(OP_WRITE)] = { 0, 0 },
638 [C(OP_PREFETCH)] = { 0xc50c3, 0 },
639 },
640 [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
641 [C(OP_READ)] = { 0xc20e4, 0x800c4 },
642 [C(OP_WRITE)] = { -1, -1 },
643 [C(OP_PREFETCH)] = { -1, -1 },
644 },
645 [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
646 [C(OP_READ)] = { 0, 0x800c0 },
647 [C(OP_WRITE)] = { -1, -1 },
648 [C(OP_PREFETCH)] = { -1, -1 },
649 },
650 [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
651 [C(OP_READ)] = { 0x230e4, 0x230e5 },
652 [C(OP_WRITE)] = { -1, -1 },
653 [C(OP_PREFETCH)] = { -1, -1 },
654 },
655};
656
657struct power_pmu power5p_pmu = {
658 .n_counter = 6,
659 .max_alternatives = MAX_ALT,
660 .add_fields = 0x7000000000055ull,
661 .test_adder = 0x3000040000000ull,
662 .compute_mmcr = power5p_compute_mmcr,
663 .get_constraint = power5p_get_constraint,
664 .get_alternatives = power5p_get_alternatives,
665 .disable_pmc = power5p_disable_pmc,
666 .limited_pmc_event = power5p_limited_pmc_event,
667 .flags = PPMU_LIMITED_PMC5_6,
668 .n_generic = ARRAY_SIZE(power5p_generic_events),
669 .generic_events = power5p_generic_events,
670 .cache_events = &power5p_cache_events,
671};
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
new file mode 100644
index 000000000000..05600b66221a
--- /dev/null
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -0,0 +1,611 @@
1/*
2 * Performance counter support for POWER5 (not POWER5++) processors.
3 *
4 * Copyright 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER5 (not POWER5++)
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */
22#define PM_UNIT_MSK 0xf
23#define PM_BYTE_SH 12 /* Byte number of event bus to use */
24#define PM_BYTE_MSK 7
25#define PM_GRS_SH 8 /* Storage subsystem mux select */
26#define PM_GRS_MSK 7
27#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */
28#define PM_PMCSEL_MSK 0x7f
29
30/* Values in PM_UNIT field */
31#define PM_FPU 0
32#define PM_ISU0 1
33#define PM_IFU 2
34#define PM_ISU1 3
35#define PM_IDU 4
36#define PM_ISU0_ALT 6
37#define PM_GRS 7
38#define PM_LSU0 8
39#define PM_LSU1 0xc
40#define PM_LASTUNIT 0xc
41
42/*
43 * Bits in MMCR1 for POWER5
44 */
45#define MMCR1_TTM0SEL_SH 62
46#define MMCR1_TTM1SEL_SH 60
47#define MMCR1_TTM2SEL_SH 58
48#define MMCR1_TTM3SEL_SH 56
49#define MMCR1_TTMSEL_MSK 3
50#define MMCR1_TD_CP_DBG0SEL_SH 54
51#define MMCR1_TD_CP_DBG1SEL_SH 52
52#define MMCR1_TD_CP_DBG2SEL_SH 50
53#define MMCR1_TD_CP_DBG3SEL_SH 48
54#define MMCR1_GRS_L2SEL_SH 46
55#define MMCR1_GRS_L2SEL_MSK 3
56#define MMCR1_GRS_L3SEL_SH 44
57#define MMCR1_GRS_L3SEL_MSK 3
58#define MMCR1_GRS_MCSEL_SH 41
59#define MMCR1_GRS_MCSEL_MSK 7
60#define MMCR1_GRS_FABSEL_SH 39
61#define MMCR1_GRS_FABSEL_MSK 3
62#define MMCR1_PMC1_ADDER_SEL_SH 35
63#define MMCR1_PMC2_ADDER_SEL_SH 34
64#define MMCR1_PMC3_ADDER_SEL_SH 33
65#define MMCR1_PMC4_ADDER_SEL_SH 32
66#define MMCR1_PMC1SEL_SH 25
67#define MMCR1_PMC2SEL_SH 17
68#define MMCR1_PMC3SEL_SH 9
69#define MMCR1_PMC4SEL_SH 1
70#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
71#define MMCR1_PMCSEL_MSK 0x7f
72
73/*
74 * Bits in MMCRA
75 */
76
77/*
78 * Layout of constraint bits:
79 * 6666555555555544444444443333333333222222222211111111110000000000
80 * 3210987654321098765432109876543210987654321098765432109876543210
81 * <><>[ ><><>< ><> [ >[ >[ >< >< >< >< ><><><><><><>
82 * T0T1 NC G0G1G2 G3 UC PS1PS2 B0 B1 B2 B3 P6P5P4P3P2P1
83 *
84 * T0 - TTM0 constraint
85 * 54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000
86 *
87 * T1 - TTM1 constraint
88 * 52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000
89 *
90 * NC - number of counters
91 * 51: NC error 0x0008_0000_0000_0000
92 * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
93 *
94 * G0..G3 - GRS mux constraints
95 * 46-47: GRS_L2SEL value
96 * 44-45: GRS_L3SEL value
97 * 41-44: GRS_MCSEL value
98 * 39-40: GRS_FABSEL value
99 * Note that these match up with their bit positions in MMCR1
100 *
101 * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
102 * 37: UC3 error 0x20_0000_0000
103 * 36: FPU|IFU|ISU1 events needed 0x10_0000_0000
104 * 35: ISU0 events needed 0x08_0000_0000
105 * 34: IDU|GRS events needed 0x04_0000_0000
106 *
107 * PS1
108 * 33: PS1 error 0x2_0000_0000
109 * 31-32: count of events needing PMC1/2 0x1_8000_0000
110 *
111 * PS2
112 * 30: PS2 error 0x4000_0000
113 * 28-29: count of events needing PMC3/4 0x3000_0000
114 *
115 * B0
116 * 24-27: Byte 0 event source 0x0f00_0000
117 * Encoding as for the event code
118 *
119 * B1, B2, B3
120 * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
121 *
122 * P1..P6
123 * 0-11: Count of events needing PMC1..PMC6
124 */
125
126static const int grsel_shift[8] = {
127 MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
128 MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
129 MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
130};
131
132/* Masks and values for using events from the various units */
133static u64 unit_cons[PM_LASTUNIT+1][2] = {
134 [PM_FPU] = { 0xc0002000000000ull, 0x00001000000000ull },
135 [PM_ISU0] = { 0x00002000000000ull, 0x00000800000000ull },
136 [PM_ISU1] = { 0xc0002000000000ull, 0xc0001000000000ull },
137 [PM_IFU] = { 0xc0002000000000ull, 0x80001000000000ull },
138 [PM_IDU] = { 0x30002000000000ull, 0x00000400000000ull },
139 [PM_GRS] = { 0x30002000000000ull, 0x30000400000000ull },
140};
141
142static int power5_get_constraint(u64 event, u64 *maskp, u64 *valp)
143{
144 int pmc, byte, unit, sh;
145 int bit, fmask;
146 u64 mask = 0, value = 0;
147 int grp = -1;
148
149 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
150 if (pmc) {
151 if (pmc > 6)
152 return -1;
153 sh = (pmc - 1) * 2;
154 mask |= 2 << sh;
155 value |= 1 << sh;
156 if (pmc <= 4)
157 grp = (pmc - 1) >> 1;
158 else if (event != 0x500009 && event != 0x600005)
159 return -1;
160 }
161 if (event & PM_BUSEVENT_MSK) {
162 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
163 if (unit > PM_LASTUNIT)
164 return -1;
165 if (unit == PM_ISU0_ALT)
166 unit = PM_ISU0;
167 mask |= unit_cons[unit][0];
168 value |= unit_cons[unit][1];
169 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
170 if (byte >= 4) {
171 if (unit != PM_LSU1)
172 return -1;
173 /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
174 ++unit;
175 byte &= 3;
176 }
177 if (unit == PM_GRS) {
178 bit = event & 7;
179 fmask = (bit == 6)? 7: 3;
180 sh = grsel_shift[bit];
181 mask |= (u64)fmask << sh;
182 value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
183 }
184 /*
185 * Bus events on bytes 0 and 2 can be counted
186 * on PMC1/2; bytes 1 and 3 on PMC3/4.
187 */
188 if (!pmc)
189 grp = byte & 1;
190 /* Set byte lane select field */
191 mask |= 0xfULL << (24 - 4 * byte);
192 value |= (u64)unit << (24 - 4 * byte);
193 }
194 if (grp == 0) {
195 /* increment PMC1/2 field */
196 mask |= 0x200000000ull;
197 value |= 0x080000000ull;
198 } else if (grp == 1) {
199 /* increment PMC3/4 field */
200 mask |= 0x40000000ull;
201 value |= 0x10000000ull;
202 }
203 if (pmc < 5) {
204 /* need a counter from PMC1-4 set */
205 mask |= 0x8000000000000ull;
206 value |= 0x1000000000000ull;
207 }
208 *maskp = mask;
209 *valp = value;
210 return 0;
211}
212
213#define MAX_ALT 3 /* at most 3 alternatives for any event */
214
215static const unsigned int event_alternatives[][MAX_ALT] = {
216 { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */
217 { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */
218 { 0x100005, 0x600005 }, /* PM_RUN_CYC */
219 { 0x100009, 0x200009, 0x500009 }, /* PM_INST_CMPL */
220 { 0x300009, 0x400009 }, /* PM_INST_DISP */
221};
222
223/*
224 * Scan the alternatives table for a match and return the
225 * index into the alternatives table if found, else -1.
226 */
227static int find_alternative(u64 event)
228{
229 int i, j;
230
231 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
232 if (event < event_alternatives[i][0])
233 break;
234 for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
235 if (event == event_alternatives[i][j])
236 return i;
237 }
238 return -1;
239}
240
241static const unsigned char bytedecode_alternatives[4][4] = {
242 /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 },
243 /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e },
244 /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 },
245 /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e }
246};
247
248/*
249 * Some direct events for decodes of event bus byte 3 have alternative
250 * PMCSEL values on other counters. This returns the alternative
251 * event code for those that do, or -1 otherwise.
252 */
253static s64 find_alternative_bdecode(u64 event)
254{
255 int pmc, altpmc, pp, j;
256
257 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
258 if (pmc == 0 || pmc > 4)
259 return -1;
260 altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */
261 pp = event & PM_PMCSEL_MSK;
262 for (j = 0; j < 4; ++j) {
263 if (bytedecode_alternatives[pmc - 1][j] == pp) {
264 return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
265 (altpmc << PM_PMC_SH) |
266 bytedecode_alternatives[altpmc - 1][j];
267 }
268 }
269 return -1;
270}
271
272static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[])
273{
274 int i, j, nalt = 1;
275 s64 ae;
276
277 alt[0] = event;
278 nalt = 1;
279 i = find_alternative(event);
280 if (i >= 0) {
281 for (j = 0; j < MAX_ALT; ++j) {
282 ae = event_alternatives[i][j];
283 if (ae && ae != event)
284 alt[nalt++] = ae;
285 }
286 } else {
287 ae = find_alternative_bdecode(event);
288 if (ae > 0)
289 alt[nalt++] = ae;
290 }
291 return nalt;
292}
293
294/*
295 * Map of which direct events on which PMCs are marked instruction events.
296 * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
297 * Bit 0 is set if it is marked for all PMCs.
298 * The 0x80 bit indicates a byte decode PMCSEL value.
299 */
300static unsigned char direct_event_is_marked[0x28] = {
301 0, /* 00 */
302 0x1f, /* 01 PM_IOPS_CMPL */
303 0x2, /* 02 PM_MRK_GRP_DISP */
304 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
305 0, /* 04 */
306 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
307 0x80, /* 06 */
308 0x80, /* 07 */
309 0, 0, 0,/* 08 - 0a */
310 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
311 0, /* 0c */
312 0x80, /* 0d */
313 0x80, /* 0e */
314 0, /* 0f */
315 0, /* 10 */
316 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
317 0, /* 12 */
318 0x10, /* 13 PM_MRK_GRP_CMPL */
319 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
320 0x2, /* 15 PM_MRK_GRP_ISSUED */
321 0x80, /* 16 */
322 0x80, /* 17 */
323 0, 0, 0, 0, 0,
324 0x80, /* 1d */
325 0x80, /* 1e */
326 0, /* 1f */
327 0x80, /* 20 */
328 0x80, /* 21 */
329 0x80, /* 22 */
330 0x80, /* 23 */
331 0x80, /* 24 */
332 0x80, /* 25 */
333 0x80, /* 26 */
334 0x80, /* 27 */
335};
336
337/*
338 * Returns 1 if event counts things relating to marked instructions
339 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
340 */
341static int power5_marked_instr_event(u64 event)
342{
343 int pmc, psel;
344 int bit, byte, unit;
345 u32 mask;
346
347 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
348 psel = event & PM_PMCSEL_MSK;
349 if (pmc >= 5)
350 return 0;
351
352 bit = -1;
353 if (psel < sizeof(direct_event_is_marked)) {
354 if (direct_event_is_marked[psel] & (1 << pmc))
355 return 1;
356 if (direct_event_is_marked[psel] & 0x80)
357 bit = 4;
358 else if (psel == 0x08)
359 bit = pmc - 1;
360 else if (psel == 0x10)
361 bit = 4 - pmc;
362 else if (psel == 0x1b && (pmc == 1 || pmc == 3))
363 bit = 4;
364 } else if ((psel & 0x58) == 0x40)
365 bit = psel & 7;
366
367 if (!(event & PM_BUSEVENT_MSK))
368 return 0;
369
370 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
371 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
372 if (unit == PM_LSU0) {
373 /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
374 mask = 0x5dff00;
375 } else if (unit == PM_LSU1 && byte >= 4) {
376 byte -= 4;
377 /* byte 4 bits 1,3,5,7, byte 5 bits 6-7, byte 7 bits 0-4,6 */
378 mask = 0x5f00c0aa;
379 } else
380 return 0;
381
382 return (mask >> (byte * 8 + bit)) & 1;
383}
384
385static int power5_compute_mmcr(u64 event[], int n_ev,
386 unsigned int hwc[], u64 mmcr[])
387{
388 u64 mmcr1 = 0;
389 u64 mmcra = 0;
390 unsigned int pmc, unit, byte, psel;
391 unsigned int ttm, grp;
392 int i, isbus, bit, grsel;
393 unsigned int pmc_inuse = 0;
394 unsigned int pmc_grp_use[2];
395 unsigned char busbyte[4];
396 unsigned char unituse[16];
397 int ttmuse;
398
399 if (n_ev > 6)
400 return -1;
401
402 /* First pass to count resource use */
403 pmc_grp_use[0] = pmc_grp_use[1] = 0;
404 memset(busbyte, 0, sizeof(busbyte));
405 memset(unituse, 0, sizeof(unituse));
406 for (i = 0; i < n_ev; ++i) {
407 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
408 if (pmc) {
409 if (pmc > 6)
410 return -1;
411 if (pmc_inuse & (1 << (pmc - 1)))
412 return -1;
413 pmc_inuse |= 1 << (pmc - 1);
414 /* count 1/2 vs 3/4 use */
415 if (pmc <= 4)
416 ++pmc_grp_use[(pmc - 1) >> 1];
417 }
418 if (event[i] & PM_BUSEVENT_MSK) {
419 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
420 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
421 if (unit > PM_LASTUNIT)
422 return -1;
423 if (unit == PM_ISU0_ALT)
424 unit = PM_ISU0;
425 if (byte >= 4) {
426 if (unit != PM_LSU1)
427 return -1;
428 ++unit;
429 byte &= 3;
430 }
431 if (!pmc)
432 ++pmc_grp_use[byte & 1];
433 if (busbyte[byte] && busbyte[byte] != unit)
434 return -1;
435 busbyte[byte] = unit;
436 unituse[unit] = 1;
437 }
438 }
439 if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2)
440 return -1;
441
442 /*
443 * Assign resources and set multiplexer selects.
444 *
445 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
446 * choice we have to deal with.
447 */
448 if (unituse[PM_ISU0] &
449 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
450 unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */
451 unituse[PM_ISU0] = 0;
452 }
453 /* Set TTM[01]SEL fields. */
454 ttmuse = 0;
455 for (i = PM_FPU; i <= PM_ISU1; ++i) {
456 if (!unituse[i])
457 continue;
458 if (ttmuse++)
459 return -1;
460 mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
461 }
462 ttmuse = 0;
463 for (; i <= PM_GRS; ++i) {
464 if (!unituse[i])
465 continue;
466 if (ttmuse++)
467 return -1;
468 mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
469 }
470 if (ttmuse > 1)
471 return -1;
472
473 /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
474 for (byte = 0; byte < 4; ++byte) {
475 unit = busbyte[byte];
476 if (!unit)
477 continue;
478 if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
479 /* get ISU0 through TTM1 rather than TTM0 */
480 unit = PM_ISU0_ALT;
481 } else if (unit == PM_LSU1 + 1) {
482 /* select lower word of LSU1 for this byte */
483 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
484 }
485 ttm = unit >> 2;
486 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
487 }
488
489 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
490 for (i = 0; i < n_ev; ++i) {
491 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
492 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
493 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
494 psel = event[i] & PM_PMCSEL_MSK;
495 isbus = event[i] & PM_BUSEVENT_MSK;
496 if (!pmc) {
497 /* Bus event or any-PMC direct event */
498 for (pmc = 0; pmc < 4; ++pmc) {
499 if (pmc_inuse & (1 << pmc))
500 continue;
501 grp = (pmc >> 1) & 1;
502 if (isbus) {
503 if (grp == (byte & 1))
504 break;
505 } else if (pmc_grp_use[grp] < 2) {
506 ++pmc_grp_use[grp];
507 break;
508 }
509 }
510 pmc_inuse |= 1 << pmc;
511 } else if (pmc <= 4) {
512 /* Direct event */
513 --pmc;
514 if ((psel == 8 || psel == 0x10) && isbus && (byte & 2))
515 /* add events on higher-numbered bus */
516 mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
517 } else {
518 /* Instructions or run cycles on PMC5/6 */
519 --pmc;
520 }
521 if (isbus && unit == PM_GRS) {
522 bit = psel & 7;
523 grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
524 mmcr1 |= (u64)grsel << grsel_shift[bit];
525 }
526 if (power5_marked_instr_event(event[i]))
527 mmcra |= MMCRA_SAMPLE_ENABLE;
528 if (pmc <= 3)
529 mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
530 hwc[i] = pmc;
531 }
532
533 /* Return MMCRx values */
534 mmcr[0] = 0;
535 if (pmc_inuse & 1)
536 mmcr[0] = MMCR0_PMC1CE;
537 if (pmc_inuse & 0x3e)
538 mmcr[0] |= MMCR0_PMCjCE;
539 mmcr[1] = mmcr1;
540 mmcr[2] = mmcra;
541 return 0;
542}
543
544static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
545{
546 if (pmc <= 3)
547 mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
548}
549
550static int power5_generic_events[] = {
551 [PERF_COUNT_HW_CPU_CYCLES] = 0xf,
552 [PERF_COUNT_HW_INSTRUCTIONS] = 0x100009,
553 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4c1090, /* LD_REF_L1 */
554 [PERF_COUNT_HW_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */
555 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */
556 [PERF_COUNT_HW_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */
557};
558
559#define C(x) PERF_COUNT_HW_CACHE_##x
560
561/*
562 * Table of generalized cache-related events.
563 * 0 means not supported, -1 means nonsensical, other values
564 * are event codes.
565 */
566static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
567 [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
568 [C(OP_READ)] = { 0x4c1090, 0x3c1088 },
569 [C(OP_WRITE)] = { 0x3c1090, 0xc10c3 },
570 [C(OP_PREFETCH)] = { 0xc70e7, 0 },
571 },
572 [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
573 [C(OP_READ)] = { 0, 0 },
574 [C(OP_WRITE)] = { -1, -1 },
575 [C(OP_PREFETCH)] = { 0, 0 },
576 },
577 [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
578 [C(OP_READ)] = { 0, 0x3c309b },
579 [C(OP_WRITE)] = { 0, 0 },
580 [C(OP_PREFETCH)] = { 0xc50c3, 0 },
581 },
582 [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
583 [C(OP_READ)] = { 0x2c4090, 0x800c4 },
584 [C(OP_WRITE)] = { -1, -1 },
585 [C(OP_PREFETCH)] = { -1, -1 },
586 },
587 [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
588 [C(OP_READ)] = { 0, 0x800c0 },
589 [C(OP_WRITE)] = { -1, -1 },
590 [C(OP_PREFETCH)] = { -1, -1 },
591 },
592 [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
593 [C(OP_READ)] = { 0x230e4, 0x230e5 },
594 [C(OP_WRITE)] = { -1, -1 },
595 [C(OP_PREFETCH)] = { -1, -1 },
596 },
597};
598
599struct power_pmu power5_pmu = {
600 .n_counter = 6,
601 .max_alternatives = MAX_ALT,
602 .add_fields = 0x7000090000555ull,
603 .test_adder = 0x3000490000000ull,
604 .compute_mmcr = power5_compute_mmcr,
605 .get_constraint = power5_get_constraint,
606 .get_alternatives = power5_get_alternatives,
607 .disable_pmc = power5_disable_pmc,
608 .n_generic = ARRAY_SIZE(power5_generic_events),
609 .generic_events = power5_generic_events,
610 .cache_events = &power5_cache_events,
611};
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
new file mode 100644
index 000000000000..46f74bebcfd9
--- /dev/null
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -0,0 +1,532 @@
1/*
2 * Performance counter support for POWER6 processors.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER6
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0x7
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* Unit event comes (TTMxSEL encoding) */
22#define PM_UNIT_MSK 0xf
23#define PM_UNIT_MSKS (PM_UNIT_MSK << PM_UNIT_SH)
24#define PM_LLAV 0x8000 /* Load lookahead match value */
25#define PM_LLA 0x4000 /* Load lookahead match enable */
26#define PM_BYTE_SH 12 /* Byte of event bus to use */
27#define PM_BYTE_MSK 3
28#define PM_SUBUNIT_SH 8 /* Subunit event comes from (NEST_SEL enc.) */
29#define PM_SUBUNIT_MSK 7
30#define PM_SUBUNIT_MSKS (PM_SUBUNIT_MSK << PM_SUBUNIT_SH)
31#define PM_PMCSEL_MSK 0xff /* PMCxSEL value */
32#define PM_BUSEVENT_MSK 0xf3700
33
34/*
35 * Bits in MMCR1 for POWER6
36 */
37#define MMCR1_TTM0SEL_SH 60
38#define MMCR1_TTMSEL_SH(n) (MMCR1_TTM0SEL_SH - (n) * 4)
39#define MMCR1_TTMSEL_MSK 0xf
40#define MMCR1_TTMSEL(m, n) (((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK)
41#define MMCR1_NESTSEL_SH 45
42#define MMCR1_NESTSEL_MSK 0x7
43#define MMCR1_NESTSEL(m) (((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK)
44#define MMCR1_PMC1_LLA ((u64)1 << 44)
45#define MMCR1_PMC1_LLA_VALUE ((u64)1 << 39)
46#define MMCR1_PMC1_ADDR_SEL ((u64)1 << 35)
47#define MMCR1_PMC1SEL_SH 24
48#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
49#define MMCR1_PMCSEL_MSK 0xff
50
51/*
52 * Map of which direct events on which PMCs are marked instruction events.
53 * Indexed by PMCSEL value >> 1.
54 * Bottom 4 bits are a map of which PMCs are interesting,
55 * top 4 bits say what sort of event:
56 * 0 = direct marked event,
57 * 1 = byte decode event,
58 * 4 = add/and event (PMC1 -> bits 0 & 4),
59 * 5 = add/and event (PMC1 -> bits 1 & 5),
60 * 6 = add/and event (PMC1 -> bits 2 & 6),
61 * 7 = add/and event (PMC1 -> bits 3 & 7).
62 */
63static unsigned char direct_event_is_marked[0x60 >> 1] = {
64 0, /* 00 */
65 0, /* 02 */
66 0, /* 04 */
67 0x07, /* 06 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
68 0x04, /* 08 PM_MRK_DFU_FIN */
69 0x06, /* 0a PM_MRK_IFU_FIN, PM_MRK_INST_FIN */
70 0, /* 0c */
71 0, /* 0e */
72 0x02, /* 10 PM_MRK_INST_DISP */
73 0x08, /* 12 PM_MRK_LSU_DERAT_MISS */
74 0, /* 14 */
75 0, /* 16 */
76 0x0c, /* 18 PM_THRESH_TIMEO, PM_MRK_INST_FIN */
77 0x0f, /* 1a PM_MRK_INST_DISP, PM_MRK_{FXU,FPU,LSU}_FIN */
78 0x01, /* 1c PM_MRK_INST_ISSUED */
79 0, /* 1e */
80 0, /* 20 */
81 0, /* 22 */
82 0, /* 24 */
83 0, /* 26 */
84 0x15, /* 28 PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L3MISS */
85 0, /* 2a */
86 0, /* 2c */
87 0, /* 2e */
88 0x4f, /* 30 */
89 0x7f, /* 32 */
90 0x4f, /* 34 */
91 0x5f, /* 36 */
92 0x6f, /* 38 */
93 0x4f, /* 3a */
94 0, /* 3c */
95 0x08, /* 3e PM_MRK_INST_TIMEO */
96 0x1f, /* 40 */
97 0x1f, /* 42 */
98 0x1f, /* 44 */
99 0x1f, /* 46 */
100 0x1f, /* 48 */
101 0x1f, /* 4a */
102 0x1f, /* 4c */
103 0x1f, /* 4e */
104 0, /* 50 */
105 0x05, /* 52 PM_MRK_BR_TAKEN, PM_MRK_BR_MPRED */
106 0x1c, /* 54 PM_MRK_PTEG_FROM_L3MISS, PM_MRK_PTEG_FROM_L2MISS */
107 0x02, /* 56 PM_MRK_LD_MISS_L1 */
108 0, /* 58 */
109 0, /* 5a */
110 0, /* 5c */
111 0, /* 5e */
112};
113
114/*
115 * Masks showing for each unit which bits are marked events.
116 * These masks are in LE order, i.e. 0x00000001 is byte 0, bit 0.
117 */
118static u32 marked_bus_events[16] = {
119 0x01000000, /* direct events set 1: byte 3 bit 0 */
120 0x00010000, /* direct events set 2: byte 2 bit 0 */
121 0, 0, 0, 0, /* IDU, IFU, nest: nothing */
122 0x00000088, /* VMX set 1: byte 0 bits 3, 7 */
123 0x000000c0, /* VMX set 2: byte 0 bits 4-7 */
124 0x04010000, /* LSU set 1: byte 2 bit 0, byte 3 bit 2 */
125 0xff010000u, /* LSU set 2: byte 2 bit 0, all of byte 3 */
126 0, /* LSU set 3 */
127 0x00000010, /* VMX set 3: byte 0 bit 4 */
128 0, /* BFP set 1 */
129 0x00000022, /* BFP set 2: byte 0 bits 1, 5 */
130 0, 0
131};
132
133/*
134 * Returns 1 if event counts things relating to marked instructions
135 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
136 */
137static int power6_marked_instr_event(u64 event)
138{
139 int pmc, psel, ptype;
140 int bit, byte, unit;
141 u32 mask;
142
143 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
144 psel = (event & PM_PMCSEL_MSK) >> 1; /* drop edge/level bit */
145 if (pmc >= 5)
146 return 0;
147
148 bit = -1;
149 if (psel < sizeof(direct_event_is_marked)) {
150 ptype = direct_event_is_marked[psel];
151 if (pmc == 0 || !(ptype & (1 << (pmc - 1))))
152 return 0;
153 ptype >>= 4;
154 if (ptype == 0)
155 return 1;
156 if (ptype == 1)
157 bit = 0;
158 else
159 bit = ptype ^ (pmc - 1);
160 } else if ((psel & 0x48) == 0x40)
161 bit = psel & 7;
162
163 if (!(event & PM_BUSEVENT_MSK) || bit == -1)
164 return 0;
165
166 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
167 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
168 mask = marked_bus_events[unit];
169 return (mask >> (byte * 8 + bit)) & 1;
170}
171
172/*
173 * Assign PMC numbers and compute MMCR1 value for a set of events
174 */
175static int p6_compute_mmcr(u64 event[], int n_ev,
176 unsigned int hwc[], u64 mmcr[])
177{
178 u64 mmcr1 = 0;
179 u64 mmcra = 0;
180 int i;
181 unsigned int pmc, ev, b, u, s, psel;
182 unsigned int ttmset = 0;
183 unsigned int pmc_inuse = 0;
184
185 if (n_ev > 6)
186 return -1;
187 for (i = 0; i < n_ev; ++i) {
188 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
189 if (pmc) {
190 if (pmc_inuse & (1 << (pmc - 1)))
191 return -1; /* collision! */
192 pmc_inuse |= 1 << (pmc - 1);
193 }
194 }
195 for (i = 0; i < n_ev; ++i) {
196 ev = event[i];
197 pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
198 if (pmc) {
199 --pmc;
200 } else {
201 /* can go on any PMC; find a free one */
202 for (pmc = 0; pmc < 4; ++pmc)
203 if (!(pmc_inuse & (1 << pmc)))
204 break;
205 if (pmc >= 4)
206 return -1;
207 pmc_inuse |= 1 << pmc;
208 }
209 hwc[i] = pmc;
210 psel = ev & PM_PMCSEL_MSK;
211 if (ev & PM_BUSEVENT_MSK) {
212 /* this event uses the event bus */
213 b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK;
214 u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK;
215 /* check for conflict on this byte of event bus */
216 if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u)
217 return -1;
218 mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b);
219 ttmset |= 1 << b;
220 if (u == 5) {
221 /* Nest events have a further mux */
222 s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
223 if ((ttmset & 0x10) &&
224 MMCR1_NESTSEL(mmcr1) != s)
225 return -1;
226 ttmset |= 0x10;
227 mmcr1 |= (u64)s << MMCR1_NESTSEL_SH;
228 }
229 if (0x30 <= psel && psel <= 0x3d) {
230 /* these need the PMCx_ADDR_SEL bits */
231 if (b >= 2)
232 mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc;
233 }
234 /* bus select values are different for PMC3/4 */
235 if (pmc >= 2 && (psel & 0x90) == 0x80)
236 psel ^= 0x20;
237 }
238 if (ev & PM_LLA) {
239 mmcr1 |= MMCR1_PMC1_LLA >> pmc;
240 if (ev & PM_LLAV)
241 mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
242 }
243 if (power6_marked_instr_event(event[i]))
244 mmcra |= MMCRA_SAMPLE_ENABLE;
245 if (pmc < 4)
246 mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
247 }
248 mmcr[0] = 0;
249 if (pmc_inuse & 1)
250 mmcr[0] = MMCR0_PMC1CE;
251 if (pmc_inuse & 0xe)
252 mmcr[0] |= MMCR0_PMCjCE;
253 mmcr[1] = mmcr1;
254 mmcr[2] = mmcra;
255 return 0;
256}
257
258/*
259 * Layout of constraint bits:
260 *
261 * 0-1 add field: number of uses of PMC1 (max 1)
262 * 2-3, 4-5, 6-7, 8-9, 10-11: ditto for PMC2, 3, 4, 5, 6
263 * 12-15 add field: number of uses of PMC1-4 (max 4)
264 * 16-19 select field: unit on byte 0 of event bus
265 * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
266 * 32-34 select field: nest (subunit) event selector
267 */
268static int p6_get_constraint(u64 event, u64 *maskp, u64 *valp)
269{
270 int pmc, byte, sh, subunit;
271 u64 mask = 0, value = 0;
272
273 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
274 if (pmc) {
275 if (pmc > 4 && !(event == 0x500009 || event == 0x600005))
276 return -1;
277 sh = (pmc - 1) * 2;
278 mask |= 2 << sh;
279 value |= 1 << sh;
280 }
281 if (event & PM_BUSEVENT_MSK) {
282 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
283 sh = byte * 4 + (16 - PM_UNIT_SH);
284 mask |= PM_UNIT_MSKS << sh;
285 value |= (u64)(event & PM_UNIT_MSKS) << sh;
286 if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
287 subunit = (event >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
288 mask |= (u64)PM_SUBUNIT_MSK << 32;
289 value |= (u64)subunit << 32;
290 }
291 }
292 if (pmc <= 4) {
293 mask |= 0x8000; /* add field for count of PMC1-4 uses */
294 value |= 0x1000;
295 }
296 *maskp = mask;
297 *valp = value;
298 return 0;
299}
300
301static int p6_limited_pmc_event(u64 event)
302{
303 int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
304
305 return pmc == 5 || pmc == 6;
306}
307
308#define MAX_ALT 4 /* at most 4 alternatives for any event */
309
310static const unsigned int event_alternatives[][MAX_ALT] = {
311 { 0x0130e8, 0x2000f6, 0x3000fc }, /* PM_PTEG_RELOAD_VALID */
312 { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
313 { 0x080088, 0x200054, 0x3000f0 }, /* PM_ST_MISS_L1 */
314 { 0x10000a, 0x2000f4, 0x600005 }, /* PM_RUN_CYC */
315 { 0x10000b, 0x2000f5 }, /* PM_RUN_COUNT */
316 { 0x10000e, 0x400010 }, /* PM_PURR */
317 { 0x100010, 0x4000f8 }, /* PM_FLUSH */
318 { 0x10001a, 0x200010 }, /* PM_MRK_INST_DISP */
319 { 0x100026, 0x3000f8 }, /* PM_TB_BIT_TRANS */
320 { 0x100054, 0x2000f0 }, /* PM_ST_FIN */
321 { 0x100056, 0x2000fc }, /* PM_L1_ICACHE_MISS */
322 { 0x1000f0, 0x40000a }, /* PM_INST_IMC_MATCH_CMPL */
323 { 0x1000f8, 0x200008 }, /* PM_GCT_EMPTY_CYC */
324 { 0x1000fc, 0x400006 }, /* PM_LSU_DERAT_MISS_CYC */
325 { 0x20000e, 0x400007 }, /* PM_LSU_DERAT_MISS */
326 { 0x200012, 0x300012 }, /* PM_INST_DISP */
327 { 0x2000f2, 0x3000f2 }, /* PM_INST_DISP */
328 { 0x2000f8, 0x300010 }, /* PM_EXT_INT */
329 { 0x2000fe, 0x300056 }, /* PM_DATA_FROM_L2MISS */
330 { 0x2d0030, 0x30001a }, /* PM_MRK_FPU_FIN */
331 { 0x30000a, 0x400018 }, /* PM_MRK_INST_FIN */
332 { 0x3000f6, 0x40000e }, /* PM_L1_DCACHE_RELOAD_VALID */
333 { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */
334};
335
336/*
337 * This could be made more efficient with a binary search on
338 * a presorted list, if necessary
339 */
340static int find_alternatives_list(u64 event)
341{
342 int i, j;
343 unsigned int alt;
344
345 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
346 if (event < event_alternatives[i][0])
347 return -1;
348 for (j = 0; j < MAX_ALT; ++j) {
349 alt = event_alternatives[i][j];
350 if (!alt || event < alt)
351 break;
352 if (event == alt)
353 return i;
354 }
355 }
356 return -1;
357}
358
359static int p6_get_alternatives(u64 event, unsigned int flags, u64 alt[])
360{
361 int i, j, nlim;
362 unsigned int psel, pmc;
363 unsigned int nalt = 1;
364 u64 aevent;
365
366 alt[0] = event;
367 nlim = p6_limited_pmc_event(event);
368
369 /* check the alternatives table */
370 i = find_alternatives_list(event);
371 if (i >= 0) {
372 /* copy out alternatives from list */
373 for (j = 0; j < MAX_ALT; ++j) {
374 aevent = event_alternatives[i][j];
375 if (!aevent)
376 break;
377 if (aevent != event)
378 alt[nalt++] = aevent;
379 nlim += p6_limited_pmc_event(aevent);
380 }
381
382 } else {
383 /* Check for alternative ways of computing sum events */
384 /* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */
385 psel = event & (PM_PMCSEL_MSK & ~1); /* ignore edge bit */
386 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
387 if (pmc && (psel == 0x32 || psel == 0x34))
388 alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) |
389 ((5 - pmc) << PM_PMC_SH);
390
391 /* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */
392 if (pmc && (psel == 0x38 || psel == 0x3a))
393 alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) |
394 ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
395 }
396
397 if (flags & PPMU_ONLY_COUNT_RUN) {
398 /*
399 * We're only counting in RUN state,
400 * so PM_CYC is equivalent to PM_RUN_CYC,
401 * PM_INST_CMPL === PM_RUN_INST_CMPL, PM_PURR === PM_RUN_PURR.
402 * This doesn't include alternatives that don't provide
403 * any extra flexibility in assigning PMCs (e.g.
404 * 0x10000a for PM_RUN_CYC vs. 0x1e for PM_CYC).
405 * Note that even with these additional alternatives
406 * we never end up with more than 4 alternatives for any event.
407 */
408 j = nalt;
409 for (i = 0; i < nalt; ++i) {
410 switch (alt[i]) {
411 case 0x1e: /* PM_CYC */
412 alt[j++] = 0x600005; /* PM_RUN_CYC */
413 ++nlim;
414 break;
415 case 0x10000a: /* PM_RUN_CYC */
416 alt[j++] = 0x1e; /* PM_CYC */
417 break;
418 case 2: /* PM_INST_CMPL */
419 alt[j++] = 0x500009; /* PM_RUN_INST_CMPL */
420 ++nlim;
421 break;
422 case 0x500009: /* PM_RUN_INST_CMPL */
423 alt[j++] = 2; /* PM_INST_CMPL */
424 break;
425 case 0x10000e: /* PM_PURR */
426 alt[j++] = 0x4000f4; /* PM_RUN_PURR */
427 break;
428 case 0x4000f4: /* PM_RUN_PURR */
429 alt[j++] = 0x10000e; /* PM_PURR */
430 break;
431 }
432 }
433 nalt = j;
434 }
435
436 if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
437 /* remove the limited PMC events */
438 j = 0;
439 for (i = 0; i < nalt; ++i) {
440 if (!p6_limited_pmc_event(alt[i])) {
441 alt[j] = alt[i];
442 ++j;
443 }
444 }
445 nalt = j;
446 } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
447 /* remove all but the limited PMC events */
448 j = 0;
449 for (i = 0; i < nalt; ++i) {
450 if (p6_limited_pmc_event(alt[i])) {
451 alt[j] = alt[i];
452 ++j;
453 }
454 }
455 nalt = j;
456 }
457
458 return nalt;
459}
460
461static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
462{
463 /* Set PMCxSEL to 0 to disable PMCx */
464 if (pmc <= 3)
465 mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
466}
467
468static int power6_generic_events[] = {
469 [PERF_COUNT_HW_CPU_CYCLES] = 0x1e,
470 [PERF_COUNT_HW_INSTRUCTIONS] = 2,
471 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */
472 [PERF_COUNT_HW_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */
473 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */
474 [PERF_COUNT_HW_BRANCH_MISSES] = 0x400052, /* BR_MPRED */
475};
476
477#define C(x) PERF_COUNT_HW_CACHE_##x
478
479/*
480 * Table of generalized cache-related events.
481 * 0 means not supported, -1 means nonsensical, other values
482 * are event codes.
483 * The "DTLB" and "ITLB" events relate to the DERAT and IERAT.
484 */
485static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
486 [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
487 [C(OP_READ)] = { 0x80082, 0x80080 },
488 [C(OP_WRITE)] = { 0x80086, 0x80088 },
489 [C(OP_PREFETCH)] = { 0x810a4, 0 },
490 },
491 [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
492 [C(OP_READ)] = { 0, 0x100056 },
493 [C(OP_WRITE)] = { -1, -1 },
494 [C(OP_PREFETCH)] = { 0x4008c, 0 },
495 },
496 [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
497 [C(OP_READ)] = { 0x150730, 0x250532 },
498 [C(OP_WRITE)] = { 0x250432, 0x150432 },
499 [C(OP_PREFETCH)] = { 0x810a6, 0 },
500 },
501 [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
502 [C(OP_READ)] = { 0, 0x20000e },
503 [C(OP_WRITE)] = { -1, -1 },
504 [C(OP_PREFETCH)] = { -1, -1 },
505 },
506 [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
507 [C(OP_READ)] = { 0, 0x420ce },
508 [C(OP_WRITE)] = { -1, -1 },
509 [C(OP_PREFETCH)] = { -1, -1 },
510 },
511 [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
512 [C(OP_READ)] = { 0x430e6, 0x400052 },
513 [C(OP_WRITE)] = { -1, -1 },
514 [C(OP_PREFETCH)] = { -1, -1 },
515 },
516};
517
518struct power_pmu power6_pmu = {
519 .n_counter = 6,
520 .max_alternatives = MAX_ALT,
521 .add_fields = 0x1555,
522 .test_adder = 0x3000,
523 .compute_mmcr = p6_compute_mmcr,
524 .get_constraint = p6_get_constraint,
525 .get_alternatives = p6_get_alternatives,
526 .disable_pmc = p6_disable_pmc,
527 .limited_pmc_event = p6_limited_pmc_event,
528 .flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
529 .n_generic = ARRAY_SIZE(power6_generic_events),
530 .generic_events = power6_generic_events,
531 .cache_events = &power6_cache_events,
532};
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
new file mode 100644
index 000000000000..b3f7d1216bae
--- /dev/null
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -0,0 +1,357 @@
1/*
2 * Performance counter support for POWER7 processors.
3 *
4 * Copyright 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER7
17 */
18#define PM_PMC_SH 16 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 12 /* TTMMUX number and setting - unit select */
22#define PM_UNIT_MSK 0xf
23#define PM_COMBINE_SH 11 /* Combined event bit */
24#define PM_COMBINE_MSK 1
25#define PM_COMBINE_MSKS 0x800
26#define PM_L2SEL_SH 8 /* L2 event select */
27#define PM_L2SEL_MSK 7
28#define PM_PMCSEL_MSK 0xff
29
30/*
31 * Bits in MMCR1 for POWER7
32 */
33#define MMCR1_TTM0SEL_SH 60
34#define MMCR1_TTM1SEL_SH 56
35#define MMCR1_TTM2SEL_SH 52
36#define MMCR1_TTM3SEL_SH 48
37#define MMCR1_TTMSEL_MSK 0xf
38#define MMCR1_L2SEL_SH 45
39#define MMCR1_L2SEL_MSK 7
40#define MMCR1_PMC1_COMBINE_SH 35
41#define MMCR1_PMC2_COMBINE_SH 34
42#define MMCR1_PMC3_COMBINE_SH 33
43#define MMCR1_PMC4_COMBINE_SH 32
44#define MMCR1_PMC1SEL_SH 24
45#define MMCR1_PMC2SEL_SH 16
46#define MMCR1_PMC3SEL_SH 8
47#define MMCR1_PMC4SEL_SH 0
48#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
49#define MMCR1_PMCSEL_MSK 0xff
50
51/*
52 * Bits in MMCRA
53 */
54
55/*
56 * Layout of constraint bits:
57 * 6666555555555544444444443333333333222222222211111111110000000000
58 * 3210987654321098765432109876543210987654321098765432109876543210
59 * [ ><><><><><><>
60 * NC P6P5P4P3P2P1
61 *
62 * NC - number of counters
63 * 15: NC error 0x8000
64 * 12-14: number of events needing PMC1-4 0x7000
65 *
66 * P6
67 * 11: P6 error 0x800
68 * 10-11: Count of events needing PMC6
69 *
70 * P1..P5
71 * 0-9: Count of events needing PMC1..PMC5
72 */
73
74static int power7_get_constraint(u64 event, u64 *maskp, u64 *valp)
75{
76 int pmc, sh;
77 u64 mask = 0, value = 0;
78
79 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
80 if (pmc) {
81 if (pmc > 6)
82 return -1;
83 sh = (pmc - 1) * 2;
84 mask |= 2 << sh;
85 value |= 1 << sh;
86 if (pmc >= 5 && !(event == 0x500fa || event == 0x600f4))
87 return -1;
88 }
89 if (pmc < 5) {
90 /* need a counter from PMC1-4 set */
91 mask |= 0x8000;
92 value |= 0x1000;
93 }
94 *maskp = mask;
95 *valp = value;
96 return 0;
97}
98
99#define MAX_ALT 2 /* at most 2 alternatives for any event */
100
101static const unsigned int event_alternatives[][MAX_ALT] = {
102 { 0x200f2, 0x300f2 }, /* PM_INST_DISP */
103 { 0x200f4, 0x600f4 }, /* PM_RUN_CYC */
104 { 0x400fa, 0x500fa }, /* PM_RUN_INST_CMPL */
105};
106
107/*
108 * Scan the alternatives table for a match and return the
109 * index into the alternatives table if found, else -1.
110 */
111static int find_alternative(u64 event)
112{
113 int i, j;
114
115 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
116 if (event < event_alternatives[i][0])
117 break;
118 for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
119 if (event == event_alternatives[i][j])
120 return i;
121 }
122 return -1;
123}
124
125static s64 find_alternative_decode(u64 event)
126{
127 int pmc, psel;
128
129 /* this only handles the 4x decode events */
130 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
131 psel = event & PM_PMCSEL_MSK;
132 if ((pmc == 2 || pmc == 4) && (psel & ~7) == 0x40)
133 return event - (1 << PM_PMC_SH) + 8;
134 if ((pmc == 1 || pmc == 3) && (psel & ~7) == 0x48)
135 return event + (1 << PM_PMC_SH) - 8;
136 return -1;
137}
138
139static int power7_get_alternatives(u64 event, unsigned int flags, u64 alt[])
140{
141 int i, j, nalt = 1;
142 s64 ae;
143
144 alt[0] = event;
145 nalt = 1;
146 i = find_alternative(event);
147 if (i >= 0) {
148 for (j = 0; j < MAX_ALT; ++j) {
149 ae = event_alternatives[i][j];
150 if (ae && ae != event)
151 alt[nalt++] = ae;
152 }
153 } else {
154 ae = find_alternative_decode(event);
155 if (ae > 0)
156 alt[nalt++] = ae;
157 }
158
159 if (flags & PPMU_ONLY_COUNT_RUN) {
160 /*
161 * We're only counting in RUN state,
162 * so PM_CYC is equivalent to PM_RUN_CYC
163 * and PM_INST_CMPL === PM_RUN_INST_CMPL.
164 * This doesn't include alternatives that don't provide
165 * any extra flexibility in assigning PMCs.
166 */
167 j = nalt;
168 for (i = 0; i < nalt; ++i) {
169 switch (alt[i]) {
170 case 0x1e: /* PM_CYC */
171 alt[j++] = 0x600f4; /* PM_RUN_CYC */
172 break;
173 case 0x600f4: /* PM_RUN_CYC */
174 alt[j++] = 0x1e;
175 break;
176 case 0x2: /* PM_PPC_CMPL */
177 alt[j++] = 0x500fa; /* PM_RUN_INST_CMPL */
178 break;
179 case 0x500fa: /* PM_RUN_INST_CMPL */
180 alt[j++] = 0x2; /* PM_PPC_CMPL */
181 break;
182 }
183 }
184 nalt = j;
185 }
186
187 return nalt;
188}
189
190/*
191 * Returns 1 if event counts things relating to marked instructions
192 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
193 */
194static int power7_marked_instr_event(u64 event)
195{
196 int pmc, psel;
197 int unit;
198
199 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
200 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
201 psel = event & PM_PMCSEL_MSK & ~1; /* trim off edge/level bit */
202 if (pmc >= 5)
203 return 0;
204
205 switch (psel >> 4) {
206 case 2:
207 return pmc == 2 || pmc == 4;
208 case 3:
209 if (psel == 0x3c)
210 return pmc == 1;
211 if (psel == 0x3e)
212 return pmc != 2;
213 return 1;
214 case 4:
215 case 5:
216 return unit == 0xd;
217 case 6:
218 if (psel == 0x64)
219 return pmc >= 3;
220 case 8:
221 return unit == 0xd;
222 }
223 return 0;
224}
225
226static int power7_compute_mmcr(u64 event[], int n_ev,
227 unsigned int hwc[], u64 mmcr[])
228{
229 u64 mmcr1 = 0;
230 u64 mmcra = 0;
231 unsigned int pmc, unit, combine, l2sel, psel;
232 unsigned int pmc_inuse = 0;
233 int i;
234
235 /* First pass to count resource use */
236 for (i = 0; i < n_ev; ++i) {
237 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
238 if (pmc) {
239 if (pmc > 6)
240 return -1;
241 if (pmc_inuse & (1 << (pmc - 1)))
242 return -1;
243 pmc_inuse |= 1 << (pmc - 1);
244 }
245 }
246
247 /* Second pass: assign PMCs, set all MMCR1 fields */
248 for (i = 0; i < n_ev; ++i) {
249 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
250 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
251 combine = (event[i] >> PM_COMBINE_SH) & PM_COMBINE_MSK;
252 l2sel = (event[i] >> PM_L2SEL_SH) & PM_L2SEL_MSK;
253 psel = event[i] & PM_PMCSEL_MSK;
254 if (!pmc) {
255 /* Bus event or any-PMC direct event */
256 for (pmc = 0; pmc < 4; ++pmc) {
257 if (!(pmc_inuse & (1 << pmc)))
258 break;
259 }
260 if (pmc >= 4)
261 return -1;
262 pmc_inuse |= 1 << pmc;
263 } else {
264 /* Direct or decoded event */
265 --pmc;
266 }
267 if (pmc <= 3) {
268 mmcr1 |= (u64) unit << (MMCR1_TTM0SEL_SH - 4 * pmc);
269 mmcr1 |= (u64) combine << (MMCR1_PMC1_COMBINE_SH - pmc);
270 mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
271 if (unit == 6) /* L2 events */
272 mmcr1 |= (u64) l2sel << MMCR1_L2SEL_SH;
273 }
274 if (power7_marked_instr_event(event[i]))
275 mmcra |= MMCRA_SAMPLE_ENABLE;
276 hwc[i] = pmc;
277 }
278
279 /* Return MMCRx values */
280 mmcr[0] = 0;
281 if (pmc_inuse & 1)
282 mmcr[0] = MMCR0_PMC1CE;
283 if (pmc_inuse & 0x3e)
284 mmcr[0] |= MMCR0_PMCjCE;
285 mmcr[1] = mmcr1;
286 mmcr[2] = mmcra;
287 return 0;
288}
289
290static void power7_disable_pmc(unsigned int pmc, u64 mmcr[])
291{
292 if (pmc <= 3)
293 mmcr[1] &= ~(0xffULL << MMCR1_PMCSEL_SH(pmc));
294}
295
296static int power7_generic_events[] = {
297 [PERF_COUNT_CPU_CYCLES] = 0x1e,
298 [PERF_COUNT_INSTRUCTIONS] = 2,
299 [PERF_COUNT_CACHE_REFERENCES] = 0xc880, /* LD_REF_L1_LSU */
300 [PERF_COUNT_CACHE_MISSES] = 0x400f0, /* LD_MISS_L1 */
301 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x10068, /* BRU_FIN */
302 [PERF_COUNT_BRANCH_MISSES] = 0x400f6, /* BR_MPRED */
303};
304
305#define C(x) PERF_COUNT_HW_CACHE_##x
306
307/*
308 * Table of generalized cache-related events.
309 * 0 means not supported, -1 means nonsensical, other values
310 * are event codes.
311 */
312static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
313 [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
314 [C(OP_READ)] = { 0x400f0, 0xc880 },
315 [C(OP_WRITE)] = { 0, 0x300f0 },
316 [C(OP_PREFETCH)] = { 0xd8b8, 0 },
317 },
318 [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
319 [C(OP_READ)] = { 0, 0x200fc },
320 [C(OP_WRITE)] = { -1, -1 },
321 [C(OP_PREFETCH)] = { 0x408a, 0 },
322 },
323 [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
324 [C(OP_READ)] = { 0x6080, 0x6084 },
325 [C(OP_WRITE)] = { 0x6082, 0x6086 },
326 [C(OP_PREFETCH)] = { 0, 0 },
327 },
328 [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
329 [C(OP_READ)] = { 0, 0x300fc },
330 [C(OP_WRITE)] = { -1, -1 },
331 [C(OP_PREFETCH)] = { -1, -1 },
332 },
333 [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
334 [C(OP_READ)] = { 0, 0x400fc },
335 [C(OP_WRITE)] = { -1, -1 },
336 [C(OP_PREFETCH)] = { -1, -1 },
337 },
338 [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
339 [C(OP_READ)] = { 0x10068, 0x400f6 },
340 [C(OP_WRITE)] = { -1, -1 },
341 [C(OP_PREFETCH)] = { -1, -1 },
342 },
343};
344
345struct power_pmu power7_pmu = {
346 .n_counter = 6,
347 .max_alternatives = MAX_ALT + 1,
348 .add_fields = 0x1555ull,
349 .test_adder = 0x3000ull,
350 .compute_mmcr = power7_compute_mmcr,
351 .get_constraint = power7_get_constraint,
352 .get_alternatives = power7_get_alternatives,
353 .disable_pmc = power7_disable_pmc,
354 .n_generic = ARRAY_SIZE(power7_generic_events),
355 .generic_events = power7_generic_events,
356 .cache_events = &power7_cache_events,
357};
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
new file mode 100644
index 000000000000..ba0a357a89f4
--- /dev/null
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -0,0 +1,482 @@
1/*
2 * Performance counter support for PPC970-family processors.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/string.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for PPC970
17 */
18#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
21#define PM_UNIT_MSK 0xf
22#define PM_SPCSEL_SH 6
23#define PM_SPCSEL_MSK 3
24#define PM_BYTE_SH 4 /* Byte number of event bus to use */
25#define PM_BYTE_MSK 3
26#define PM_PMCSEL_MSK 0xf
27
28/* Values in PM_UNIT field */
29#define PM_NONE 0
30#define PM_FPU 1
31#define PM_VPU 2
32#define PM_ISU 3
33#define PM_IFU 4
34#define PM_IDU 5
35#define PM_STS 6
36#define PM_LSU0 7
37#define PM_LSU1U 8
38#define PM_LSU1L 9
39#define PM_LASTUNIT 9
40
41/*
42 * Bits in MMCR0 for PPC970
43 */
44#define MMCR0_PMC1SEL_SH 8
45#define MMCR0_PMC2SEL_SH 1
46#define MMCR_PMCSEL_MSK 0x1f
47
48/*
49 * Bits in MMCR1 for PPC970
50 */
51#define MMCR1_TTM0SEL_SH 62
52#define MMCR1_TTM1SEL_SH 59
53#define MMCR1_TTM3SEL_SH 53
54#define MMCR1_TTMSEL_MSK 3
55#define MMCR1_TD_CP_DBG0SEL_SH 50
56#define MMCR1_TD_CP_DBG1SEL_SH 48
57#define MMCR1_TD_CP_DBG2SEL_SH 46
58#define MMCR1_TD_CP_DBG3SEL_SH 44
59#define MMCR1_PMC1_ADDER_SEL_SH 39
60#define MMCR1_PMC2_ADDER_SEL_SH 38
61#define MMCR1_PMC6_ADDER_SEL_SH 37
62#define MMCR1_PMC5_ADDER_SEL_SH 36
63#define MMCR1_PMC8_ADDER_SEL_SH 35
64#define MMCR1_PMC7_ADDER_SEL_SH 34
65#define MMCR1_PMC3_ADDER_SEL_SH 33
66#define MMCR1_PMC4_ADDER_SEL_SH 32
67#define MMCR1_PMC3SEL_SH 27
68#define MMCR1_PMC4SEL_SH 22
69#define MMCR1_PMC5SEL_SH 17
70#define MMCR1_PMC6SEL_SH 12
71#define MMCR1_PMC7SEL_SH 7
72#define MMCR1_PMC8SEL_SH 2
73
74static short mmcr1_adder_bits[8] = {
75 MMCR1_PMC1_ADDER_SEL_SH,
76 MMCR1_PMC2_ADDER_SEL_SH,
77 MMCR1_PMC3_ADDER_SEL_SH,
78 MMCR1_PMC4_ADDER_SEL_SH,
79 MMCR1_PMC5_ADDER_SEL_SH,
80 MMCR1_PMC6_ADDER_SEL_SH,
81 MMCR1_PMC7_ADDER_SEL_SH,
82 MMCR1_PMC8_ADDER_SEL_SH
83};
84
85/*
86 * Bits in MMCRA
87 */
88
89/*
90 * Layout of constraint bits:
91 * 6666555555555544444444443333333333222222222211111111110000000000
92 * 3210987654321098765432109876543210987654321098765432109876543210
93 * <><><>[ >[ >[ >< >< >< >< ><><><><><><><><>
94 * SPT0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
95 *
96 * SP - SPCSEL constraint
97 * 48-49: SPCSEL value 0x3_0000_0000_0000
98 *
99 * T0 - TTM0 constraint
100 * 46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
101 *
102 * T1 - TTM1 constraint
103 * 44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000
104 *
105 * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS
106 * 43: UC3 error 0x0800_0000_0000
107 * 42: FPU|IFU|VPU events needed 0x0400_0000_0000
108 * 41: ISU events needed 0x0200_0000_0000
109 * 40: IDU|STS events needed 0x0100_0000_0000
110 *
111 * PS1
112 * 39: PS1 error 0x0080_0000_0000
113 * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
114 *
115 * PS2
116 * 35: PS2 error 0x0008_0000_0000
117 * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
118 *
119 * B0
120 * 28-31: Byte 0 event source 0xf000_0000
121 * Encoding as for the event code
122 *
123 * B1, B2, B3
124 * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
125 *
126 * P1
127 * 15: P1 error 0x8000
128 * 14-15: Count of events needing PMC1
129 *
130 * P2..P8
131 * 0-13: Count of events needing PMC2..PMC8
132 */
133
134static unsigned char direct_marked_event[8] = {
135 (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
136 (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
137 (1<<3) | (1<<5), /* PMC3: PM_MRK_ST_CMPL_INT, PM_MRK_VMX_FIN */
138 (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
139 (1<<4) | (1<<5), /* PMC5: PM_GRP_MRK, PM_MRK_GRP_TIMEO */
140 (1<<3) | (1<<4) | (1<<5),
141 /* PMC6: PM_MRK_ST_STS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
142 (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
143 (1<<4) /* PMC8: PM_MRK_LSU_FIN */
144};
145
146/*
147 * Returns 1 if event counts things relating to marked instructions
148 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
149 */
150static int p970_marked_instr_event(u64 event)
151{
152 int pmc, psel, unit, byte, bit;
153 unsigned int mask;
154
155 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
156 psel = event & PM_PMCSEL_MSK;
157 if (pmc) {
158 if (direct_marked_event[pmc - 1] & (1 << psel))
159 return 1;
160 if (psel == 0) /* add events */
161 bit = (pmc <= 4)? pmc - 1: 8 - pmc;
162 else if (psel == 7 || psel == 13) /* decode events */
163 bit = 4;
164 else
165 return 0;
166 } else
167 bit = psel;
168
169 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
170 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
171 mask = 0;
172 switch (unit) {
173 case PM_VPU:
174 mask = 0x4c; /* byte 0 bits 2,3,6 */
175 case PM_LSU0:
176 /* byte 2 bits 0,2,3,4,6; all of byte 1 */
177 mask = 0x085dff00;
178 case PM_LSU1L:
179 mask = 0x50 << 24; /* byte 3 bits 4,6 */
180 break;
181 }
182 return (mask >> (byte * 8 + bit)) & 1;
183}
184
185/* Masks and values for using events from the various units */
186static u64 unit_cons[PM_LASTUNIT+1][2] = {
187 [PM_FPU] = { 0xc80000000000ull, 0x040000000000ull },
188 [PM_VPU] = { 0xc80000000000ull, 0xc40000000000ull },
189 [PM_ISU] = { 0x080000000000ull, 0x020000000000ull },
190 [PM_IFU] = { 0xc80000000000ull, 0x840000000000ull },
191 [PM_IDU] = { 0x380000000000ull, 0x010000000000ull },
192 [PM_STS] = { 0x380000000000ull, 0x310000000000ull },
193};
194
195static int p970_get_constraint(u64 event, u64 *maskp, u64 *valp)
196{
197 int pmc, byte, unit, sh, spcsel;
198 u64 mask = 0, value = 0;
199 int grp = -1;
200
201 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
202 if (pmc) {
203 if (pmc > 8)
204 return -1;
205 sh = (pmc - 1) * 2;
206 mask |= 2 << sh;
207 value |= 1 << sh;
208 grp = ((pmc - 1) >> 1) & 1;
209 }
210 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
211 if (unit) {
212 if (unit > PM_LASTUNIT)
213 return -1;
214 mask |= unit_cons[unit][0];
215 value |= unit_cons[unit][1];
216 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
217 /*
218 * Bus events on bytes 0 and 2 can be counted
219 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
220 */
221 if (!pmc)
222 grp = byte & 1;
223 /* Set byte lane select field */
224 mask |= 0xfULL << (28 - 4 * byte);
225 value |= (u64)unit << (28 - 4 * byte);
226 }
227 if (grp == 0) {
228 /* increment PMC1/2/5/6 field */
229 mask |= 0x8000000000ull;
230 value |= 0x1000000000ull;
231 } else if (grp == 1) {
232 /* increment PMC3/4/7/8 field */
233 mask |= 0x800000000ull;
234 value |= 0x100000000ull;
235 }
236 spcsel = (event >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
237 if (spcsel) {
238 mask |= 3ull << 48;
239 value |= (u64)spcsel << 48;
240 }
241 *maskp = mask;
242 *valp = value;
243 return 0;
244}
245
246static int p970_get_alternatives(u64 event, unsigned int flags, u64 alt[])
247{
248 alt[0] = event;
249
250 /* 2 alternatives for LSU empty */
251 if (event == 0x2002 || event == 0x3002) {
252 alt[1] = event ^ 0x1000;
253 return 2;
254 }
255
256 return 1;
257}
258
259static int p970_compute_mmcr(u64 event[], int n_ev,
260 unsigned int hwc[], u64 mmcr[])
261{
262 u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
263 unsigned int pmc, unit, byte, psel;
264 unsigned int ttm, grp;
265 unsigned int pmc_inuse = 0;
266 unsigned int pmc_grp_use[2];
267 unsigned char busbyte[4];
268 unsigned char unituse[16];
269 unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 };
270 unsigned char ttmuse[2];
271 unsigned char pmcsel[8];
272 int i;
273 int spcsel;
274
275 if (n_ev > 8)
276 return -1;
277
278 /* First pass to count resource use */
279 pmc_grp_use[0] = pmc_grp_use[1] = 0;
280 memset(busbyte, 0, sizeof(busbyte));
281 memset(unituse, 0, sizeof(unituse));
282 for (i = 0; i < n_ev; ++i) {
283 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
284 if (pmc) {
285 if (pmc_inuse & (1 << (pmc - 1)))
286 return -1;
287 pmc_inuse |= 1 << (pmc - 1);
288 /* count 1/2/5/6 vs 3/4/7/8 use */
289 ++pmc_grp_use[((pmc - 1) >> 1) & 1];
290 }
291 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
292 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
293 if (unit) {
294 if (unit > PM_LASTUNIT)
295 return -1;
296 if (!pmc)
297 ++pmc_grp_use[byte & 1];
298 if (busbyte[byte] && busbyte[byte] != unit)
299 return -1;
300 busbyte[byte] = unit;
301 unituse[unit] = 1;
302 }
303 }
304 if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
305 return -1;
306
307 /*
308 * Assign resources and set multiplexer selects.
309 *
310 * PM_ISU can go either on TTM0 or TTM1, but that's the only
311 * choice we have to deal with.
312 */
313 if (unituse[PM_ISU] &
314 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU]))
315 unitmap[PM_ISU] = 2 | 4; /* move ISU to TTM1 */
316 /* Set TTM[01]SEL fields. */
317 ttmuse[0] = ttmuse[1] = 0;
318 for (i = PM_FPU; i <= PM_STS; ++i) {
319 if (!unituse[i])
320 continue;
321 ttm = unitmap[i];
322 ++ttmuse[(ttm >> 2) & 1];
323 mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH;
324 }
325 /* Check only one unit per TTMx */
326 if (ttmuse[0] > 1 || ttmuse[1] > 1)
327 return -1;
328
329 /* Set byte lane select fields and TTM3SEL. */
330 for (byte = 0; byte < 4; ++byte) {
331 unit = busbyte[byte];
332 if (!unit)
333 continue;
334 if (unit <= PM_STS)
335 ttm = (unitmap[unit] >> 2) & 1;
336 else if (unit == PM_LSU0)
337 ttm = 2;
338 else {
339 ttm = 3;
340 if (unit == PM_LSU1L && byte >= 2)
341 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
342 }
343 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
344 }
345
346 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
347 memset(pmcsel, 0x8, sizeof(pmcsel)); /* 8 means don't count */
348 for (i = 0; i < n_ev; ++i) {
349 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
350 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
351 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
352 psel = event[i] & PM_PMCSEL_MSK;
353 if (!pmc) {
354 /* Bus event or any-PMC direct event */
355 if (unit)
356 psel |= 0x10 | ((byte & 2) << 2);
357 else
358 psel |= 8;
359 for (pmc = 0; pmc < 8; ++pmc) {
360 if (pmc_inuse & (1 << pmc))
361 continue;
362 grp = (pmc >> 1) & 1;
363 if (unit) {
364 if (grp == (byte & 1))
365 break;
366 } else if (pmc_grp_use[grp] < 4) {
367 ++pmc_grp_use[grp];
368 break;
369 }
370 }
371 pmc_inuse |= 1 << pmc;
372 } else {
373 /* Direct event */
374 --pmc;
375 if (psel == 0 && (byte & 2))
376 /* add events on higher-numbered bus */
377 mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
378 }
379 pmcsel[pmc] = psel;
380 hwc[i] = pmc;
381 spcsel = (event[i] >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
382 mmcr1 |= spcsel;
383 if (p970_marked_instr_event(event[i]))
384 mmcra |= MMCRA_SAMPLE_ENABLE;
385 }
386 for (pmc = 0; pmc < 2; ++pmc)
387 mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
388 for (; pmc < 8; ++pmc)
389 mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
390 if (pmc_inuse & 1)
391 mmcr0 |= MMCR0_PMC1CE;
392 if (pmc_inuse & 0xfe)
393 mmcr0 |= MMCR0_PMCjCE;
394
395 mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
396
397 /* Return MMCRx values */
398 mmcr[0] = mmcr0;
399 mmcr[1] = mmcr1;
400 mmcr[2] = mmcra;
401 return 0;
402}
403
404static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
405{
406 int shift, i;
407
408 if (pmc <= 1) {
409 shift = MMCR0_PMC1SEL_SH - 7 * pmc;
410 i = 0;
411 } else {
412 shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2);
413 i = 1;
414 }
415 /*
416 * Setting the PMCxSEL field to 0x08 disables PMC x.
417 */
418 mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift);
419}
420
421static int ppc970_generic_events[] = {
422 [PERF_COUNT_HW_CPU_CYCLES] = 7,
423 [PERF_COUNT_HW_INSTRUCTIONS] = 1,
424 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */
425 [PERF_COUNT_HW_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */
426 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */
427 [PERF_COUNT_HW_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */
428};
429
430#define C(x) PERF_COUNT_HW_CACHE_##x
431
432/*
433 * Table of generalized cache-related events.
434 * 0 means not supported, -1 means nonsensical, other values
435 * are event codes.
436 */
437static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
438 [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
439 [C(OP_READ)] = { 0x8810, 0x3810 },
440 [C(OP_WRITE)] = { 0x7810, 0x813 },
441 [C(OP_PREFETCH)] = { 0x731, 0 },
442 },
443 [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
444 [C(OP_READ)] = { 0, 0 },
445 [C(OP_WRITE)] = { -1, -1 },
446 [C(OP_PREFETCH)] = { 0, 0 },
447 },
448 [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
449 [C(OP_READ)] = { 0, 0 },
450 [C(OP_WRITE)] = { 0, 0 },
451 [C(OP_PREFETCH)] = { 0x733, 0 },
452 },
453 [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
454 [C(OP_READ)] = { 0, 0x704 },
455 [C(OP_WRITE)] = { -1, -1 },
456 [C(OP_PREFETCH)] = { -1, -1 },
457 },
458 [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
459 [C(OP_READ)] = { 0, 0x700 },
460 [C(OP_WRITE)] = { -1, -1 },
461 [C(OP_PREFETCH)] = { -1, -1 },
462 },
463 [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
464 [C(OP_READ)] = { 0x431, 0x327 },
465 [C(OP_WRITE)] = { -1, -1 },
466 [C(OP_PREFETCH)] = { -1, -1 },
467 },
468};
469
470struct power_pmu ppc970_pmu = {
471 .n_counter = 8,
472 .max_alternatives = 2,
473 .add_fields = 0x001100005555ull,
474 .test_adder = 0x013300000000ull,
475 .compute_mmcr = p970_compute_mmcr,
476 .get_constraint = p970_get_constraint,
477 .get_alternatives = p970_get_alternatives,
478 .disable_pmc = p970_disable_pmc,
479 .n_generic = ARRAY_SIZE(ppc970_generic_events),
480 .generic_events = ppc970_generic_events,
481 .cache_events = &ppc970_cache_events,
482};
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 76993941cac9..5beffc8f481e 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,6 +29,7 @@
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/kprobes.h> 30#include <linux/kprobes.h>
31#include <linux/kdebug.h> 31#include <linux/kdebug.h>
32#include <linux/perf_counter.h>
32 33
33#include <asm/firmware.h> 34#include <asm/firmware.h>
34#include <asm/page.h> 35#include <asm/page.h>
@@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
170 die("Weird page fault", regs, SIGSEGV); 171 die("Weird page fault", regs, SIGSEGV);
171 } 172 }
172 173
174 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
175
173 /* When running in the kernel we expect faults to occur only to 176 /* When running in the kernel we expect faults to occur only to
174 * addresses in user space. All other faults represent errors in the 177 * addresses in user space. All other faults represent errors in the
175 * kernel and should generate an OOPS. Unfortunately, in the case of an 178 * kernel and should generate an OOPS. Unfortunately, in the case of an
@@ -309,6 +312,8 @@ good_area:
309 } 312 }
310 if (ret & VM_FAULT_MAJOR) { 313 if (ret & VM_FAULT_MAJOR) {
311 current->maj_flt++; 314 current->maj_flt++;
315 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
316 regs, address);
312#ifdef CONFIG_PPC_SMLPAR 317#ifdef CONFIG_PPC_SMLPAR
313 if (firmware_has_feature(FW_FEATURE_CMO)) { 318 if (firmware_has_feature(FW_FEATURE_CMO)) {
314 preempt_disable(); 319 preempt_disable();
@@ -316,8 +321,11 @@ good_area:
316 preempt_enable(); 321 preempt_enable();
317 } 322 }
318#endif 323#endif
319 } else 324 } else {
320 current->min_flt++; 325 current->min_flt++;
326 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
327 regs, address);
328 }
321 up_read(&mm->mmap_sem); 329 up_read(&mm->mmap_sem);
322 return 0; 330 return 0;
323 331
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 9da795e49337..732ee93a8e98 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
1config PPC64 1config PPC64
2 bool "64-bit kernel" 2 bool "64-bit kernel"
3 default n 3 default n
4 select HAVE_PERF_COUNTERS
4 help 5 help
5 This option selects whether a 32-bit or a 64-bit kernel 6 This option selects whether a 32-bit or a 64-bit kernel
6 will be built. 7 will be built.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index aafae3b140de..68f5578fe38e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -739,6 +739,7 @@ config X86_UP_IOAPIC
739config X86_LOCAL_APIC 739config X86_LOCAL_APIC
740 def_bool y 740 def_bool y
741 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC 741 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
742 select HAVE_PERF_COUNTERS if (!M386 && !M486)
742 743
743config X86_IO_APIC 744config X86_IO_APIC
744 def_bool y 745 def_bool y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index dcef387ddc36..e590261ba059 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -825,10 +825,11 @@ ia32_sys_call_table:
825 .quad compat_sys_signalfd4 825 .quad compat_sys_signalfd4
826 .quad sys_eventfd2 826 .quad sys_eventfd2
827 .quad sys_epoll_create1 827 .quad sys_epoll_create1
828 .quad sys_dup3 /* 330 */ 828 .quad sys_dup3 /* 330 */
829 .quad sys_pipe2 829 .quad sys_pipe2
830 .quad sys_inotify_init1 830 .quad sys_inotify_init1
831 .quad compat_sys_preadv 831 .quad compat_sys_preadv
832 .quad compat_sys_pwritev 832 .quad compat_sys_pwritev
833 .quad compat_sys_rt_tgsigqueueinfo /* 335 */ 833 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
834 .quad sys_perf_counter_open
834ia32_syscall_end: 835ia32_syscall_end:
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 85b46fba4229..aff9f1fcdcd7 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -247,5 +247,241 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
247#define smp_mb__before_atomic_inc() barrier() 247#define smp_mb__before_atomic_inc() barrier()
248#define smp_mb__after_atomic_inc() barrier() 248#define smp_mb__after_atomic_inc() barrier()
249 249
250/* An 64bit atomic type */
251
252typedef struct {
253 unsigned long long counter;
254} atomic64_t;
255
256#define ATOMIC64_INIT(val) { (val) }
257
258/**
259 * atomic64_read - read atomic64 variable
260 * @v: pointer of type atomic64_t
261 *
262 * Atomically reads the value of @v.
263 * Doesn't imply a read memory barrier.
264 */
265#define __atomic64_read(ptr) ((ptr)->counter)
266
267static inline unsigned long long
268cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
269{
270 asm volatile(
271
272 LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
273
274 : "=A" (old)
275
276 : [ptr] "D" (ptr),
277 "A" (old),
278 "b" (ll_low(new)),
279 "c" (ll_high(new))
280
281 : "memory");
282
283 return old;
284}
285
286static inline unsigned long long
287atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
288 unsigned long long new_val)
289{
290 return cmpxchg8b(&ptr->counter, old_val, new_val);
291}
292
293/**
294 * atomic64_xchg - xchg atomic64 variable
295 * @ptr: pointer to type atomic64_t
296 * @new_val: value to assign
297 * @old_val: old value that was there
298 *
299 * Atomically xchgs the value of @ptr to @new_val and returns
300 * the old value.
301 */
302
303static inline unsigned long long
304atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
305{
306 unsigned long long old_val;
307
308 do {
309 old_val = atomic_read(ptr);
310 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
311
312 return old_val;
313}
314
315/**
316 * atomic64_set - set atomic64 variable
317 * @ptr: pointer to type atomic64_t
318 * @new_val: value to assign
319 *
320 * Atomically sets the value of @ptr to @new_val.
321 */
322static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
323{
324 atomic64_xchg(ptr, new_val);
325}
326
327/**
328 * atomic64_read - read atomic64 variable
329 * @ptr: pointer to type atomic64_t
330 *
331 * Atomically reads the value of @ptr and returns it.
332 */
333static inline unsigned long long atomic64_read(atomic64_t *ptr)
334{
335 unsigned long long curr_val;
336
337 do {
338 curr_val = __atomic64_read(ptr);
339 } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
340
341 return curr_val;
342}
343
344/**
345 * atomic64_add_return - add and return
346 * @delta: integer value to add
347 * @ptr: pointer to type atomic64_t
348 *
349 * Atomically adds @delta to @ptr and returns @delta + *@ptr
350 */
351static inline unsigned long long
352atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
353{
354 unsigned long long old_val, new_val;
355
356 do {
357 old_val = atomic_read(ptr);
358 new_val = old_val + delta;
359
360 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
361
362 return new_val;
363}
364
365static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
366{
367 return atomic64_add_return(-delta, ptr);
368}
369
370static inline long atomic64_inc_return(atomic64_t *ptr)
371{
372 return atomic64_add_return(1, ptr);
373}
374
375static inline long atomic64_dec_return(atomic64_t *ptr)
376{
377 return atomic64_sub_return(1, ptr);
378}
379
380/**
381 * atomic64_add - add integer to atomic64 variable
382 * @delta: integer value to add
383 * @ptr: pointer to type atomic64_t
384 *
385 * Atomically adds @delta to @ptr.
386 */
387static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
388{
389 atomic64_add_return(delta, ptr);
390}
391
392/**
393 * atomic64_sub - subtract the atomic64 variable
394 * @delta: integer value to subtract
395 * @ptr: pointer to type atomic64_t
396 *
397 * Atomically subtracts @delta from @ptr.
398 */
399static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
400{
401 atomic64_add(-delta, ptr);
402}
403
404/**
405 * atomic64_sub_and_test - subtract value from variable and test result
406 * @delta: integer value to subtract
407 * @ptr: pointer to type atomic64_t
408 *
409 * Atomically subtracts @delta from @ptr and returns
410 * true if the result is zero, or false for all
411 * other cases.
412 */
413static inline int
414atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
415{
416 unsigned long long old_val = atomic64_sub_return(delta, ptr);
417
418 return old_val == 0;
419}
420
421/**
422 * atomic64_inc - increment atomic64 variable
423 * @ptr: pointer to type atomic64_t
424 *
425 * Atomically increments @ptr by 1.
426 */
427static inline void atomic64_inc(atomic64_t *ptr)
428{
429 atomic64_add(1, ptr);
430}
431
432/**
433 * atomic64_dec - decrement atomic64 variable
434 * @ptr: pointer to type atomic64_t
435 *
436 * Atomically decrements @ptr by 1.
437 */
438static inline void atomic64_dec(atomic64_t *ptr)
439{
440 atomic64_sub(1, ptr);
441}
442
443/**
444 * atomic64_dec_and_test - decrement and test
445 * @ptr: pointer to type atomic64_t
446 *
447 * Atomically decrements @ptr by 1 and
448 * returns true if the result is 0, or false for all other
449 * cases.
450 */
451static inline int atomic64_dec_and_test(atomic64_t *ptr)
452{
453 return atomic64_sub_and_test(1, ptr);
454}
455
456/**
457 * atomic64_inc_and_test - increment and test
458 * @ptr: pointer to type atomic64_t
459 *
460 * Atomically increments @ptr by 1
461 * and returns true if the result is zero, or false for all
462 * other cases.
463 */
464static inline int atomic64_inc_and_test(atomic64_t *ptr)
465{
466 return atomic64_sub_and_test(-1, ptr);
467}
468
469/**
470 * atomic64_add_negative - add and test if negative
471 * @delta: integer value to add
472 * @ptr: pointer to type atomic64_t
473 *
474 * Atomically adds @delta to @ptr and returns true
475 * if the result is negative, or false when
476 * result is greater than or equal to zero.
477 */
478static inline int
479atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
480{
481 long long old_val = atomic64_add_return(delta, ptr);
482
483 return old_val < 0;
484}
485
250#include <asm-generic/atomic.h> 486#include <asm-generic/atomic.h>
251#endif /* _ASM_X86_ATOMIC_32_H */ 487#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index c2e6bedaf258..d750a10ccad6 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,7 +49,7 @@ BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
49BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) 49BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
50 50
51#ifdef CONFIG_PERF_COUNTERS 51#ifdef CONFIG_PERF_COUNTERS
52BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) 52BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
53#endif 53#endif
54 54
55#ifdef CONFIG_X86_MCE_P4THERMAL 55#ifdef CONFIG_X86_MCE_P4THERMAL
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 37555e52f980..9ebc5c255032 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -13,6 +13,8 @@ typedef struct {
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */ 15 unsigned int generic_irqs; /* arch dependent */
16 unsigned int apic_perf_irqs;
17 unsigned int apic_pending_irqs;
16#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
17 unsigned int irq_resched_count; 19 unsigned int irq_resched_count;
18 unsigned int irq_call_count; 20 unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 3bd1777a4c8b..6df45f639666 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,6 +29,8 @@
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void); 30extern void generic_interrupt(void);
31extern void error_interrupt(void); 31extern void error_interrupt(void);
32extern void perf_pending_interrupt(void);
33
32extern void spurious_interrupt(void); 34extern void spurious_interrupt(void);
33extern void thermal_interrupt(void); 35extern void thermal_interrupt(void);
34extern void reschedule_interrupt(void); 36extern void reschedule_interrupt(void);
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index fa0fd068bc2e..000000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,31 +0,0 @@
1#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
2#define _ASM_X86_INTEL_ARCH_PERFMON_H
3
4#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
5#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
6
7#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
8#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
9
10#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
11#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
12#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
13#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
14
15#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
16#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
17#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
18#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
19 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
20
21union cpuid10_eax {
22 struct {
23 unsigned int version_id:8;
24 unsigned int num_counters:8;
25 unsigned int bit_width:8;
26 unsigned int mask_length:8;
27 } split;
28 unsigned int full;
29};
30
31#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 910b5a3d6751..e997be98c9b9 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -108,14 +108,14 @@
108#define LOCAL_TIMER_VECTOR 0xef 108#define LOCAL_TIMER_VECTOR 0xef
109 109
110/* 110/*
111 * Performance monitoring interrupt vector: 111 * Generic system vector for platform specific use
112 */ 112 */
113#define LOCAL_PERF_VECTOR 0xee 113#define GENERIC_INTERRUPT_VECTOR 0xed
114 114
115/* 115/*
116 * Generic system vector for platform specific use 116 * Performance monitoring pending work vector:
117 */ 117 */
118#define GENERIC_INTERRUPT_VECTOR 0xed 118#define LOCAL_PENDING_VECTOR 0xec
119 119
120/* 120/*
121 * First APIC vector available to drivers: (vectors 0x30-0xee) we 121 * First APIC vector available to drivers: (vectors 0x30-0xee) we
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 000000000000..876ed97147b3
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,100 @@
1#ifndef _ASM_X86_PERF_COUNTER_H
2#define _ASM_X86_PERF_COUNTER_H
3
4/*
5 * Performance counter hw details:
6 */
7
8#define X86_PMC_MAX_GENERIC 8
9#define X86_PMC_MAX_FIXED 3
10
11#define X86_PMC_IDX_GENERIC 0
12#define X86_PMC_IDX_FIXED 32
13#define X86_PMC_IDX_MAX 64
14
15#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
16#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
17
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20
21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
22#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
23#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
24#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
25
26/*
27 * Includes eventsel and unit mask as well:
28 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff
30
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36
37#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
38
39/*
40 * Intel "Architectural Performance Monitoring" CPUID
41 * detection/enumeration details:
42 */
43union cpuid10_eax {
44 struct {
45 unsigned int version_id:8;
46 unsigned int num_counters:8;
47 unsigned int bit_width:8;
48 unsigned int mask_length:8;
49 } split;
50 unsigned int full;
51};
52
53union cpuid10_edx {
54 struct {
55 unsigned int num_counters_fixed:4;
56 unsigned int reserved:28;
57 } split;
58 unsigned int full;
59};
60
61
62/*
63 * Fixed-purpose performance counters:
64 */
65
66/*
67 * All 3 fixed-mode PMCs are configured via this single MSR:
68 */
69#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
70
71/*
72 * The counts are available in three separate MSRs:
73 */
74
75/* Instr_Retired.Any: */
76#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
77#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
78
79/* CPU_CLK_Unhalted.Core: */
80#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
81#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
82
83/* CPU_CLK_Unhalted.Ref: */
84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
86
87extern void set_perf_counter_pending(void);
88
89#define clear_perf_counter_pending() do { } while (0)
90#define test_perf_counter_pending() (0)
91
92#ifdef CONFIG_PERF_COUNTERS
93extern void init_hw_perf_counters(void);
94extern void perf_counters_lapic_init(void);
95#else
96static inline void init_hw_perf_counters(void) { }
97static inline void perf_counters_lapic_init(void) { }
98#endif
99
100#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 708dae61262d..732a30706153 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -341,6 +341,7 @@
341#define __NR_preadv 333 341#define __NR_preadv 333
342#define __NR_pwritev 334 342#define __NR_pwritev 334
343#define __NR_rt_tgsigqueueinfo 335 343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_counter_open 336
344 345
345#ifdef __KERNEL__ 346#ifdef __KERNEL__
346 347
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 4e2b05404400..900e1617e672 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -659,7 +659,8 @@ __SYSCALL(__NR_preadv, sys_preadv)
659__SYSCALL(__NR_pwritev, sys_pwritev) 659__SYSCALL(__NR_pwritev, sys_pwritev)
660#define __NR_rt_tgsigqueueinfo 297 660#define __NR_rt_tgsigqueueinfo 297
661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) 661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
662 662#define __NR_perf_counter_open 298
663__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
663 664
664#ifndef __NO_STUBS 665#ifndef __NO_STUBS
665#define __ARCH_WANT_OLD_READDIR 666#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a4c9cf0bf70b..076d3881f3da 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
14 * Mikael Pettersson : PM converted to driver model. 14 * Mikael Pettersson : PM converted to driver model.
15 */ 15 */
16 16
17#include <linux/perf_counter.h>
17#include <linux/kernel_stat.h> 18#include <linux/kernel_stat.h>
18#include <linux/mc146818rtc.h> 19#include <linux/mc146818rtc.h>
19#include <linux/acpi_pmtmr.h> 20#include <linux/acpi_pmtmr.h>
@@ -34,6 +35,7 @@
34#include <linux/smp.h> 35#include <linux/smp.h>
35#include <linux/mm.h> 36#include <linux/mm.h>
36 37
38#include <asm/perf_counter.h>
37#include <asm/pgalloc.h> 39#include <asm/pgalloc.h>
38#include <asm/atomic.h> 40#include <asm/atomic.h>
39#include <asm/mpspec.h> 41#include <asm/mpspec.h>
@@ -1187,6 +1189,7 @@ void __cpuinit setup_local_APIC(void)
1187 apic_write(APIC_ESR, 0); 1189 apic_write(APIC_ESR, 0);
1188 } 1190 }
1189#endif 1191#endif
1192 perf_counters_lapic_init();
1190 1193
1191 preempt_disable(); 1194 preempt_disable();
1192 1195
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1946fac42ab3..94605e7f6a54 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -177,16 +177,18 @@ int __init arch_early_irq_init(void)
177 struct irq_cfg *cfg; 177 struct irq_cfg *cfg;
178 struct irq_desc *desc; 178 struct irq_desc *desc;
179 int count; 179 int count;
180 int node;
180 int i; 181 int i;
181 182
182 cfg = irq_cfgx; 183 cfg = irq_cfgx;
183 count = ARRAY_SIZE(irq_cfgx); 184 count = ARRAY_SIZE(irq_cfgx);
185 node= cpu_to_node(boot_cpu_id);
184 186
185 for (i = 0; i < count; i++) { 187 for (i = 0; i < count; i++) {
186 desc = irq_to_desc(i); 188 desc = irq_to_desc(i);
187 desc->chip_data = &cfg[i]; 189 desc->chip_data = &cfg[i];
188 alloc_bootmem_cpumask_var(&cfg[i].domain); 190 alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
189 alloc_bootmem_cpumask_var(&cfg[i].old_domain); 191 alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
190 if (i < NR_IRQS_LEGACY) 192 if (i < NR_IRQS_LEGACY)
191 cpumask_setall(cfg[i].domain); 193 cpumask_setall(cfg[i].domain);
192 } 194 }
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4e242f9a06e4..3efcb2b96a15 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
1# 1#
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details, features and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
25 25
26obj-$(CONFIG_X86_MCE) += mcheck/ 26obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
27obj-$(CONFIG_MTRR) += mtrr/
28obj-$(CONFIG_CPU_FREQ) += cpufreq/
29 27
30obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 28obj-$(CONFIG_X86_MCE) += mcheck/
29obj-$(CONFIG_MTRR) += mtrr/
30obj-$(CONFIG_CPU_FREQ) += cpufreq/
31
32obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
31 33
32quiet_cmd_mkcapflags = MKCAP $@ 34quiet_cmd_mkcapflags = MKCAP $@
33 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ 35 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b0517aa2bd3b..3ffdcfa9abdf 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
13#include <linux/io.h> 13#include <linux/io.h>
14 14
15#include <asm/stackprotector.h> 15#include <asm/stackprotector.h>
16#include <asm/perf_counter.h>
16#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
17#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
@@ -874,6 +875,7 @@ void __init identify_boot_cpu(void)
874#else 875#else
875 vgetcpu_set_mode(); 876 vgetcpu_set_mode();
876#endif 877#endif
878 init_hw_perf_counters();
877} 879}
878 880
879void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 881void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 000000000000..895c82e78455
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,1704 @@
1/*
2 * Performance counter x86 architecture code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 *
10 * For licencing details see kernel-base/COPYING
11 */
12
13#include <linux/perf_counter.h>
14#include <linux/capability.h>
15#include <linux/notifier.h>
16#include <linux/hardirq.h>
17#include <linux/kprobes.h>
18#include <linux/module.h>
19#include <linux/kdebug.h>
20#include <linux/sched.h>
21#include <linux/uaccess.h>
22
23#include <asm/apic.h>
24#include <asm/stacktrace.h>
25#include <asm/nmi.h>
26
27static u64 perf_counter_mask __read_mostly;
28
29struct cpu_hw_counters {
30 struct perf_counter *counters[X86_PMC_IDX_MAX];
31 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
32 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
33 unsigned long interrupts;
34 int enabled;
35};
36
37/*
38 * struct x86_pmu - generic x86 pmu
39 */
40struct x86_pmu {
41 const char *name;
42 int version;
43 int (*handle_irq)(struct pt_regs *);
44 void (*disable_all)(void);
45 void (*enable_all)(void);
46 void (*enable)(struct hw_perf_counter *, int);
47 void (*disable)(struct hw_perf_counter *, int);
48 unsigned eventsel;
49 unsigned perfctr;
50 u64 (*event_map)(int);
51 u64 (*raw_event)(u64);
52 int max_events;
53 int num_counters;
54 int num_counters_fixed;
55 int counter_bits;
56 u64 counter_mask;
57 u64 max_period;
58 u64 intel_ctrl;
59};
60
61static struct x86_pmu x86_pmu __read_mostly;
62
63static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
64 .enabled = 1,
65};
66
67/*
68 * Intel PerfMon v3. Used on Core2 and later.
69 */
70static const u64 intel_perfmon_event_map[] =
71{
72 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
73 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
74 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
75 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
76 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
77 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
78 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
79};
80
81static u64 intel_pmu_event_map(int event)
82{
83 return intel_perfmon_event_map[event];
84}
85
86/*
87 * Generalized hw caching related event table, filled
88 * in on a per model basis. A value of 0 means
89 * 'not supported', -1 means 'event makes no sense on
90 * this CPU', any other value means the raw event
91 * ID.
92 */
93
94#define C(x) PERF_COUNT_HW_CACHE_##x
95
96static u64 __read_mostly hw_cache_event_ids
97 [PERF_COUNT_HW_CACHE_MAX]
98 [PERF_COUNT_HW_CACHE_OP_MAX]
99 [PERF_COUNT_HW_CACHE_RESULT_MAX];
100
101static const u64 nehalem_hw_cache_event_ids
102 [PERF_COUNT_HW_CACHE_MAX]
103 [PERF_COUNT_HW_CACHE_OP_MAX]
104 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
105{
106 [ C(L1D) ] = {
107 [ C(OP_READ) ] = {
108 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
109 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
110 },
111 [ C(OP_WRITE) ] = {
112 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
113 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
114 },
115 [ C(OP_PREFETCH) ] = {
116 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
117 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
118 },
119 },
120 [ C(L1I ) ] = {
121 [ C(OP_READ) ] = {
122 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
123 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
124 },
125 [ C(OP_WRITE) ] = {
126 [ C(RESULT_ACCESS) ] = -1,
127 [ C(RESULT_MISS) ] = -1,
128 },
129 [ C(OP_PREFETCH) ] = {
130 [ C(RESULT_ACCESS) ] = 0x0,
131 [ C(RESULT_MISS) ] = 0x0,
132 },
133 },
134 [ C(LL ) ] = {
135 [ C(OP_READ) ] = {
136 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
137 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
138 },
139 [ C(OP_WRITE) ] = {
140 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
141 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
142 },
143 [ C(OP_PREFETCH) ] = {
144 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
145 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
146 },
147 },
148 [ C(DTLB) ] = {
149 [ C(OP_READ) ] = {
150 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
151 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
152 },
153 [ C(OP_WRITE) ] = {
154 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
155 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
156 },
157 [ C(OP_PREFETCH) ] = {
158 [ C(RESULT_ACCESS) ] = 0x0,
159 [ C(RESULT_MISS) ] = 0x0,
160 },
161 },
162 [ C(ITLB) ] = {
163 [ C(OP_READ) ] = {
164 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
165 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
166 },
167 [ C(OP_WRITE) ] = {
168 [ C(RESULT_ACCESS) ] = -1,
169 [ C(RESULT_MISS) ] = -1,
170 },
171 [ C(OP_PREFETCH) ] = {
172 [ C(RESULT_ACCESS) ] = -1,
173 [ C(RESULT_MISS) ] = -1,
174 },
175 },
176 [ C(BPU ) ] = {
177 [ C(OP_READ) ] = {
178 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
179 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
180 },
181 [ C(OP_WRITE) ] = {
182 [ C(RESULT_ACCESS) ] = -1,
183 [ C(RESULT_MISS) ] = -1,
184 },
185 [ C(OP_PREFETCH) ] = {
186 [ C(RESULT_ACCESS) ] = -1,
187 [ C(RESULT_MISS) ] = -1,
188 },
189 },
190};
191
192static const u64 core2_hw_cache_event_ids
193 [PERF_COUNT_HW_CACHE_MAX]
194 [PERF_COUNT_HW_CACHE_OP_MAX]
195 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
196{
197 [ C(L1D) ] = {
198 [ C(OP_READ) ] = {
199 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
200 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
201 },
202 [ C(OP_WRITE) ] = {
203 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
204 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
205 },
206 [ C(OP_PREFETCH) ] = {
207 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
208 [ C(RESULT_MISS) ] = 0,
209 },
210 },
211 [ C(L1I ) ] = {
212 [ C(OP_READ) ] = {
213 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
214 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
215 },
216 [ C(OP_WRITE) ] = {
217 [ C(RESULT_ACCESS) ] = -1,
218 [ C(RESULT_MISS) ] = -1,
219 },
220 [ C(OP_PREFETCH) ] = {
221 [ C(RESULT_ACCESS) ] = 0,
222 [ C(RESULT_MISS) ] = 0,
223 },
224 },
225 [ C(LL ) ] = {
226 [ C(OP_READ) ] = {
227 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
228 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
229 },
230 [ C(OP_WRITE) ] = {
231 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
232 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
233 },
234 [ C(OP_PREFETCH) ] = {
235 [ C(RESULT_ACCESS) ] = 0,
236 [ C(RESULT_MISS) ] = 0,
237 },
238 },
239 [ C(DTLB) ] = {
240 [ C(OP_READ) ] = {
241 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
242 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
243 },
244 [ C(OP_WRITE) ] = {
245 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
246 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
247 },
248 [ C(OP_PREFETCH) ] = {
249 [ C(RESULT_ACCESS) ] = 0,
250 [ C(RESULT_MISS) ] = 0,
251 },
252 },
253 [ C(ITLB) ] = {
254 [ C(OP_READ) ] = {
255 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
256 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
257 },
258 [ C(OP_WRITE) ] = {
259 [ C(RESULT_ACCESS) ] = -1,
260 [ C(RESULT_MISS) ] = -1,
261 },
262 [ C(OP_PREFETCH) ] = {
263 [ C(RESULT_ACCESS) ] = -1,
264 [ C(RESULT_MISS) ] = -1,
265 },
266 },
267 [ C(BPU ) ] = {
268 [ C(OP_READ) ] = {
269 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
270 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
271 },
272 [ C(OP_WRITE) ] = {
273 [ C(RESULT_ACCESS) ] = -1,
274 [ C(RESULT_MISS) ] = -1,
275 },
276 [ C(OP_PREFETCH) ] = {
277 [ C(RESULT_ACCESS) ] = -1,
278 [ C(RESULT_MISS) ] = -1,
279 },
280 },
281};
282
283static const u64 atom_hw_cache_event_ids
284 [PERF_COUNT_HW_CACHE_MAX]
285 [PERF_COUNT_HW_CACHE_OP_MAX]
286 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
287{
288 [ C(L1D) ] = {
289 [ C(OP_READ) ] = {
290 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
291 [ C(RESULT_MISS) ] = 0,
292 },
293 [ C(OP_WRITE) ] = {
294 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
295 [ C(RESULT_MISS) ] = 0,
296 },
297 [ C(OP_PREFETCH) ] = {
298 [ C(RESULT_ACCESS) ] = 0x0,
299 [ C(RESULT_MISS) ] = 0,
300 },
301 },
302 [ C(L1I ) ] = {
303 [ C(OP_READ) ] = {
304 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
305 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
306 },
307 [ C(OP_WRITE) ] = {
308 [ C(RESULT_ACCESS) ] = -1,
309 [ C(RESULT_MISS) ] = -1,
310 },
311 [ C(OP_PREFETCH) ] = {
312 [ C(RESULT_ACCESS) ] = 0,
313 [ C(RESULT_MISS) ] = 0,
314 },
315 },
316 [ C(LL ) ] = {
317 [ C(OP_READ) ] = {
318 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
319 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
320 },
321 [ C(OP_WRITE) ] = {
322 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
323 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
324 },
325 [ C(OP_PREFETCH) ] = {
326 [ C(RESULT_ACCESS) ] = 0,
327 [ C(RESULT_MISS) ] = 0,
328 },
329 },
330 [ C(DTLB) ] = {
331 [ C(OP_READ) ] = {
332 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
333 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
334 },
335 [ C(OP_WRITE) ] = {
336 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
337 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
338 },
339 [ C(OP_PREFETCH) ] = {
340 [ C(RESULT_ACCESS) ] = 0,
341 [ C(RESULT_MISS) ] = 0,
342 },
343 },
344 [ C(ITLB) ] = {
345 [ C(OP_READ) ] = {
346 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
347 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
348 },
349 [ C(OP_WRITE) ] = {
350 [ C(RESULT_ACCESS) ] = -1,
351 [ C(RESULT_MISS) ] = -1,
352 },
353 [ C(OP_PREFETCH) ] = {
354 [ C(RESULT_ACCESS) ] = -1,
355 [ C(RESULT_MISS) ] = -1,
356 },
357 },
358 [ C(BPU ) ] = {
359 [ C(OP_READ) ] = {
360 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
361 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
362 },
363 [ C(OP_WRITE) ] = {
364 [ C(RESULT_ACCESS) ] = -1,
365 [ C(RESULT_MISS) ] = -1,
366 },
367 [ C(OP_PREFETCH) ] = {
368 [ C(RESULT_ACCESS) ] = -1,
369 [ C(RESULT_MISS) ] = -1,
370 },
371 },
372};
373
374static u64 intel_pmu_raw_event(u64 event)
375{
376#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
377#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
378#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
379#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
380#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL
381
382#define CORE_EVNTSEL_MASK \
383 (CORE_EVNTSEL_EVENT_MASK | \
384 CORE_EVNTSEL_UNIT_MASK | \
385 CORE_EVNTSEL_EDGE_MASK | \
386 CORE_EVNTSEL_INV_MASK | \
387 CORE_EVNTSEL_COUNTER_MASK)
388
389 return event & CORE_EVNTSEL_MASK;
390}
391
392static const u64 amd_0f_hw_cache_event_ids
393 [PERF_COUNT_HW_CACHE_MAX]
394 [PERF_COUNT_HW_CACHE_OP_MAX]
395 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
396{
397 [ C(L1D) ] = {
398 [ C(OP_READ) ] = {
399 [ C(RESULT_ACCESS) ] = 0,
400 [ C(RESULT_MISS) ] = 0,
401 },
402 [ C(OP_WRITE) ] = {
403 [ C(RESULT_ACCESS) ] = 0,
404 [ C(RESULT_MISS) ] = 0,
405 },
406 [ C(OP_PREFETCH) ] = {
407 [ C(RESULT_ACCESS) ] = 0,
408 [ C(RESULT_MISS) ] = 0,
409 },
410 },
411 [ C(L1I ) ] = {
412 [ C(OP_READ) ] = {
413 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
414 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
415 },
416 [ C(OP_WRITE) ] = {
417 [ C(RESULT_ACCESS) ] = -1,
418 [ C(RESULT_MISS) ] = -1,
419 },
420 [ C(OP_PREFETCH) ] = {
421 [ C(RESULT_ACCESS) ] = 0,
422 [ C(RESULT_MISS) ] = 0,
423 },
424 },
425 [ C(LL ) ] = {
426 [ C(OP_READ) ] = {
427 [ C(RESULT_ACCESS) ] = 0,
428 [ C(RESULT_MISS) ] = 0,
429 },
430 [ C(OP_WRITE) ] = {
431 [ C(RESULT_ACCESS) ] = 0,
432 [ C(RESULT_MISS) ] = 0,
433 },
434 [ C(OP_PREFETCH) ] = {
435 [ C(RESULT_ACCESS) ] = 0,
436 [ C(RESULT_MISS) ] = 0,
437 },
438 },
439 [ C(DTLB) ] = {
440 [ C(OP_READ) ] = {
441 [ C(RESULT_ACCESS) ] = 0,
442 [ C(RESULT_MISS) ] = 0,
443 },
444 [ C(OP_WRITE) ] = {
445 [ C(RESULT_ACCESS) ] = 0,
446 [ C(RESULT_MISS) ] = 0,
447 },
448 [ C(OP_PREFETCH) ] = {
449 [ C(RESULT_ACCESS) ] = 0,
450 [ C(RESULT_MISS) ] = 0,
451 },
452 },
453 [ C(ITLB) ] = {
454 [ C(OP_READ) ] = {
455 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
456 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
457 },
458 [ C(OP_WRITE) ] = {
459 [ C(RESULT_ACCESS) ] = -1,
460 [ C(RESULT_MISS) ] = -1,
461 },
462 [ C(OP_PREFETCH) ] = {
463 [ C(RESULT_ACCESS) ] = -1,
464 [ C(RESULT_MISS) ] = -1,
465 },
466 },
467 [ C(BPU ) ] = {
468 [ C(OP_READ) ] = {
469 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
470 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
471 },
472 [ C(OP_WRITE) ] = {
473 [ C(RESULT_ACCESS) ] = -1,
474 [ C(RESULT_MISS) ] = -1,
475 },
476 [ C(OP_PREFETCH) ] = {
477 [ C(RESULT_ACCESS) ] = -1,
478 [ C(RESULT_MISS) ] = -1,
479 },
480 },
481};
482
483/*
484 * AMD Performance Monitor K7 and later.
485 */
486static const u64 amd_perfmon_event_map[] =
487{
488 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
489 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
490 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
491 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
492 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
493 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
494};
495
496static u64 amd_pmu_event_map(int event)
497{
498 return amd_perfmon_event_map[event];
499}
500
501static u64 amd_pmu_raw_event(u64 event)
502{
503#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
504#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
505#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
506#define K7_EVNTSEL_INV_MASK 0x000800000ULL
507#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
508
509#define K7_EVNTSEL_MASK \
510 (K7_EVNTSEL_EVENT_MASK | \
511 K7_EVNTSEL_UNIT_MASK | \
512 K7_EVNTSEL_EDGE_MASK | \
513 K7_EVNTSEL_INV_MASK | \
514 K7_EVNTSEL_COUNTER_MASK)
515
516 return event & K7_EVNTSEL_MASK;
517}
518
519/*
520 * Propagate counter elapsed time into the generic counter.
521 * Can only be executed on the CPU where the counter is active.
522 * Returns the delta events processed.
523 */
524static u64
525x86_perf_counter_update(struct perf_counter *counter,
526 struct hw_perf_counter *hwc, int idx)
527{
528 int shift = 64 - x86_pmu.counter_bits;
529 u64 prev_raw_count, new_raw_count;
530 s64 delta;
531
532 /*
533 * Careful: an NMI might modify the previous counter value.
534 *
535 * Our tactic to handle this is to first atomically read and
536 * exchange a new raw count - then add that new-prev delta
537 * count to the generic counter atomically:
538 */
539again:
540 prev_raw_count = atomic64_read(&hwc->prev_count);
541 rdmsrl(hwc->counter_base + idx, new_raw_count);
542
543 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
544 new_raw_count) != prev_raw_count)
545 goto again;
546
547 /*
548 * Now we have the new raw value and have updated the prev
549 * timestamp already. We can now calculate the elapsed delta
550 * (counter-)time and add that to the generic counter.
551 *
552 * Careful, not all hw sign-extends above the physical width
553 * of the count.
554 */
555 delta = (new_raw_count << shift) - (prev_raw_count << shift);
556 delta >>= shift;
557
558 atomic64_add(delta, &counter->count);
559 atomic64_sub(delta, &hwc->period_left);
560
561 return new_raw_count;
562}
563
564static atomic_t active_counters;
565static DEFINE_MUTEX(pmc_reserve_mutex);
566
567static bool reserve_pmc_hardware(void)
568{
569 int i;
570
571 if (nmi_watchdog == NMI_LOCAL_APIC)
572 disable_lapic_nmi_watchdog();
573
574 for (i = 0; i < x86_pmu.num_counters; i++) {
575 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
576 goto perfctr_fail;
577 }
578
579 for (i = 0; i < x86_pmu.num_counters; i++) {
580 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
581 goto eventsel_fail;
582 }
583
584 return true;
585
586eventsel_fail:
587 for (i--; i >= 0; i--)
588 release_evntsel_nmi(x86_pmu.eventsel + i);
589
590 i = x86_pmu.num_counters;
591
592perfctr_fail:
593 for (i--; i >= 0; i--)
594 release_perfctr_nmi(x86_pmu.perfctr + i);
595
596 if (nmi_watchdog == NMI_LOCAL_APIC)
597 enable_lapic_nmi_watchdog();
598
599 return false;
600}
601
602static void release_pmc_hardware(void)
603{
604 int i;
605
606 for (i = 0; i < x86_pmu.num_counters; i++) {
607 release_perfctr_nmi(x86_pmu.perfctr + i);
608 release_evntsel_nmi(x86_pmu.eventsel + i);
609 }
610
611 if (nmi_watchdog == NMI_LOCAL_APIC)
612 enable_lapic_nmi_watchdog();
613}
614
615static void hw_perf_counter_destroy(struct perf_counter *counter)
616{
617 if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
618 release_pmc_hardware();
619 mutex_unlock(&pmc_reserve_mutex);
620 }
621}
622
623static inline int x86_pmu_initialized(void)
624{
625 return x86_pmu.handle_irq != NULL;
626}
627
628static inline int
629set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
630{
631 unsigned int cache_type, cache_op, cache_result;
632 u64 config, val;
633
634 config = attr->config;
635
636 cache_type = (config >> 0) & 0xff;
637 if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
638 return -EINVAL;
639
640 cache_op = (config >> 8) & 0xff;
641 if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
642 return -EINVAL;
643
644 cache_result = (config >> 16) & 0xff;
645 if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
646 return -EINVAL;
647
648 val = hw_cache_event_ids[cache_type][cache_op][cache_result];
649
650 if (val == 0)
651 return -ENOENT;
652
653 if (val == -1)
654 return -EINVAL;
655
656 hwc->config |= val;
657
658 return 0;
659}
660
661/*
662 * Setup the hardware configuration for a given attr_type
663 */
664static int __hw_perf_counter_init(struct perf_counter *counter)
665{
666 struct perf_counter_attr *attr = &counter->attr;
667 struct hw_perf_counter *hwc = &counter->hw;
668 int err;
669
670 if (!x86_pmu_initialized())
671 return -ENODEV;
672
673 err = 0;
674 if (!atomic_inc_not_zero(&active_counters)) {
675 mutex_lock(&pmc_reserve_mutex);
676 if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
677 err = -EBUSY;
678 else
679 atomic_inc(&active_counters);
680 mutex_unlock(&pmc_reserve_mutex);
681 }
682 if (err)
683 return err;
684
685 /*
686 * Generate PMC IRQs:
687 * (keep 'enabled' bit clear for now)
688 */
689 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
690
691 /*
692 * Count user and OS events unless requested not to.
693 */
694 if (!attr->exclude_user)
695 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
696 if (!attr->exclude_kernel)
697 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
698
699 if (!hwc->sample_period) {
700 hwc->sample_period = x86_pmu.max_period;
701 hwc->last_period = hwc->sample_period;
702 atomic64_set(&hwc->period_left, hwc->sample_period);
703 }
704
705 counter->destroy = hw_perf_counter_destroy;
706
707 /*
708 * Raw event type provide the config in the event structure
709 */
710 if (attr->type == PERF_TYPE_RAW) {
711 hwc->config |= x86_pmu.raw_event(attr->config);
712 return 0;
713 }
714
715 if (attr->type == PERF_TYPE_HW_CACHE)
716 return set_ext_hw_attr(hwc, attr);
717
718 if (attr->config >= x86_pmu.max_events)
719 return -EINVAL;
720 /*
721 * The generic map:
722 */
723 hwc->config |= x86_pmu.event_map(attr->config);
724
725 return 0;
726}
727
728static void intel_pmu_disable_all(void)
729{
730 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
731}
732
733static void amd_pmu_disable_all(void)
734{
735 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
736 int idx;
737
738 if (!cpuc->enabled)
739 return;
740
741 cpuc->enabled = 0;
742 /*
743 * ensure we write the disable before we start disabling the
744 * counters proper, so that amd_pmu_enable_counter() does the
745 * right thing.
746 */
747 barrier();
748
749 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
750 u64 val;
751
752 if (!test_bit(idx, cpuc->active_mask))
753 continue;
754 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
755 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
756 continue;
757 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
758 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
759 }
760}
761
762void hw_perf_disable(void)
763{
764 if (!x86_pmu_initialized())
765 return;
766 return x86_pmu.disable_all();
767}
768
769static void intel_pmu_enable_all(void)
770{
771 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
772}
773
774static void amd_pmu_enable_all(void)
775{
776 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
777 int idx;
778
779 if (cpuc->enabled)
780 return;
781
782 cpuc->enabled = 1;
783 barrier();
784
785 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
786 u64 val;
787
788 if (!test_bit(idx, cpuc->active_mask))
789 continue;
790 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
791 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
792 continue;
793 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
794 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
795 }
796}
797
798void hw_perf_enable(void)
799{
800 if (!x86_pmu_initialized())
801 return;
802 x86_pmu.enable_all();
803}
804
805static inline u64 intel_pmu_get_status(void)
806{
807 u64 status;
808
809 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
810
811 return status;
812}
813
814static inline void intel_pmu_ack_status(u64 ack)
815{
816 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
817}
818
819static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
820{
821 int err;
822 err = checking_wrmsrl(hwc->config_base + idx,
823 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
824}
825
826static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
827{
828 int err;
829 err = checking_wrmsrl(hwc->config_base + idx,
830 hwc->config);
831}
832
833static inline void
834intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
835{
836 int idx = __idx - X86_PMC_IDX_FIXED;
837 u64 ctrl_val, mask;
838 int err;
839
840 mask = 0xfULL << (idx * 4);
841
842 rdmsrl(hwc->config_base, ctrl_val);
843 ctrl_val &= ~mask;
844 err = checking_wrmsrl(hwc->config_base, ctrl_val);
845}
846
847static inline void
848intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
849{
850 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
851 intel_pmu_disable_fixed(hwc, idx);
852 return;
853 }
854
855 x86_pmu_disable_counter(hwc, idx);
856}
857
858static inline void
859amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
860{
861 x86_pmu_disable_counter(hwc, idx);
862}
863
864static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
865
866/*
867 * Set the next IRQ period, based on the hwc->period_left value.
868 * To be called with the counter disabled in hw:
869 */
870static int
871x86_perf_counter_set_period(struct perf_counter *counter,
872 struct hw_perf_counter *hwc, int idx)
873{
874 s64 left = atomic64_read(&hwc->period_left);
875 s64 period = hwc->sample_period;
876 int err, ret = 0;
877
878 /*
879 * If we are way outside a reasoable range then just skip forward:
880 */
881 if (unlikely(left <= -period)) {
882 left = period;
883 atomic64_set(&hwc->period_left, left);
884 hwc->last_period = period;
885 ret = 1;
886 }
887
888 if (unlikely(left <= 0)) {
889 left += period;
890 atomic64_set(&hwc->period_left, left);
891 hwc->last_period = period;
892 ret = 1;
893 }
894 /*
895 * Quirk: certain CPUs dont like it if just 1 event is left:
896 */
897 if (unlikely(left < 2))
898 left = 2;
899
900 if (left > x86_pmu.max_period)
901 left = x86_pmu.max_period;
902
903 per_cpu(prev_left[idx], smp_processor_id()) = left;
904
905 /*
906 * The hw counter starts counting from this counter offset,
907 * mark it to be able to extra future deltas:
908 */
909 atomic64_set(&hwc->prev_count, (u64)-left);
910
911 err = checking_wrmsrl(hwc->counter_base + idx,
912 (u64)(-left) & x86_pmu.counter_mask);
913
914 return ret;
915}
916
917static inline void
918intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
919{
920 int idx = __idx - X86_PMC_IDX_FIXED;
921 u64 ctrl_val, bits, mask;
922 int err;
923
924 /*
925 * Enable IRQ generation (0x8),
926 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
927 * if requested:
928 */
929 bits = 0x8ULL;
930 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
931 bits |= 0x2;
932 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
933 bits |= 0x1;
934 bits <<= (idx * 4);
935 mask = 0xfULL << (idx * 4);
936
937 rdmsrl(hwc->config_base, ctrl_val);
938 ctrl_val &= ~mask;
939 ctrl_val |= bits;
940 err = checking_wrmsrl(hwc->config_base, ctrl_val);
941}
942
943static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
944{
945 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
946 intel_pmu_enable_fixed(hwc, idx);
947 return;
948 }
949
950 x86_pmu_enable_counter(hwc, idx);
951}
952
953static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
954{
955 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
956
957 if (cpuc->enabled)
958 x86_pmu_enable_counter(hwc, idx);
959 else
960 x86_pmu_disable_counter(hwc, idx);
961}
962
963static int
964fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
965{
966 unsigned int event;
967
968 if (!x86_pmu.num_counters_fixed)
969 return -1;
970
971 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
972
973 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
974 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
975 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
976 return X86_PMC_IDX_FIXED_CPU_CYCLES;
977 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
978 return X86_PMC_IDX_FIXED_BUS_CYCLES;
979
980 return -1;
981}
982
983/*
984 * Find a PMC slot for the freshly enabled / scheduled in counter:
985 */
986static int x86_pmu_enable(struct perf_counter *counter)
987{
988 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
989 struct hw_perf_counter *hwc = &counter->hw;
990 int idx;
991
992 idx = fixed_mode_idx(counter, hwc);
993 if (idx >= 0) {
994 /*
995 * Try to get the fixed counter, if that is already taken
996 * then try to get a generic counter:
997 */
998 if (test_and_set_bit(idx, cpuc->used_mask))
999 goto try_generic;
1000
1001 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1002 /*
1003 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
1004 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1005 */
1006 hwc->counter_base =
1007 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1008 hwc->idx = idx;
1009 } else {
1010 idx = hwc->idx;
1011 /* Try to get the previous generic counter again */
1012 if (test_and_set_bit(idx, cpuc->used_mask)) {
1013try_generic:
1014 idx = find_first_zero_bit(cpuc->used_mask,
1015 x86_pmu.num_counters);
1016 if (idx == x86_pmu.num_counters)
1017 return -EAGAIN;
1018
1019 set_bit(idx, cpuc->used_mask);
1020 hwc->idx = idx;
1021 }
1022 hwc->config_base = x86_pmu.eventsel;
1023 hwc->counter_base = x86_pmu.perfctr;
1024 }
1025
1026 perf_counters_lapic_init();
1027
1028 x86_pmu.disable(hwc, idx);
1029
1030 cpuc->counters[idx] = counter;
1031 set_bit(idx, cpuc->active_mask);
1032
1033 x86_perf_counter_set_period(counter, hwc, idx);
1034 x86_pmu.enable(hwc, idx);
1035
1036 return 0;
1037}
1038
1039static void x86_pmu_unthrottle(struct perf_counter *counter)
1040{
1041 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1042 struct hw_perf_counter *hwc = &counter->hw;
1043
1044 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
1045 cpuc->counters[hwc->idx] != counter))
1046 return;
1047
1048 x86_pmu.enable(hwc, hwc->idx);
1049}
1050
1051void perf_counter_print_debug(void)
1052{
1053 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1054 struct cpu_hw_counters *cpuc;
1055 unsigned long flags;
1056 int cpu, idx;
1057
1058 if (!x86_pmu.num_counters)
1059 return;
1060
1061 local_irq_save(flags);
1062
1063 cpu = smp_processor_id();
1064 cpuc = &per_cpu(cpu_hw_counters, cpu);
1065
1066 if (x86_pmu.version >= 2) {
1067 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1068 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1069 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1070 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1071
1072 pr_info("\n");
1073 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1074 pr_info("CPU#%d: status: %016llx\n", cpu, status);
1075 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1076 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1077 }
1078 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
1079
1080 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1081 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1082 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1083
1084 prev_left = per_cpu(prev_left[idx], cpu);
1085
1086 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1087 cpu, idx, pmc_ctrl);
1088 pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
1089 cpu, idx, pmc_count);
1090 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1091 cpu, idx, prev_left);
1092 }
1093 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1094 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1095
1096 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1097 cpu, idx, pmc_count);
1098 }
1099 local_irq_restore(flags);
1100}
1101
1102static void x86_pmu_disable(struct perf_counter *counter)
1103{
1104 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1105 struct hw_perf_counter *hwc = &counter->hw;
1106 int idx = hwc->idx;
1107
1108 /*
1109 * Must be done before we disable, otherwise the nmi handler
1110 * could reenable again:
1111 */
1112 clear_bit(idx, cpuc->active_mask);
1113 x86_pmu.disable(hwc, idx);
1114
1115 /*
1116 * Make sure the cleared pointer becomes visible before we
1117 * (potentially) free the counter:
1118 */
1119 barrier();
1120
1121 /*
1122 * Drain the remaining delta count out of a counter
1123 * that we are disabling:
1124 */
1125 x86_perf_counter_update(counter, hwc, idx);
1126 cpuc->counters[idx] = NULL;
1127 clear_bit(idx, cpuc->used_mask);
1128}
1129
1130/*
1131 * Save and restart an expired counter. Called by NMI contexts,
1132 * so it has to be careful about preempting normal counter ops:
1133 */
1134static int intel_pmu_save_and_restart(struct perf_counter *counter)
1135{
1136 struct hw_perf_counter *hwc = &counter->hw;
1137 int idx = hwc->idx;
1138 int ret;
1139
1140 x86_perf_counter_update(counter, hwc, idx);
1141 ret = x86_perf_counter_set_period(counter, hwc, idx);
1142
1143 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1144 intel_pmu_enable_counter(hwc, idx);
1145
1146 return ret;
1147}
1148
1149static void intel_pmu_reset(void)
1150{
1151 unsigned long flags;
1152 int idx;
1153
1154 if (!x86_pmu.num_counters)
1155 return;
1156
1157 local_irq_save(flags);
1158
1159 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1160
1161 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1162 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
1163 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1164 }
1165 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1166 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1167 }
1168
1169 local_irq_restore(flags);
1170}
1171
1172
1173/*
1174 * This handler is triggered by the local APIC, so the APIC IRQ handling
1175 * rules apply:
1176 */
1177static int intel_pmu_handle_irq(struct pt_regs *regs)
1178{
1179 struct perf_sample_data data;
1180 struct cpu_hw_counters *cpuc;
1181 int bit, cpu, loops;
1182 u64 ack, status;
1183
1184 data.regs = regs;
1185 data.addr = 0;
1186
1187 cpu = smp_processor_id();
1188 cpuc = &per_cpu(cpu_hw_counters, cpu);
1189
1190 perf_disable();
1191 status = intel_pmu_get_status();
1192 if (!status) {
1193 perf_enable();
1194 return 0;
1195 }
1196
1197 loops = 0;
1198again:
1199 if (++loops > 100) {
1200 WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
1201 perf_counter_print_debug();
1202 intel_pmu_reset();
1203 perf_enable();
1204 return 1;
1205 }
1206
1207 inc_irq_stat(apic_perf_irqs);
1208 ack = status;
1209 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1210 struct perf_counter *counter = cpuc->counters[bit];
1211
1212 clear_bit(bit, (unsigned long *) &status);
1213 if (!test_bit(bit, cpuc->active_mask))
1214 continue;
1215
1216 if (!intel_pmu_save_and_restart(counter))
1217 continue;
1218
1219 if (perf_counter_overflow(counter, 1, &data))
1220 intel_pmu_disable_counter(&counter->hw, bit);
1221 }
1222
1223 intel_pmu_ack_status(ack);
1224
1225 /*
1226 * Repeat if there is more work to be done:
1227 */
1228 status = intel_pmu_get_status();
1229 if (status)
1230 goto again;
1231
1232 perf_enable();
1233
1234 return 1;
1235}
1236
1237static int amd_pmu_handle_irq(struct pt_regs *regs)
1238{
1239 struct perf_sample_data data;
1240 struct cpu_hw_counters *cpuc;
1241 struct perf_counter *counter;
1242 struct hw_perf_counter *hwc;
1243 int cpu, idx, handled = 0;
1244 u64 val;
1245
1246 data.regs = regs;
1247 data.addr = 0;
1248
1249 cpu = smp_processor_id();
1250 cpuc = &per_cpu(cpu_hw_counters, cpu);
1251
1252 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1253 if (!test_bit(idx, cpuc->active_mask))
1254 continue;
1255
1256 counter = cpuc->counters[idx];
1257 hwc = &counter->hw;
1258
1259 val = x86_perf_counter_update(counter, hwc, idx);
1260 if (val & (1ULL << (x86_pmu.counter_bits - 1)))
1261 continue;
1262
1263 /*
1264 * counter overflow
1265 */
1266 handled = 1;
1267 data.period = counter->hw.last_period;
1268
1269 if (!x86_perf_counter_set_period(counter, hwc, idx))
1270 continue;
1271
1272 if (perf_counter_overflow(counter, 1, &data))
1273 amd_pmu_disable_counter(hwc, idx);
1274 }
1275
1276 if (handled)
1277 inc_irq_stat(apic_perf_irqs);
1278
1279 return handled;
1280}
1281
1282void smp_perf_pending_interrupt(struct pt_regs *regs)
1283{
1284 irq_enter();
1285 ack_APIC_irq();
1286 inc_irq_stat(apic_pending_irqs);
1287 perf_counter_do_pending();
1288 irq_exit();
1289}
1290
1291void set_perf_counter_pending(void)
1292{
1293 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1294}
1295
1296void perf_counters_lapic_init(void)
1297{
1298 if (!x86_pmu_initialized())
1299 return;
1300
1301 /*
1302 * Always use NMI for PMU
1303 */
1304 apic_write(APIC_LVTPC, APIC_DM_NMI);
1305}
1306
1307static int __kprobes
1308perf_counter_nmi_handler(struct notifier_block *self,
1309 unsigned long cmd, void *__args)
1310{
1311 struct die_args *args = __args;
1312 struct pt_regs *regs;
1313
1314 if (!atomic_read(&active_counters))
1315 return NOTIFY_DONE;
1316
1317 switch (cmd) {
1318 case DIE_NMI:
1319 case DIE_NMI_IPI:
1320 break;
1321
1322 default:
1323 return NOTIFY_DONE;
1324 }
1325
1326 regs = args->regs;
1327
1328 apic_write(APIC_LVTPC, APIC_DM_NMI);
1329 /*
1330 * Can't rely on the handled return value to say it was our NMI, two
1331 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
1332 *
1333 * If the first NMI handles both, the latter will be empty and daze
1334 * the CPU.
1335 */
1336 x86_pmu.handle_irq(regs);
1337
1338 return NOTIFY_STOP;
1339}
1340
1341static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
1342 .notifier_call = perf_counter_nmi_handler,
1343 .next = NULL,
1344 .priority = 1
1345};
1346
1347static struct x86_pmu intel_pmu = {
1348 .name = "Intel",
1349 .handle_irq = intel_pmu_handle_irq,
1350 .disable_all = intel_pmu_disable_all,
1351 .enable_all = intel_pmu_enable_all,
1352 .enable = intel_pmu_enable_counter,
1353 .disable = intel_pmu_disable_counter,
1354 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
1355 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
1356 .event_map = intel_pmu_event_map,
1357 .raw_event = intel_pmu_raw_event,
1358 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
1359 /*
1360 * Intel PMCs cannot be accessed sanely above 32 bit width,
1361 * so we install an artificial 1<<31 period regardless of
1362 * the generic counter period:
1363 */
1364 .max_period = (1ULL << 31) - 1,
1365};
1366
1367static struct x86_pmu amd_pmu = {
1368 .name = "AMD",
1369 .handle_irq = amd_pmu_handle_irq,
1370 .disable_all = amd_pmu_disable_all,
1371 .enable_all = amd_pmu_enable_all,
1372 .enable = amd_pmu_enable_counter,
1373 .disable = amd_pmu_disable_counter,
1374 .eventsel = MSR_K7_EVNTSEL0,
1375 .perfctr = MSR_K7_PERFCTR0,
1376 .event_map = amd_pmu_event_map,
1377 .raw_event = amd_pmu_raw_event,
1378 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
1379 .num_counters = 4,
1380 .counter_bits = 48,
1381 .counter_mask = (1ULL << 48) - 1,
1382 /* use highest bit to detect overflow */
1383 .max_period = (1ULL << 47) - 1,
1384};
1385
1386static int intel_pmu_init(void)
1387{
1388 union cpuid10_edx edx;
1389 union cpuid10_eax eax;
1390 unsigned int unused;
1391 unsigned int ebx;
1392 int version;
1393
1394 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
1395 return -ENODEV;
1396
1397 /*
1398 * Check whether the Architectural PerfMon supports
1399 * Branch Misses Retired Event or not.
1400 */
1401 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
1402 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
1403 return -ENODEV;
1404
1405 version = eax.split.version_id;
1406 if (version < 2)
1407 return -ENODEV;
1408
1409 x86_pmu = intel_pmu;
1410 x86_pmu.version = version;
1411 x86_pmu.num_counters = eax.split.num_counters;
1412 x86_pmu.counter_bits = eax.split.bit_width;
1413 x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
1414
1415 /*
1416 * Quirk: v2 perfmon does not report fixed-purpose counters, so
1417 * assume at least 3 counters:
1418 */
1419 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
1420
1421 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
1422
1423 /*
1424 * Install the hw-cache-events table:
1425 */
1426 switch (boot_cpu_data.x86_model) {
1427 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
1428 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
1429 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
1430 case 29: /* six-core 45 nm xeon "Dunnington" */
1431 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
1432 sizeof(hw_cache_event_ids));
1433
1434 pr_cont("Core2 events, ");
1435 break;
1436 default:
1437 case 26:
1438 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
1439 sizeof(hw_cache_event_ids));
1440
1441 pr_cont("Nehalem/Corei7 events, ");
1442 break;
1443 case 28:
1444 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
1445 sizeof(hw_cache_event_ids));
1446
1447 pr_cont("Atom events, ");
1448 break;
1449 }
1450 return 0;
1451}
1452
1453static int amd_pmu_init(void)
1454{
1455 x86_pmu = amd_pmu;
1456
1457 switch (boot_cpu_data.x86) {
1458 case 0x0f:
1459 case 0x10:
1460 case 0x11:
1461 memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
1462 sizeof(hw_cache_event_ids));
1463
1464 pr_cont("AMD Family 0f/10/11 events, ");
1465 break;
1466 }
1467 return 0;
1468}
1469
1470void __init init_hw_perf_counters(void)
1471{
1472 int err;
1473
1474 pr_info("Performance Counters: ");
1475
1476 switch (boot_cpu_data.x86_vendor) {
1477 case X86_VENDOR_INTEL:
1478 err = intel_pmu_init();
1479 break;
1480 case X86_VENDOR_AMD:
1481 err = amd_pmu_init();
1482 break;
1483 default:
1484 return;
1485 }
1486 if (err != 0) {
1487 pr_cont("no PMU driver, software counters only.\n");
1488 return;
1489 }
1490
1491 pr_cont("%s PMU driver.\n", x86_pmu.name);
1492
1493 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1494 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1495 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1496 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1497 }
1498 perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
1499 perf_max_counters = x86_pmu.num_counters;
1500
1501 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1502 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1503 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1504 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1505 }
1506
1507 perf_counter_mask |=
1508 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1509
1510 perf_counters_lapic_init();
1511 register_die_notifier(&perf_counter_nmi_notifier);
1512
1513 pr_info("... version: %d\n", x86_pmu.version);
1514 pr_info("... bit width: %d\n", x86_pmu.counter_bits);
1515 pr_info("... generic counters: %d\n", x86_pmu.num_counters);
1516 pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask);
1517 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1518 pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed);
1519 pr_info("... counter mask: %016Lx\n", perf_counter_mask);
1520}
1521
1522static inline void x86_pmu_read(struct perf_counter *counter)
1523{
1524 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
1525}
1526
1527static const struct pmu pmu = {
1528 .enable = x86_pmu_enable,
1529 .disable = x86_pmu_disable,
1530 .read = x86_pmu_read,
1531 .unthrottle = x86_pmu_unthrottle,
1532};
1533
1534const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
1535{
1536 int err;
1537
1538 err = __hw_perf_counter_init(counter);
1539 if (err)
1540 return ERR_PTR(err);
1541
1542 return &pmu;
1543}
1544
1545/*
1546 * callchain support
1547 */
1548
1549static inline
1550void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
1551{
1552 if (entry->nr < MAX_STACK_DEPTH)
1553 entry->ip[entry->nr++] = ip;
1554}
1555
1556static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
1557static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
1558
1559
1560static void
1561backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
1562{
1563 /* Ignore warnings */
1564}
1565
1566static void backtrace_warning(void *data, char *msg)
1567{
1568 /* Ignore warnings */
1569}
1570
1571static int backtrace_stack(void *data, char *name)
1572{
1573 /* Don't bother with IRQ stacks for now */
1574 return -1;
1575}
1576
1577static void backtrace_address(void *data, unsigned long addr, int reliable)
1578{
1579 struct perf_callchain_entry *entry = data;
1580
1581 if (reliable)
1582 callchain_store(entry, addr);
1583}
1584
1585static const struct stacktrace_ops backtrace_ops = {
1586 .warning = backtrace_warning,
1587 .warning_symbol = backtrace_warning_symbol,
1588 .stack = backtrace_stack,
1589 .address = backtrace_address,
1590};
1591
1592static void
1593perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1594{
1595 unsigned long bp;
1596 char *stack;
1597 int nr = entry->nr;
1598
1599 callchain_store(entry, instruction_pointer(regs));
1600
1601 stack = ((char *)regs + sizeof(struct pt_regs));
1602#ifdef CONFIG_FRAME_POINTER
1603 bp = frame_pointer(regs);
1604#else
1605 bp = 0;
1606#endif
1607
1608 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
1609
1610 entry->kernel = entry->nr - nr;
1611}
1612
1613
1614struct stack_frame {
1615 const void __user *next_fp;
1616 unsigned long return_address;
1617};
1618
1619static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
1620{
1621 int ret;
1622
1623 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
1624 return 0;
1625
1626 ret = 1;
1627 pagefault_disable();
1628 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
1629 ret = 0;
1630 pagefault_enable();
1631
1632 return ret;
1633}
1634
1635static void
1636perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1637{
1638 struct stack_frame frame;
1639 const void __user *fp;
1640 int nr = entry->nr;
1641
1642 regs = (struct pt_regs *)current->thread.sp0 - 1;
1643 fp = (void __user *)regs->bp;
1644
1645 callchain_store(entry, regs->ip);
1646
1647 while (entry->nr < MAX_STACK_DEPTH) {
1648 frame.next_fp = NULL;
1649 frame.return_address = 0;
1650
1651 if (!copy_stack_frame(fp, &frame))
1652 break;
1653
1654 if ((unsigned long)fp < user_stack_pointer(regs))
1655 break;
1656
1657 callchain_store(entry, frame.return_address);
1658 fp = frame.next_fp;
1659 }
1660
1661 entry->user = entry->nr - nr;
1662}
1663
1664static void
1665perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
1666{
1667 int is_user;
1668
1669 if (!regs)
1670 return;
1671
1672 is_user = user_mode(regs);
1673
1674 if (!current || current->pid == 0)
1675 return;
1676
1677 if (is_user && current->state != TASK_RUNNING)
1678 return;
1679
1680 if (!is_user)
1681 perf_callchain_kernel(regs, entry);
1682
1683 if (current->mm)
1684 perf_callchain_user(regs, entry);
1685}
1686
1687struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1688{
1689 struct perf_callchain_entry *entry;
1690
1691 if (in_nmi())
1692 entry = &__get_cpu_var(nmi_entry);
1693 else
1694 entry = &__get_cpu_var(irq_entry);
1695
1696 entry->nr = 0;
1697 entry->hv = 0;
1698 entry->kernel = 0;
1699 entry->user = 0;
1700
1701 perf_do_callchain(regs, entry);
1702
1703 return entry;
1704}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f6c70a164e32..d6f5b9fbde32 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,8 +19,8 @@
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/genapic.h> 22#include <asm/apic.h>
23#include <asm/intel_arch_perfmon.h> 23#include <asm/perf_counter.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1c17d7c751a4..a4742a340d8d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1012,6 +1012,11 @@ apicinterrupt ERROR_APIC_VECTOR \
1012apicinterrupt SPURIOUS_APIC_VECTOR \ 1012apicinterrupt SPURIOUS_APIC_VECTOR \
1013 spurious_interrupt smp_spurious_interrupt 1013 spurious_interrupt smp_spurious_interrupt
1014 1014
1015#ifdef CONFIG_PERF_COUNTERS
1016apicinterrupt LOCAL_PENDING_VECTOR \
1017 perf_pending_interrupt smp_perf_pending_interrupt
1018#endif
1019
1015/* 1020/*
1016 * Exception entry points. 1021 * Exception entry points.
1017 */ 1022 */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 9a391bbb8ba8..38287b5f116e 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -62,6 +62,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)
62 for_each_online_cpu(j) 62 for_each_online_cpu(j)
63 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); 63 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
64 seq_printf(p, " Spurious interrupts\n"); 64 seq_printf(p, " Spurious interrupts\n");
65 seq_printf(p, "%*s: ", prec, "CNT");
66 for_each_online_cpu(j)
67 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
68 seq_printf(p, " Performance counter interrupts\n");
69 seq_printf(p, "%*s: ", prec, "PND");
70 for_each_online_cpu(j)
71 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
72 seq_printf(p, " Performance pending work\n");
65#endif 73#endif
66 if (generic_interrupt_extension) { 74 if (generic_interrupt_extension) {
67 seq_printf(p, "%*s: ", prec, "PLT"); 75 seq_printf(p, "%*s: ", prec, "PLT");
@@ -165,6 +173,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
165#ifdef CONFIG_X86_LOCAL_APIC 173#ifdef CONFIG_X86_LOCAL_APIC
166 sum += irq_stats(cpu)->apic_timer_irqs; 174 sum += irq_stats(cpu)->apic_timer_irqs;
167 sum += irq_stats(cpu)->irq_spurious_count; 175 sum += irq_stats(cpu)->irq_spurious_count;
176 sum += irq_stats(cpu)->apic_perf_irqs;
177 sum += irq_stats(cpu)->apic_pending_irqs;
168#endif 178#endif
169 if (generic_interrupt_extension) 179 if (generic_interrupt_extension)
170 sum += irq_stats(cpu)->generic_irqs; 180 sum += irq_stats(cpu)->generic_irqs;
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 2e08b10ad51a..267c6624c77f 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -181,10 +181,15 @@ static void __init apic_intr_init(void)
181{ 181{
182 smp_intr_init(); 182 smp_intr_init();
183 183
184#ifdef CONFIG_X86_64 184#ifdef CONFIG_X86_THERMAL_VECTOR
185 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 185 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
186#endif
187#ifdef CONFIG_X86_THRESHOLD
186 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 188 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
187#endif 189#endif
190#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
191 alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
192#endif
188 193
189#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) 194#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
190 /* self generated IPI for local APIC timer */ 195 /* self generated IPI for local APIC timer */
@@ -199,18 +204,10 @@ static void __init apic_intr_init(void)
199 204
200 /* Performance monitoring interrupts: */ 205 /* Performance monitoring interrupts: */
201# ifdef CONFIG_PERF_COUNTERS 206# ifdef CONFIG_PERF_COUNTERS
202 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
203 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); 207 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
204# endif 208# endif
205 209
206#endif 210#endif
207
208#ifdef CONFIG_X86_32
209#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
210 /* thermal monitor LVT interrupt */
211 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
212#endif
213#endif
214} 211}
215 212
216/** 213/**
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 14425166b8e3..0a813b17b172 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9
10#include <linux/sched.h> 9#include <linux/sched.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12#include <linux/smp.h> 11#include <linux/smp.h>
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 734f92c02dde..d51321ddafda 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -335,3 +335,4 @@ ENTRY(sys_call_table)
335 .long sys_preadv 335 .long sys_preadv
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_counter_open
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ede024531f8f..07d60c870ce2 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -942,8 +942,13 @@ void __init trap_init(void)
942#endif 942#endif
943 set_intr_gate(19, &simd_coprocessor_error); 943 set_intr_gate(19, &simd_coprocessor_error);
944 944
945 /* Reserve all the builtin and the syscall vector: */
946 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
947 set_bit(i, used_vectors);
948
945#ifdef CONFIG_IA32_EMULATION 949#ifdef CONFIG_IA32_EMULATION
946 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 950 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
951 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
947#endif 952#endif
948 953
949#ifdef CONFIG_X86_32 954#ifdef CONFIG_X86_32
@@ -960,14 +965,9 @@ void __init trap_init(void)
960 } 965 }
961 966
962 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 967 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
968 set_bit(SYSCALL_VECTOR, used_vectors);
963#endif 969#endif
964 970
965 /* Reserve all the builtin and the syscall vector: */
966 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
967 set_bit(i, used_vectors);
968
969 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
970
971 /* 971 /*
972 * Should be a barrier for any external CPU state: 972 * Should be a barrier for any external CPU state:
973 */ 973 */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 5ec7ae366615..c6acc6326374 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,6 +10,7 @@
10#include <linux/bootmem.h> /* max_low_pfn */ 10#include <linux/bootmem.h> /* max_low_pfn */
11#include <linux/kprobes.h> /* __kprobes, ... */ 11#include <linux/kprobes.h> /* __kprobes, ... */
12#include <linux/mmiotrace.h> /* kmmio_handler, ... */ 12#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13#include <linux/perf_counter.h> /* perf_swcounter_event */
13 14
14#include <asm/traps.h> /* dotraplinkage, ... */ 15#include <asm/traps.h> /* dotraplinkage, ... */
15#include <asm/pgalloc.h> /* pgd_*(), ... */ 16#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -1013,6 +1014,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1013 if (unlikely(error_code & PF_RSVD)) 1014 if (unlikely(error_code & PF_RSVD))
1014 pgtable_bad(regs, error_code, address); 1015 pgtable_bad(regs, error_code, address);
1015 1016
1017 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
1018
1016 /* 1019 /*
1017 * If we're in an interrupt, have no user context or are running 1020 * If we're in an interrupt, have no user context or are running
1018 * in an atomic region then we must not take the fault: 1021 * in an atomic region then we must not take the fault:
@@ -1106,10 +1109,15 @@ good_area:
1106 return; 1109 return;
1107 } 1110 }
1108 1111
1109 if (fault & VM_FAULT_MAJOR) 1112 if (fault & VM_FAULT_MAJOR) {
1110 tsk->maj_flt++; 1113 tsk->maj_flt++;
1111 else 1114 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
1115 regs, address);
1116 } else {
1112 tsk->min_flt++; 1117 tsk->min_flt++;
1118 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
1119 regs, address);
1120 }
1113 1121
1114 check_v8086_mode(regs, address, tsk); 1122 check_v8086_mode(regs, address, tsk);
1115 1123
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index c0bedcd10f97..18d244f70205 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -40,21 +40,20 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
40 40
41static void __init memtest(u64 pattern, u64 start_phys, u64 size) 41static void __init memtest(u64 pattern, u64 start_phys, u64 size)
42{ 42{
43 u64 *p; 43 u64 *p, *start, *end;
44 void *start, *end;
45 u64 start_bad, last_bad; 44 u64 start_bad, last_bad;
46 u64 start_phys_aligned; 45 u64 start_phys_aligned;
47 size_t incr; 46 const size_t incr = sizeof(pattern);
48 47
49 incr = sizeof(pattern);
50 start_phys_aligned = ALIGN(start_phys, incr); 48 start_phys_aligned = ALIGN(start_phys, incr);
51 start = __va(start_phys_aligned); 49 start = __va(start_phys_aligned);
52 end = start + size - (start_phys_aligned - start_phys); 50 end = start + (size - (start_phys_aligned - start_phys)) / incr;
53 start_bad = 0; 51 start_bad = 0;
54 last_bad = 0; 52 last_bad = 0;
55 53
56 for (p = start; p < end; p++) 54 for (p = start; p < end; p++)
57 *p = pattern; 55 *p = pattern;
56
58 for (p = start; p < end; p++, start_phys_aligned += incr) { 57 for (p = start; p < end; p++, start_phys_aligned += incr) {
59 if (*p == pattern) 58 if (*p == pattern)
60 continue; 59 continue;
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 3b285e656e27..b07dd8d0b321 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
40 40
41 switch (val) { 41 switch (val) {
42 case DIE_NMI: 42 case DIE_NMI:
43 if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu))) 43 case DIE_NMI_IPI:
44 ret = NOTIFY_STOP; 44 model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
45 ret = NOTIFY_STOP;
45 break; 46 break;
46 default: 47 default:
47 break; 48 break;
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)
134static struct notifier_block profile_exceptions_nb = { 135static struct notifier_block profile_exceptions_nb = {
135 .notifier_call = profile_exceptions_notify, 136 .notifier_call = profile_exceptions_notify,
136 .next = NULL, 137 .next = NULL,
137 .priority = 0 138 .priority = 2
138}; 139};
139 140
140static int nmi_setup(void) 141static int nmi_setup(void)
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 10131fbdaada..4da7230b3d17 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
18#include <asm/msr.h> 18#include <asm/msr.h>
19#include <asm/apic.h> 19#include <asm/apic.h>
20#include <asm/nmi.h> 20#include <asm/nmi.h>
21#include <asm/intel_arch_perfmon.h> 21#include <asm/perf_counter.h>
22 22
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24#include "op_counter.h" 24#include "op_counter.h"
@@ -136,6 +136,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
136 u64 val; 136 u64 val;
137 int i; 137 int i;
138 138
139 /*
140 * This can happen if perf counters are in use when
141 * we steal the die notifier NMI.
142 */
143 if (unlikely(!reset_value))
144 goto out;
145
139 for (i = 0 ; i < num_counters; ++i) { 146 for (i = 0 ; i < num_counters; ++i) {
140 if (!reset_value[i]) 147 if (!reset_value[i])
141 continue; 148 continue;
@@ -146,6 +153,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
146 } 153 }
147 } 154 }
148 155
156out:
149 /* Only P6 based Pentium M need to re-unmask the apic vector but it 157 /* Only P6 based Pentium M need to re-unmask the apic vector but it
150 * doesn't hurt other P6 variant */ 158 * doesn't hurt other P6 variant */
151 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); 159 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 1241f118ab56..58bc00f68b12 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -338,6 +338,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
338 } 338 }
339 } 339 }
340 340
341 current->mm->context.vdso = (void *)addr;
342
341 if (compat_uses_vma || !compat) { 343 if (compat_uses_vma || !compat) {
342 /* 344 /*
343 * MAYWRITE to allow gdb to COW and set breakpoints 345 * MAYWRITE to allow gdb to COW and set breakpoints
@@ -358,11 +360,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
358 goto up_fail; 360 goto up_fail;
359 } 361 }
360 362
361 current->mm->context.vdso = (void *)addr;
362 current_thread_info()->sysenter_return = 363 current_thread_info()->sysenter_return =
363 VDSO32_SYMBOL(addr, SYSENTER_RETURN); 364 VDSO32_SYMBOL(addr, SYSENTER_RETURN);
364 365
365 up_fail: 366 up_fail:
367 if (ret)
368 current->mm->context.vdso = NULL;
369
366 up_write(&mm->mmap_sem); 370 up_write(&mm->mmap_sem);
367 371
368 return ret; 372 return ret;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index cac083386e03..21e1aeb9f3ea 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -116,15 +116,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
116 goto up_fail; 116 goto up_fail;
117 } 117 }
118 118
119 current->mm->context.vdso = (void *)addr;
120
119 ret = install_special_mapping(mm, addr, vdso_size, 121 ret = install_special_mapping(mm, addr, vdso_size,
120 VM_READ|VM_EXEC| 122 VM_READ|VM_EXEC|
121 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 123 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
122 VM_ALWAYSDUMP, 124 VM_ALWAYSDUMP,
123 vdso_pages); 125 vdso_pages);
124 if (ret) 126 if (ret) {
127 current->mm->context.vdso = NULL;
125 goto up_fail; 128 goto up_fail;
129 }
126 130
127 current->mm->context.vdso = (void *)addr;
128up_fail: 131up_fail:
129 up_write(&mm->mmap_sem); 132 up_write(&mm->mmap_sem);
130 return ret; 133 return ret;
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 6b91c26a4635..15a23031833f 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -77,8 +77,6 @@ static ssize_t ahci_led_store(struct ata_port *ap, const char *buf,
77 size_t size); 77 size_t size);
78static ssize_t ahci_transmit_led_message(struct ata_port *ap, u32 state, 78static ssize_t ahci_transmit_led_message(struct ata_port *ap, u32 state,
79 ssize_t size); 79 ssize_t size);
80#define MAX_SLOTS 8
81#define MAX_RETRY 15
82 80
83enum { 81enum {
84 AHCI_PCI_BAR = 5, 82 AHCI_PCI_BAR = 5,
@@ -231,6 +229,10 @@ enum {
231 229
232 ICH_MAP = 0x90, /* ICH MAP register */ 230 ICH_MAP = 0x90, /* ICH MAP register */
233 231
232 /* em constants */
233 EM_MAX_SLOTS = 8,
234 EM_MAX_RETRY = 5,
235
234 /* em_ctl bits */ 236 /* em_ctl bits */
235 EM_CTL_RST = (1 << 9), /* Reset */ 237 EM_CTL_RST = (1 << 9), /* Reset */
236 EM_CTL_TM = (1 << 8), /* Transmit Message */ 238 EM_CTL_TM = (1 << 8), /* Transmit Message */
@@ -282,8 +284,8 @@ struct ahci_port_priv {
282 unsigned int ncq_saw_dmas:1; 284 unsigned int ncq_saw_dmas:1;
283 unsigned int ncq_saw_sdb:1; 285 unsigned int ncq_saw_sdb:1;
284 u32 intr_mask; /* interrupts to enable */ 286 u32 intr_mask; /* interrupts to enable */
285 struct ahci_em_priv em_priv[MAX_SLOTS];/* enclosure management info 287 /* enclosure management info per PM slot */
286 * per PM slot */ 288 struct ahci_em_priv em_priv[EM_MAX_SLOTS];
287}; 289};
288 290
289static int ahci_scr_read(struct ata_link *link, unsigned int sc_reg, u32 *val); 291static int ahci_scr_read(struct ata_link *link, unsigned int sc_reg, u32 *val);
@@ -313,7 +315,6 @@ static void ahci_error_handler(struct ata_port *ap);
313static void ahci_post_internal_cmd(struct ata_queued_cmd *qc); 315static void ahci_post_internal_cmd(struct ata_queued_cmd *qc);
314static int ahci_port_resume(struct ata_port *ap); 316static int ahci_port_resume(struct ata_port *ap);
315static void ahci_dev_config(struct ata_device *dev); 317static void ahci_dev_config(struct ata_device *dev);
316static unsigned int ahci_fill_sg(struct ata_queued_cmd *qc, void *cmd_tbl);
317static void ahci_fill_cmd_slot(struct ahci_port_priv *pp, unsigned int tag, 318static void ahci_fill_cmd_slot(struct ahci_port_priv *pp, unsigned int tag,
318 u32 opts); 319 u32 opts);
319#ifdef CONFIG_PM 320#ifdef CONFIG_PM
@@ -404,14 +405,14 @@ static struct ata_port_operations ahci_sb600_ops = {
404#define AHCI_HFLAGS(flags) .private_data = (void *)(flags) 405#define AHCI_HFLAGS(flags) .private_data = (void *)(flags)
405 406
406static const struct ata_port_info ahci_port_info[] = { 407static const struct ata_port_info ahci_port_info[] = {
407 /* board_ahci */ 408 [board_ahci] =
408 { 409 {
409 .flags = AHCI_FLAG_COMMON, 410 .flags = AHCI_FLAG_COMMON,
410 .pio_mask = ATA_PIO4, 411 .pio_mask = ATA_PIO4,
411 .udma_mask = ATA_UDMA6, 412 .udma_mask = ATA_UDMA6,
412 .port_ops = &ahci_ops, 413 .port_ops = &ahci_ops,
413 }, 414 },
414 /* board_ahci_vt8251 */ 415 [board_ahci_vt8251] =
415 { 416 {
416 AHCI_HFLAGS (AHCI_HFLAG_NO_NCQ | AHCI_HFLAG_NO_PMP), 417 AHCI_HFLAGS (AHCI_HFLAG_NO_NCQ | AHCI_HFLAG_NO_PMP),
417 .flags = AHCI_FLAG_COMMON, 418 .flags = AHCI_FLAG_COMMON,
@@ -419,7 +420,7 @@ static const struct ata_port_info ahci_port_info[] = {
419 .udma_mask = ATA_UDMA6, 420 .udma_mask = ATA_UDMA6,
420 .port_ops = &ahci_vt8251_ops, 421 .port_ops = &ahci_vt8251_ops,
421 }, 422 },
422 /* board_ahci_ign_iferr */ 423 [board_ahci_ign_iferr] =
423 { 424 {
424 AHCI_HFLAGS (AHCI_HFLAG_IGN_IRQ_IF_ERR), 425 AHCI_HFLAGS (AHCI_HFLAG_IGN_IRQ_IF_ERR),
425 .flags = AHCI_FLAG_COMMON, 426 .flags = AHCI_FLAG_COMMON,
@@ -427,17 +428,16 @@ static const struct ata_port_info ahci_port_info[] = {
427 .udma_mask = ATA_UDMA6, 428 .udma_mask = ATA_UDMA6,
428 .port_ops = &ahci_ops, 429 .port_ops = &ahci_ops,
429 }, 430 },
430 /* board_ahci_sb600 */ 431 [board_ahci_sb600] =
431 { 432 {
432 AHCI_HFLAGS (AHCI_HFLAG_IGN_SERR_INTERNAL | 433 AHCI_HFLAGS (AHCI_HFLAG_IGN_SERR_INTERNAL |
433 AHCI_HFLAG_32BIT_ONLY | AHCI_HFLAG_NO_MSI | 434 AHCI_HFLAG_NO_MSI | AHCI_HFLAG_SECT255),
434 AHCI_HFLAG_SECT255),
435 .flags = AHCI_FLAG_COMMON, 435 .flags = AHCI_FLAG_COMMON,
436 .pio_mask = ATA_PIO4, 436 .pio_mask = ATA_PIO4,
437 .udma_mask = ATA_UDMA6, 437 .udma_mask = ATA_UDMA6,
438 .port_ops = &ahci_sb600_ops, 438 .port_ops = &ahci_sb600_ops,
439 }, 439 },
440 /* board_ahci_mv */ 440 [board_ahci_mv] =
441 { 441 {
442 AHCI_HFLAGS (AHCI_HFLAG_NO_NCQ | AHCI_HFLAG_NO_MSI | 442 AHCI_HFLAGS (AHCI_HFLAG_NO_NCQ | AHCI_HFLAG_NO_MSI |
443 AHCI_HFLAG_MV_PATA | AHCI_HFLAG_NO_PMP), 443 AHCI_HFLAG_MV_PATA | AHCI_HFLAG_NO_PMP),
@@ -447,7 +447,7 @@ static const struct ata_port_info ahci_port_info[] = {
447 .udma_mask = ATA_UDMA6, 447 .udma_mask = ATA_UDMA6,
448 .port_ops = &ahci_ops, 448 .port_ops = &ahci_ops,
449 }, 449 },
450 /* board_ahci_sb700, for SB700 and SB800 */ 450 [board_ahci_sb700] = /* for SB700 and SB800 */
451 { 451 {
452 AHCI_HFLAGS (AHCI_HFLAG_IGN_SERR_INTERNAL), 452 AHCI_HFLAGS (AHCI_HFLAG_IGN_SERR_INTERNAL),
453 .flags = AHCI_FLAG_COMMON, 453 .flags = AHCI_FLAG_COMMON,
@@ -455,7 +455,7 @@ static const struct ata_port_info ahci_port_info[] = {
455 .udma_mask = ATA_UDMA6, 455 .udma_mask = ATA_UDMA6,
456 .port_ops = &ahci_sb600_ops, 456 .port_ops = &ahci_sb600_ops,
457 }, 457 },
458 /* board_ahci_mcp65 */ 458 [board_ahci_mcp65] =
459 { 459 {
460 AHCI_HFLAGS (AHCI_HFLAG_YES_NCQ), 460 AHCI_HFLAGS (AHCI_HFLAG_YES_NCQ),
461 .flags = AHCI_FLAG_COMMON, 461 .flags = AHCI_FLAG_COMMON,
@@ -463,7 +463,7 @@ static const struct ata_port_info ahci_port_info[] = {
463 .udma_mask = ATA_UDMA6, 463 .udma_mask = ATA_UDMA6,
464 .port_ops = &ahci_ops, 464 .port_ops = &ahci_ops,
465 }, 465 },
466 /* board_ahci_nopmp */ 466 [board_ahci_nopmp] =
467 { 467 {
468 AHCI_HFLAGS (AHCI_HFLAG_NO_PMP), 468 AHCI_HFLAGS (AHCI_HFLAG_NO_PMP),
469 .flags = AHCI_FLAG_COMMON, 469 .flags = AHCI_FLAG_COMMON,
@@ -1141,12 +1141,12 @@ static void ahci_start_port(struct ata_port *ap)
1141 emp = &pp->em_priv[link->pmp]; 1141 emp = &pp->em_priv[link->pmp];
1142 1142
1143 /* EM Transmit bit maybe busy during init */ 1143 /* EM Transmit bit maybe busy during init */
1144 for (i = 0; i < MAX_RETRY; i++) { 1144 for (i = 0; i < EM_MAX_RETRY; i++) {
1145 rc = ahci_transmit_led_message(ap, 1145 rc = ahci_transmit_led_message(ap,
1146 emp->led_state, 1146 emp->led_state,
1147 4); 1147 4);
1148 if (rc == -EBUSY) 1148 if (rc == -EBUSY)
1149 udelay(100); 1149 msleep(1);
1150 else 1150 else
1151 break; 1151 break;
1152 } 1152 }
@@ -1340,7 +1340,7 @@ static ssize_t ahci_transmit_led_message(struct ata_port *ap, u32 state,
1340 1340
1341 /* get the slot number from the message */ 1341 /* get the slot number from the message */
1342 pmp = (state & EM_MSG_LED_PMP_SLOT) >> 8; 1342 pmp = (state & EM_MSG_LED_PMP_SLOT) >> 8;
1343 if (pmp < MAX_SLOTS) 1343 if (pmp < EM_MAX_SLOTS)
1344 emp = &pp->em_priv[pmp]; 1344 emp = &pp->em_priv[pmp];
1345 else 1345 else
1346 return -EINVAL; 1346 return -EINVAL;
@@ -1408,7 +1408,7 @@ static ssize_t ahci_led_store(struct ata_port *ap, const char *buf,
1408 1408
1409 /* get the slot number from the message */ 1409 /* get the slot number from the message */
1410 pmp = (state & EM_MSG_LED_PMP_SLOT) >> 8; 1410 pmp = (state & EM_MSG_LED_PMP_SLOT) >> 8;
1411 if (pmp < MAX_SLOTS) 1411 if (pmp < EM_MAX_SLOTS)
1412 emp = &pp->em_priv[pmp]; 1412 emp = &pp->em_priv[pmp];
1413 else 1413 else
1414 return -EINVAL; 1414 return -EINVAL;
@@ -2584,6 +2584,51 @@ static void ahci_p5wdh_workaround(struct ata_host *host)
2584 } 2584 }
2585} 2585}
2586 2586
2587/*
2588 * SB600 ahci controller on ASUS M2A-VM can't do 64bit DMA with older
2589 * BIOS. The oldest version known to be broken is 0901 and working is
2590 * 1501 which was released on 2007-10-26. Force 32bit DMA on anything
2591 * older than 1501. Please read bko#9412 for more info.
2592 */
2593static bool ahci_asus_m2a_vm_32bit_only(struct pci_dev *pdev)
2594{
2595 static const struct dmi_system_id sysids[] = {
2596 {
2597 .ident = "ASUS M2A-VM",
2598 .matches = {
2599 DMI_MATCH(DMI_BOARD_VENDOR,
2600 "ASUSTeK Computer INC."),
2601 DMI_MATCH(DMI_BOARD_NAME, "M2A-VM"),
2602 },
2603 },
2604 { }
2605 };
2606 const char *cutoff_mmdd = "10/26";
2607 const char *date;
2608 int year;
2609
2610 if (pdev->bus->number != 0 || pdev->devfn != PCI_DEVFN(0x12, 0) ||
2611 !dmi_check_system(sysids))
2612 return false;
2613
2614 /*
2615 * Argh.... both version and date are free form strings.
2616 * Let's hope they're using the same date format across
2617 * different versions.
2618 */
2619 date = dmi_get_system_info(DMI_BIOS_DATE);
2620 year = dmi_get_year(DMI_BIOS_DATE);
2621 if (date && strlen(date) >= 10 && date[2] == '/' && date[5] == '/' &&
2622 (year > 2007 ||
2623 (year == 2007 && strncmp(date, cutoff_mmdd, 5) >= 0)))
2624 return false;
2625
2626 dev_printk(KERN_WARNING, &pdev->dev, "ASUS M2A-VM: BIOS too old, "
2627 "forcing 32bit DMA, update BIOS\n");
2628
2629 return true;
2630}
2631
2587static bool ahci_broken_system_poweroff(struct pci_dev *pdev) 2632static bool ahci_broken_system_poweroff(struct pci_dev *pdev)
2588{ 2633{
2589 static const struct dmi_system_id broken_systems[] = { 2634 static const struct dmi_system_id broken_systems[] = {
@@ -2744,6 +2789,10 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
2744 if (board_id == board_ahci_sb700 && pdev->revision >= 0x40) 2789 if (board_id == board_ahci_sb700 && pdev->revision >= 0x40)
2745 hpriv->flags &= ~AHCI_HFLAG_IGN_SERR_INTERNAL; 2790 hpriv->flags &= ~AHCI_HFLAG_IGN_SERR_INTERNAL;
2746 2791
2792 /* apply ASUS M2A_VM quirk */
2793 if (ahci_asus_m2a_vm_32bit_only(pdev))
2794 hpriv->flags |= AHCI_HFLAG_32BIT_ONLY;
2795
2747 if (!(hpriv->flags & AHCI_HFLAG_NO_MSI)) 2796 if (!(hpriv->flags & AHCI_HFLAG_NO_MSI))
2748 pci_enable_msi(pdev); 2797 pci_enable_msi(pdev);
2749 2798
diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index 1aeb7082b0c4..d0a14cf2bd74 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -223,10 +223,8 @@ static const struct pci_device_id piix_pci_tbl[] = {
223 /* ICH8 Mobile PATA Controller */ 223 /* ICH8 Mobile PATA Controller */
224 { 0x8086, 0x2850, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich_pata_100 }, 224 { 0x8086, 0x2850, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich_pata_100 },
225 225
226 /* NOTE: The following PCI ids must be kept in sync with the 226 /* SATA ports */
227 * list in drivers/pci/quirks.c. 227
228 */
229
230 /* 82801EB (ICH5) */ 228 /* 82801EB (ICH5) */
231 { 0x8086, 0x24d1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich5_sata }, 229 { 0x8086, 0x24d1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich5_sata },
232 /* 82801EB (ICH5) */ 230 /* 82801EB (ICH5) */
@@ -1509,8 +1507,8 @@ static int __devinit piix_init_one(struct pci_dev *pdev,
1509 dev_printk(KERN_DEBUG, &pdev->dev, 1507 dev_printk(KERN_DEBUG, &pdev->dev,
1510 "version " DRV_VERSION "\n"); 1508 "version " DRV_VERSION "\n");
1511 1509
1512 /* no hotplugging support (FIXME) */ 1510 /* no hotplugging support for later devices (FIXME) */
1513 if (!in_module_init) 1511 if (!in_module_init && ent->driver_data >= ich5_sata)
1514 return -ENODEV; 1512 return -ENODEV;
1515 1513
1516 if (piix_broken_system_poweroff(pdev)) { 1514 if (piix_broken_system_poweroff(pdev)) {
@@ -1591,6 +1589,7 @@ static int __devinit piix_init_one(struct pci_dev *pdev,
1591 host->ports[1]->mwdma_mask = 0; 1589 host->ports[1]->mwdma_mask = 0;
1592 host->ports[1]->udma_mask = 0; 1590 host->ports[1]->udma_mask = 0;
1593 } 1591 }
1592 host->flags |= ATA_HOST_PARALLEL_SCAN;
1594 1593
1595 pci_set_master(pdev); 1594 pci_set_master(pdev);
1596 return ata_pci_sff_activate_host(host, ata_sff_interrupt, &piix_sht); 1595 return ata_pci_sff_activate_host(host, ata_sff_interrupt, &piix_sht);
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index c9242301cfa1..ca4d208ddf3b 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -5031,7 +5031,6 @@ int ata_qc_complete_multiple(struct ata_port *ap, u32 qc_active)
5031{ 5031{
5032 int nr_done = 0; 5032 int nr_done = 0;
5033 u32 done_mask; 5033 u32 done_mask;
5034 int i;
5035 5034
5036 done_mask = ap->qc_active ^ qc_active; 5035 done_mask = ap->qc_active ^ qc_active;
5037 5036
@@ -5041,16 +5040,16 @@ int ata_qc_complete_multiple(struct ata_port *ap, u32 qc_active)
5041 return -EINVAL; 5040 return -EINVAL;
5042 } 5041 }
5043 5042
5044 for (i = 0; i < ATA_MAX_QUEUE; i++) { 5043 while (done_mask) {
5045 struct ata_queued_cmd *qc; 5044 struct ata_queued_cmd *qc;
5045 unsigned int tag = __ffs(done_mask);
5046 5046
5047 if (!(done_mask & (1 << i))) 5047 qc = ata_qc_from_tag(ap, tag);
5048 continue; 5048 if (qc) {
5049
5050 if ((qc = ata_qc_from_tag(ap, i))) {
5051 ata_qc_complete(qc); 5049 ata_qc_complete(qc);
5052 nr_done++; 5050 nr_done++;
5053 } 5051 }
5052 done_mask &= ~(1 << tag);
5054 } 5053 }
5055 5054
5056 return nr_done; 5055 return nr_done;
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index bb18415d3d63..bbbb1fab1755 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -727,17 +727,23 @@ unsigned int ata_sff_data_xfer(struct ata_device *dev, unsigned char *buf,
727 else 727 else
728 iowrite16_rep(data_addr, buf, words); 728 iowrite16_rep(data_addr, buf, words);
729 729
730 /* Transfer trailing 1 byte, if any. */ 730 /* Transfer trailing byte, if any. */
731 if (unlikely(buflen & 0x01)) { 731 if (unlikely(buflen & 0x01)) {
732 __le16 align_buf[1] = { 0 }; 732 unsigned char pad[2];
733 unsigned char *trailing_buf = buf + buflen - 1;
734 733
734 /* Point buf to the tail of buffer */
735 buf += buflen - 1;
736
737 /*
738 * Use io*16_rep() accessors here as well to avoid pointlessly
739 * swapping bytes to and fro on the big endian machines...
740 */
735 if (rw == READ) { 741 if (rw == READ) {
736 align_buf[0] = cpu_to_le16(ioread16(data_addr)); 742 ioread16_rep(data_addr, pad, 1);
737 memcpy(trailing_buf, align_buf, 1); 743 *buf = pad[0];
738 } else { 744 } else {
739 memcpy(align_buf, trailing_buf, 1); 745 pad[0] = *buf;
740 iowrite16(le16_to_cpu(align_buf[0]), data_addr); 746 iowrite16_rep(data_addr, pad, 1);
741 } 747 }
742 words++; 748 words++;
743 } 749 }
diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index 6cda12ba8122..b2d11f300c39 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -305,8 +305,8 @@ static irqreturn_t nv_ck804_interrupt(int irq, void *dev_instance);
305static int nv_scr_read(struct ata_link *link, unsigned int sc_reg, u32 *val); 305static int nv_scr_read(struct ata_link *link, unsigned int sc_reg, u32 *val);
306static int nv_scr_write(struct ata_link *link, unsigned int sc_reg, u32 val); 306static int nv_scr_write(struct ata_link *link, unsigned int sc_reg, u32 val);
307 307
308static int nv_noclassify_hardreset(struct ata_link *link, unsigned int *class, 308static int nv_hardreset(struct ata_link *link, unsigned int *class,
309 unsigned long deadline); 309 unsigned long deadline);
310static void nv_nf2_freeze(struct ata_port *ap); 310static void nv_nf2_freeze(struct ata_port *ap);
311static void nv_nf2_thaw(struct ata_port *ap); 311static void nv_nf2_thaw(struct ata_port *ap);
312static void nv_ck804_freeze(struct ata_port *ap); 312static void nv_ck804_freeze(struct ata_port *ap);
@@ -406,49 +406,82 @@ static struct scsi_host_template nv_swncq_sht = {
406 .slave_configure = nv_swncq_slave_config, 406 .slave_configure = nv_swncq_slave_config,
407}; 407};
408 408
409static struct ata_port_operations nv_common_ops = { 409/*
410 * NV SATA controllers have various different problems with hardreset
411 * protocol depending on the specific controller and device.
412 *
413 * GENERIC:
414 *
415 * bko11195 reports that link doesn't come online after hardreset on
416 * generic nv's and there have been several other similar reports on
417 * linux-ide.
418 *
419 * bko12351#c23 reports that warmplug on MCP61 doesn't work with
420 * softreset.
421 *
422 * NF2/3:
423 *
424 * bko3352 reports nf2/3 controllers can't determine device signature
425 * reliably after hardreset. The following thread reports detection
426 * failure on cold boot with the standard debouncing timing.
427 *
428 * http://thread.gmane.org/gmane.linux.ide/34098
429 *
430 * bko12176 reports that hardreset fails to bring up the link during
431 * boot on nf2.
432 *
433 * CK804:
434 *
435 * For initial probing after boot and hot plugging, hardreset mostly
436 * works fine on CK804 but curiously, reprobing on the initial port
437 * by rescanning or rmmod/insmod fails to acquire the initial D2H Reg
438 * FIS in somewhat undeterministic way.
439 *
440 * SWNCQ:
441 *
442 * bko12351 reports that when SWNCQ is enabled, for hotplug to work,
443 * hardreset should be used and hardreset can't report proper
444 * signature, which suggests that mcp5x is closer to nf2 as long as
445 * reset quirkiness is concerned.
446 *
447 * bko12703 reports that boot probing fails for intel SSD with
448 * hardreset. Link fails to come online. Softreset works fine.
449 *
450 * The failures are varied but the following patterns seem true for
451 * all flavors.
452 *
453 * - Softreset during boot always works.
454 *
455 * - Hardreset during boot sometimes fails to bring up the link on
456 * certain comibnations and device signature acquisition is
457 * unreliable.
458 *
459 * - Hardreset is often necessary after hotplug.
460 *
461 * So, preferring softreset for boot probing and error handling (as
462 * hardreset might bring down the link) but using hardreset for
463 * post-boot probing should work around the above issues in most
464 * cases. Define nv_hardreset() which only kicks in for post-boot
465 * probing and use it for all variants.
466 */
467static struct ata_port_operations nv_generic_ops = {
410 .inherits = &ata_bmdma_port_ops, 468 .inherits = &ata_bmdma_port_ops,
411 .lost_interrupt = ATA_OP_NULL, 469 .lost_interrupt = ATA_OP_NULL,
412 .scr_read = nv_scr_read, 470 .scr_read = nv_scr_read,
413 .scr_write = nv_scr_write, 471 .scr_write = nv_scr_write,
472 .hardreset = nv_hardreset,
414}; 473};
415 474
416/* OSDL bz11195 reports that link doesn't come online after hardreset
417 * on generic nv's and there have been several other similar reports
418 * on linux-ide. Disable hardreset for generic nv's.
419 */
420static struct ata_port_operations nv_generic_ops = {
421 .inherits = &nv_common_ops,
422 .hardreset = ATA_OP_NULL,
423};
424
425/* nf2 is ripe with hardreset related problems.
426 *
427 * kernel bz#3352 reports nf2/3 controllers can't determine device
428 * signature reliably. The following thread reports detection failure
429 * on cold boot with the standard debouncing timing.
430 *
431 * http://thread.gmane.org/gmane.linux.ide/34098
432 *
433 * And bz#12176 reports that hardreset simply doesn't work on nf2.
434 * Give up on it and just don't do hardreset.
435 */
436static struct ata_port_operations nv_nf2_ops = { 475static struct ata_port_operations nv_nf2_ops = {
437 .inherits = &nv_generic_ops, 476 .inherits = &nv_generic_ops,
438 .freeze = nv_nf2_freeze, 477 .freeze = nv_nf2_freeze,
439 .thaw = nv_nf2_thaw, 478 .thaw = nv_nf2_thaw,
440}; 479};
441 480
442/* For initial probing after boot and hot plugging, hardreset mostly
443 * works fine on CK804 but curiously, reprobing on the initial port by
444 * rescanning or rmmod/insmod fails to acquire the initial D2H Reg FIS
445 * in somewhat undeterministic way. Use noclassify hardreset.
446 */
447static struct ata_port_operations nv_ck804_ops = { 481static struct ata_port_operations nv_ck804_ops = {
448 .inherits = &nv_common_ops, 482 .inherits = &nv_generic_ops,
449 .freeze = nv_ck804_freeze, 483 .freeze = nv_ck804_freeze,
450 .thaw = nv_ck804_thaw, 484 .thaw = nv_ck804_thaw,
451 .hardreset = nv_noclassify_hardreset,
452 .host_stop = nv_ck804_host_stop, 485 .host_stop = nv_ck804_host_stop,
453}; 486};
454 487
@@ -476,19 +509,8 @@ static struct ata_port_operations nv_adma_ops = {
476 .host_stop = nv_adma_host_stop, 509 .host_stop = nv_adma_host_stop,
477}; 510};
478 511
479/* Kernel bz#12351 reports that when SWNCQ is enabled, for hotplug to
480 * work, hardreset should be used and hardreset can't report proper
481 * signature, which suggests that mcp5x is closer to nf2 as long as
482 * reset quirkiness is concerned. Define separate ops for mcp5x with
483 * nv_noclassify_hardreset().
484 */
485static struct ata_port_operations nv_mcp5x_ops = {
486 .inherits = &nv_common_ops,
487 .hardreset = nv_noclassify_hardreset,
488};
489
490static struct ata_port_operations nv_swncq_ops = { 512static struct ata_port_operations nv_swncq_ops = {
491 .inherits = &nv_mcp5x_ops, 513 .inherits = &nv_generic_ops,
492 514
493 .qc_defer = ata_std_qc_defer, 515 .qc_defer = ata_std_qc_defer,
494 .qc_prep = nv_swncq_qc_prep, 516 .qc_prep = nv_swncq_qc_prep,
@@ -557,7 +579,7 @@ static const struct ata_port_info nv_port_info[] = {
557 .pio_mask = NV_PIO_MASK, 579 .pio_mask = NV_PIO_MASK,
558 .mwdma_mask = NV_MWDMA_MASK, 580 .mwdma_mask = NV_MWDMA_MASK,
559 .udma_mask = NV_UDMA_MASK, 581 .udma_mask = NV_UDMA_MASK,
560 .port_ops = &nv_mcp5x_ops, 582 .port_ops = &nv_generic_ops,
561 .private_data = NV_PI_PRIV(nv_generic_interrupt, &nv_sht), 583 .private_data = NV_PI_PRIV(nv_generic_interrupt, &nv_sht),
562 }, 584 },
563 /* SWNCQ */ 585 /* SWNCQ */
@@ -1559,15 +1581,24 @@ static int nv_scr_write(struct ata_link *link, unsigned int sc_reg, u32 val)
1559 return 0; 1581 return 0;
1560} 1582}
1561 1583
1562static int nv_noclassify_hardreset(struct ata_link *link, unsigned int *class, 1584static int nv_hardreset(struct ata_link *link, unsigned int *class,
1563 unsigned long deadline) 1585 unsigned long deadline)
1564{ 1586{
1565 bool online; 1587 struct ata_eh_context *ehc = &link->eh_context;
1566 int rc;
1567 1588
1568 rc = sata_link_hardreset(link, sata_deb_timing_hotplug, deadline, 1589 /* Do hardreset iff it's post-boot probing, please read the
1569 &online, NULL); 1590 * comment above port ops for details.
1570 return online ? -EAGAIN : rc; 1591 */
1592 if (!(link->ap->pflags & ATA_PFLAG_LOADING) &&
1593 !ata_dev_enabled(link->device))
1594 sata_link_hardreset(link, sata_deb_timing_hotplug, deadline,
1595 NULL, NULL);
1596 else if (!(ehc->i.flags & ATA_EHI_QUIET))
1597 ata_link_printk(link, KERN_INFO,
1598 "nv: skipping hardreset on occupied port\n");
1599
1600 /* device signature acquisition is unreliable */
1601 return -EAGAIN;
1571} 1602}
1572 1603
1573static void nv_nf2_freeze(struct ata_port *ap) 1604static void nv_nf2_freeze(struct ata_port *ap)
diff --git a/drivers/ata/sata_sil.c b/drivers/ata/sata_sil.c
index e67ce8e5caa5..030ec079b184 100644
--- a/drivers/ata/sata_sil.c
+++ b/drivers/ata/sata_sil.c
@@ -183,7 +183,7 @@ static struct scsi_host_template sil_sht = {
183}; 183};
184 184
185static struct ata_port_operations sil_ops = { 185static struct ata_port_operations sil_ops = {
186 .inherits = &ata_bmdma_port_ops, 186 .inherits = &ata_bmdma32_port_ops,
187 .dev_config = sil_dev_config, 187 .dev_config = sil_dev_config,
188 .set_mode = sil_set_mode, 188 .set_mode = sil_set_mode,
189 .bmdma_setup = sil_bmdma_setup, 189 .bmdma_setup = sil_bmdma_setup,
diff --git a/drivers/ata/sata_sx4.c b/drivers/ata/sata_sx4.c
index eb05a3c82a9e..bbcf970068ad 100644
--- a/drivers/ata/sata_sx4.c
+++ b/drivers/ata/sata_sx4.c
@@ -193,6 +193,7 @@ enum {
193 PDC_TIMER_MASK_INT, 193 PDC_TIMER_MASK_INT,
194}; 194};
195 195
196#define ECC_ERASE_BUF_SZ (128 * 1024)
196 197
197struct pdc_port_priv { 198struct pdc_port_priv {
198 u8 dimm_buf[(ATA_PRD_SZ * ATA_MAX_PRD) + 512]; 199 u8 dimm_buf[(ATA_PRD_SZ * ATA_MAX_PRD) + 512];
@@ -1280,7 +1281,6 @@ static unsigned int pdc20621_dimm_init(struct ata_host *host)
1280{ 1281{
1281 int speed, size, length; 1282 int speed, size, length;
1282 u32 addr, spd0, pci_status; 1283 u32 addr, spd0, pci_status;
1283 u32 tmp = 0;
1284 u32 time_period = 0; 1284 u32 time_period = 0;
1285 u32 tcount = 0; 1285 u32 tcount = 0;
1286 u32 ticks = 0; 1286 u32 ticks = 0;
@@ -1395,14 +1395,17 @@ static unsigned int pdc20621_dimm_init(struct ata_host *host)
1395 pdc20621_i2c_read(host, PDC_DIMM0_SPD_DEV_ADDRESS, 1395 pdc20621_i2c_read(host, PDC_DIMM0_SPD_DEV_ADDRESS,
1396 PDC_DIMM_SPD_TYPE, &spd0); 1396 PDC_DIMM_SPD_TYPE, &spd0);
1397 if (spd0 == 0x02) { 1397 if (spd0 == 0x02) {
1398 void *buf;
1398 VPRINTK("Start ECC initialization\n"); 1399 VPRINTK("Start ECC initialization\n");
1399 addr = 0; 1400 addr = 0;
1400 length = size * 1024 * 1024; 1401 length = size * 1024 * 1024;
1402 buf = kzalloc(ECC_ERASE_BUF_SZ, GFP_KERNEL);
1401 while (addr < length) { 1403 while (addr < length) {
1402 pdc20621_put_to_dimm(host, (void *) &tmp, addr, 1404 pdc20621_put_to_dimm(host, buf, addr,
1403 sizeof(u32)); 1405 ECC_ERASE_BUF_SZ);
1404 addr += sizeof(u32); 1406 addr += ECC_ERASE_BUF_SZ;
1405 } 1407 }
1408 kfree(buf);
1406 VPRINTK("Finish ECC initialization\n"); 1409 VPRINTK("Finish ECC initialization\n");
1407 } 1410 }
1408 return 0; 1411 return 0;
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index d6a807f4077d..39a05b5fa9cb 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -25,6 +25,7 @@
25#include <linux/kbd_kern.h> 25#include <linux/kbd_kern.h>
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/quotaops.h> 27#include <linux/quotaops.h>
28#include <linux/perf_counter.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
29#include <linux/module.h> 30#include <linux/module.h>
30#include <linux/suspend.h> 31#include <linux/suspend.h>
@@ -243,6 +244,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
243 struct pt_regs *regs = get_irq_regs(); 244 struct pt_regs *regs = get_irq_regs();
244 if (regs) 245 if (regs)
245 show_regs(regs); 246 show_regs(regs);
247 perf_counter_print_debug();
246} 248}
247static struct sysrq_key_op sysrq_showregs_op = { 249static struct sysrq_key_op sysrq_showregs_op = {
248 .handler = sysrq_handle_showregs, 250 .handler = sysrq_handle_showregs,
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 08151d4de489..de9ebee8657b 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -95,7 +95,6 @@
95#include <linux/timer.h> 95#include <linux/timer.h>
96#include <linux/interrupt.h> 96#include <linux/interrupt.h>
97#include <linux/workqueue.h> 97#include <linux/workqueue.h>
98#include <linux/bootmem.h>
99#include <linux/pm.h> 98#include <linux/pm.h>
100#include <linux/font.h> 99#include <linux/font.h>
101#include <linux/bitops.h> 100#include <linux/bitops.h>
@@ -104,6 +103,7 @@
104#include <linux/io.h> 103#include <linux/io.h>
105#include <asm/system.h> 104#include <asm/system.h>
106#include <linux/uaccess.h> 105#include <linux/uaccess.h>
106#include <linux/kmemleak.h>
107 107
108#define MAX_NR_CON_DRIVER 16 108#define MAX_NR_CON_DRIVER 16
109 109
@@ -2875,14 +2875,11 @@ static int __init con_init(void)
2875 mod_timer(&console_timer, jiffies + blankinterval); 2875 mod_timer(&console_timer, jiffies + blankinterval);
2876 } 2876 }
2877 2877
2878 /*
2879 * kmalloc is not running yet - we use the bootmem allocator.
2880 */
2881 for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) { 2878 for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) {
2882 vc_cons[currcons].d = vc = alloc_bootmem(sizeof(struct vc_data)); 2879 vc_cons[currcons].d = vc = kzalloc(sizeof(struct vc_data), GFP_NOWAIT);
2883 INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK); 2880 INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK);
2884 visual_init(vc, currcons, 1); 2881 visual_init(vc, currcons, 1);
2885 vc->vc_screenbuf = (unsigned short *)alloc_bootmem(vc->vc_screenbuf_size); 2882 vc->vc_screenbuf = kzalloc(vc->vc_screenbuf_size, GFP_NOWAIT);
2886 vc->vc_kmalloced = 0; 2883 vc->vc_kmalloced = 0;
2887 vc_init(vc, vc->vc_rows, vc->vc_cols, 2884 vc_init(vc, vc->vc_rows, vc->vc_cols,
2888 currcons || !vc->vc_sw->con_save_screen); 2885 currcons || !vc->vc_sw->con_save_screen);
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 5f1b5400d96a..24c84ae81527 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -596,6 +596,7 @@ int dmi_get_year(int field)
596 596
597 return year; 597 return year;
598} 598}
599EXPORT_SYMBOL(dmi_get_year);
599 600
600/** 601/**
601 * dmi_walk - Walk the DMI table and get called back for every record 602 * dmi_walk - Walk the DMI table and get called back for every record
diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 38e86b84dce0..59d7d5ec17a4 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -180,7 +180,7 @@ static inline void vga_set_mem_top(struct vc_data *c)
180} 180}
181 181
182#ifdef CONFIG_VGACON_SOFT_SCROLLBACK 182#ifdef CONFIG_VGACON_SOFT_SCROLLBACK
183#include <linux/bootmem.h> 183#include <linux/slab.h>
184/* software scrollback */ 184/* software scrollback */
185static void *vgacon_scrollback; 185static void *vgacon_scrollback;
186static int vgacon_scrollback_tail; 186static int vgacon_scrollback_tail;
@@ -210,8 +210,7 @@ static void vgacon_scrollback_init(int pitch)
210 */ 210 */
211static void __init_refok vgacon_scrollback_startup(void) 211static void __init_refok vgacon_scrollback_startup(void)
212{ 212{
213 vgacon_scrollback = alloc_bootmem(CONFIG_VGACON_SOFT_SCROLLBACK_SIZE 213 vgacon_scrollback = kcalloc(CONFIG_VGACON_SOFT_SCROLLBACK_SIZE, 1024, GFP_NOWAIT);
214 * 1024);
215 vgacon_scrollback_init(vga_video_num_columns * 2); 214 vgacon_scrollback_init(vga_video_num_columns * 2);
216} 215}
217 216
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2dfc6cdcebbe..931f6b8c4b2f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -25,6 +25,7 @@
25#include <linux/uio.h> 25#include <linux/uio.h>
26#include <linux/namei.h> 26#include <linux/namei.h>
27#include <linux/log2.h> 27#include <linux/log2.h>
28#include <linux/kmemleak.h>
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29#include "internal.h" 30#include "internal.h"
30 31
@@ -492,6 +493,11 @@ void __init bdev_cache_init(void)
492 bd_mnt = kern_mount(&bd_type); 493 bd_mnt = kern_mount(&bd_type);
493 if (IS_ERR(bd_mnt)) 494 if (IS_ERR(bd_mnt))
494 panic("Cannot create bdev pseudo-fs"); 495 panic("Cannot create bdev pseudo-fs");
496 /*
497 * This vfsmount structure is only used to obtain the
498 * blockdev_superblock, so tell kmemleak not to report it.
499 */
500 kmemleak_not_leak(bd_mnt);
495 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ 501 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
496} 502}
497 503
diff --git a/fs/exec.c b/fs/exec.c
index a7fcd975c6b2..e639957d7a57 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/perf_counter.h>
36#include <linux/highmem.h> 37#include <linux/highmem.h>
37#include <linux/spinlock.h> 38#include <linux/spinlock.h>
38#include <linux/key.h> 39#include <linux/key.h>
@@ -922,6 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
922 task_lock(tsk); 923 task_lock(tsk);
923 strlcpy(tsk->comm, buf, sizeof(tsk->comm)); 924 strlcpy(tsk->comm, buf, sizeof(tsk->comm));
924 task_unlock(tsk); 925 task_unlock(tsk);
926 perf_counter_comm(tsk);
925} 927}
926 928
927int flush_old_exec(struct linux_binprm * bprm) 929int flush_old_exec(struct linux_binprm * bprm)
@@ -990,6 +992,13 @@ int flush_old_exec(struct linux_binprm * bprm)
990 992
991 current->personality &= ~bprm->per_clear; 993 current->personality &= ~bprm->per_clear;
992 994
995 /*
996 * Flush performance counters when crossing a
997 * security domain:
998 */
999 if (!get_dumpable(current->mm))
1000 perf_counter_exit_task(current);
1001
993 /* An exec changes our domain. We are no longer part of the thread 1002 /* An exec changes our domain. We are no longer part of the thread
994 group */ 1003 group */
995 1004
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 346057218edc..0fc30407f039 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -2571,6 +2571,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2571 2571
2572 txAbort(tid, 0); 2572 txAbort(tid, 0);
2573 txEnd(tid); 2573 txEnd(tid);
2574 mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
2574 2575
2575 /* release the inode map lock */ 2576 /* release the inode map lock */
2576 IWRITE_UNLOCK(ipimap); 2577 IWRITE_UNLOCK(ipimap);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 6f21adf9479a..d9b0e92b3602 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -720,8 +720,10 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
720 blk++; 720 blk++;
721 } 721 }
722out: 722out:
723 if (len == towrite) 723 if (len == towrite) {
724 mutex_unlock(&inode->i_mutex);
724 return err; 725 return err;
726 }
725 if (inode->i_size < off+len-towrite) 727 if (inode->i_size < off+len-towrite)
726 i_size_write(inode, off+len-towrite); 728 i_size_write(inode, off+len-towrite);
727 inode->i_version++; 729 inode->i_version++;
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index 3673a13b6703..81d3be459efb 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -134,7 +134,7 @@ static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u)
134#define atomic_long_cmpxchg(l, old, new) \ 134#define atomic_long_cmpxchg(l, old, new) \
135 (atomic64_cmpxchg((atomic64_t *)(l), (old), (new))) 135 (atomic64_cmpxchg((atomic64_t *)(l), (old), (new)))
136#define atomic_long_xchg(v, new) \ 136#define atomic_long_xchg(v, new) \
137 (atomic64_xchg((atomic64_t *)(l), (new))) 137 (atomic64_xchg((atomic64_t *)(v), (new)))
138 138
139#else /* BITS_PER_LONG == 64 */ 139#else /* BITS_PER_LONG == 64 */
140 140
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 6646bfc7b892..28b1f30601b5 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -108,6 +108,15 @@ extern struct group_info init_groups;
108 108
109extern struct cred init_cred; 109extern struct cred init_cred;
110 110
111#ifdef CONFIG_PERF_COUNTERS
112# define INIT_PERF_COUNTERS(tsk) \
113 .perf_counter_mutex = \
114 __MUTEX_INITIALIZER(tsk.perf_counter_mutex), \
115 .perf_counter_list = LIST_HEAD_INIT(tsk.perf_counter_list),
116#else
117# define INIT_PERF_COUNTERS(tsk)
118#endif
119
111/* 120/*
112 * INIT_TASK is used to set up the first task table, touch at 121 * INIT_TASK is used to set up the first task table, touch at
113 * your own risk!. Base=0, limit=0x1fffff (=2MB) 122 * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -171,6 +180,7 @@ extern struct cred init_cred;
171 }, \ 180 }, \
172 .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ 181 .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \
173 INIT_IDS \ 182 INIT_IDS \
183 INIT_PERF_COUNTERS(tsk) \
174 INIT_TRACE_IRQFLAGS \ 184 INIT_TRACE_IRQFLAGS \
175 INIT_LOCKDEP \ 185 INIT_LOCKDEP \
176 INIT_FTRACE_GRAPH \ 186 INIT_FTRACE_GRAPH \
diff --git a/include/linux/irq.h b/include/linux/irq.h
index eedbb8e5e0cc..1e50c34f0062 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -430,23 +430,19 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
430 * Returns true if successful (or not required). 430 * Returns true if successful (or not required).
431 */ 431 */
432static inline bool alloc_desc_masks(struct irq_desc *desc, int node, 432static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
433 bool boot) 433 bool boot)
434{ 434{
435#ifdef CONFIG_CPUMASK_OFFSTACK 435 gfp_t gfp = GFP_ATOMIC;
436 if (boot) {
437 alloc_bootmem_cpumask_var(&desc->affinity);
438 436
439#ifdef CONFIG_GENERIC_PENDING_IRQ 437 if (boot)
440 alloc_bootmem_cpumask_var(&desc->pending_mask); 438 gfp = GFP_NOWAIT;
441#endif
442 return true;
443 }
444 439
445 if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node)) 440#ifdef CONFIG_CPUMASK_OFFSTACK
441 if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
446 return false; 442 return false;
447 443
448#ifdef CONFIG_GENERIC_PENDING_IRQ 444#ifdef CONFIG_GENERIC_PENDING_IRQ
449 if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) { 445 if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
450 free_cpumask_var(desc->affinity); 446 free_cpumask_var(desc->affinity);
451 return false; 447 return false;
452 } 448 }
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 0c8b89f28a95..a77c6007dc99 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -81,7 +81,12 @@ static inline unsigned int kstat_irqs(unsigned int irq)
81 return sum; 81 return sum;
82} 82}
83 83
84
85/*
86 * Lock/unlock the current runqueue - to extract task statistics:
87 */
84extern unsigned long long task_delta_exec(struct task_struct *); 88extern unsigned long long task_delta_exec(struct task_struct *);
89
85extern void account_user_time(struct task_struct *, cputime_t, cputime_t); 90extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
86extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); 91extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
87extern void account_steal_time(cputime_t); 92extern void account_steal_time(cputime_t);
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
new file mode 100644
index 000000000000..7796aed6cdd5
--- /dev/null
+++ b/include/linux/kmemleak.h
@@ -0,0 +1,96 @@
1/*
2 * include/linux/kmemleak.h
3 *
4 * Copyright (C) 2008 ARM Limited
5 * Written by Catalin Marinas <catalin.marinas@arm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21#ifndef __KMEMLEAK_H
22#define __KMEMLEAK_H
23
24#ifdef CONFIG_DEBUG_KMEMLEAK
25
26extern void kmemleak_init(void);
27extern void kmemleak_alloc(const void *ptr, size_t size, int min_count,
28 gfp_t gfp);
29extern void kmemleak_free(const void *ptr);
30extern void kmemleak_padding(const void *ptr, unsigned long offset,
31 size_t size);
32extern void kmemleak_not_leak(const void *ptr);
33extern void kmemleak_ignore(const void *ptr);
34extern void kmemleak_scan_area(const void *ptr, unsigned long offset,
35 size_t length, gfp_t gfp);
36extern void kmemleak_no_scan(const void *ptr);
37
38static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
39 int min_count, unsigned long flags,
40 gfp_t gfp)
41{
42 if (!(flags & SLAB_NOLEAKTRACE))
43 kmemleak_alloc(ptr, size, min_count, gfp);
44}
45
46static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags)
47{
48 if (!(flags & SLAB_NOLEAKTRACE))
49 kmemleak_free(ptr);
50}
51
52static inline void kmemleak_erase(void **ptr)
53{
54 *ptr = NULL;
55}
56
57#else
58
59static inline void kmemleak_init(void)
60{
61}
62static inline void kmemleak_alloc(const void *ptr, size_t size, int min_count,
63 gfp_t gfp)
64{
65}
66static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
67 int min_count, unsigned long flags,
68 gfp_t gfp)
69{
70}
71static inline void kmemleak_free(const void *ptr)
72{
73}
74static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags)
75{
76}
77static inline void kmemleak_not_leak(const void *ptr)
78{
79}
80static inline void kmemleak_ignore(const void *ptr)
81{
82}
83static inline void kmemleak_scan_area(const void *ptr, unsigned long offset,
84 size_t length, gfp_t gfp)
85{
86}
87static inline void kmemleak_erase(void **ptr)
88{
89}
90static inline void kmemleak_no_scan(const void *ptr)
91{
92}
93
94#endif /* CONFIG_DEBUG_KMEMLEAK */
95
96#endif /* __KMEMLEAK_H */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 1581ff235c7e..26fd9d12f050 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -86,7 +86,12 @@ struct percpu_data {
86 void *ptrs[1]; 86 void *ptrs[1];
87}; 87};
88 88
89/* pointer disguising messes up the kmemleak objects tracking */
90#ifndef CONFIG_DEBUG_KMEMLEAK
89#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata) 91#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
92#else
93#define __percpu_disguise(pdata) (struct percpu_data *)(pdata)
94#endif
90 95
91#define per_cpu_ptr(ptr, cpu) \ 96#define per_cpu_ptr(ptr, cpu) \
92({ \ 97({ \
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
new file mode 100644
index 000000000000..6e133954e2e4
--- /dev/null
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,697 @@
1/*
2 * Performance counters:
3 *
4 * Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
7 *
8 * Data type definitions, declarations, prototypes.
9 *
10 * Started by: Thomas Gleixner and Ingo Molnar
11 *
12 * For licencing details see kernel-base/COPYING
13 */
14#ifndef _LINUX_PERF_COUNTER_H
15#define _LINUX_PERF_COUNTER_H
16
17#include <linux/types.h>
18#include <linux/ioctl.h>
19#include <asm/byteorder.h>
20
21/*
22 * User-space ABI bits:
23 */
24
25/*
26 * attr.type
27 */
28enum perf_type_id {
29 PERF_TYPE_HARDWARE = 0,
30 PERF_TYPE_SOFTWARE = 1,
31 PERF_TYPE_TRACEPOINT = 2,
32 PERF_TYPE_HW_CACHE = 3,
33 PERF_TYPE_RAW = 4,
34
35 PERF_TYPE_MAX, /* non-ABI */
36};
37
38/*
39 * Generalized performance counter event types, used by the
40 * attr.event_id parameter of the sys_perf_counter_open()
41 * syscall:
42 */
43enum perf_hw_id {
44 /*
45 * Common hardware events, generalized by the kernel:
46 */
47 PERF_COUNT_HW_CPU_CYCLES = 0,
48 PERF_COUNT_HW_INSTRUCTIONS = 1,
49 PERF_COUNT_HW_CACHE_REFERENCES = 2,
50 PERF_COUNT_HW_CACHE_MISSES = 3,
51 PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4,
52 PERF_COUNT_HW_BRANCH_MISSES = 5,
53 PERF_COUNT_HW_BUS_CYCLES = 6,
54
55 PERF_COUNT_HW_MAX, /* non-ABI */
56};
57
58/*
59 * Generalized hardware cache counters:
60 *
61 * { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
62 * { read, write, prefetch } x
63 * { accesses, misses }
64 */
65enum perf_hw_cache_id {
66 PERF_COUNT_HW_CACHE_L1D = 0,
67 PERF_COUNT_HW_CACHE_L1I = 1,
68 PERF_COUNT_HW_CACHE_LL = 2,
69 PERF_COUNT_HW_CACHE_DTLB = 3,
70 PERF_COUNT_HW_CACHE_ITLB = 4,
71 PERF_COUNT_HW_CACHE_BPU = 5,
72
73 PERF_COUNT_HW_CACHE_MAX, /* non-ABI */
74};
75
76enum perf_hw_cache_op_id {
77 PERF_COUNT_HW_CACHE_OP_READ = 0,
78 PERF_COUNT_HW_CACHE_OP_WRITE = 1,
79 PERF_COUNT_HW_CACHE_OP_PREFETCH = 2,
80
81 PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */
82};
83
84enum perf_hw_cache_op_result_id {
85 PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0,
86 PERF_COUNT_HW_CACHE_RESULT_MISS = 1,
87
88 PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */
89};
90
91/*
92 * Special "software" counters provided by the kernel, even if the hardware
93 * does not support performance counters. These counters measure various
94 * physical and sw events of the kernel (and allow the profiling of them as
95 * well):
96 */
97enum perf_sw_ids {
98 PERF_COUNT_SW_CPU_CLOCK = 0,
99 PERF_COUNT_SW_TASK_CLOCK = 1,
100 PERF_COUNT_SW_PAGE_FAULTS = 2,
101 PERF_COUNT_SW_CONTEXT_SWITCHES = 3,
102 PERF_COUNT_SW_CPU_MIGRATIONS = 4,
103 PERF_COUNT_SW_PAGE_FAULTS_MIN = 5,
104 PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6,
105
106 PERF_COUNT_SW_MAX, /* non-ABI */
107};
108
109/*
110 * Bits that can be set in attr.sample_type to request information
111 * in the overflow packets.
112 */
113enum perf_counter_sample_format {
114 PERF_SAMPLE_IP = 1U << 0,
115 PERF_SAMPLE_TID = 1U << 1,
116 PERF_SAMPLE_TIME = 1U << 2,
117 PERF_SAMPLE_ADDR = 1U << 3,
118 PERF_SAMPLE_GROUP = 1U << 4,
119 PERF_SAMPLE_CALLCHAIN = 1U << 5,
120 PERF_SAMPLE_ID = 1U << 6,
121 PERF_SAMPLE_CPU = 1U << 7,
122 PERF_SAMPLE_PERIOD = 1U << 8,
123};
124
125/*
126 * Bits that can be set in attr.read_format to request that
127 * reads on the counter should return the indicated quantities,
128 * in increasing order of bit value, after the counter value.
129 */
130enum perf_counter_read_format {
131 PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0,
132 PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1,
133 PERF_FORMAT_ID = 1U << 2,
134};
135
136/*
137 * Hardware event to monitor via a performance monitoring counter:
138 */
139struct perf_counter_attr {
140 /*
141 * Major type: hardware/software/tracepoint/etc.
142 */
143 __u32 type;
144 __u32 __reserved_1;
145
146 /*
147 * Type specific configuration information.
148 */
149 __u64 config;
150
151 union {
152 __u64 sample_period;
153 __u64 sample_freq;
154 };
155
156 __u64 sample_type;
157 __u64 read_format;
158
159 __u64 disabled : 1, /* off by default */
160 inherit : 1, /* children inherit it */
161 pinned : 1, /* must always be on PMU */
162 exclusive : 1, /* only group on PMU */
163 exclude_user : 1, /* don't count user */
164 exclude_kernel : 1, /* ditto kernel */
165 exclude_hv : 1, /* ditto hypervisor */
166 exclude_idle : 1, /* don't count when idle */
167 mmap : 1, /* include mmap data */
168 comm : 1, /* include comm data */
169 freq : 1, /* use freq, not period */
170
171 __reserved_2 : 53;
172
173 __u32 wakeup_events; /* wakeup every n events */
174 __u32 __reserved_3;
175
176 __u64 __reserved_4;
177};
178
179/*
180 * Ioctls that can be done on a perf counter fd:
181 */
182#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0)
183#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1)
184#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2)
185#define PERF_COUNTER_IOC_RESET _IO ('$', 3)
186#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64)
187
188enum perf_counter_ioc_flags {
189 PERF_IOC_FLAG_GROUP = 1U << 0,
190};
191
192/*
193 * Structure of the page that can be mapped via mmap
194 */
195struct perf_counter_mmap_page {
196 __u32 version; /* version number of this structure */
197 __u32 compat_version; /* lowest version this is compat with */
198
199 /*
200 * Bits needed to read the hw counters in user-space.
201 *
202 * u32 seq;
203 * s64 count;
204 *
205 * do {
206 * seq = pc->lock;
207 *
208 * barrier()
209 * if (pc->index) {
210 * count = pmc_read(pc->index - 1);
211 * count += pc->offset;
212 * } else
213 * goto regular_read;
214 *
215 * barrier();
216 * } while (pc->lock != seq);
217 *
218 * NOTE: for obvious reason this only works on self-monitoring
219 * processes.
220 */
221 __u32 lock; /* seqlock for synchronization */
222 __u32 index; /* hardware counter identifier */
223 __s64 offset; /* add to hardware counter value */
224
225 /*
226 * Control data for the mmap() data buffer.
227 *
228 * User-space reading this value should issue an rmb(), on SMP capable
229 * platforms, after reading this value -- see perf_counter_wakeup().
230 */
231 __u64 data_head; /* head in the data section */
232};
233
234#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0)
235#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0)
236#define PERF_EVENT_MISC_KERNEL (1 << 0)
237#define PERF_EVENT_MISC_USER (2 << 0)
238#define PERF_EVENT_MISC_HYPERVISOR (3 << 0)
239#define PERF_EVENT_MISC_OVERFLOW (1 << 2)
240
241struct perf_event_header {
242 __u32 type;
243 __u16 misc;
244 __u16 size;
245};
246
247enum perf_event_type {
248
249 /*
250 * The MMAP events record the PROT_EXEC mappings so that we can
251 * correlate userspace IPs to code. They have the following structure:
252 *
253 * struct {
254 * struct perf_event_header header;
255 *
256 * u32 pid, tid;
257 * u64 addr;
258 * u64 len;
259 * u64 pgoff;
260 * char filename[];
261 * };
262 */
263 PERF_EVENT_MMAP = 1,
264
265 /*
266 * struct {
267 * struct perf_event_header header;
268 *
269 * u32 pid, tid;
270 * char comm[];
271 * };
272 */
273 PERF_EVENT_COMM = 3,
274
275 /*
276 * struct {
277 * struct perf_event_header header;
278 * u64 time;
279 * u64 id;
280 * u64 sample_period;
281 * };
282 */
283 PERF_EVENT_PERIOD = 4,
284
285 /*
286 * struct {
287 * struct perf_event_header header;
288 * u64 time;
289 * u64 id;
290 * };
291 */
292 PERF_EVENT_THROTTLE = 5,
293 PERF_EVENT_UNTHROTTLE = 6,
294
295 /*
296 * struct {
297 * struct perf_event_header header;
298 * u32 pid, ppid;
299 * };
300 */
301 PERF_EVENT_FORK = 7,
302
303 /*
304 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
305 * will be PERF_RECORD_*
306 *
307 * struct {
308 * struct perf_event_header header;
309 *
310 * { u64 ip; } && PERF_RECORD_IP
311 * { u32 pid, tid; } && PERF_RECORD_TID
312 * { u64 time; } && PERF_RECORD_TIME
313 * { u64 addr; } && PERF_RECORD_ADDR
314 * { u64 config; } && PERF_RECORD_CONFIG
315 * { u32 cpu, res; } && PERF_RECORD_CPU
316 *
317 * { u64 nr;
318 * { u64 id, val; } cnt[nr]; } && PERF_RECORD_GROUP
319 *
320 * { u16 nr,
321 * hv,
322 * kernel,
323 * user;
324 * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN
325 * };
326 */
327};
328
329#ifdef __KERNEL__
330/*
331 * Kernel-internal data types and definitions:
332 */
333
334#ifdef CONFIG_PERF_COUNTERS
335# include <asm/perf_counter.h>
336#endif
337
338#include <linux/list.h>
339#include <linux/mutex.h>
340#include <linux/rculist.h>
341#include <linux/rcupdate.h>
342#include <linux/spinlock.h>
343#include <linux/hrtimer.h>
344#include <linux/fs.h>
345#include <linux/pid_namespace.h>
346#include <asm/atomic.h>
347
348struct task_struct;
349
350/**
351 * struct hw_perf_counter - performance counter hardware details:
352 */
353struct hw_perf_counter {
354#ifdef CONFIG_PERF_COUNTERS
355 union {
356 struct { /* hardware */
357 u64 config;
358 unsigned long config_base;
359 unsigned long counter_base;
360 int idx;
361 };
362 union { /* software */
363 atomic64_t count;
364 struct hrtimer hrtimer;
365 };
366 };
367 atomic64_t prev_count;
368 u64 sample_period;
369 u64 last_period;
370 atomic64_t period_left;
371 u64 interrupts;
372
373 u64 freq_count;
374 u64 freq_interrupts;
375 u64 freq_stamp;
376#endif
377};
378
379struct perf_counter;
380
381/**
382 * struct pmu - generic performance monitoring unit
383 */
384struct pmu {
385 int (*enable) (struct perf_counter *counter);
386 void (*disable) (struct perf_counter *counter);
387 void (*read) (struct perf_counter *counter);
388 void (*unthrottle) (struct perf_counter *counter);
389};
390
391/**
392 * enum perf_counter_active_state - the states of a counter
393 */
394enum perf_counter_active_state {
395 PERF_COUNTER_STATE_ERROR = -2,
396 PERF_COUNTER_STATE_OFF = -1,
397 PERF_COUNTER_STATE_INACTIVE = 0,
398 PERF_COUNTER_STATE_ACTIVE = 1,
399};
400
401struct file;
402
403struct perf_mmap_data {
404 struct rcu_head rcu_head;
405 int nr_pages; /* nr of data pages */
406 int nr_locked; /* nr pages mlocked */
407
408 atomic_t poll; /* POLL_ for wakeups */
409 atomic_t events; /* event limit */
410
411 atomic_long_t head; /* write position */
412 atomic_long_t done_head; /* completed head */
413
414 atomic_t lock; /* concurrent writes */
415
416 atomic_t wakeup; /* needs a wakeup */
417
418 struct perf_counter_mmap_page *user_page;
419 void *data_pages[0];
420};
421
422struct perf_pending_entry {
423 struct perf_pending_entry *next;
424 void (*func)(struct perf_pending_entry *);
425};
426
427/**
428 * struct perf_counter - performance counter kernel representation:
429 */
430struct perf_counter {
431#ifdef CONFIG_PERF_COUNTERS
432 struct list_head list_entry;
433 struct list_head event_entry;
434 struct list_head sibling_list;
435 int nr_siblings;
436 struct perf_counter *group_leader;
437 const struct pmu *pmu;
438
439 enum perf_counter_active_state state;
440 atomic64_t count;
441
442 /*
443 * These are the total time in nanoseconds that the counter
444 * has been enabled (i.e. eligible to run, and the task has
445 * been scheduled in, if this is a per-task counter)
446 * and running (scheduled onto the CPU), respectively.
447 *
448 * They are computed from tstamp_enabled, tstamp_running and
449 * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
450 */
451 u64 total_time_enabled;
452 u64 total_time_running;
453
454 /*
455 * These are timestamps used for computing total_time_enabled
456 * and total_time_running when the counter is in INACTIVE or
457 * ACTIVE state, measured in nanoseconds from an arbitrary point
458 * in time.
459 * tstamp_enabled: the notional time when the counter was enabled
460 * tstamp_running: the notional time when the counter was scheduled on
461 * tstamp_stopped: in INACTIVE state, the notional time when the
462 * counter was scheduled off.
463 */
464 u64 tstamp_enabled;
465 u64 tstamp_running;
466 u64 tstamp_stopped;
467
468 struct perf_counter_attr attr;
469 struct hw_perf_counter hw;
470
471 struct perf_counter_context *ctx;
472 struct file *filp;
473
474 /*
475 * These accumulate total time (in nanoseconds) that children
476 * counters have been enabled and running, respectively.
477 */
478 atomic64_t child_total_time_enabled;
479 atomic64_t child_total_time_running;
480
481 /*
482 * Protect attach/detach and child_list:
483 */
484 struct mutex child_mutex;
485 struct list_head child_list;
486 struct perf_counter *parent;
487
488 int oncpu;
489 int cpu;
490
491 struct list_head owner_entry;
492 struct task_struct *owner;
493
494 /* mmap bits */
495 struct mutex mmap_mutex;
496 atomic_t mmap_count;
497 struct perf_mmap_data *data;
498
499 /* poll related */
500 wait_queue_head_t waitq;
501 struct fasync_struct *fasync;
502
503 /* delayed work for NMIs and such */
504 int pending_wakeup;
505 int pending_kill;
506 int pending_disable;
507 struct perf_pending_entry pending;
508
509 atomic_t event_limit;
510
511 void (*destroy)(struct perf_counter *);
512 struct rcu_head rcu_head;
513
514 struct pid_namespace *ns;
515 u64 id;
516#endif
517};
518
519/**
520 * struct perf_counter_context - counter context structure
521 *
522 * Used as a container for task counters and CPU counters as well:
523 */
524struct perf_counter_context {
525 /*
526 * Protect the states of the counters in the list,
527 * nr_active, and the list:
528 */
529 spinlock_t lock;
530 /*
531 * Protect the list of counters. Locking either mutex or lock
532 * is sufficient to ensure the list doesn't change; to change
533 * the list you need to lock both the mutex and the spinlock.
534 */
535 struct mutex mutex;
536
537 struct list_head counter_list;
538 struct list_head event_list;
539 int nr_counters;
540 int nr_active;
541 int is_active;
542 atomic_t refcount;
543 struct task_struct *task;
544
545 /*
546 * Context clock, runs when context enabled.
547 */
548 u64 time;
549 u64 timestamp;
550
551 /*
552 * These fields let us detect when two contexts have both
553 * been cloned (inherited) from a common ancestor.
554 */
555 struct perf_counter_context *parent_ctx;
556 u64 parent_gen;
557 u64 generation;
558 int pin_count;
559 struct rcu_head rcu_head;
560};
561
562/**
563 * struct perf_counter_cpu_context - per cpu counter context structure
564 */
565struct perf_cpu_context {
566 struct perf_counter_context ctx;
567 struct perf_counter_context *task_ctx;
568 int active_oncpu;
569 int max_pertask;
570 int exclusive;
571
572 /*
573 * Recursion avoidance:
574 *
575 * task, softirq, irq, nmi context
576 */
577 int recursion[4];
578};
579
580#ifdef CONFIG_PERF_COUNTERS
581
582/*
583 * Set by architecture code:
584 */
585extern int perf_max_counters;
586
587extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
588
589extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
590extern void perf_counter_task_sched_out(struct task_struct *task,
591 struct task_struct *next, int cpu);
592extern void perf_counter_task_tick(struct task_struct *task, int cpu);
593extern int perf_counter_init_task(struct task_struct *child);
594extern void perf_counter_exit_task(struct task_struct *child);
595extern void perf_counter_free_task(struct task_struct *task);
596extern void perf_counter_do_pending(void);
597extern void perf_counter_print_debug(void);
598extern void __perf_disable(void);
599extern bool __perf_enable(void);
600extern void perf_disable(void);
601extern void perf_enable(void);
602extern int perf_counter_task_disable(void);
603extern int perf_counter_task_enable(void);
604extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
605 struct perf_cpu_context *cpuctx,
606 struct perf_counter_context *ctx, int cpu);
607extern void perf_counter_update_userpage(struct perf_counter *counter);
608
609struct perf_sample_data {
610 struct pt_regs *regs;
611 u64 addr;
612 u64 period;
613};
614
615extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
616 struct perf_sample_data *data);
617
618/*
619 * Return 1 for a software counter, 0 for a hardware counter
620 */
621static inline int is_software_counter(struct perf_counter *counter)
622{
623 return (counter->attr.type != PERF_TYPE_RAW) &&
624 (counter->attr.type != PERF_TYPE_HARDWARE);
625}
626
627extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
628
629extern void __perf_counter_mmap(struct vm_area_struct *vma);
630
631static inline void perf_counter_mmap(struct vm_area_struct *vma)
632{
633 if (vma->vm_flags & VM_EXEC)
634 __perf_counter_mmap(vma);
635}
636
637extern void perf_counter_comm(struct task_struct *tsk);
638extern void perf_counter_fork(struct task_struct *tsk);
639
640extern void perf_counter_task_migration(struct task_struct *task, int cpu);
641
642#define MAX_STACK_DEPTH 255
643
644struct perf_callchain_entry {
645 u16 nr;
646 u16 hv;
647 u16 kernel;
648 u16 user;
649 u64 ip[MAX_STACK_DEPTH];
650};
651
652extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
653
654extern int sysctl_perf_counter_paranoid;
655extern int sysctl_perf_counter_mlock;
656extern int sysctl_perf_counter_sample_rate;
657
658extern void perf_counter_init(void);
659
660#ifndef perf_misc_flags
661#define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \
662 PERF_EVENT_MISC_KERNEL)
663#define perf_instruction_pointer(regs) instruction_pointer(regs)
664#endif
665
666#else
667static inline void
668perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
669static inline void
670perf_counter_task_sched_out(struct task_struct *task,
671 struct task_struct *next, int cpu) { }
672static inline void
673perf_counter_task_tick(struct task_struct *task, int cpu) { }
674static inline int perf_counter_init_task(struct task_struct *child) { return 0; }
675static inline void perf_counter_exit_task(struct task_struct *child) { }
676static inline void perf_counter_free_task(struct task_struct *task) { }
677static inline void perf_counter_do_pending(void) { }
678static inline void perf_counter_print_debug(void) { }
679static inline void perf_disable(void) { }
680static inline void perf_enable(void) { }
681static inline int perf_counter_task_disable(void) { return -EINVAL; }
682static inline int perf_counter_task_enable(void) { return -EINVAL; }
683
684static inline void
685perf_swcounter_event(u32 event, u64 nr, int nmi,
686 struct pt_regs *regs, u64 addr) { }
687
688static inline void perf_counter_mmap(struct vm_area_struct *vma) { }
689static inline void perf_counter_comm(struct task_struct *tsk) { }
690static inline void perf_counter_fork(struct task_struct *tsk) { }
691static inline void perf_counter_init(void) { }
692static inline void perf_counter_task_migration(struct task_struct *task,
693 int cpu) { }
694#endif
695
696#endif /* __KERNEL__ */
697#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 48d887e3c6e7..b00df4c79c63 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,4 +85,7 @@
85#define PR_SET_TIMERSLACK 29 85#define PR_SET_TIMERSLACK 29
86#define PR_GET_TIMERSLACK 30 86#define PR_GET_TIMERSLACK 30
87 87
88#define PR_TASK_PERF_COUNTERS_DISABLE 31
89#define PR_TASK_PERF_COUNTERS_ENABLE 32
90
88#endif /* _LINUX_PRCTL_H */ 91#endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 42bf2766111e..4896fdfec913 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -99,6 +99,7 @@ struct robust_list_head;
99struct bio; 99struct bio;
100struct fs_struct; 100struct fs_struct;
101struct bts_context; 101struct bts_context;
102struct perf_counter_context;
102 103
103/* 104/*
104 * List of flags we want to share for kernel threads, 105 * List of flags we want to share for kernel threads,
@@ -139,6 +140,7 @@ extern unsigned long nr_running(void);
139extern unsigned long nr_uninterruptible(void); 140extern unsigned long nr_uninterruptible(void);
140extern unsigned long nr_iowait(void); 141extern unsigned long nr_iowait(void);
141extern void calc_global_load(void); 142extern void calc_global_load(void);
143extern u64 cpu_nr_migrations(int cpu);
142 144
143extern unsigned long get_parent_ip(unsigned long addr); 145extern unsigned long get_parent_ip(unsigned long addr);
144 146
@@ -674,6 +676,10 @@ struct user_struct {
674 struct work_struct work; 676 struct work_struct work;
675#endif 677#endif
676#endif 678#endif
679
680#ifdef CONFIG_PERF_COUNTERS
681 atomic_long_t locked_vm;
682#endif
677}; 683};
678 684
679extern int uids_sysfs_init(void); 685extern int uids_sysfs_init(void);
@@ -1073,9 +1079,10 @@ struct sched_entity {
1073 u64 last_wakeup; 1079 u64 last_wakeup;
1074 u64 avg_overlap; 1080 u64 avg_overlap;
1075 1081
1082 u64 nr_migrations;
1083
1076 u64 start_runtime; 1084 u64 start_runtime;
1077 u64 avg_wakeup; 1085 u64 avg_wakeup;
1078 u64 nr_migrations;
1079 1086
1080#ifdef CONFIG_SCHEDSTATS 1087#ifdef CONFIG_SCHEDSTATS
1081 u64 wait_start; 1088 u64 wait_start;
@@ -1396,6 +1403,11 @@ struct task_struct {
1396 struct list_head pi_state_list; 1403 struct list_head pi_state_list;
1397 struct futex_pi_state *pi_state_cache; 1404 struct futex_pi_state *pi_state_cache;
1398#endif 1405#endif
1406#ifdef CONFIG_PERF_COUNTERS
1407 struct perf_counter_context *perf_counter_ctxp;
1408 struct mutex perf_counter_mutex;
1409 struct list_head perf_counter_list;
1410#endif
1399#ifdef CONFIG_NUMA 1411#ifdef CONFIG_NUMA
1400 struct mempolicy *mempolicy; 1412 struct mempolicy *mempolicy;
1401 short il_next; 1413 short il_next;
@@ -2410,6 +2422,13 @@ static inline void inc_syscw(struct task_struct *tsk)
2410#define TASK_SIZE_OF(tsk) TASK_SIZE 2422#define TASK_SIZE_OF(tsk) TASK_SIZE
2411#endif 2423#endif
2412 2424
2425/*
2426 * Call the function if the target task is executing on a CPU right now:
2427 */
2428extern void task_oncpu_function_call(struct task_struct *p,
2429 void (*func) (void *info), void *info);
2430
2431
2413#ifdef CONFIG_MM_OWNER 2432#ifdef CONFIG_MM_OWNER
2414extern void mm_update_next_owner(struct mm_struct *mm); 2433extern void mm_update_next_owner(struct mm_struct *mm);
2415extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); 2434extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 24c5602bee99..48803064cedf 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -62,6 +62,8 @@
62# define SLAB_DEBUG_OBJECTS 0x00000000UL 62# define SLAB_DEBUG_OBJECTS 0x00000000UL
63#endif 63#endif
64 64
65#define SLAB_NOLEAKTRACE 0x00800000UL /* Avoid kmemleak tracing */
66
65/* The following flags affect the page allocator grouping pages by mobility */ 67/* The following flags affect the page allocator grouping pages by mobility */
66#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ 68#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */
67#define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ 69#define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 30520844b8da..c6c84ad8bd71 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -55,6 +55,7 @@ struct compat_timeval;
55struct robust_list_head; 55struct robust_list_head;
56struct getcpu_cache; 56struct getcpu_cache;
57struct old_linux_dirent; 57struct old_linux_dirent;
58struct perf_counter_attr;
58 59
59#include <linux/types.h> 60#include <linux/types.h>
60#include <linux/aio_abi.h> 61#include <linux/aio_abi.h>
@@ -755,4 +756,8 @@ asmlinkage long sys_pipe(int __user *);
755 756
756int kernel_execve(const char *filename, char *const argv[], char *const envp[]); 757int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
757 758
759
760asmlinkage long sys_perf_counter_open(
761 const struct perf_counter_attr __user *attr_uptr,
762 pid_t pid, int cpu, int group_fd, unsigned long flags);
758#endif 763#endif
diff --git a/init/Kconfig b/init/Kconfig
index 5de1c17c51ed..c649657e2259 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -934,6 +934,40 @@ config AIO
934 by some high performance threaded applications. Disabling 934 by some high performance threaded applications. Disabling
935 this option saves about 7k. 935 this option saves about 7k.
936 936
937config HAVE_PERF_COUNTERS
938 bool
939
940menu "Performance Counters"
941
942config PERF_COUNTERS
943 bool "Kernel Performance Counters"
944 depends on HAVE_PERF_COUNTERS
945 select ANON_INODES
946 help
947 Enable kernel support for performance counter hardware.
948
949 Performance counters are special hardware registers available
950 on most modern CPUs. These registers count the number of certain
951 types of hw events: such as instructions executed, cachemisses
952 suffered, or branches mis-predicted - without slowing down the
953 kernel or applications. These registers can also trigger interrupts
954 when a threshold number of events have passed - and can thus be
955 used to profile the code that runs on that CPU.
956
957 The Linux Performance Counter subsystem provides an abstraction of
958 these hardware capabilities, available via a system call. It
959 provides per task and per CPU counters, and it provides event
960 capabilities on top of those.
961
962 Say Y if unsure.
963
964config EVENT_PROFILE
965 bool "Tracepoint profile sources"
966 depends on PERF_COUNTERS && EVENT_TRACER
967 default y
968
969endmenu
970
937config VM_EVENT_COUNTERS 971config VM_EVENT_COUNTERS
938 default y 972 default y
939 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED 973 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
diff --git a/init/main.c b/init/main.c
index bb7dc57eee36..5616661eac01 100644
--- a/init/main.c
+++ b/init/main.c
@@ -56,6 +56,7 @@
56#include <linux/debug_locks.h> 56#include <linux/debug_locks.h>
57#include <linux/debugobjects.h> 57#include <linux/debugobjects.h>
58#include <linux/lockdep.h> 58#include <linux/lockdep.h>
59#include <linux/kmemleak.h>
59#include <linux/pid_namespace.h> 60#include <linux/pid_namespace.h>
60#include <linux/device.h> 61#include <linux/device.h>
61#include <linux/kthread.h> 62#include <linux/kthread.h>
@@ -533,6 +534,16 @@ void __init __weak thread_info_cache_init(void)
533{ 534{
534} 535}
535 536
537/*
538 * Set up kernel memory allocators
539 */
540static void __init mm_init(void)
541{
542 mem_init();
543 kmem_cache_init();
544 vmalloc_init();
545}
546
536asmlinkage void __init start_kernel(void) 547asmlinkage void __init start_kernel(void)
537{ 548{
538 char * command_line; 549 char * command_line;
@@ -574,6 +585,23 @@ asmlinkage void __init start_kernel(void)
574 setup_nr_cpu_ids(); 585 setup_nr_cpu_ids();
575 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ 586 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
576 587
588 build_all_zonelists();
589 page_alloc_init();
590
591 printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
592 parse_early_param();
593 parse_args("Booting kernel", static_command_line, __start___param,
594 __stop___param - __start___param,
595 &unknown_bootoption);
596 /*
597 * These use large bootmem allocations and must precede
598 * kmem_cache_init()
599 */
600 pidhash_init();
601 vfs_caches_init_early();
602 sort_main_extable();
603 trap_init();
604 mm_init();
577 /* 605 /*
578 * Set up the scheduler prior starting any interrupts (such as the 606 * Set up the scheduler prior starting any interrupts (such as the
579 * timer interrupt). Full topology setup happens at smp_init() 607 * timer interrupt). Full topology setup happens at smp_init()
@@ -585,25 +613,16 @@ asmlinkage void __init start_kernel(void)
585 * fragile until we cpu_idle() for the first time. 613 * fragile until we cpu_idle() for the first time.
586 */ 614 */
587 preempt_disable(); 615 preempt_disable();
588 build_all_zonelists();
589 page_alloc_init();
590 printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
591 parse_early_param();
592 parse_args("Booting kernel", static_command_line, __start___param,
593 __stop___param - __start___param,
594 &unknown_bootoption);
595 if (!irqs_disabled()) { 616 if (!irqs_disabled()) {
596 printk(KERN_WARNING "start_kernel(): bug: interrupts were " 617 printk(KERN_WARNING "start_kernel(): bug: interrupts were "
597 "enabled *very* early, fixing it\n"); 618 "enabled *very* early, fixing it\n");
598 local_irq_disable(); 619 local_irq_disable();
599 } 620 }
600 sort_main_extable();
601 trap_init();
602 rcu_init(); 621 rcu_init();
603 /* init some links before init_ISA_irqs() */ 622 /* init some links before init_ISA_irqs() */
604 early_irq_init(); 623 early_irq_init();
605 init_IRQ(); 624 init_IRQ();
606 pidhash_init(); 625 prio_tree_init();
607 init_timers(); 626 init_timers();
608 hrtimers_init(); 627 hrtimers_init();
609 softirq_init(); 628 softirq_init();
@@ -645,15 +664,12 @@ asmlinkage void __init start_kernel(void)
645 initrd_start = 0; 664 initrd_start = 0;
646 } 665 }
647#endif 666#endif
648 vmalloc_init();
649 vfs_caches_init_early();
650 cpuset_init_early(); 667 cpuset_init_early();
651 page_cgroup_init(); 668 page_cgroup_init();
652 mem_init();
653 enable_debug_pagealloc(); 669 enable_debug_pagealloc();
654 cpu_hotplug_init(); 670 cpu_hotplug_init();
655 kmem_cache_init();
656 kmemtrace_init(); 671 kmemtrace_init();
672 kmemleak_init();
657 debug_objects_mem_init(); 673 debug_objects_mem_init();
658 idr_init_cache(); 674 idr_init_cache();
659 setup_per_cpu_pageset(); 675 setup_per_cpu_pageset();
@@ -663,7 +679,6 @@ asmlinkage void __init start_kernel(void)
663 calibrate_delay(); 679 calibrate_delay();
664 pidmap_init(); 680 pidmap_init();
665 pgtable_cache_init(); 681 pgtable_cache_init();
666 prio_tree_init();
667 anon_vma_init(); 682 anon_vma_init();
668#ifdef CONFIG_X86 683#ifdef CONFIG_X86
669 if (efi_enabled) 684 if (efi_enabled)
diff --git a/kernel/Makefile b/kernel/Makefile
index a35eee3436de..90b53f6dc226 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -96,6 +96,7 @@ obj-$(CONFIG_TRACING) += trace/
96obj-$(CONFIG_X86_DS) += trace/ 96obj-$(CONFIG_X86_DS) += trace/
97obj-$(CONFIG_SMP) += sched_cpupri.o 97obj-$(CONFIG_SMP) += sched_cpupri.o
98obj-$(CONFIG_SLOW_WORK) += slow-work.o 98obj-$(CONFIG_SLOW_WORK) += slow-work.o
99obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
99 100
100ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 101ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
101# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 102# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 026faccca869..d5a7e17474ee 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1857,7 +1857,7 @@ struct cgroup_subsys cpuset_subsys = {
1857 1857
1858int __init cpuset_init_early(void) 1858int __init cpuset_init_early(void)
1859{ 1859{
1860 alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed); 1860 alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
1861 1861
1862 top_cpuset.mems_generation = cpuset_mems_generation++; 1862 top_cpuset.mems_generation = cpuset_mems_generation++;
1863 return 0; 1863 return 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index 51d1fe3fb7ad..b6c90b5ef509 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,6 +48,7 @@
48#include <linux/tracehook.h> 48#include <linux/tracehook.h>
49#include <linux/fs_struct.h> 49#include <linux/fs_struct.h>
50#include <linux/init_task.h> 50#include <linux/init_task.h>
51#include <linux/perf_counter.h>
51#include <trace/events/sched.h> 52#include <trace/events/sched.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
@@ -154,6 +155,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
154{ 155{
155 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 156 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
156 157
158#ifdef CONFIG_PERF_COUNTERS
159 WARN_ON_ONCE(tsk->perf_counter_ctxp);
160#endif
157 trace_sched_process_free(tsk); 161 trace_sched_process_free(tsk);
158 put_task_struct(tsk); 162 put_task_struct(tsk);
159} 163}
@@ -170,6 +174,7 @@ repeat:
170 atomic_dec(&__task_cred(p)->user->processes); 174 atomic_dec(&__task_cred(p)->user->processes);
171 175
172 proc_flush_task(p); 176 proc_flush_task(p);
177
173 write_lock_irq(&tasklist_lock); 178 write_lock_irq(&tasklist_lock);
174 tracehook_finish_release_task(p); 179 tracehook_finish_release_task(p);
175 __exit_signal(p); 180 __exit_signal(p);
@@ -971,16 +976,19 @@ NORET_TYPE void do_exit(long code)
971 module_put(tsk->binfmt->module); 976 module_put(tsk->binfmt->module);
972 977
973 proc_exit_connector(tsk); 978 proc_exit_connector(tsk);
979
980 /*
981 * Flush inherited counters to the parent - before the parent
982 * gets woken up by child-exit notifications.
983 */
984 perf_counter_exit_task(tsk);
985
974 exit_notify(tsk, group_dead); 986 exit_notify(tsk, group_dead);
975#ifdef CONFIG_NUMA 987#ifdef CONFIG_NUMA
976 mpol_put(tsk->mempolicy); 988 mpol_put(tsk->mempolicy);
977 tsk->mempolicy = NULL; 989 tsk->mempolicy = NULL;
978#endif 990#endif
979#ifdef CONFIG_FUTEX 991#ifdef CONFIG_FUTEX
980 /*
981 * This must happen late, after the PID is not
982 * hashed anymore:
983 */
984 if (unlikely(!list_empty(&tsk->pi_state_list))) 992 if (unlikely(!list_empty(&tsk->pi_state_list)))
985 exit_pi_state_list(tsk); 993 exit_pi_state_list(tsk);
986 if (unlikely(current->pi_state_cache)) 994 if (unlikely(current->pi_state_cache))
diff --git a/kernel/fork.c b/kernel/fork.c
index bb762b4dd217..4430eb1376f2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -62,6 +62,7 @@
62#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <linux/fs_struct.h> 63#include <linux/fs_struct.h>
64#include <linux/magic.h> 64#include <linux/magic.h>
65#include <linux/perf_counter.h>
65 66
66#include <asm/pgtable.h> 67#include <asm/pgtable.h>
67#include <asm/pgalloc.h> 68#include <asm/pgalloc.h>
@@ -1096,6 +1097,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1096 /* Perform scheduler related setup. Assign this task to a CPU. */ 1097 /* Perform scheduler related setup. Assign this task to a CPU. */
1097 sched_fork(p, clone_flags); 1098 sched_fork(p, clone_flags);
1098 1099
1100 retval = perf_counter_init_task(p);
1101 if (retval)
1102 goto bad_fork_cleanup_policy;
1103
1099 if ((retval = audit_alloc(p))) 1104 if ((retval = audit_alloc(p)))
1100 goto bad_fork_cleanup_policy; 1105 goto bad_fork_cleanup_policy;
1101 /* copy all the process information */ 1106 /* copy all the process information */
@@ -1290,6 +1295,7 @@ bad_fork_cleanup_semundo:
1290bad_fork_cleanup_audit: 1295bad_fork_cleanup_audit:
1291 audit_free(p); 1296 audit_free(p);
1292bad_fork_cleanup_policy: 1297bad_fork_cleanup_policy:
1298 perf_counter_free_task(p);
1293#ifdef CONFIG_NUMA 1299#ifdef CONFIG_NUMA
1294 mpol_put(p->mempolicy); 1300 mpol_put(p->mempolicy);
1295bad_fork_cleanup_cgroup: 1301bad_fork_cleanup_cgroup:
@@ -1403,6 +1409,12 @@ long do_fork(unsigned long clone_flags,
1403 if (clone_flags & CLONE_VFORK) { 1409 if (clone_flags & CLONE_VFORK) {
1404 p->vfork_done = &vfork; 1410 p->vfork_done = &vfork;
1405 init_completion(&vfork); 1411 init_completion(&vfork);
1412 } else if (!(clone_flags & CLONE_VM)) {
1413 /*
1414 * vfork will do an exec which will call
1415 * set_task_comm()
1416 */
1417 perf_counter_fork(p);
1406 } 1418 }
1407 1419
1408 audit_finish_fork(p); 1420 audit_finish_fork(p);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a60018402f42..104578541230 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -150,6 +150,7 @@ int __init early_irq_init(void)
150{ 150{
151 struct irq_desc *desc; 151 struct irq_desc *desc;
152 int legacy_count; 152 int legacy_count;
153 int node;
153 int i; 154 int i;
154 155
155 init_irq_default_affinity(); 156 init_irq_default_affinity();
@@ -160,20 +161,20 @@ int __init early_irq_init(void)
160 161
161 desc = irq_desc_legacy; 162 desc = irq_desc_legacy;
162 legacy_count = ARRAY_SIZE(irq_desc_legacy); 163 legacy_count = ARRAY_SIZE(irq_desc_legacy);
164 node = first_online_node;
163 165
164 /* allocate irq_desc_ptrs array based on nr_irqs */ 166 /* allocate irq_desc_ptrs array based on nr_irqs */
165 irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); 167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
166 168
167 /* allocate based on nr_cpu_ids */ 169 /* allocate based on nr_cpu_ids */
168 /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */ 170 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
169 kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids * 171 sizeof(int), GFP_NOWAIT, node);
170 sizeof(int));
171 172
172 for (i = 0; i < legacy_count; i++) { 173 for (i = 0; i < legacy_count; i++) {
173 desc[i].irq = i; 174 desc[i].irq = i;
174 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; 175 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
175 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 176 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
176 alloc_desc_masks(&desc[i], 0, true); 177 alloc_desc_masks(&desc[i], node, true);
177 init_desc_masks(&desc[i]); 178 init_desc_masks(&desc[i]);
178 irq_desc_ptrs[i] = desc + i; 179 irq_desc_ptrs[i] = desc + i;
179 } 180 }
diff --git a/kernel/module.c b/kernel/module.c
index 278e9b6762bb..35f7de00bf0d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -53,6 +53,7 @@
53#include <linux/ftrace.h> 53#include <linux/ftrace.h>
54#include <linux/async.h> 54#include <linux/async.h>
55#include <linux/percpu.h> 55#include <linux/percpu.h>
56#include <linux/kmemleak.h>
56 57
57#if 0 58#if 0
58#define DEBUGP printk 59#define DEBUGP printk
@@ -433,6 +434,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
433 unsigned long extra; 434 unsigned long extra;
434 unsigned int i; 435 unsigned int i;
435 void *ptr; 436 void *ptr;
437 int cpu;
436 438
437 if (align > PAGE_SIZE) { 439 if (align > PAGE_SIZE) {
438 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", 440 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
@@ -462,6 +464,11 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
462 if (!split_block(i, size)) 464 if (!split_block(i, size))
463 return NULL; 465 return NULL;
464 466
467 /* add the per-cpu scanning areas */
468 for_each_possible_cpu(cpu)
469 kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
470 GFP_KERNEL);
471
465 /* Mark allocated */ 472 /* Mark allocated */
466 pcpu_size[i] = -pcpu_size[i]; 473 pcpu_size[i] = -pcpu_size[i];
467 return ptr; 474 return ptr;
@@ -476,6 +483,7 @@ static void percpu_modfree(void *freeme)
476{ 483{
477 unsigned int i; 484 unsigned int i;
478 void *ptr = __per_cpu_start + block_size(pcpu_size[0]); 485 void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
486 int cpu;
479 487
480 /* First entry is core kernel percpu data. */ 488 /* First entry is core kernel percpu data. */
481 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { 489 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
@@ -487,6 +495,10 @@ static void percpu_modfree(void *freeme)
487 BUG(); 495 BUG();
488 496
489 free: 497 free:
498 /* remove the per-cpu scanning areas */
499 for_each_possible_cpu(cpu)
500 kmemleak_free(freeme + per_cpu_offset(cpu));
501
490 /* Merge with previous? */ 502 /* Merge with previous? */
491 if (pcpu_size[i-1] >= 0) { 503 if (pcpu_size[i-1] >= 0) {
492 pcpu_size[i-1] += pcpu_size[i]; 504 pcpu_size[i-1] += pcpu_size[i];
@@ -1879,6 +1891,36 @@ static void *module_alloc_update_bounds(unsigned long size)
1879 return ret; 1891 return ret;
1880} 1892}
1881 1893
1894#ifdef CONFIG_DEBUG_KMEMLEAK
1895static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
1896 Elf_Shdr *sechdrs, char *secstrings)
1897{
1898 unsigned int i;
1899
1900 /* only scan the sections containing data */
1901 kmemleak_scan_area(mod->module_core, (unsigned long)mod -
1902 (unsigned long)mod->module_core,
1903 sizeof(struct module), GFP_KERNEL);
1904
1905 for (i = 1; i < hdr->e_shnum; i++) {
1906 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
1907 continue;
1908 if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0
1909 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
1910 continue;
1911
1912 kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr -
1913 (unsigned long)mod->module_core,
1914 sechdrs[i].sh_size, GFP_KERNEL);
1915 }
1916}
1917#else
1918static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
1919 Elf_Shdr *sechdrs, char *secstrings)
1920{
1921}
1922#endif
1923
1882/* Allocate and load the module: note that size of section 0 is always 1924/* Allocate and load the module: note that size of section 0 is always
1883 zero, and we rely on this for optional sections. */ 1925 zero, and we rely on this for optional sections. */
1884static noinline struct module *load_module(void __user *umod, 1926static noinline struct module *load_module(void __user *umod,
@@ -2049,6 +2091,12 @@ static noinline struct module *load_module(void __user *umod,
2049 2091
2050 /* Do the allocs. */ 2092 /* Do the allocs. */
2051 ptr = module_alloc_update_bounds(mod->core_size); 2093 ptr = module_alloc_update_bounds(mod->core_size);
2094 /*
2095 * The pointer to this block is stored in the module structure
2096 * which is inside the block. Just mark it as not being a
2097 * leak.
2098 */
2099 kmemleak_not_leak(ptr);
2052 if (!ptr) { 2100 if (!ptr) {
2053 err = -ENOMEM; 2101 err = -ENOMEM;
2054 goto free_percpu; 2102 goto free_percpu;
@@ -2057,6 +2105,13 @@ static noinline struct module *load_module(void __user *umod,
2057 mod->module_core = ptr; 2105 mod->module_core = ptr;
2058 2106
2059 ptr = module_alloc_update_bounds(mod->init_size); 2107 ptr = module_alloc_update_bounds(mod->init_size);
2108 /*
2109 * The pointer to this block is stored in the module structure
2110 * which is inside the block. This block doesn't need to be
2111 * scanned as it contains data and code that will be freed
2112 * after the module is initialized.
2113 */
2114 kmemleak_ignore(ptr);
2060 if (!ptr && mod->init_size) { 2115 if (!ptr && mod->init_size) {
2061 err = -ENOMEM; 2116 err = -ENOMEM;
2062 goto free_core; 2117 goto free_core;
@@ -2087,6 +2142,7 @@ static noinline struct module *load_module(void __user *umod,
2087 } 2142 }
2088 /* Module has been moved. */ 2143 /* Module has been moved. */
2089 mod = (void *)sechdrs[modindex].sh_addr; 2144 mod = (void *)sechdrs[modindex].sh_addr;
2145 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2090 2146
2091#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2147#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2092 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), 2148 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
diff --git a/kernel/mutex.c b/kernel/mutex.c
index e5cc0cd28d54..947b3ad551f8 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
89 * 89 *
90 * This function is similar to (but not equivalent to) down(). 90 * This function is similar to (but not equivalent to) down().
91 */ 91 */
92void inline __sched mutex_lock(struct mutex *lock) 92void __sched mutex_lock(struct mutex *lock)
93{ 93{
94 might_sleep(); 94 might_sleep();
95 /* 95 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 000000000000..ef5d8a5b2453
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,4260 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/hardirq.h>
24#include <linux/rculist.h>
25#include <linux/uaccess.h>
26#include <linux/syscalls.h>
27#include <linux/anon_inodes.h>
28#include <linux/kernel_stat.h>
29#include <linux/perf_counter.h>
30
31#include <asm/irq_regs.h>
32
33/*
34 * Each CPU has a list of per CPU counters:
35 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37
38int perf_max_counters __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly;
40static int perf_overcommit __read_mostly = 1;
41
42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly;
45
46/*
47 * perf counter paranoia level:
48 * 0 - not paranoid
49 * 1 - disallow cpu counters to unpriv
50 * 2 - disallow kernel profiling to unpriv
51 */
52int sysctl_perf_counter_paranoid __read_mostly;
53
54static inline bool perf_paranoid_cpu(void)
55{
56 return sysctl_perf_counter_paranoid > 0;
57}
58
59static inline bool perf_paranoid_kernel(void)
60{
61 return sysctl_perf_counter_paranoid > 1;
62}
63
64int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
65
66/*
67 * max perf counter sample rate
68 */
69int sysctl_perf_counter_sample_rate __read_mostly = 100000;
70
71static atomic64_t perf_counter_id;
72
73/*
74 * Lock for (sysadmin-configurable) counter reservations:
75 */
76static DEFINE_SPINLOCK(perf_resource_lock);
77
78/*
79 * Architecture provided APIs - weak aliases:
80 */
81extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
82{
83 return NULL;
84}
85
86void __weak hw_perf_disable(void) { barrier(); }
87void __weak hw_perf_enable(void) { barrier(); }
88
89void __weak hw_perf_counter_setup(int cpu) { barrier(); }
90
91int __weak
92hw_perf_group_sched_in(struct perf_counter *group_leader,
93 struct perf_cpu_context *cpuctx,
94 struct perf_counter_context *ctx, int cpu)
95{
96 return 0;
97}
98
99void __weak perf_counter_print_debug(void) { }
100
101static DEFINE_PER_CPU(int, disable_count);
102
103void __perf_disable(void)
104{
105 __get_cpu_var(disable_count)++;
106}
107
108bool __perf_enable(void)
109{
110 return !--__get_cpu_var(disable_count);
111}
112
113void perf_disable(void)
114{
115 __perf_disable();
116 hw_perf_disable();
117}
118
119void perf_enable(void)
120{
121 if (__perf_enable())
122 hw_perf_enable();
123}
124
125static void get_ctx(struct perf_counter_context *ctx)
126{
127 atomic_inc(&ctx->refcount);
128}
129
130static void free_ctx(struct rcu_head *head)
131{
132 struct perf_counter_context *ctx;
133
134 ctx = container_of(head, struct perf_counter_context, rcu_head);
135 kfree(ctx);
136}
137
138static void put_ctx(struct perf_counter_context *ctx)
139{
140 if (atomic_dec_and_test(&ctx->refcount)) {
141 if (ctx->parent_ctx)
142 put_ctx(ctx->parent_ctx);
143 if (ctx->task)
144 put_task_struct(ctx->task);
145 call_rcu(&ctx->rcu_head, free_ctx);
146 }
147}
148
149/*
150 * Get the perf_counter_context for a task and lock it.
151 * This has to cope with with the fact that until it is locked,
152 * the context could get moved to another task.
153 */
154static struct perf_counter_context *
155perf_lock_task_context(struct task_struct *task, unsigned long *flags)
156{
157 struct perf_counter_context *ctx;
158
159 rcu_read_lock();
160 retry:
161 ctx = rcu_dereference(task->perf_counter_ctxp);
162 if (ctx) {
163 /*
164 * If this context is a clone of another, it might
165 * get swapped for another underneath us by
166 * perf_counter_task_sched_out, though the
167 * rcu_read_lock() protects us from any context
168 * getting freed. Lock the context and check if it
169 * got swapped before we could get the lock, and retry
170 * if so. If we locked the right context, then it
171 * can't get swapped on us any more.
172 */
173 spin_lock_irqsave(&ctx->lock, *flags);
174 if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
175 spin_unlock_irqrestore(&ctx->lock, *flags);
176 goto retry;
177 }
178 }
179 rcu_read_unlock();
180 return ctx;
181}
182
183/*
184 * Get the context for a task and increment its pin_count so it
185 * can't get swapped to another task. This also increments its
186 * reference count so that the context can't get freed.
187 */
188static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
189{
190 struct perf_counter_context *ctx;
191 unsigned long flags;
192
193 ctx = perf_lock_task_context(task, &flags);
194 if (ctx) {
195 ++ctx->pin_count;
196 get_ctx(ctx);
197 spin_unlock_irqrestore(&ctx->lock, flags);
198 }
199 return ctx;
200}
201
202static void perf_unpin_context(struct perf_counter_context *ctx)
203{
204 unsigned long flags;
205
206 spin_lock_irqsave(&ctx->lock, flags);
207 --ctx->pin_count;
208 spin_unlock_irqrestore(&ctx->lock, flags);
209 put_ctx(ctx);
210}
211
212/*
213 * Add a counter from the lists for its context.
214 * Must be called with ctx->mutex and ctx->lock held.
215 */
216static void
217list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
218{
219 struct perf_counter *group_leader = counter->group_leader;
220
221 /*
222 * Depending on whether it is a standalone or sibling counter,
223 * add it straight to the context's counter list, or to the group
224 * leader's sibling list:
225 */
226 if (group_leader == counter)
227 list_add_tail(&counter->list_entry, &ctx->counter_list);
228 else {
229 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
230 group_leader->nr_siblings++;
231 }
232
233 list_add_rcu(&counter->event_entry, &ctx->event_list);
234 ctx->nr_counters++;
235}
236
237/*
238 * Remove a counter from the lists for its context.
239 * Must be called with ctx->mutex and ctx->lock held.
240 */
241static void
242list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
243{
244 struct perf_counter *sibling, *tmp;
245
246 if (list_empty(&counter->list_entry))
247 return;
248 ctx->nr_counters--;
249
250 list_del_init(&counter->list_entry);
251 list_del_rcu(&counter->event_entry);
252
253 if (counter->group_leader != counter)
254 counter->group_leader->nr_siblings--;
255
256 /*
257 * If this was a group counter with sibling counters then
258 * upgrade the siblings to singleton counters by adding them
259 * to the context list directly:
260 */
261 list_for_each_entry_safe(sibling, tmp,
262 &counter->sibling_list, list_entry) {
263
264 list_move_tail(&sibling->list_entry, &ctx->counter_list);
265 sibling->group_leader = sibling;
266 }
267}
268
269static void
270counter_sched_out(struct perf_counter *counter,
271 struct perf_cpu_context *cpuctx,
272 struct perf_counter_context *ctx)
273{
274 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
275 return;
276
277 counter->state = PERF_COUNTER_STATE_INACTIVE;
278 counter->tstamp_stopped = ctx->time;
279 counter->pmu->disable(counter);
280 counter->oncpu = -1;
281
282 if (!is_software_counter(counter))
283 cpuctx->active_oncpu--;
284 ctx->nr_active--;
285 if (counter->attr.exclusive || !cpuctx->active_oncpu)
286 cpuctx->exclusive = 0;
287}
288
289static void
290group_sched_out(struct perf_counter *group_counter,
291 struct perf_cpu_context *cpuctx,
292 struct perf_counter_context *ctx)
293{
294 struct perf_counter *counter;
295
296 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
297 return;
298
299 counter_sched_out(group_counter, cpuctx, ctx);
300
301 /*
302 * Schedule out siblings (if any):
303 */
304 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
305 counter_sched_out(counter, cpuctx, ctx);
306
307 if (group_counter->attr.exclusive)
308 cpuctx->exclusive = 0;
309}
310
311/*
312 * Cross CPU call to remove a performance counter
313 *
314 * We disable the counter on the hardware level first. After that we
315 * remove it from the context list.
316 */
317static void __perf_counter_remove_from_context(void *info)
318{
319 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
320 struct perf_counter *counter = info;
321 struct perf_counter_context *ctx = counter->ctx;
322
323 /*
324 * If this is a task context, we need to check whether it is
325 * the current task context of this cpu. If not it has been
326 * scheduled out before the smp call arrived.
327 */
328 if (ctx->task && cpuctx->task_ctx != ctx)
329 return;
330
331 spin_lock(&ctx->lock);
332 /*
333 * Protect the list operation against NMI by disabling the
334 * counters on a global level.
335 */
336 perf_disable();
337
338 counter_sched_out(counter, cpuctx, ctx);
339
340 list_del_counter(counter, ctx);
341
342 if (!ctx->task) {
343 /*
344 * Allow more per task counters with respect to the
345 * reservation:
346 */
347 cpuctx->max_pertask =
348 min(perf_max_counters - ctx->nr_counters,
349 perf_max_counters - perf_reserved_percpu);
350 }
351
352 perf_enable();
353 spin_unlock(&ctx->lock);
354}
355
356
357/*
358 * Remove the counter from a task's (or a CPU's) list of counters.
359 *
360 * Must be called with ctx->mutex held.
361 *
362 * CPU counters are removed with a smp call. For task counters we only
363 * call when the task is on a CPU.
364 *
365 * If counter->ctx is a cloned context, callers must make sure that
366 * every task struct that counter->ctx->task could possibly point to
367 * remains valid. This is OK when called from perf_release since
368 * that only calls us on the top-level context, which can't be a clone.
369 * When called from perf_counter_exit_task, it's OK because the
370 * context has been detached from its task.
371 */
372static void perf_counter_remove_from_context(struct perf_counter *counter)
373{
374 struct perf_counter_context *ctx = counter->ctx;
375 struct task_struct *task = ctx->task;
376
377 if (!task) {
378 /*
379 * Per cpu counters are removed via an smp call and
380 * the removal is always sucessful.
381 */
382 smp_call_function_single(counter->cpu,
383 __perf_counter_remove_from_context,
384 counter, 1);
385 return;
386 }
387
388retry:
389 task_oncpu_function_call(task, __perf_counter_remove_from_context,
390 counter);
391
392 spin_lock_irq(&ctx->lock);
393 /*
394 * If the context is active we need to retry the smp call.
395 */
396 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
397 spin_unlock_irq(&ctx->lock);
398 goto retry;
399 }
400
401 /*
402 * The lock prevents that this context is scheduled in so we
403 * can remove the counter safely, if the call above did not
404 * succeed.
405 */
406 if (!list_empty(&counter->list_entry)) {
407 list_del_counter(counter, ctx);
408 }
409 spin_unlock_irq(&ctx->lock);
410}
411
412static inline u64 perf_clock(void)
413{
414 return cpu_clock(smp_processor_id());
415}
416
417/*
418 * Update the record of the current time in a context.
419 */
420static void update_context_time(struct perf_counter_context *ctx)
421{
422 u64 now = perf_clock();
423
424 ctx->time += now - ctx->timestamp;
425 ctx->timestamp = now;
426}
427
428/*
429 * Update the total_time_enabled and total_time_running fields for a counter.
430 */
431static void update_counter_times(struct perf_counter *counter)
432{
433 struct perf_counter_context *ctx = counter->ctx;
434 u64 run_end;
435
436 if (counter->state < PERF_COUNTER_STATE_INACTIVE)
437 return;
438
439 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
440
441 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
442 run_end = counter->tstamp_stopped;
443 else
444 run_end = ctx->time;
445
446 counter->total_time_running = run_end - counter->tstamp_running;
447}
448
449/*
450 * Update total_time_enabled and total_time_running for all counters in a group.
451 */
452static void update_group_times(struct perf_counter *leader)
453{
454 struct perf_counter *counter;
455
456 update_counter_times(leader);
457 list_for_each_entry(counter, &leader->sibling_list, list_entry)
458 update_counter_times(counter);
459}
460
461/*
462 * Cross CPU call to disable a performance counter
463 */
464static void __perf_counter_disable(void *info)
465{
466 struct perf_counter *counter = info;
467 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
468 struct perf_counter_context *ctx = counter->ctx;
469
470 /*
471 * If this is a per-task counter, need to check whether this
472 * counter's task is the current task on this cpu.
473 */
474 if (ctx->task && cpuctx->task_ctx != ctx)
475 return;
476
477 spin_lock(&ctx->lock);
478
479 /*
480 * If the counter is on, turn it off.
481 * If it is in error state, leave it in error state.
482 */
483 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
484 update_context_time(ctx);
485 update_counter_times(counter);
486 if (counter == counter->group_leader)
487 group_sched_out(counter, cpuctx, ctx);
488 else
489 counter_sched_out(counter, cpuctx, ctx);
490 counter->state = PERF_COUNTER_STATE_OFF;
491 }
492
493 spin_unlock(&ctx->lock);
494}
495
496/*
497 * Disable a counter.
498 *
499 * If counter->ctx is a cloned context, callers must make sure that
500 * every task struct that counter->ctx->task could possibly point to
501 * remains valid. This condition is satisifed when called through
502 * perf_counter_for_each_child or perf_counter_for_each because they
503 * hold the top-level counter's child_mutex, so any descendant that
504 * goes to exit will block in sync_child_counter.
505 * When called from perf_pending_counter it's OK because counter->ctx
506 * is the current context on this CPU and preemption is disabled,
507 * hence we can't get into perf_counter_task_sched_out for this context.
508 */
509static void perf_counter_disable(struct perf_counter *counter)
510{
511 struct perf_counter_context *ctx = counter->ctx;
512 struct task_struct *task = ctx->task;
513
514 if (!task) {
515 /*
516 * Disable the counter on the cpu that it's on
517 */
518 smp_call_function_single(counter->cpu, __perf_counter_disable,
519 counter, 1);
520 return;
521 }
522
523 retry:
524 task_oncpu_function_call(task, __perf_counter_disable, counter);
525
526 spin_lock_irq(&ctx->lock);
527 /*
528 * If the counter is still active, we need to retry the cross-call.
529 */
530 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
531 spin_unlock_irq(&ctx->lock);
532 goto retry;
533 }
534
535 /*
536 * Since we have the lock this context can't be scheduled
537 * in, so we can change the state safely.
538 */
539 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
540 update_counter_times(counter);
541 counter->state = PERF_COUNTER_STATE_OFF;
542 }
543
544 spin_unlock_irq(&ctx->lock);
545}
546
547static int
548counter_sched_in(struct perf_counter *counter,
549 struct perf_cpu_context *cpuctx,
550 struct perf_counter_context *ctx,
551 int cpu)
552{
553 if (counter->state <= PERF_COUNTER_STATE_OFF)
554 return 0;
555
556 counter->state = PERF_COUNTER_STATE_ACTIVE;
557 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
558 /*
559 * The new state must be visible before we turn it on in the hardware:
560 */
561 smp_wmb();
562
563 if (counter->pmu->enable(counter)) {
564 counter->state = PERF_COUNTER_STATE_INACTIVE;
565 counter->oncpu = -1;
566 return -EAGAIN;
567 }
568
569 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
570
571 if (!is_software_counter(counter))
572 cpuctx->active_oncpu++;
573 ctx->nr_active++;
574
575 if (counter->attr.exclusive)
576 cpuctx->exclusive = 1;
577
578 return 0;
579}
580
581static int
582group_sched_in(struct perf_counter *group_counter,
583 struct perf_cpu_context *cpuctx,
584 struct perf_counter_context *ctx,
585 int cpu)
586{
587 struct perf_counter *counter, *partial_group;
588 int ret;
589
590 if (group_counter->state == PERF_COUNTER_STATE_OFF)
591 return 0;
592
593 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
594 if (ret)
595 return ret < 0 ? ret : 0;
596
597 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
598 return -EAGAIN;
599
600 /*
601 * Schedule in siblings as one group (if any):
602 */
603 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
604 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
605 partial_group = counter;
606 goto group_error;
607 }
608 }
609
610 return 0;
611
612group_error:
613 /*
614 * Groups can be scheduled in as one unit only, so undo any
615 * partial group before returning:
616 */
617 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
618 if (counter == partial_group)
619 break;
620 counter_sched_out(counter, cpuctx, ctx);
621 }
622 counter_sched_out(group_counter, cpuctx, ctx);
623
624 return -EAGAIN;
625}
626
627/*
628 * Return 1 for a group consisting entirely of software counters,
629 * 0 if the group contains any hardware counters.
630 */
631static int is_software_only_group(struct perf_counter *leader)
632{
633 struct perf_counter *counter;
634
635 if (!is_software_counter(leader))
636 return 0;
637
638 list_for_each_entry(counter, &leader->sibling_list, list_entry)
639 if (!is_software_counter(counter))
640 return 0;
641
642 return 1;
643}
644
645/*
646 * Work out whether we can put this counter group on the CPU now.
647 */
648static int group_can_go_on(struct perf_counter *counter,
649 struct perf_cpu_context *cpuctx,
650 int can_add_hw)
651{
652 /*
653 * Groups consisting entirely of software counters can always go on.
654 */
655 if (is_software_only_group(counter))
656 return 1;
657 /*
658 * If an exclusive group is already on, no other hardware
659 * counters can go on.
660 */
661 if (cpuctx->exclusive)
662 return 0;
663 /*
664 * If this group is exclusive and there are already
665 * counters on the CPU, it can't go on.
666 */
667 if (counter->attr.exclusive && cpuctx->active_oncpu)
668 return 0;
669 /*
670 * Otherwise, try to add it if all previous groups were able
671 * to go on.
672 */
673 return can_add_hw;
674}
675
676static void add_counter_to_ctx(struct perf_counter *counter,
677 struct perf_counter_context *ctx)
678{
679 list_add_counter(counter, ctx);
680 counter->tstamp_enabled = ctx->time;
681 counter->tstamp_running = ctx->time;
682 counter->tstamp_stopped = ctx->time;
683}
684
685/*
686 * Cross CPU call to install and enable a performance counter
687 *
688 * Must be called with ctx->mutex held
689 */
690static void __perf_install_in_context(void *info)
691{
692 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
693 struct perf_counter *counter = info;
694 struct perf_counter_context *ctx = counter->ctx;
695 struct perf_counter *leader = counter->group_leader;
696 int cpu = smp_processor_id();
697 int err;
698
699 /*
700 * If this is a task context, we need to check whether it is
701 * the current task context of this cpu. If not it has been
702 * scheduled out before the smp call arrived.
703 * Or possibly this is the right context but it isn't
704 * on this cpu because it had no counters.
705 */
706 if (ctx->task && cpuctx->task_ctx != ctx) {
707 if (cpuctx->task_ctx || ctx->task != current)
708 return;
709 cpuctx->task_ctx = ctx;
710 }
711
712 spin_lock(&ctx->lock);
713 ctx->is_active = 1;
714 update_context_time(ctx);
715
716 /*
717 * Protect the list operation against NMI by disabling the
718 * counters on a global level. NOP for non NMI based counters.
719 */
720 perf_disable();
721
722 add_counter_to_ctx(counter, ctx);
723
724 /*
725 * Don't put the counter on if it is disabled or if
726 * it is in a group and the group isn't on.
727 */
728 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
729 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
730 goto unlock;
731
732 /*
733 * An exclusive counter can't go on if there are already active
734 * hardware counters, and no hardware counter can go on if there
735 * is already an exclusive counter on.
736 */
737 if (!group_can_go_on(counter, cpuctx, 1))
738 err = -EEXIST;
739 else
740 err = counter_sched_in(counter, cpuctx, ctx, cpu);
741
742 if (err) {
743 /*
744 * This counter couldn't go on. If it is in a group
745 * then we have to pull the whole group off.
746 * If the counter group is pinned then put it in error state.
747 */
748 if (leader != counter)
749 group_sched_out(leader, cpuctx, ctx);
750 if (leader->attr.pinned) {
751 update_group_times(leader);
752 leader->state = PERF_COUNTER_STATE_ERROR;
753 }
754 }
755
756 if (!err && !ctx->task && cpuctx->max_pertask)
757 cpuctx->max_pertask--;
758
759 unlock:
760 perf_enable();
761
762 spin_unlock(&ctx->lock);
763}
764
765/*
766 * Attach a performance counter to a context
767 *
768 * First we add the counter to the list with the hardware enable bit
769 * in counter->hw_config cleared.
770 *
771 * If the counter is attached to a task which is on a CPU we use a smp
772 * call to enable it in the task context. The task might have been
773 * scheduled away, but we check this in the smp call again.
774 *
775 * Must be called with ctx->mutex held.
776 */
777static void
778perf_install_in_context(struct perf_counter_context *ctx,
779 struct perf_counter *counter,
780 int cpu)
781{
782 struct task_struct *task = ctx->task;
783
784 if (!task) {
785 /*
786 * Per cpu counters are installed via an smp call and
787 * the install is always sucessful.
788 */
789 smp_call_function_single(cpu, __perf_install_in_context,
790 counter, 1);
791 return;
792 }
793
794retry:
795 task_oncpu_function_call(task, __perf_install_in_context,
796 counter);
797
798 spin_lock_irq(&ctx->lock);
799 /*
800 * we need to retry the smp call.
801 */
802 if (ctx->is_active && list_empty(&counter->list_entry)) {
803 spin_unlock_irq(&ctx->lock);
804 goto retry;
805 }
806
807 /*
808 * The lock prevents that this context is scheduled in so we
809 * can add the counter safely, if it the call above did not
810 * succeed.
811 */
812 if (list_empty(&counter->list_entry))
813 add_counter_to_ctx(counter, ctx);
814 spin_unlock_irq(&ctx->lock);
815}
816
817/*
818 * Cross CPU call to enable a performance counter
819 */
820static void __perf_counter_enable(void *info)
821{
822 struct perf_counter *counter = info;
823 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
824 struct perf_counter_context *ctx = counter->ctx;
825 struct perf_counter *leader = counter->group_leader;
826 int err;
827
828 /*
829 * If this is a per-task counter, need to check whether this
830 * counter's task is the current task on this cpu.
831 */
832 if (ctx->task && cpuctx->task_ctx != ctx) {
833 if (cpuctx->task_ctx || ctx->task != current)
834 return;
835 cpuctx->task_ctx = ctx;
836 }
837
838 spin_lock(&ctx->lock);
839 ctx->is_active = 1;
840 update_context_time(ctx);
841
842 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
843 goto unlock;
844 counter->state = PERF_COUNTER_STATE_INACTIVE;
845 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
846
847 /*
848 * If the counter is in a group and isn't the group leader,
849 * then don't put it on unless the group is on.
850 */
851 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
852 goto unlock;
853
854 if (!group_can_go_on(counter, cpuctx, 1)) {
855 err = -EEXIST;
856 } else {
857 perf_disable();
858 if (counter == leader)
859 err = group_sched_in(counter, cpuctx, ctx,
860 smp_processor_id());
861 else
862 err = counter_sched_in(counter, cpuctx, ctx,
863 smp_processor_id());
864 perf_enable();
865 }
866
867 if (err) {
868 /*
869 * If this counter can't go on and it's part of a
870 * group, then the whole group has to come off.
871 */
872 if (leader != counter)
873 group_sched_out(leader, cpuctx, ctx);
874 if (leader->attr.pinned) {
875 update_group_times(leader);
876 leader->state = PERF_COUNTER_STATE_ERROR;
877 }
878 }
879
880 unlock:
881 spin_unlock(&ctx->lock);
882}
883
884/*
885 * Enable a counter.
886 *
887 * If counter->ctx is a cloned context, callers must make sure that
888 * every task struct that counter->ctx->task could possibly point to
889 * remains valid. This condition is satisfied when called through
890 * perf_counter_for_each_child or perf_counter_for_each as described
891 * for perf_counter_disable.
892 */
893static void perf_counter_enable(struct perf_counter *counter)
894{
895 struct perf_counter_context *ctx = counter->ctx;
896 struct task_struct *task = ctx->task;
897
898 if (!task) {
899 /*
900 * Enable the counter on the cpu that it's on
901 */
902 smp_call_function_single(counter->cpu, __perf_counter_enable,
903 counter, 1);
904 return;
905 }
906
907 spin_lock_irq(&ctx->lock);
908 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
909 goto out;
910
911 /*
912 * If the counter is in error state, clear that first.
913 * That way, if we see the counter in error state below, we
914 * know that it has gone back into error state, as distinct
915 * from the task having been scheduled away before the
916 * cross-call arrived.
917 */
918 if (counter->state == PERF_COUNTER_STATE_ERROR)
919 counter->state = PERF_COUNTER_STATE_OFF;
920
921 retry:
922 spin_unlock_irq(&ctx->lock);
923 task_oncpu_function_call(task, __perf_counter_enable, counter);
924
925 spin_lock_irq(&ctx->lock);
926
927 /*
928 * If the context is active and the counter is still off,
929 * we need to retry the cross-call.
930 */
931 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
932 goto retry;
933
934 /*
935 * Since we have the lock this context can't be scheduled
936 * in, so we can change the state safely.
937 */
938 if (counter->state == PERF_COUNTER_STATE_OFF) {
939 counter->state = PERF_COUNTER_STATE_INACTIVE;
940 counter->tstamp_enabled =
941 ctx->time - counter->total_time_enabled;
942 }
943 out:
944 spin_unlock_irq(&ctx->lock);
945}
946
947static int perf_counter_refresh(struct perf_counter *counter, int refresh)
948{
949 /*
950 * not supported on inherited counters
951 */
952 if (counter->attr.inherit)
953 return -EINVAL;
954
955 atomic_add(refresh, &counter->event_limit);
956 perf_counter_enable(counter);
957
958 return 0;
959}
960
961void __perf_counter_sched_out(struct perf_counter_context *ctx,
962 struct perf_cpu_context *cpuctx)
963{
964 struct perf_counter *counter;
965
966 spin_lock(&ctx->lock);
967 ctx->is_active = 0;
968 if (likely(!ctx->nr_counters))
969 goto out;
970 update_context_time(ctx);
971
972 perf_disable();
973 if (ctx->nr_active) {
974 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
975 if (counter != counter->group_leader)
976 counter_sched_out(counter, cpuctx, ctx);
977 else
978 group_sched_out(counter, cpuctx, ctx);
979 }
980 }
981 perf_enable();
982 out:
983 spin_unlock(&ctx->lock);
984}
985
986/*
987 * Test whether two contexts are equivalent, i.e. whether they
988 * have both been cloned from the same version of the same context
989 * and they both have the same number of enabled counters.
990 * If the number of enabled counters is the same, then the set
991 * of enabled counters should be the same, because these are both
992 * inherited contexts, therefore we can't access individual counters
993 * in them directly with an fd; we can only enable/disable all
994 * counters via prctl, or enable/disable all counters in a family
995 * via ioctl, which will have the same effect on both contexts.
996 */
997static int context_equiv(struct perf_counter_context *ctx1,
998 struct perf_counter_context *ctx2)
999{
1000 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1001 && ctx1->parent_gen == ctx2->parent_gen
1002 && !ctx1->pin_count && !ctx2->pin_count;
1003}
1004
1005/*
1006 * Called from scheduler to remove the counters of the current task,
1007 * with interrupts disabled.
1008 *
1009 * We stop each counter and update the counter value in counter->count.
1010 *
1011 * This does not protect us against NMI, but disable()
1012 * sets the disabled bit in the control field of counter _before_
1013 * accessing the counter control register. If a NMI hits, then it will
1014 * not restart the counter.
1015 */
1016void perf_counter_task_sched_out(struct task_struct *task,
1017 struct task_struct *next, int cpu)
1018{
1019 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1020 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1021 struct perf_counter_context *next_ctx;
1022 struct perf_counter_context *parent;
1023 struct pt_regs *regs;
1024 int do_switch = 1;
1025
1026 regs = task_pt_regs(task);
1027 perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1028
1029 if (likely(!ctx || !cpuctx->task_ctx))
1030 return;
1031
1032 update_context_time(ctx);
1033
1034 rcu_read_lock();
1035 parent = rcu_dereference(ctx->parent_ctx);
1036 next_ctx = next->perf_counter_ctxp;
1037 if (parent && next_ctx &&
1038 rcu_dereference(next_ctx->parent_ctx) == parent) {
1039 /*
1040 * Looks like the two contexts are clones, so we might be
1041 * able to optimize the context switch. We lock both
1042 * contexts and check that they are clones under the
1043 * lock (including re-checking that neither has been
1044 * uncloned in the meantime). It doesn't matter which
1045 * order we take the locks because no other cpu could
1046 * be trying to lock both of these tasks.
1047 */
1048 spin_lock(&ctx->lock);
1049 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1050 if (context_equiv(ctx, next_ctx)) {
1051 /*
1052 * XXX do we need a memory barrier of sorts
1053 * wrt to rcu_dereference() of perf_counter_ctxp
1054 */
1055 task->perf_counter_ctxp = next_ctx;
1056 next->perf_counter_ctxp = ctx;
1057 ctx->task = next;
1058 next_ctx->task = task;
1059 do_switch = 0;
1060 }
1061 spin_unlock(&next_ctx->lock);
1062 spin_unlock(&ctx->lock);
1063 }
1064 rcu_read_unlock();
1065
1066 if (do_switch) {
1067 __perf_counter_sched_out(ctx, cpuctx);
1068 cpuctx->task_ctx = NULL;
1069 }
1070}
1071
1072/*
1073 * Called with IRQs disabled
1074 */
1075static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
1076{
1077 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1078
1079 if (!cpuctx->task_ctx)
1080 return;
1081
1082 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1083 return;
1084
1085 __perf_counter_sched_out(ctx, cpuctx);
1086 cpuctx->task_ctx = NULL;
1087}
1088
1089/*
1090 * Called with IRQs disabled
1091 */
1092static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
1093{
1094 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
1095}
1096
1097static void
1098__perf_counter_sched_in(struct perf_counter_context *ctx,
1099 struct perf_cpu_context *cpuctx, int cpu)
1100{
1101 struct perf_counter *counter;
1102 int can_add_hw = 1;
1103
1104 spin_lock(&ctx->lock);
1105 ctx->is_active = 1;
1106 if (likely(!ctx->nr_counters))
1107 goto out;
1108
1109 ctx->timestamp = perf_clock();
1110
1111 perf_disable();
1112
1113 /*
1114 * First go through the list and put on any pinned groups
1115 * in order to give them the best chance of going on.
1116 */
1117 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1118 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1119 !counter->attr.pinned)
1120 continue;
1121 if (counter->cpu != -1 && counter->cpu != cpu)
1122 continue;
1123
1124 if (counter != counter->group_leader)
1125 counter_sched_in(counter, cpuctx, ctx, cpu);
1126 else {
1127 if (group_can_go_on(counter, cpuctx, 1))
1128 group_sched_in(counter, cpuctx, ctx, cpu);
1129 }
1130
1131 /*
1132 * If this pinned group hasn't been scheduled,
1133 * put it in error state.
1134 */
1135 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1136 update_group_times(counter);
1137 counter->state = PERF_COUNTER_STATE_ERROR;
1138 }
1139 }
1140
1141 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1142 /*
1143 * Ignore counters in OFF or ERROR state, and
1144 * ignore pinned counters since we did them already.
1145 */
1146 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1147 counter->attr.pinned)
1148 continue;
1149
1150 /*
1151 * Listen to the 'cpu' scheduling filter constraint
1152 * of counters:
1153 */
1154 if (counter->cpu != -1 && counter->cpu != cpu)
1155 continue;
1156
1157 if (counter != counter->group_leader) {
1158 if (counter_sched_in(counter, cpuctx, ctx, cpu))
1159 can_add_hw = 0;
1160 } else {
1161 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
1162 if (group_sched_in(counter, cpuctx, ctx, cpu))
1163 can_add_hw = 0;
1164 }
1165 }
1166 }
1167 perf_enable();
1168 out:
1169 spin_unlock(&ctx->lock);
1170}
1171
1172/*
1173 * Called from scheduler to add the counters of the current task
1174 * with interrupts disabled.
1175 *
1176 * We restore the counter value and then enable it.
1177 *
1178 * This does not protect us against NMI, but enable()
1179 * sets the enabled bit in the control field of counter _before_
1180 * accessing the counter control register. If a NMI hits, then it will
1181 * keep the counter running.
1182 */
1183void perf_counter_task_sched_in(struct task_struct *task, int cpu)
1184{
1185 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1186 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1187
1188 if (likely(!ctx))
1189 return;
1190 if (cpuctx->task_ctx == ctx)
1191 return;
1192 __perf_counter_sched_in(ctx, cpuctx, cpu);
1193 cpuctx->task_ctx = ctx;
1194}
1195
1196static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1197{
1198 struct perf_counter_context *ctx = &cpuctx->ctx;
1199
1200 __perf_counter_sched_in(ctx, cpuctx, cpu);
1201}
1202
1203#define MAX_INTERRUPTS (~0ULL)
1204
1205static void perf_log_throttle(struct perf_counter *counter, int enable);
1206static void perf_log_period(struct perf_counter *counter, u64 period);
1207
1208static void perf_adjust_period(struct perf_counter *counter, u64 events)
1209{
1210 struct hw_perf_counter *hwc = &counter->hw;
1211 u64 period, sample_period;
1212 s64 delta;
1213
1214 events *= hwc->sample_period;
1215 period = div64_u64(events, counter->attr.sample_freq);
1216
1217 delta = (s64)(period - hwc->sample_period);
1218 delta = (delta + 7) / 8; /* low pass filter */
1219
1220 sample_period = hwc->sample_period + delta;
1221
1222 if (!sample_period)
1223 sample_period = 1;
1224
1225 perf_log_period(counter, sample_period);
1226
1227 hwc->sample_period = sample_period;
1228}
1229
1230static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1231{
1232 struct perf_counter *counter;
1233 struct hw_perf_counter *hwc;
1234 u64 interrupts, freq;
1235
1236 spin_lock(&ctx->lock);
1237 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1238 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1239 continue;
1240
1241 hwc = &counter->hw;
1242
1243 interrupts = hwc->interrupts;
1244 hwc->interrupts = 0;
1245
1246 /*
1247 * unthrottle counters on the tick
1248 */
1249 if (interrupts == MAX_INTERRUPTS) {
1250 perf_log_throttle(counter, 1);
1251 counter->pmu->unthrottle(counter);
1252 interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
1253 }
1254
1255 if (!counter->attr.freq || !counter->attr.sample_freq)
1256 continue;
1257
1258 /*
1259 * if the specified freq < HZ then we need to skip ticks
1260 */
1261 if (counter->attr.sample_freq < HZ) {
1262 freq = counter->attr.sample_freq;
1263
1264 hwc->freq_count += freq;
1265 hwc->freq_interrupts += interrupts;
1266
1267 if (hwc->freq_count < HZ)
1268 continue;
1269
1270 interrupts = hwc->freq_interrupts;
1271 hwc->freq_interrupts = 0;
1272 hwc->freq_count -= HZ;
1273 } else
1274 freq = HZ;
1275
1276 perf_adjust_period(counter, freq * interrupts);
1277
1278 /*
1279 * In order to avoid being stalled by an (accidental) huge
1280 * sample period, force reset the sample period if we didn't
1281 * get any events in this freq period.
1282 */
1283 if (!interrupts) {
1284 perf_disable();
1285 counter->pmu->disable(counter);
1286 atomic_set(&hwc->period_left, 0);
1287 counter->pmu->enable(counter);
1288 perf_enable();
1289 }
1290 }
1291 spin_unlock(&ctx->lock);
1292}
1293
1294/*
1295 * Round-robin a context's counters:
1296 */
1297static void rotate_ctx(struct perf_counter_context *ctx)
1298{
1299 struct perf_counter *counter;
1300
1301 if (!ctx->nr_counters)
1302 return;
1303
1304 spin_lock(&ctx->lock);
1305 /*
1306 * Rotate the first entry last (works just fine for group counters too):
1307 */
1308 perf_disable();
1309 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1310 list_move_tail(&counter->list_entry, &ctx->counter_list);
1311 break;
1312 }
1313 perf_enable();
1314
1315 spin_unlock(&ctx->lock);
1316}
1317
1318void perf_counter_task_tick(struct task_struct *curr, int cpu)
1319{
1320 struct perf_cpu_context *cpuctx;
1321 struct perf_counter_context *ctx;
1322
1323 if (!atomic_read(&nr_counters))
1324 return;
1325
1326 cpuctx = &per_cpu(perf_cpu_context, cpu);
1327 ctx = curr->perf_counter_ctxp;
1328
1329 perf_ctx_adjust_freq(&cpuctx->ctx);
1330 if (ctx)
1331 perf_ctx_adjust_freq(ctx);
1332
1333 perf_counter_cpu_sched_out(cpuctx);
1334 if (ctx)
1335 __perf_counter_task_sched_out(ctx);
1336
1337 rotate_ctx(&cpuctx->ctx);
1338 if (ctx)
1339 rotate_ctx(ctx);
1340
1341 perf_counter_cpu_sched_in(cpuctx, cpu);
1342 if (ctx)
1343 perf_counter_task_sched_in(curr, cpu);
1344}
1345
1346/*
1347 * Cross CPU call to read the hardware counter
1348 */
1349static void __read(void *info)
1350{
1351 struct perf_counter *counter = info;
1352 struct perf_counter_context *ctx = counter->ctx;
1353 unsigned long flags;
1354
1355 local_irq_save(flags);
1356 if (ctx->is_active)
1357 update_context_time(ctx);
1358 counter->pmu->read(counter);
1359 update_counter_times(counter);
1360 local_irq_restore(flags);
1361}
1362
1363static u64 perf_counter_read(struct perf_counter *counter)
1364{
1365 /*
1366 * If counter is enabled and currently active on a CPU, update the
1367 * value in the counter structure:
1368 */
1369 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1370 smp_call_function_single(counter->oncpu,
1371 __read, counter, 1);
1372 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1373 update_counter_times(counter);
1374 }
1375
1376 return atomic64_read(&counter->count);
1377}
1378
1379/*
1380 * Initialize the perf_counter context in a task_struct:
1381 */
1382static void
1383__perf_counter_init_context(struct perf_counter_context *ctx,
1384 struct task_struct *task)
1385{
1386 memset(ctx, 0, sizeof(*ctx));
1387 spin_lock_init(&ctx->lock);
1388 mutex_init(&ctx->mutex);
1389 INIT_LIST_HEAD(&ctx->counter_list);
1390 INIT_LIST_HEAD(&ctx->event_list);
1391 atomic_set(&ctx->refcount, 1);
1392 ctx->task = task;
1393}
1394
1395static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1396{
1397 struct perf_counter_context *parent_ctx;
1398 struct perf_counter_context *ctx;
1399 struct perf_cpu_context *cpuctx;
1400 struct task_struct *task;
1401 unsigned long flags;
1402 int err;
1403
1404 /*
1405 * If cpu is not a wildcard then this is a percpu counter:
1406 */
1407 if (cpu != -1) {
1408 /* Must be root to operate on a CPU counter: */
1409 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1410 return ERR_PTR(-EACCES);
1411
1412 if (cpu < 0 || cpu > num_possible_cpus())
1413 return ERR_PTR(-EINVAL);
1414
1415 /*
1416 * We could be clever and allow to attach a counter to an
1417 * offline CPU and activate it when the CPU comes up, but
1418 * that's for later.
1419 */
1420 if (!cpu_isset(cpu, cpu_online_map))
1421 return ERR_PTR(-ENODEV);
1422
1423 cpuctx = &per_cpu(perf_cpu_context, cpu);
1424 ctx = &cpuctx->ctx;
1425 get_ctx(ctx);
1426
1427 return ctx;
1428 }
1429
1430 rcu_read_lock();
1431 if (!pid)
1432 task = current;
1433 else
1434 task = find_task_by_vpid(pid);
1435 if (task)
1436 get_task_struct(task);
1437 rcu_read_unlock();
1438
1439 if (!task)
1440 return ERR_PTR(-ESRCH);
1441
1442 /*
1443 * Can't attach counters to a dying task.
1444 */
1445 err = -ESRCH;
1446 if (task->flags & PF_EXITING)
1447 goto errout;
1448
1449 /* Reuse ptrace permission checks for now. */
1450 err = -EACCES;
1451 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1452 goto errout;
1453
1454 retry:
1455 ctx = perf_lock_task_context(task, &flags);
1456 if (ctx) {
1457 parent_ctx = ctx->parent_ctx;
1458 if (parent_ctx) {
1459 put_ctx(parent_ctx);
1460 ctx->parent_ctx = NULL; /* no longer a clone */
1461 }
1462 /*
1463 * Get an extra reference before dropping the lock so that
1464 * this context won't get freed if the task exits.
1465 */
1466 get_ctx(ctx);
1467 spin_unlock_irqrestore(&ctx->lock, flags);
1468 }
1469
1470 if (!ctx) {
1471 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
1472 err = -ENOMEM;
1473 if (!ctx)
1474 goto errout;
1475 __perf_counter_init_context(ctx, task);
1476 get_ctx(ctx);
1477 if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
1478 /*
1479 * We raced with some other task; use
1480 * the context they set.
1481 */
1482 kfree(ctx);
1483 goto retry;
1484 }
1485 get_task_struct(task);
1486 }
1487
1488 put_task_struct(task);
1489 return ctx;
1490
1491 errout:
1492 put_task_struct(task);
1493 return ERR_PTR(err);
1494}
1495
1496static void free_counter_rcu(struct rcu_head *head)
1497{
1498 struct perf_counter *counter;
1499
1500 counter = container_of(head, struct perf_counter, rcu_head);
1501 if (counter->ns)
1502 put_pid_ns(counter->ns);
1503 kfree(counter);
1504}
1505
1506static void perf_pending_sync(struct perf_counter *counter);
1507
1508static void free_counter(struct perf_counter *counter)
1509{
1510 perf_pending_sync(counter);
1511
1512 atomic_dec(&nr_counters);
1513 if (counter->attr.mmap)
1514 atomic_dec(&nr_mmap_counters);
1515 if (counter->attr.comm)
1516 atomic_dec(&nr_comm_counters);
1517
1518 if (counter->destroy)
1519 counter->destroy(counter);
1520
1521 put_ctx(counter->ctx);
1522 call_rcu(&counter->rcu_head, free_counter_rcu);
1523}
1524
1525/*
1526 * Called when the last reference to the file is gone.
1527 */
1528static int perf_release(struct inode *inode, struct file *file)
1529{
1530 struct perf_counter *counter = file->private_data;
1531 struct perf_counter_context *ctx = counter->ctx;
1532
1533 file->private_data = NULL;
1534
1535 WARN_ON_ONCE(ctx->parent_ctx);
1536 mutex_lock(&ctx->mutex);
1537 perf_counter_remove_from_context(counter);
1538 mutex_unlock(&ctx->mutex);
1539
1540 mutex_lock(&counter->owner->perf_counter_mutex);
1541 list_del_init(&counter->owner_entry);
1542 mutex_unlock(&counter->owner->perf_counter_mutex);
1543 put_task_struct(counter->owner);
1544
1545 free_counter(counter);
1546
1547 return 0;
1548}
1549
1550/*
1551 * Read the performance counter - simple non blocking version for now
1552 */
1553static ssize_t
1554perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1555{
1556 u64 values[3];
1557 int n;
1558
1559 /*
1560 * Return end-of-file for a read on a counter that is in
1561 * error state (i.e. because it was pinned but it couldn't be
1562 * scheduled on to the CPU at some point).
1563 */
1564 if (counter->state == PERF_COUNTER_STATE_ERROR)
1565 return 0;
1566
1567 WARN_ON_ONCE(counter->ctx->parent_ctx);
1568 mutex_lock(&counter->child_mutex);
1569 values[0] = perf_counter_read(counter);
1570 n = 1;
1571 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1572 values[n++] = counter->total_time_enabled +
1573 atomic64_read(&counter->child_total_time_enabled);
1574 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1575 values[n++] = counter->total_time_running +
1576 atomic64_read(&counter->child_total_time_running);
1577 if (counter->attr.read_format & PERF_FORMAT_ID)
1578 values[n++] = counter->id;
1579 mutex_unlock(&counter->child_mutex);
1580
1581 if (count < n * sizeof(u64))
1582 return -EINVAL;
1583 count = n * sizeof(u64);
1584
1585 if (copy_to_user(buf, values, count))
1586 return -EFAULT;
1587
1588 return count;
1589}
1590
1591static ssize_t
1592perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1593{
1594 struct perf_counter *counter = file->private_data;
1595
1596 return perf_read_hw(counter, buf, count);
1597}
1598
1599static unsigned int perf_poll(struct file *file, poll_table *wait)
1600{
1601 struct perf_counter *counter = file->private_data;
1602 struct perf_mmap_data *data;
1603 unsigned int events = POLL_HUP;
1604
1605 rcu_read_lock();
1606 data = rcu_dereference(counter->data);
1607 if (data)
1608 events = atomic_xchg(&data->poll, 0);
1609 rcu_read_unlock();
1610
1611 poll_wait(file, &counter->waitq, wait);
1612
1613 return events;
1614}
1615
1616static void perf_counter_reset(struct perf_counter *counter)
1617{
1618 (void)perf_counter_read(counter);
1619 atomic64_set(&counter->count, 0);
1620 perf_counter_update_userpage(counter);
1621}
1622
1623static void perf_counter_for_each_sibling(struct perf_counter *counter,
1624 void (*func)(struct perf_counter *))
1625{
1626 struct perf_counter_context *ctx = counter->ctx;
1627 struct perf_counter *sibling;
1628
1629 WARN_ON_ONCE(ctx->parent_ctx);
1630 mutex_lock(&ctx->mutex);
1631 counter = counter->group_leader;
1632
1633 func(counter);
1634 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1635 func(sibling);
1636 mutex_unlock(&ctx->mutex);
1637}
1638
1639/*
1640 * Holding the top-level counter's child_mutex means that any
1641 * descendant process that has inherited this counter will block
1642 * in sync_child_counter if it goes to exit, thus satisfying the
1643 * task existence requirements of perf_counter_enable/disable.
1644 */
1645static void perf_counter_for_each_child(struct perf_counter *counter,
1646 void (*func)(struct perf_counter *))
1647{
1648 struct perf_counter *child;
1649
1650 WARN_ON_ONCE(counter->ctx->parent_ctx);
1651 mutex_lock(&counter->child_mutex);
1652 func(counter);
1653 list_for_each_entry(child, &counter->child_list, child_list)
1654 func(child);
1655 mutex_unlock(&counter->child_mutex);
1656}
1657
1658static void perf_counter_for_each(struct perf_counter *counter,
1659 void (*func)(struct perf_counter *))
1660{
1661 struct perf_counter *child;
1662
1663 WARN_ON_ONCE(counter->ctx->parent_ctx);
1664 mutex_lock(&counter->child_mutex);
1665 perf_counter_for_each_sibling(counter, func);
1666 list_for_each_entry(child, &counter->child_list, child_list)
1667 perf_counter_for_each_sibling(child, func);
1668 mutex_unlock(&counter->child_mutex);
1669}
1670
1671static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1672{
1673 struct perf_counter_context *ctx = counter->ctx;
1674 unsigned long size;
1675 int ret = 0;
1676 u64 value;
1677
1678 if (!counter->attr.sample_period)
1679 return -EINVAL;
1680
1681 size = copy_from_user(&value, arg, sizeof(value));
1682 if (size != sizeof(value))
1683 return -EFAULT;
1684
1685 if (!value)
1686 return -EINVAL;
1687
1688 spin_lock_irq(&ctx->lock);
1689 if (counter->attr.freq) {
1690 if (value > sysctl_perf_counter_sample_rate) {
1691 ret = -EINVAL;
1692 goto unlock;
1693 }
1694
1695 counter->attr.sample_freq = value;
1696 } else {
1697 perf_log_period(counter, value);
1698
1699 counter->attr.sample_period = value;
1700 counter->hw.sample_period = value;
1701 }
1702unlock:
1703 spin_unlock_irq(&ctx->lock);
1704
1705 return ret;
1706}
1707
1708static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1709{
1710 struct perf_counter *counter = file->private_data;
1711 void (*func)(struct perf_counter *);
1712 u32 flags = arg;
1713
1714 switch (cmd) {
1715 case PERF_COUNTER_IOC_ENABLE:
1716 func = perf_counter_enable;
1717 break;
1718 case PERF_COUNTER_IOC_DISABLE:
1719 func = perf_counter_disable;
1720 break;
1721 case PERF_COUNTER_IOC_RESET:
1722 func = perf_counter_reset;
1723 break;
1724
1725 case PERF_COUNTER_IOC_REFRESH:
1726 return perf_counter_refresh(counter, arg);
1727
1728 case PERF_COUNTER_IOC_PERIOD:
1729 return perf_counter_period(counter, (u64 __user *)arg);
1730
1731 default:
1732 return -ENOTTY;
1733 }
1734
1735 if (flags & PERF_IOC_FLAG_GROUP)
1736 perf_counter_for_each(counter, func);
1737 else
1738 perf_counter_for_each_child(counter, func);
1739
1740 return 0;
1741}
1742
1743int perf_counter_task_enable(void)
1744{
1745 struct perf_counter *counter;
1746
1747 mutex_lock(&current->perf_counter_mutex);
1748 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
1749 perf_counter_for_each_child(counter, perf_counter_enable);
1750 mutex_unlock(&current->perf_counter_mutex);
1751
1752 return 0;
1753}
1754
1755int perf_counter_task_disable(void)
1756{
1757 struct perf_counter *counter;
1758
1759 mutex_lock(&current->perf_counter_mutex);
1760 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
1761 perf_counter_for_each_child(counter, perf_counter_disable);
1762 mutex_unlock(&current->perf_counter_mutex);
1763
1764 return 0;
1765}
1766
1767/*
1768 * Callers need to ensure there can be no nesting of this function, otherwise
1769 * the seqlock logic goes bad. We can not serialize this because the arch
1770 * code calls this from NMI context.
1771 */
1772void perf_counter_update_userpage(struct perf_counter *counter)
1773{
1774 struct perf_counter_mmap_page *userpg;
1775 struct perf_mmap_data *data;
1776
1777 rcu_read_lock();
1778 data = rcu_dereference(counter->data);
1779 if (!data)
1780 goto unlock;
1781
1782 userpg = data->user_page;
1783
1784 /*
1785 * Disable preemption so as to not let the corresponding user-space
1786 * spin too long if we get preempted.
1787 */
1788 preempt_disable();
1789 ++userpg->lock;
1790 barrier();
1791 userpg->index = counter->hw.idx;
1792 userpg->offset = atomic64_read(&counter->count);
1793 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1794 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1795
1796 barrier();
1797 ++userpg->lock;
1798 preempt_enable();
1799unlock:
1800 rcu_read_unlock();
1801}
1802
1803static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1804{
1805 struct perf_counter *counter = vma->vm_file->private_data;
1806 struct perf_mmap_data *data;
1807 int ret = VM_FAULT_SIGBUS;
1808
1809 rcu_read_lock();
1810 data = rcu_dereference(counter->data);
1811 if (!data)
1812 goto unlock;
1813
1814 if (vmf->pgoff == 0) {
1815 vmf->page = virt_to_page(data->user_page);
1816 } else {
1817 int nr = vmf->pgoff - 1;
1818
1819 if ((unsigned)nr > data->nr_pages)
1820 goto unlock;
1821
1822 vmf->page = virt_to_page(data->data_pages[nr]);
1823 }
1824 get_page(vmf->page);
1825 ret = 0;
1826unlock:
1827 rcu_read_unlock();
1828
1829 return ret;
1830}
1831
1832static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
1833{
1834 struct perf_mmap_data *data;
1835 unsigned long size;
1836 int i;
1837
1838 WARN_ON(atomic_read(&counter->mmap_count));
1839
1840 size = sizeof(struct perf_mmap_data);
1841 size += nr_pages * sizeof(void *);
1842
1843 data = kzalloc(size, GFP_KERNEL);
1844 if (!data)
1845 goto fail;
1846
1847 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
1848 if (!data->user_page)
1849 goto fail_user_page;
1850
1851 for (i = 0; i < nr_pages; i++) {
1852 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
1853 if (!data->data_pages[i])
1854 goto fail_data_pages;
1855 }
1856
1857 data->nr_pages = nr_pages;
1858 atomic_set(&data->lock, -1);
1859
1860 rcu_assign_pointer(counter->data, data);
1861
1862 return 0;
1863
1864fail_data_pages:
1865 for (i--; i >= 0; i--)
1866 free_page((unsigned long)data->data_pages[i]);
1867
1868 free_page((unsigned long)data->user_page);
1869
1870fail_user_page:
1871 kfree(data);
1872
1873fail:
1874 return -ENOMEM;
1875}
1876
1877static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1878{
1879 struct perf_mmap_data *data;
1880 int i;
1881
1882 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
1883
1884 free_page((unsigned long)data->user_page);
1885 for (i = 0; i < data->nr_pages; i++)
1886 free_page((unsigned long)data->data_pages[i]);
1887 kfree(data);
1888}
1889
1890static void perf_mmap_data_free(struct perf_counter *counter)
1891{
1892 struct perf_mmap_data *data = counter->data;
1893
1894 WARN_ON(atomic_read(&counter->mmap_count));
1895
1896 rcu_assign_pointer(counter->data, NULL);
1897 call_rcu(&data->rcu_head, __perf_mmap_data_free);
1898}
1899
1900static void perf_mmap_open(struct vm_area_struct *vma)
1901{
1902 struct perf_counter *counter = vma->vm_file->private_data;
1903
1904 atomic_inc(&counter->mmap_count);
1905}
1906
1907static void perf_mmap_close(struct vm_area_struct *vma)
1908{
1909 struct perf_counter *counter = vma->vm_file->private_data;
1910
1911 WARN_ON_ONCE(counter->ctx->parent_ctx);
1912 if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
1913 struct user_struct *user = current_user();
1914
1915 atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
1916 vma->vm_mm->locked_vm -= counter->data->nr_locked;
1917 perf_mmap_data_free(counter);
1918 mutex_unlock(&counter->mmap_mutex);
1919 }
1920}
1921
1922static struct vm_operations_struct perf_mmap_vmops = {
1923 .open = perf_mmap_open,
1924 .close = perf_mmap_close,
1925 .fault = perf_mmap_fault,
1926};
1927
1928static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1929{
1930 struct perf_counter *counter = file->private_data;
1931 unsigned long user_locked, user_lock_limit;
1932 struct user_struct *user = current_user();
1933 unsigned long locked, lock_limit;
1934 unsigned long vma_size;
1935 unsigned long nr_pages;
1936 long user_extra, extra;
1937 int ret = 0;
1938
1939 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
1940 return -EINVAL;
1941
1942 vma_size = vma->vm_end - vma->vm_start;
1943 nr_pages = (vma_size / PAGE_SIZE) - 1;
1944
1945 /*
1946 * If we have data pages ensure they're a power-of-two number, so we
1947 * can do bitmasks instead of modulo.
1948 */
1949 if (nr_pages != 0 && !is_power_of_2(nr_pages))
1950 return -EINVAL;
1951
1952 if (vma_size != PAGE_SIZE * (1 + nr_pages))
1953 return -EINVAL;
1954
1955 if (vma->vm_pgoff != 0)
1956 return -EINVAL;
1957
1958 WARN_ON_ONCE(counter->ctx->parent_ctx);
1959 mutex_lock(&counter->mmap_mutex);
1960 if (atomic_inc_not_zero(&counter->mmap_count)) {
1961 if (nr_pages != counter->data->nr_pages)
1962 ret = -EINVAL;
1963 goto unlock;
1964 }
1965
1966 user_extra = nr_pages + 1;
1967 user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
1968
1969 /*
1970 * Increase the limit linearly with more CPUs:
1971 */
1972 user_lock_limit *= num_online_cpus();
1973
1974 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
1975
1976 extra = 0;
1977 if (user_locked > user_lock_limit)
1978 extra = user_locked - user_lock_limit;
1979
1980 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1981 lock_limit >>= PAGE_SHIFT;
1982 locked = vma->vm_mm->locked_vm + extra;
1983
1984 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
1985 ret = -EPERM;
1986 goto unlock;
1987 }
1988
1989 WARN_ON(counter->data);
1990 ret = perf_mmap_data_alloc(counter, nr_pages);
1991 if (ret)
1992 goto unlock;
1993
1994 atomic_set(&counter->mmap_count, 1);
1995 atomic_long_add(user_extra, &user->locked_vm);
1996 vma->vm_mm->locked_vm += extra;
1997 counter->data->nr_locked = extra;
1998unlock:
1999 mutex_unlock(&counter->mmap_mutex);
2000
2001 vma->vm_flags &= ~VM_MAYWRITE;
2002 vma->vm_flags |= VM_RESERVED;
2003 vma->vm_ops = &perf_mmap_vmops;
2004
2005 return ret;
2006}
2007
2008static int perf_fasync(int fd, struct file *filp, int on)
2009{
2010 struct inode *inode = filp->f_path.dentry->d_inode;
2011 struct perf_counter *counter = filp->private_data;
2012 int retval;
2013
2014 mutex_lock(&inode->i_mutex);
2015 retval = fasync_helper(fd, filp, on, &counter->fasync);
2016 mutex_unlock(&inode->i_mutex);
2017
2018 if (retval < 0)
2019 return retval;
2020
2021 return 0;
2022}
2023
2024static const struct file_operations perf_fops = {
2025 .release = perf_release,
2026 .read = perf_read,
2027 .poll = perf_poll,
2028 .unlocked_ioctl = perf_ioctl,
2029 .compat_ioctl = perf_ioctl,
2030 .mmap = perf_mmap,
2031 .fasync = perf_fasync,
2032};
2033
2034/*
2035 * Perf counter wakeup
2036 *
2037 * If there's data, ensure we set the poll() state and publish everything
2038 * to user-space before waking everybody up.
2039 */
2040
2041void perf_counter_wakeup(struct perf_counter *counter)
2042{
2043 wake_up_all(&counter->waitq);
2044
2045 if (counter->pending_kill) {
2046 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
2047 counter->pending_kill = 0;
2048 }
2049}
2050
2051/*
2052 * Pending wakeups
2053 *
2054 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2055 *
2056 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2057 * single linked list and use cmpxchg() to add entries lockless.
2058 */
2059
2060static void perf_pending_counter(struct perf_pending_entry *entry)
2061{
2062 struct perf_counter *counter = container_of(entry,
2063 struct perf_counter, pending);
2064
2065 if (counter->pending_disable) {
2066 counter->pending_disable = 0;
2067 perf_counter_disable(counter);
2068 }
2069
2070 if (counter->pending_wakeup) {
2071 counter->pending_wakeup = 0;
2072 perf_counter_wakeup(counter);
2073 }
2074}
2075
2076#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2077
2078static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2079 PENDING_TAIL,
2080};
2081
2082static void perf_pending_queue(struct perf_pending_entry *entry,
2083 void (*func)(struct perf_pending_entry *))
2084{
2085 struct perf_pending_entry **head;
2086
2087 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2088 return;
2089
2090 entry->func = func;
2091
2092 head = &get_cpu_var(perf_pending_head);
2093
2094 do {
2095 entry->next = *head;
2096 } while (cmpxchg(head, entry->next, entry) != entry->next);
2097
2098 set_perf_counter_pending();
2099
2100 put_cpu_var(perf_pending_head);
2101}
2102
2103static int __perf_pending_run(void)
2104{
2105 struct perf_pending_entry *list;
2106 int nr = 0;
2107
2108 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2109 while (list != PENDING_TAIL) {
2110 void (*func)(struct perf_pending_entry *);
2111 struct perf_pending_entry *entry = list;
2112
2113 list = list->next;
2114
2115 func = entry->func;
2116 entry->next = NULL;
2117 /*
2118 * Ensure we observe the unqueue before we issue the wakeup,
2119 * so that we won't be waiting forever.
2120 * -- see perf_not_pending().
2121 */
2122 smp_wmb();
2123
2124 func(entry);
2125 nr++;
2126 }
2127
2128 return nr;
2129}
2130
2131static inline int perf_not_pending(struct perf_counter *counter)
2132{
2133 /*
2134 * If we flush on whatever cpu we run, there is a chance we don't
2135 * need to wait.
2136 */
2137 get_cpu();
2138 __perf_pending_run();
2139 put_cpu();
2140
2141 /*
2142 * Ensure we see the proper queue state before going to sleep
2143 * so that we do not miss the wakeup. -- see perf_pending_handle()
2144 */
2145 smp_rmb();
2146 return counter->pending.next == NULL;
2147}
2148
2149static void perf_pending_sync(struct perf_counter *counter)
2150{
2151 wait_event(counter->waitq, perf_not_pending(counter));
2152}
2153
2154void perf_counter_do_pending(void)
2155{
2156 __perf_pending_run();
2157}
2158
2159/*
2160 * Callchain support -- arch specific
2161 */
2162
2163__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2164{
2165 return NULL;
2166}
2167
2168/*
2169 * Output
2170 */
2171
2172struct perf_output_handle {
2173 struct perf_counter *counter;
2174 struct perf_mmap_data *data;
2175 unsigned long head;
2176 unsigned long offset;
2177 int nmi;
2178 int overflow;
2179 int locked;
2180 unsigned long flags;
2181};
2182
2183static void perf_output_wakeup(struct perf_output_handle *handle)
2184{
2185 atomic_set(&handle->data->poll, POLL_IN);
2186
2187 if (handle->nmi) {
2188 handle->counter->pending_wakeup = 1;
2189 perf_pending_queue(&handle->counter->pending,
2190 perf_pending_counter);
2191 } else
2192 perf_counter_wakeup(handle->counter);
2193}
2194
2195/*
2196 * Curious locking construct.
2197 *
2198 * We need to ensure a later event doesn't publish a head when a former
2199 * event isn't done writing. However since we need to deal with NMIs we
2200 * cannot fully serialize things.
2201 *
2202 * What we do is serialize between CPUs so we only have to deal with NMI
2203 * nesting on a single CPU.
2204 *
2205 * We only publish the head (and generate a wakeup) when the outer-most
2206 * event completes.
2207 */
2208static void perf_output_lock(struct perf_output_handle *handle)
2209{
2210 struct perf_mmap_data *data = handle->data;
2211 int cpu;
2212
2213 handle->locked = 0;
2214
2215 local_irq_save(handle->flags);
2216 cpu = smp_processor_id();
2217
2218 if (in_nmi() && atomic_read(&data->lock) == cpu)
2219 return;
2220
2221 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2222 cpu_relax();
2223
2224 handle->locked = 1;
2225}
2226
2227static void perf_output_unlock(struct perf_output_handle *handle)
2228{
2229 struct perf_mmap_data *data = handle->data;
2230 unsigned long head;
2231 int cpu;
2232
2233 data->done_head = data->head;
2234
2235 if (!handle->locked)
2236 goto out;
2237
2238again:
2239 /*
2240 * The xchg implies a full barrier that ensures all writes are done
2241 * before we publish the new head, matched by a rmb() in userspace when
2242 * reading this position.
2243 */
2244 while ((head = atomic_long_xchg(&data->done_head, 0)))
2245 data->user_page->data_head = head;
2246
2247 /*
2248 * NMI can happen here, which means we can miss a done_head update.
2249 */
2250
2251 cpu = atomic_xchg(&data->lock, -1);
2252 WARN_ON_ONCE(cpu != smp_processor_id());
2253
2254 /*
2255 * Therefore we have to validate we did not indeed do so.
2256 */
2257 if (unlikely(atomic_long_read(&data->done_head))) {
2258 /*
2259 * Since we had it locked, we can lock it again.
2260 */
2261 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2262 cpu_relax();
2263
2264 goto again;
2265 }
2266
2267 if (atomic_xchg(&data->wakeup, 0))
2268 perf_output_wakeup(handle);
2269out:
2270 local_irq_restore(handle->flags);
2271}
2272
2273static int perf_output_begin(struct perf_output_handle *handle,
2274 struct perf_counter *counter, unsigned int size,
2275 int nmi, int overflow)
2276{
2277 struct perf_mmap_data *data;
2278 unsigned int offset, head;
2279
2280 /*
2281 * For inherited counters we send all the output towards the parent.
2282 */
2283 if (counter->parent)
2284 counter = counter->parent;
2285
2286 rcu_read_lock();
2287 data = rcu_dereference(counter->data);
2288 if (!data)
2289 goto out;
2290
2291 handle->data = data;
2292 handle->counter = counter;
2293 handle->nmi = nmi;
2294 handle->overflow = overflow;
2295
2296 if (!data->nr_pages)
2297 goto fail;
2298
2299 perf_output_lock(handle);
2300
2301 do {
2302 offset = head = atomic_long_read(&data->head);
2303 head += size;
2304 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2305
2306 handle->offset = offset;
2307 handle->head = head;
2308
2309 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2310 atomic_set(&data->wakeup, 1);
2311
2312 return 0;
2313
2314fail:
2315 perf_output_wakeup(handle);
2316out:
2317 rcu_read_unlock();
2318
2319 return -ENOSPC;
2320}
2321
2322static void perf_output_copy(struct perf_output_handle *handle,
2323 const void *buf, unsigned int len)
2324{
2325 unsigned int pages_mask;
2326 unsigned int offset;
2327 unsigned int size;
2328 void **pages;
2329
2330 offset = handle->offset;
2331 pages_mask = handle->data->nr_pages - 1;
2332 pages = handle->data->data_pages;
2333
2334 do {
2335 unsigned int page_offset;
2336 int nr;
2337
2338 nr = (offset >> PAGE_SHIFT) & pages_mask;
2339 page_offset = offset & (PAGE_SIZE - 1);
2340 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2341
2342 memcpy(pages[nr] + page_offset, buf, size);
2343
2344 len -= size;
2345 buf += size;
2346 offset += size;
2347 } while (len);
2348
2349 handle->offset = offset;
2350
2351 /*
2352 * Check we didn't copy past our reservation window, taking the
2353 * possible unsigned int wrap into account.
2354 */
2355 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2356}
2357
2358#define perf_output_put(handle, x) \
2359 perf_output_copy((handle), &(x), sizeof(x))
2360
2361static void perf_output_end(struct perf_output_handle *handle)
2362{
2363 struct perf_counter *counter = handle->counter;
2364 struct perf_mmap_data *data = handle->data;
2365
2366 int wakeup_events = counter->attr.wakeup_events;
2367
2368 if (handle->overflow && wakeup_events) {
2369 int events = atomic_inc_return(&data->events);
2370 if (events >= wakeup_events) {
2371 atomic_sub(wakeup_events, &data->events);
2372 atomic_set(&data->wakeup, 1);
2373 }
2374 }
2375
2376 perf_output_unlock(handle);
2377 rcu_read_unlock();
2378}
2379
2380static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
2381{
2382 /*
2383 * only top level counters have the pid namespace they were created in
2384 */
2385 if (counter->parent)
2386 counter = counter->parent;
2387
2388 return task_tgid_nr_ns(p, counter->ns);
2389}
2390
2391static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2392{
2393 /*
2394 * only top level counters have the pid namespace they were created in
2395 */
2396 if (counter->parent)
2397 counter = counter->parent;
2398
2399 return task_pid_nr_ns(p, counter->ns);
2400}
2401
2402static void perf_counter_output(struct perf_counter *counter, int nmi,
2403 struct perf_sample_data *data)
2404{
2405 int ret;
2406 u64 sample_type = counter->attr.sample_type;
2407 struct perf_output_handle handle;
2408 struct perf_event_header header;
2409 u64 ip;
2410 struct {
2411 u32 pid, tid;
2412 } tid_entry;
2413 struct {
2414 u64 id;
2415 u64 counter;
2416 } group_entry;
2417 struct perf_callchain_entry *callchain = NULL;
2418 int callchain_size = 0;
2419 u64 time;
2420 struct {
2421 u32 cpu, reserved;
2422 } cpu_entry;
2423
2424 header.type = 0;
2425 header.size = sizeof(header);
2426
2427 header.misc = PERF_EVENT_MISC_OVERFLOW;
2428 header.misc |= perf_misc_flags(data->regs);
2429
2430 if (sample_type & PERF_SAMPLE_IP) {
2431 ip = perf_instruction_pointer(data->regs);
2432 header.type |= PERF_SAMPLE_IP;
2433 header.size += sizeof(ip);
2434 }
2435
2436 if (sample_type & PERF_SAMPLE_TID) {
2437 /* namespace issues */
2438 tid_entry.pid = perf_counter_pid(counter, current);
2439 tid_entry.tid = perf_counter_tid(counter, current);
2440
2441 header.type |= PERF_SAMPLE_TID;
2442 header.size += sizeof(tid_entry);
2443 }
2444
2445 if (sample_type & PERF_SAMPLE_TIME) {
2446 /*
2447 * Maybe do better on x86 and provide cpu_clock_nmi()
2448 */
2449 time = sched_clock();
2450
2451 header.type |= PERF_SAMPLE_TIME;
2452 header.size += sizeof(u64);
2453 }
2454
2455 if (sample_type & PERF_SAMPLE_ADDR) {
2456 header.type |= PERF_SAMPLE_ADDR;
2457 header.size += sizeof(u64);
2458 }
2459
2460 if (sample_type & PERF_SAMPLE_ID) {
2461 header.type |= PERF_SAMPLE_ID;
2462 header.size += sizeof(u64);
2463 }
2464
2465 if (sample_type & PERF_SAMPLE_CPU) {
2466 header.type |= PERF_SAMPLE_CPU;
2467 header.size += sizeof(cpu_entry);
2468
2469 cpu_entry.cpu = raw_smp_processor_id();
2470 }
2471
2472 if (sample_type & PERF_SAMPLE_PERIOD) {
2473 header.type |= PERF_SAMPLE_PERIOD;
2474 header.size += sizeof(u64);
2475 }
2476
2477 if (sample_type & PERF_SAMPLE_GROUP) {
2478 header.type |= PERF_SAMPLE_GROUP;
2479 header.size += sizeof(u64) +
2480 counter->nr_siblings * sizeof(group_entry);
2481 }
2482
2483 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2484 callchain = perf_callchain(data->regs);
2485
2486 if (callchain) {
2487 callchain_size = (1 + callchain->nr) * sizeof(u64);
2488
2489 header.type |= PERF_SAMPLE_CALLCHAIN;
2490 header.size += callchain_size;
2491 }
2492 }
2493
2494 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2495 if (ret)
2496 return;
2497
2498 perf_output_put(&handle, header);
2499
2500 if (sample_type & PERF_SAMPLE_IP)
2501 perf_output_put(&handle, ip);
2502
2503 if (sample_type & PERF_SAMPLE_TID)
2504 perf_output_put(&handle, tid_entry);
2505
2506 if (sample_type & PERF_SAMPLE_TIME)
2507 perf_output_put(&handle, time);
2508
2509 if (sample_type & PERF_SAMPLE_ADDR)
2510 perf_output_put(&handle, data->addr);
2511
2512 if (sample_type & PERF_SAMPLE_ID)
2513 perf_output_put(&handle, counter->id);
2514
2515 if (sample_type & PERF_SAMPLE_CPU)
2516 perf_output_put(&handle, cpu_entry);
2517
2518 if (sample_type & PERF_SAMPLE_PERIOD)
2519 perf_output_put(&handle, data->period);
2520
2521 /*
2522 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
2523 */
2524 if (sample_type & PERF_SAMPLE_GROUP) {
2525 struct perf_counter *leader, *sub;
2526 u64 nr = counter->nr_siblings;
2527
2528 perf_output_put(&handle, nr);
2529
2530 leader = counter->group_leader;
2531 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2532 if (sub != counter)
2533 sub->pmu->read(sub);
2534
2535 group_entry.id = sub->id;
2536 group_entry.counter = atomic64_read(&sub->count);
2537
2538 perf_output_put(&handle, group_entry);
2539 }
2540 }
2541
2542 if (callchain)
2543 perf_output_copy(&handle, callchain, callchain_size);
2544
2545 perf_output_end(&handle);
2546}
2547
2548/*
2549 * fork tracking
2550 */
2551
2552struct perf_fork_event {
2553 struct task_struct *task;
2554
2555 struct {
2556 struct perf_event_header header;
2557
2558 u32 pid;
2559 u32 ppid;
2560 } event;
2561};
2562
2563static void perf_counter_fork_output(struct perf_counter *counter,
2564 struct perf_fork_event *fork_event)
2565{
2566 struct perf_output_handle handle;
2567 int size = fork_event->event.header.size;
2568 struct task_struct *task = fork_event->task;
2569 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2570
2571 if (ret)
2572 return;
2573
2574 fork_event->event.pid = perf_counter_pid(counter, task);
2575 fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
2576
2577 perf_output_put(&handle, fork_event->event);
2578 perf_output_end(&handle);
2579}
2580
2581static int perf_counter_fork_match(struct perf_counter *counter)
2582{
2583 if (counter->attr.comm || counter->attr.mmap)
2584 return 1;
2585
2586 return 0;
2587}
2588
2589static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
2590 struct perf_fork_event *fork_event)
2591{
2592 struct perf_counter *counter;
2593
2594 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2595 return;
2596
2597 rcu_read_lock();
2598 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2599 if (perf_counter_fork_match(counter))
2600 perf_counter_fork_output(counter, fork_event);
2601 }
2602 rcu_read_unlock();
2603}
2604
2605static void perf_counter_fork_event(struct perf_fork_event *fork_event)
2606{
2607 struct perf_cpu_context *cpuctx;
2608 struct perf_counter_context *ctx;
2609
2610 cpuctx = &get_cpu_var(perf_cpu_context);
2611 perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
2612 put_cpu_var(perf_cpu_context);
2613
2614 rcu_read_lock();
2615 /*
2616 * doesn't really matter which of the child contexts the
2617 * events ends up in.
2618 */
2619 ctx = rcu_dereference(current->perf_counter_ctxp);
2620 if (ctx)
2621 perf_counter_fork_ctx(ctx, fork_event);
2622 rcu_read_unlock();
2623}
2624
2625void perf_counter_fork(struct task_struct *task)
2626{
2627 struct perf_fork_event fork_event;
2628
2629 if (!atomic_read(&nr_comm_counters) &&
2630 !atomic_read(&nr_mmap_counters))
2631 return;
2632
2633 fork_event = (struct perf_fork_event){
2634 .task = task,
2635 .event = {
2636 .header = {
2637 .type = PERF_EVENT_FORK,
2638 .size = sizeof(fork_event.event),
2639 },
2640 },
2641 };
2642
2643 perf_counter_fork_event(&fork_event);
2644}
2645
2646/*
2647 * comm tracking
2648 */
2649
2650struct perf_comm_event {
2651 struct task_struct *task;
2652 char *comm;
2653 int comm_size;
2654
2655 struct {
2656 struct perf_event_header header;
2657
2658 u32 pid;
2659 u32 tid;
2660 } event;
2661};
2662
2663static void perf_counter_comm_output(struct perf_counter *counter,
2664 struct perf_comm_event *comm_event)
2665{
2666 struct perf_output_handle handle;
2667 int size = comm_event->event.header.size;
2668 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2669
2670 if (ret)
2671 return;
2672
2673 comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
2674 comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
2675
2676 perf_output_put(&handle, comm_event->event);
2677 perf_output_copy(&handle, comm_event->comm,
2678 comm_event->comm_size);
2679 perf_output_end(&handle);
2680}
2681
2682static int perf_counter_comm_match(struct perf_counter *counter)
2683{
2684 if (counter->attr.comm)
2685 return 1;
2686
2687 return 0;
2688}
2689
2690static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
2691 struct perf_comm_event *comm_event)
2692{
2693 struct perf_counter *counter;
2694
2695 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2696 return;
2697
2698 rcu_read_lock();
2699 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2700 if (perf_counter_comm_match(counter))
2701 perf_counter_comm_output(counter, comm_event);
2702 }
2703 rcu_read_unlock();
2704}
2705
2706static void perf_counter_comm_event(struct perf_comm_event *comm_event)
2707{
2708 struct perf_cpu_context *cpuctx;
2709 struct perf_counter_context *ctx;
2710 unsigned int size;
2711 char *comm = comm_event->task->comm;
2712
2713 size = ALIGN(strlen(comm)+1, sizeof(u64));
2714
2715 comm_event->comm = comm;
2716 comm_event->comm_size = size;
2717
2718 comm_event->event.header.size = sizeof(comm_event->event) + size;
2719
2720 cpuctx = &get_cpu_var(perf_cpu_context);
2721 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
2722 put_cpu_var(perf_cpu_context);
2723
2724 rcu_read_lock();
2725 /*
2726 * doesn't really matter which of the child contexts the
2727 * events ends up in.
2728 */
2729 ctx = rcu_dereference(current->perf_counter_ctxp);
2730 if (ctx)
2731 perf_counter_comm_ctx(ctx, comm_event);
2732 rcu_read_unlock();
2733}
2734
2735void perf_counter_comm(struct task_struct *task)
2736{
2737 struct perf_comm_event comm_event;
2738
2739 if (!atomic_read(&nr_comm_counters))
2740 return;
2741
2742 comm_event = (struct perf_comm_event){
2743 .task = task,
2744 .event = {
2745 .header = { .type = PERF_EVENT_COMM, },
2746 },
2747 };
2748
2749 perf_counter_comm_event(&comm_event);
2750}
2751
2752/*
2753 * mmap tracking
2754 */
2755
2756struct perf_mmap_event {
2757 struct vm_area_struct *vma;
2758
2759 const char *file_name;
2760 int file_size;
2761
2762 struct {
2763 struct perf_event_header header;
2764
2765 u32 pid;
2766 u32 tid;
2767 u64 start;
2768 u64 len;
2769 u64 pgoff;
2770 } event;
2771};
2772
2773static void perf_counter_mmap_output(struct perf_counter *counter,
2774 struct perf_mmap_event *mmap_event)
2775{
2776 struct perf_output_handle handle;
2777 int size = mmap_event->event.header.size;
2778 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2779
2780 if (ret)
2781 return;
2782
2783 mmap_event->event.pid = perf_counter_pid(counter, current);
2784 mmap_event->event.tid = perf_counter_tid(counter, current);
2785
2786 perf_output_put(&handle, mmap_event->event);
2787 perf_output_copy(&handle, mmap_event->file_name,
2788 mmap_event->file_size);
2789 perf_output_end(&handle);
2790}
2791
2792static int perf_counter_mmap_match(struct perf_counter *counter,
2793 struct perf_mmap_event *mmap_event)
2794{
2795 if (counter->attr.mmap)
2796 return 1;
2797
2798 return 0;
2799}
2800
2801static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
2802 struct perf_mmap_event *mmap_event)
2803{
2804 struct perf_counter *counter;
2805
2806 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2807 return;
2808
2809 rcu_read_lock();
2810 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2811 if (perf_counter_mmap_match(counter, mmap_event))
2812 perf_counter_mmap_output(counter, mmap_event);
2813 }
2814 rcu_read_unlock();
2815}
2816
2817static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2818{
2819 struct perf_cpu_context *cpuctx;
2820 struct perf_counter_context *ctx;
2821 struct vm_area_struct *vma = mmap_event->vma;
2822 struct file *file = vma->vm_file;
2823 unsigned int size;
2824 char tmp[16];
2825 char *buf = NULL;
2826 const char *name;
2827
2828 if (file) {
2829 buf = kzalloc(PATH_MAX, GFP_KERNEL);
2830 if (!buf) {
2831 name = strncpy(tmp, "//enomem", sizeof(tmp));
2832 goto got_name;
2833 }
2834 name = d_path(&file->f_path, buf, PATH_MAX);
2835 if (IS_ERR(name)) {
2836 name = strncpy(tmp, "//toolong", sizeof(tmp));
2837 goto got_name;
2838 }
2839 } else {
2840 name = arch_vma_name(mmap_event->vma);
2841 if (name)
2842 goto got_name;
2843
2844 if (!vma->vm_mm) {
2845 name = strncpy(tmp, "[vdso]", sizeof(tmp));
2846 goto got_name;
2847 }
2848
2849 name = strncpy(tmp, "//anon", sizeof(tmp));
2850 goto got_name;
2851 }
2852
2853got_name:
2854 size = ALIGN(strlen(name)+1, sizeof(u64));
2855
2856 mmap_event->file_name = name;
2857 mmap_event->file_size = size;
2858
2859 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
2860
2861 cpuctx = &get_cpu_var(perf_cpu_context);
2862 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
2863 put_cpu_var(perf_cpu_context);
2864
2865 rcu_read_lock();
2866 /*
2867 * doesn't really matter which of the child contexts the
2868 * events ends up in.
2869 */
2870 ctx = rcu_dereference(current->perf_counter_ctxp);
2871 if (ctx)
2872 perf_counter_mmap_ctx(ctx, mmap_event);
2873 rcu_read_unlock();
2874
2875 kfree(buf);
2876}
2877
2878void __perf_counter_mmap(struct vm_area_struct *vma)
2879{
2880 struct perf_mmap_event mmap_event;
2881
2882 if (!atomic_read(&nr_mmap_counters))
2883 return;
2884
2885 mmap_event = (struct perf_mmap_event){
2886 .vma = vma,
2887 .event = {
2888 .header = { .type = PERF_EVENT_MMAP, },
2889 .start = vma->vm_start,
2890 .len = vma->vm_end - vma->vm_start,
2891 .pgoff = vma->vm_pgoff,
2892 },
2893 };
2894
2895 perf_counter_mmap_event(&mmap_event);
2896}
2897
2898/*
2899 * Log sample_period changes so that analyzing tools can re-normalize the
2900 * event flow.
2901 */
2902
2903struct freq_event {
2904 struct perf_event_header header;
2905 u64 time;
2906 u64 id;
2907 u64 period;
2908};
2909
2910static void perf_log_period(struct perf_counter *counter, u64 period)
2911{
2912 struct perf_output_handle handle;
2913 struct freq_event event;
2914 int ret;
2915
2916 if (counter->hw.sample_period == period)
2917 return;
2918
2919 if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
2920 return;
2921
2922 event = (struct freq_event) {
2923 .header = {
2924 .type = PERF_EVENT_PERIOD,
2925 .misc = 0,
2926 .size = sizeof(event),
2927 },
2928 .time = sched_clock(),
2929 .id = counter->id,
2930 .period = period,
2931 };
2932
2933 ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
2934 if (ret)
2935 return;
2936
2937 perf_output_put(&handle, event);
2938 perf_output_end(&handle);
2939}
2940
2941/*
2942 * IRQ throttle logging
2943 */
2944
2945static void perf_log_throttle(struct perf_counter *counter, int enable)
2946{
2947 struct perf_output_handle handle;
2948 int ret;
2949
2950 struct {
2951 struct perf_event_header header;
2952 u64 time;
2953 u64 id;
2954 } throttle_event = {
2955 .header = {
2956 .type = PERF_EVENT_THROTTLE + 1,
2957 .misc = 0,
2958 .size = sizeof(throttle_event),
2959 },
2960 .time = sched_clock(),
2961 .id = counter->id,
2962 };
2963
2964 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
2965 if (ret)
2966 return;
2967
2968 perf_output_put(&handle, throttle_event);
2969 perf_output_end(&handle);
2970}
2971
2972/*
2973 * Generic counter overflow handling.
2974 */
2975
2976int perf_counter_overflow(struct perf_counter *counter, int nmi,
2977 struct perf_sample_data *data)
2978{
2979 int events = atomic_read(&counter->event_limit);
2980 int throttle = counter->pmu->unthrottle != NULL;
2981 struct hw_perf_counter *hwc = &counter->hw;
2982 int ret = 0;
2983
2984 if (!throttle) {
2985 hwc->interrupts++;
2986 } else {
2987 if (hwc->interrupts != MAX_INTERRUPTS) {
2988 hwc->interrupts++;
2989 if (HZ * hwc->interrupts >
2990 (u64)sysctl_perf_counter_sample_rate) {
2991 hwc->interrupts = MAX_INTERRUPTS;
2992 perf_log_throttle(counter, 0);
2993 ret = 1;
2994 }
2995 } else {
2996 /*
2997 * Keep re-disabling counters even though on the previous
2998 * pass we disabled it - just in case we raced with a
2999 * sched-in and the counter got enabled again:
3000 */
3001 ret = 1;
3002 }
3003 }
3004
3005 if (counter->attr.freq) {
3006 u64 now = sched_clock();
3007 s64 delta = now - hwc->freq_stamp;
3008
3009 hwc->freq_stamp = now;
3010
3011 if (delta > 0 && delta < TICK_NSEC)
3012 perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
3013 }
3014
3015 /*
3016 * XXX event_limit might not quite work as expected on inherited
3017 * counters
3018 */
3019
3020 counter->pending_kill = POLL_IN;
3021 if (events && atomic_dec_and_test(&counter->event_limit)) {
3022 ret = 1;
3023 counter->pending_kill = POLL_HUP;
3024 if (nmi) {
3025 counter->pending_disable = 1;
3026 perf_pending_queue(&counter->pending,
3027 perf_pending_counter);
3028 } else
3029 perf_counter_disable(counter);
3030 }
3031
3032 perf_counter_output(counter, nmi, data);
3033 return ret;
3034}
3035
3036/*
3037 * Generic software counter infrastructure
3038 */
3039
3040static void perf_swcounter_update(struct perf_counter *counter)
3041{
3042 struct hw_perf_counter *hwc = &counter->hw;
3043 u64 prev, now;
3044 s64 delta;
3045
3046again:
3047 prev = atomic64_read(&hwc->prev_count);
3048 now = atomic64_read(&hwc->count);
3049 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
3050 goto again;
3051
3052 delta = now - prev;
3053
3054 atomic64_add(delta, &counter->count);
3055 atomic64_sub(delta, &hwc->period_left);
3056}
3057
3058static void perf_swcounter_set_period(struct perf_counter *counter)
3059{
3060 struct hw_perf_counter *hwc = &counter->hw;
3061 s64 left = atomic64_read(&hwc->period_left);
3062 s64 period = hwc->sample_period;
3063
3064 if (unlikely(left <= -period)) {
3065 left = period;
3066 atomic64_set(&hwc->period_left, left);
3067 hwc->last_period = period;
3068 }
3069
3070 if (unlikely(left <= 0)) {
3071 left += period;
3072 atomic64_add(period, &hwc->period_left);
3073 hwc->last_period = period;
3074 }
3075
3076 atomic64_set(&hwc->prev_count, -left);
3077 atomic64_set(&hwc->count, -left);
3078}
3079
3080static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3081{
3082 enum hrtimer_restart ret = HRTIMER_RESTART;
3083 struct perf_sample_data data;
3084 struct perf_counter *counter;
3085 u64 period;
3086
3087 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3088 counter->pmu->read(counter);
3089
3090 data.addr = 0;
3091 data.regs = get_irq_regs();
3092 /*
3093 * In case we exclude kernel IPs or are somehow not in interrupt
3094 * context, provide the next best thing, the user IP.
3095 */
3096 if ((counter->attr.exclude_kernel || !data.regs) &&
3097 !counter->attr.exclude_user)
3098 data.regs = task_pt_regs(current);
3099
3100 if (data.regs) {
3101 if (perf_counter_overflow(counter, 0, &data))
3102 ret = HRTIMER_NORESTART;
3103 }
3104
3105 period = max_t(u64, 10000, counter->hw.sample_period);
3106 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3107
3108 return ret;
3109}
3110
3111static void perf_swcounter_overflow(struct perf_counter *counter,
3112 int nmi, struct pt_regs *regs, u64 addr)
3113{
3114 struct perf_sample_data data = {
3115 .regs = regs,
3116 .addr = addr,
3117 .period = counter->hw.last_period,
3118 };
3119
3120 perf_swcounter_update(counter);
3121 perf_swcounter_set_period(counter);
3122 if (perf_counter_overflow(counter, nmi, &data))
3123 /* soft-disable the counter */
3124 ;
3125
3126}
3127
3128static int perf_swcounter_is_counting(struct perf_counter *counter)
3129{
3130 struct perf_counter_context *ctx;
3131 unsigned long flags;
3132 int count;
3133
3134 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3135 return 1;
3136
3137 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3138 return 0;
3139
3140 /*
3141 * If the counter is inactive, it could be just because
3142 * its task is scheduled out, or because it's in a group
3143 * which could not go on the PMU. We want to count in
3144 * the first case but not the second. If the context is
3145 * currently active then an inactive software counter must
3146 * be the second case. If it's not currently active then
3147 * we need to know whether the counter was active when the
3148 * context was last active, which we can determine by
3149 * comparing counter->tstamp_stopped with ctx->time.
3150 *
3151 * We are within an RCU read-side critical section,
3152 * which protects the existence of *ctx.
3153 */
3154 ctx = counter->ctx;
3155 spin_lock_irqsave(&ctx->lock, flags);
3156 count = 1;
3157 /* Re-check state now we have the lock */
3158 if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
3159 counter->ctx->is_active ||
3160 counter->tstamp_stopped < ctx->time)
3161 count = 0;
3162 spin_unlock_irqrestore(&ctx->lock, flags);
3163 return count;
3164}
3165
3166static int perf_swcounter_match(struct perf_counter *counter,
3167 enum perf_type_id type,
3168 u32 event, struct pt_regs *regs)
3169{
3170 if (!perf_swcounter_is_counting(counter))
3171 return 0;
3172
3173 if (counter->attr.type != type)
3174 return 0;
3175 if (counter->attr.config != event)
3176 return 0;
3177
3178 if (regs) {
3179 if (counter->attr.exclude_user && user_mode(regs))
3180 return 0;
3181
3182 if (counter->attr.exclude_kernel && !user_mode(regs))
3183 return 0;
3184 }
3185
3186 return 1;
3187}
3188
3189static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3190 int nmi, struct pt_regs *regs, u64 addr)
3191{
3192 int neg = atomic64_add_negative(nr, &counter->hw.count);
3193
3194 if (counter->hw.sample_period && !neg && regs)
3195 perf_swcounter_overflow(counter, nmi, regs, addr);
3196}
3197
3198static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3199 enum perf_type_id type, u32 event,
3200 u64 nr, int nmi, struct pt_regs *regs,
3201 u64 addr)
3202{
3203 struct perf_counter *counter;
3204
3205 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3206 return;
3207
3208 rcu_read_lock();
3209 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3210 if (perf_swcounter_match(counter, type, event, regs))
3211 perf_swcounter_add(counter, nr, nmi, regs, addr);
3212 }
3213 rcu_read_unlock();
3214}
3215
3216static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3217{
3218 if (in_nmi())
3219 return &cpuctx->recursion[3];
3220
3221 if (in_irq())
3222 return &cpuctx->recursion[2];
3223
3224 if (in_softirq())
3225 return &cpuctx->recursion[1];
3226
3227 return &cpuctx->recursion[0];
3228}
3229
3230static void __perf_swcounter_event(enum perf_type_id type, u32 event,
3231 u64 nr, int nmi, struct pt_regs *regs,
3232 u64 addr)
3233{
3234 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3235 int *recursion = perf_swcounter_recursion_context(cpuctx);
3236 struct perf_counter_context *ctx;
3237
3238 if (*recursion)
3239 goto out;
3240
3241 (*recursion)++;
3242 barrier();
3243
3244 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3245 nr, nmi, regs, addr);
3246 rcu_read_lock();
3247 /*
3248 * doesn't really matter which of the child contexts the
3249 * events ends up in.
3250 */
3251 ctx = rcu_dereference(current->perf_counter_ctxp);
3252 if (ctx)
3253 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr);
3254 rcu_read_unlock();
3255
3256 barrier();
3257 (*recursion)--;
3258
3259out:
3260 put_cpu_var(perf_cpu_context);
3261}
3262
3263void
3264perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
3265{
3266 __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
3267}
3268
3269static void perf_swcounter_read(struct perf_counter *counter)
3270{
3271 perf_swcounter_update(counter);
3272}
3273
3274static int perf_swcounter_enable(struct perf_counter *counter)
3275{
3276 perf_swcounter_set_period(counter);
3277 return 0;
3278}
3279
3280static void perf_swcounter_disable(struct perf_counter *counter)
3281{
3282 perf_swcounter_update(counter);
3283}
3284
3285static const struct pmu perf_ops_generic = {
3286 .enable = perf_swcounter_enable,
3287 .disable = perf_swcounter_disable,
3288 .read = perf_swcounter_read,
3289};
3290
3291/*
3292 * Software counter: cpu wall time clock
3293 */
3294
3295static void cpu_clock_perf_counter_update(struct perf_counter *counter)
3296{
3297 int cpu = raw_smp_processor_id();
3298 s64 prev;
3299 u64 now;
3300
3301 now = cpu_clock(cpu);
3302 prev = atomic64_read(&counter->hw.prev_count);
3303 atomic64_set(&counter->hw.prev_count, now);
3304 atomic64_add(now - prev, &counter->count);
3305}
3306
3307static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
3308{
3309 struct hw_perf_counter *hwc = &counter->hw;
3310 int cpu = raw_smp_processor_id();
3311
3312 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3313 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3314 hwc->hrtimer.function = perf_swcounter_hrtimer;
3315 if (hwc->sample_period) {
3316 u64 period = max_t(u64, 10000, hwc->sample_period);
3317 __hrtimer_start_range_ns(&hwc->hrtimer,
3318 ns_to_ktime(period), 0,
3319 HRTIMER_MODE_REL, 0);
3320 }
3321
3322 return 0;
3323}
3324
3325static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
3326{
3327 if (counter->hw.sample_period)
3328 hrtimer_cancel(&counter->hw.hrtimer);
3329 cpu_clock_perf_counter_update(counter);
3330}
3331
3332static void cpu_clock_perf_counter_read(struct perf_counter *counter)
3333{
3334 cpu_clock_perf_counter_update(counter);
3335}
3336
3337static const struct pmu perf_ops_cpu_clock = {
3338 .enable = cpu_clock_perf_counter_enable,
3339 .disable = cpu_clock_perf_counter_disable,
3340 .read = cpu_clock_perf_counter_read,
3341};
3342
3343/*
3344 * Software counter: task time clock
3345 */
3346
3347static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
3348{
3349 u64 prev;
3350 s64 delta;
3351
3352 prev = atomic64_xchg(&counter->hw.prev_count, now);
3353 delta = now - prev;
3354 atomic64_add(delta, &counter->count);
3355}
3356
3357static int task_clock_perf_counter_enable(struct perf_counter *counter)
3358{
3359 struct hw_perf_counter *hwc = &counter->hw;
3360 u64 now;
3361
3362 now = counter->ctx->time;
3363
3364 atomic64_set(&hwc->prev_count, now);
3365 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3366 hwc->hrtimer.function = perf_swcounter_hrtimer;
3367 if (hwc->sample_period) {
3368 u64 period = max_t(u64, 10000, hwc->sample_period);
3369 __hrtimer_start_range_ns(&hwc->hrtimer,
3370 ns_to_ktime(period), 0,
3371 HRTIMER_MODE_REL, 0);
3372 }
3373
3374 return 0;
3375}
3376
3377static void task_clock_perf_counter_disable(struct perf_counter *counter)
3378{
3379 if (counter->hw.sample_period)
3380 hrtimer_cancel(&counter->hw.hrtimer);
3381 task_clock_perf_counter_update(counter, counter->ctx->time);
3382
3383}
3384
3385static void task_clock_perf_counter_read(struct perf_counter *counter)
3386{
3387 u64 time;
3388
3389 if (!in_nmi()) {
3390 update_context_time(counter->ctx);
3391 time = counter->ctx->time;
3392 } else {
3393 u64 now = perf_clock();
3394 u64 delta = now - counter->ctx->timestamp;
3395 time = counter->ctx->time + delta;
3396 }
3397
3398 task_clock_perf_counter_update(counter, time);
3399}
3400
3401static const struct pmu perf_ops_task_clock = {
3402 .enable = task_clock_perf_counter_enable,
3403 .disable = task_clock_perf_counter_disable,
3404 .read = task_clock_perf_counter_read,
3405};
3406
3407/*
3408 * Software counter: cpu migrations
3409 */
3410void perf_counter_task_migration(struct task_struct *task, int cpu)
3411{
3412 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3413 struct perf_counter_context *ctx;
3414
3415 perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
3416 PERF_COUNT_SW_CPU_MIGRATIONS,
3417 1, 1, NULL, 0);
3418
3419 ctx = perf_pin_task_context(task);
3420 if (ctx) {
3421 perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
3422 PERF_COUNT_SW_CPU_MIGRATIONS,
3423 1, 1, NULL, 0);
3424 perf_unpin_context(ctx);
3425 }
3426}
3427
3428#ifdef CONFIG_EVENT_PROFILE
3429void perf_tpcounter_event(int event_id)
3430{
3431 struct pt_regs *regs = get_irq_regs();
3432
3433 if (!regs)
3434 regs = task_pt_regs(current);
3435
3436 __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
3437}
3438EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3439
3440extern int ftrace_profile_enable(int);
3441extern void ftrace_profile_disable(int);
3442
3443static void tp_perf_counter_destroy(struct perf_counter *counter)
3444{
3445 ftrace_profile_disable(perf_event_id(&counter->attr));
3446}
3447
3448static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3449{
3450 int event_id = perf_event_id(&counter->attr);
3451 int ret;
3452
3453 ret = ftrace_profile_enable(event_id);
3454 if (ret)
3455 return NULL;
3456
3457 counter->destroy = tp_perf_counter_destroy;
3458
3459 return &perf_ops_generic;
3460}
3461#else
3462static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3463{
3464 return NULL;
3465}
3466#endif
3467
3468static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3469{
3470 const struct pmu *pmu = NULL;
3471
3472 /*
3473 * Software counters (currently) can't in general distinguish
3474 * between user, kernel and hypervisor events.
3475 * However, context switches and cpu migrations are considered
3476 * to be kernel events, and page faults are never hypervisor
3477 * events.
3478 */
3479 switch (counter->attr.config) {
3480 case PERF_COUNT_SW_CPU_CLOCK:
3481 pmu = &perf_ops_cpu_clock;
3482
3483 break;
3484 case PERF_COUNT_SW_TASK_CLOCK:
3485 /*
3486 * If the user instantiates this as a per-cpu counter,
3487 * use the cpu_clock counter instead.
3488 */
3489 if (counter->ctx->task)
3490 pmu = &perf_ops_task_clock;
3491 else
3492 pmu = &perf_ops_cpu_clock;
3493
3494 break;
3495 case PERF_COUNT_SW_PAGE_FAULTS:
3496 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
3497 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
3498 case PERF_COUNT_SW_CONTEXT_SWITCHES:
3499 case PERF_COUNT_SW_CPU_MIGRATIONS:
3500 pmu = &perf_ops_generic;
3501 break;
3502 }
3503
3504 return pmu;
3505}
3506
3507/*
3508 * Allocate and initialize a counter structure
3509 */
3510static struct perf_counter *
3511perf_counter_alloc(struct perf_counter_attr *attr,
3512 int cpu,
3513 struct perf_counter_context *ctx,
3514 struct perf_counter *group_leader,
3515 gfp_t gfpflags)
3516{
3517 const struct pmu *pmu;
3518 struct perf_counter *counter;
3519 struct hw_perf_counter *hwc;
3520 long err;
3521
3522 counter = kzalloc(sizeof(*counter), gfpflags);
3523 if (!counter)
3524 return ERR_PTR(-ENOMEM);
3525
3526 /*
3527 * Single counters are their own group leaders, with an
3528 * empty sibling list:
3529 */
3530 if (!group_leader)
3531 group_leader = counter;
3532
3533 mutex_init(&counter->child_mutex);
3534 INIT_LIST_HEAD(&counter->child_list);
3535
3536 INIT_LIST_HEAD(&counter->list_entry);
3537 INIT_LIST_HEAD(&counter->event_entry);
3538 INIT_LIST_HEAD(&counter->sibling_list);
3539 init_waitqueue_head(&counter->waitq);
3540
3541 mutex_init(&counter->mmap_mutex);
3542
3543 counter->cpu = cpu;
3544 counter->attr = *attr;
3545 counter->group_leader = group_leader;
3546 counter->pmu = NULL;
3547 counter->ctx = ctx;
3548 counter->oncpu = -1;
3549
3550 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
3551 counter->id = atomic64_inc_return(&perf_counter_id);
3552
3553 counter->state = PERF_COUNTER_STATE_INACTIVE;
3554
3555 if (attr->disabled)
3556 counter->state = PERF_COUNTER_STATE_OFF;
3557
3558 pmu = NULL;
3559
3560 hwc = &counter->hw;
3561 hwc->sample_period = attr->sample_period;
3562 if (attr->freq && attr->sample_freq)
3563 hwc->sample_period = 1;
3564
3565 atomic64_set(&hwc->period_left, hwc->sample_period);
3566
3567 /*
3568 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
3569 */
3570 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
3571 goto done;
3572
3573 if (attr->type == PERF_TYPE_RAW) {
3574 pmu = hw_perf_counter_init(counter);
3575 goto done;
3576 }
3577
3578 switch (attr->type) {
3579 case PERF_TYPE_HARDWARE:
3580 case PERF_TYPE_HW_CACHE:
3581 pmu = hw_perf_counter_init(counter);
3582 break;
3583
3584 case PERF_TYPE_SOFTWARE:
3585 pmu = sw_perf_counter_init(counter);
3586 break;
3587
3588 case PERF_TYPE_TRACEPOINT:
3589 pmu = tp_perf_counter_init(counter);
3590 break;
3591 }
3592done:
3593 err = 0;
3594 if (!pmu)
3595 err = -EINVAL;
3596 else if (IS_ERR(pmu))
3597 err = PTR_ERR(pmu);
3598
3599 if (err) {
3600 if (counter->ns)
3601 put_pid_ns(counter->ns);
3602 kfree(counter);
3603 return ERR_PTR(err);
3604 }
3605
3606 counter->pmu = pmu;
3607
3608 atomic_inc(&nr_counters);
3609 if (counter->attr.mmap)
3610 atomic_inc(&nr_mmap_counters);
3611 if (counter->attr.comm)
3612 atomic_inc(&nr_comm_counters);
3613
3614 return counter;
3615}
3616
3617/**
3618 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
3619 *
3620 * @attr_uptr: event type attributes for monitoring/sampling
3621 * @pid: target pid
3622 * @cpu: target cpu
3623 * @group_fd: group leader counter fd
3624 */
3625SYSCALL_DEFINE5(perf_counter_open,
3626 const struct perf_counter_attr __user *, attr_uptr,
3627 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
3628{
3629 struct perf_counter *counter, *group_leader;
3630 struct perf_counter_attr attr;
3631 struct perf_counter_context *ctx;
3632 struct file *counter_file = NULL;
3633 struct file *group_file = NULL;
3634 int fput_needed = 0;
3635 int fput_needed2 = 0;
3636 int ret;
3637
3638 /* for future expandability... */
3639 if (flags)
3640 return -EINVAL;
3641
3642 if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0)
3643 return -EFAULT;
3644
3645 if (!attr.exclude_kernel) {
3646 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
3647 return -EACCES;
3648 }
3649
3650 if (attr.freq) {
3651 if (attr.sample_freq > sysctl_perf_counter_sample_rate)
3652 return -EINVAL;
3653 }
3654
3655 /*
3656 * Get the target context (task or percpu):
3657 */
3658 ctx = find_get_context(pid, cpu);
3659 if (IS_ERR(ctx))
3660 return PTR_ERR(ctx);
3661
3662 /*
3663 * Look up the group leader (we will attach this counter to it):
3664 */
3665 group_leader = NULL;
3666 if (group_fd != -1) {
3667 ret = -EINVAL;
3668 group_file = fget_light(group_fd, &fput_needed);
3669 if (!group_file)
3670 goto err_put_context;
3671 if (group_file->f_op != &perf_fops)
3672 goto err_put_context;
3673
3674 group_leader = group_file->private_data;
3675 /*
3676 * Do not allow a recursive hierarchy (this new sibling
3677 * becoming part of another group-sibling):
3678 */
3679 if (group_leader->group_leader != group_leader)
3680 goto err_put_context;
3681 /*
3682 * Do not allow to attach to a group in a different
3683 * task or CPU context:
3684 */
3685 if (group_leader->ctx != ctx)
3686 goto err_put_context;
3687 /*
3688 * Only a group leader can be exclusive or pinned
3689 */
3690 if (attr.exclusive || attr.pinned)
3691 goto err_put_context;
3692 }
3693
3694 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
3695 GFP_KERNEL);
3696 ret = PTR_ERR(counter);
3697 if (IS_ERR(counter))
3698 goto err_put_context;
3699
3700 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
3701 if (ret < 0)
3702 goto err_free_put_context;
3703
3704 counter_file = fget_light(ret, &fput_needed2);
3705 if (!counter_file)
3706 goto err_free_put_context;
3707
3708 counter->filp = counter_file;
3709 WARN_ON_ONCE(ctx->parent_ctx);
3710 mutex_lock(&ctx->mutex);
3711 perf_install_in_context(ctx, counter, cpu);
3712 ++ctx->generation;
3713 mutex_unlock(&ctx->mutex);
3714
3715 counter->owner = current;
3716 get_task_struct(current);
3717 mutex_lock(&current->perf_counter_mutex);
3718 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
3719 mutex_unlock(&current->perf_counter_mutex);
3720
3721 fput_light(counter_file, fput_needed2);
3722
3723out_fput:
3724 fput_light(group_file, fput_needed);
3725
3726 return ret;
3727
3728err_free_put_context:
3729 kfree(counter);
3730
3731err_put_context:
3732 put_ctx(ctx);
3733
3734 goto out_fput;
3735}
3736
3737/*
3738 * inherit a counter from parent task to child task:
3739 */
3740static struct perf_counter *
3741inherit_counter(struct perf_counter *parent_counter,
3742 struct task_struct *parent,
3743 struct perf_counter_context *parent_ctx,
3744 struct task_struct *child,
3745 struct perf_counter *group_leader,
3746 struct perf_counter_context *child_ctx)
3747{
3748 struct perf_counter *child_counter;
3749
3750 /*
3751 * Instead of creating recursive hierarchies of counters,
3752 * we link inherited counters back to the original parent,
3753 * which has a filp for sure, which we use as the reference
3754 * count:
3755 */
3756 if (parent_counter->parent)
3757 parent_counter = parent_counter->parent;
3758
3759 child_counter = perf_counter_alloc(&parent_counter->attr,
3760 parent_counter->cpu, child_ctx,
3761 group_leader, GFP_KERNEL);
3762 if (IS_ERR(child_counter))
3763 return child_counter;
3764 get_ctx(child_ctx);
3765
3766 /*
3767 * Make the child state follow the state of the parent counter,
3768 * not its attr.disabled bit. We hold the parent's mutex,
3769 * so we won't race with perf_counter_{en, dis}able_family.
3770 */
3771 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
3772 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
3773 else
3774 child_counter->state = PERF_COUNTER_STATE_OFF;
3775
3776 if (parent_counter->attr.freq)
3777 child_counter->hw.sample_period = parent_counter->hw.sample_period;
3778
3779 /*
3780 * Link it up in the child's context:
3781 */
3782 add_counter_to_ctx(child_counter, child_ctx);
3783
3784 child_counter->parent = parent_counter;
3785 /*
3786 * inherit into child's child as well:
3787 */
3788 child_counter->attr.inherit = 1;
3789
3790 /*
3791 * Get a reference to the parent filp - we will fput it
3792 * when the child counter exits. This is safe to do because
3793 * we are in the parent and we know that the filp still
3794 * exists and has a nonzero count:
3795 */
3796 atomic_long_inc(&parent_counter->filp->f_count);
3797
3798 /*
3799 * Link this into the parent counter's child list
3800 */
3801 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
3802 mutex_lock(&parent_counter->child_mutex);
3803 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
3804 mutex_unlock(&parent_counter->child_mutex);
3805
3806 return child_counter;
3807}
3808
3809static int inherit_group(struct perf_counter *parent_counter,
3810 struct task_struct *parent,
3811 struct perf_counter_context *parent_ctx,
3812 struct task_struct *child,
3813 struct perf_counter_context *child_ctx)
3814{
3815 struct perf_counter *leader;
3816 struct perf_counter *sub;
3817 struct perf_counter *child_ctr;
3818
3819 leader = inherit_counter(parent_counter, parent, parent_ctx,
3820 child, NULL, child_ctx);
3821 if (IS_ERR(leader))
3822 return PTR_ERR(leader);
3823 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
3824 child_ctr = inherit_counter(sub, parent, parent_ctx,
3825 child, leader, child_ctx);
3826 if (IS_ERR(child_ctr))
3827 return PTR_ERR(child_ctr);
3828 }
3829 return 0;
3830}
3831
3832static void sync_child_counter(struct perf_counter *child_counter,
3833 struct perf_counter *parent_counter)
3834{
3835 u64 child_val;
3836
3837 child_val = atomic64_read(&child_counter->count);
3838
3839 /*
3840 * Add back the child's count to the parent's count:
3841 */
3842 atomic64_add(child_val, &parent_counter->count);
3843 atomic64_add(child_counter->total_time_enabled,
3844 &parent_counter->child_total_time_enabled);
3845 atomic64_add(child_counter->total_time_running,
3846 &parent_counter->child_total_time_running);
3847
3848 /*
3849 * Remove this counter from the parent's list
3850 */
3851 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
3852 mutex_lock(&parent_counter->child_mutex);
3853 list_del_init(&child_counter->child_list);
3854 mutex_unlock(&parent_counter->child_mutex);
3855
3856 /*
3857 * Release the parent counter, if this was the last
3858 * reference to it.
3859 */
3860 fput(parent_counter->filp);
3861}
3862
3863static void
3864__perf_counter_exit_task(struct perf_counter *child_counter,
3865 struct perf_counter_context *child_ctx)
3866{
3867 struct perf_counter *parent_counter;
3868
3869 update_counter_times(child_counter);
3870 perf_counter_remove_from_context(child_counter);
3871
3872 parent_counter = child_counter->parent;
3873 /*
3874 * It can happen that parent exits first, and has counters
3875 * that are still around due to the child reference. These
3876 * counters need to be zapped - but otherwise linger.
3877 */
3878 if (parent_counter) {
3879 sync_child_counter(child_counter, parent_counter);
3880 free_counter(child_counter);
3881 }
3882}
3883
3884/*
3885 * When a child task exits, feed back counter values to parent counters.
3886 */
3887void perf_counter_exit_task(struct task_struct *child)
3888{
3889 struct perf_counter *child_counter, *tmp;
3890 struct perf_counter_context *child_ctx;
3891 unsigned long flags;
3892
3893 if (likely(!child->perf_counter_ctxp))
3894 return;
3895
3896 local_irq_save(flags);
3897 /*
3898 * We can't reschedule here because interrupts are disabled,
3899 * and either child is current or it is a task that can't be
3900 * scheduled, so we are now safe from rescheduling changing
3901 * our context.
3902 */
3903 child_ctx = child->perf_counter_ctxp;
3904 __perf_counter_task_sched_out(child_ctx);
3905
3906 /*
3907 * Take the context lock here so that if find_get_context is
3908 * reading child->perf_counter_ctxp, we wait until it has
3909 * incremented the context's refcount before we do put_ctx below.
3910 */
3911 spin_lock(&child_ctx->lock);
3912 child->perf_counter_ctxp = NULL;
3913 if (child_ctx->parent_ctx) {
3914 /*
3915 * This context is a clone; unclone it so it can't get
3916 * swapped to another process while we're removing all
3917 * the counters from it.
3918 */
3919 put_ctx(child_ctx->parent_ctx);
3920 child_ctx->parent_ctx = NULL;
3921 }
3922 spin_unlock(&child_ctx->lock);
3923 local_irq_restore(flags);
3924
3925 /*
3926 * We can recurse on the same lock type through:
3927 *
3928 * __perf_counter_exit_task()
3929 * sync_child_counter()
3930 * fput(parent_counter->filp)
3931 * perf_release()
3932 * mutex_lock(&ctx->mutex)
3933 *
3934 * But since its the parent context it won't be the same instance.
3935 */
3936 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
3937
3938again:
3939 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
3940 list_entry)
3941 __perf_counter_exit_task(child_counter, child_ctx);
3942
3943 /*
3944 * If the last counter was a group counter, it will have appended all
3945 * its siblings to the list, but we obtained 'tmp' before that which
3946 * will still point to the list head terminating the iteration.
3947 */
3948 if (!list_empty(&child_ctx->counter_list))
3949 goto again;
3950
3951 mutex_unlock(&child_ctx->mutex);
3952
3953 put_ctx(child_ctx);
3954}
3955
3956/*
3957 * free an unexposed, unused context as created by inheritance by
3958 * init_task below, used by fork() in case of fail.
3959 */
3960void perf_counter_free_task(struct task_struct *task)
3961{
3962 struct perf_counter_context *ctx = task->perf_counter_ctxp;
3963 struct perf_counter *counter, *tmp;
3964
3965 if (!ctx)
3966 return;
3967
3968 mutex_lock(&ctx->mutex);
3969again:
3970 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
3971 struct perf_counter *parent = counter->parent;
3972
3973 if (WARN_ON_ONCE(!parent))
3974 continue;
3975
3976 mutex_lock(&parent->child_mutex);
3977 list_del_init(&counter->child_list);
3978 mutex_unlock(&parent->child_mutex);
3979
3980 fput(parent->filp);
3981
3982 list_del_counter(counter, ctx);
3983 free_counter(counter);
3984 }
3985
3986 if (!list_empty(&ctx->counter_list))
3987 goto again;
3988
3989 mutex_unlock(&ctx->mutex);
3990
3991 put_ctx(ctx);
3992}
3993
3994/*
3995 * Initialize the perf_counter context in task_struct
3996 */
3997int perf_counter_init_task(struct task_struct *child)
3998{
3999 struct perf_counter_context *child_ctx, *parent_ctx;
4000 struct perf_counter_context *cloned_ctx;
4001 struct perf_counter *counter;
4002 struct task_struct *parent = current;
4003 int inherited_all = 1;
4004 int ret = 0;
4005
4006 child->perf_counter_ctxp = NULL;
4007
4008 mutex_init(&child->perf_counter_mutex);
4009 INIT_LIST_HEAD(&child->perf_counter_list);
4010
4011 if (likely(!parent->perf_counter_ctxp))
4012 return 0;
4013
4014 /*
4015 * This is executed from the parent task context, so inherit
4016 * counters that have been marked for cloning.
4017 * First allocate and initialize a context for the child.
4018 */
4019
4020 child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
4021 if (!child_ctx)
4022 return -ENOMEM;
4023
4024 __perf_counter_init_context(child_ctx, child);
4025 child->perf_counter_ctxp = child_ctx;
4026 get_task_struct(child);
4027
4028 /*
4029 * If the parent's context is a clone, pin it so it won't get
4030 * swapped under us.
4031 */
4032 parent_ctx = perf_pin_task_context(parent);
4033
4034 /*
4035 * No need to check if parent_ctx != NULL here; since we saw
4036 * it non-NULL earlier, the only reason for it to become NULL
4037 * is if we exit, and since we're currently in the middle of
4038 * a fork we can't be exiting at the same time.
4039 */
4040
4041 /*
4042 * Lock the parent list. No need to lock the child - not PID
4043 * hashed yet and not running, so nobody can access it.
4044 */
4045 mutex_lock(&parent_ctx->mutex);
4046
4047 /*
4048 * We dont have to disable NMIs - we are only looking at
4049 * the list, not manipulating it:
4050 */
4051 list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
4052 if (counter != counter->group_leader)
4053 continue;
4054
4055 if (!counter->attr.inherit) {
4056 inherited_all = 0;
4057 continue;
4058 }
4059
4060 ret = inherit_group(counter, parent, parent_ctx,
4061 child, child_ctx);
4062 if (ret) {
4063 inherited_all = 0;
4064 break;
4065 }
4066 }
4067
4068 if (inherited_all) {
4069 /*
4070 * Mark the child context as a clone of the parent
4071 * context, or of whatever the parent is a clone of.
4072 * Note that if the parent is a clone, it could get
4073 * uncloned at any point, but that doesn't matter
4074 * because the list of counters and the generation
4075 * count can't have changed since we took the mutex.
4076 */
4077 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4078 if (cloned_ctx) {
4079 child_ctx->parent_ctx = cloned_ctx;
4080 child_ctx->parent_gen = parent_ctx->parent_gen;
4081 } else {
4082 child_ctx->parent_ctx = parent_ctx;
4083 child_ctx->parent_gen = parent_ctx->generation;
4084 }
4085 get_ctx(child_ctx->parent_ctx);
4086 }
4087
4088 mutex_unlock(&parent_ctx->mutex);
4089
4090 perf_unpin_context(parent_ctx);
4091
4092 return ret;
4093}
4094
4095static void __cpuinit perf_counter_init_cpu(int cpu)
4096{
4097 struct perf_cpu_context *cpuctx;
4098
4099 cpuctx = &per_cpu(perf_cpu_context, cpu);
4100 __perf_counter_init_context(&cpuctx->ctx, NULL);
4101
4102 spin_lock(&perf_resource_lock);
4103 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
4104 spin_unlock(&perf_resource_lock);
4105
4106 hw_perf_counter_setup(cpu);
4107}
4108
4109#ifdef CONFIG_HOTPLUG_CPU
4110static void __perf_counter_exit_cpu(void *info)
4111{
4112 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4113 struct perf_counter_context *ctx = &cpuctx->ctx;
4114 struct perf_counter *counter, *tmp;
4115
4116 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
4117 __perf_counter_remove_from_context(counter);
4118}
4119static void perf_counter_exit_cpu(int cpu)
4120{
4121 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4122 struct perf_counter_context *ctx = &cpuctx->ctx;
4123
4124 mutex_lock(&ctx->mutex);
4125 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
4126 mutex_unlock(&ctx->mutex);
4127}
4128#else
4129static inline void perf_counter_exit_cpu(int cpu) { }
4130#endif
4131
4132static int __cpuinit
4133perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4134{
4135 unsigned int cpu = (long)hcpu;
4136
4137 switch (action) {
4138
4139 case CPU_UP_PREPARE:
4140 case CPU_UP_PREPARE_FROZEN:
4141 perf_counter_init_cpu(cpu);
4142 break;
4143
4144 case CPU_DOWN_PREPARE:
4145 case CPU_DOWN_PREPARE_FROZEN:
4146 perf_counter_exit_cpu(cpu);
4147 break;
4148
4149 default:
4150 break;
4151 }
4152
4153 return NOTIFY_OK;
4154}
4155
4156/*
4157 * This has to have a higher priority than migration_notifier in sched.c.
4158 */
4159static struct notifier_block __cpuinitdata perf_cpu_nb = {
4160 .notifier_call = perf_cpu_notify,
4161 .priority = 20,
4162};
4163
4164void __init perf_counter_init(void)
4165{
4166 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4167 (void *)(long)smp_processor_id());
4168 register_cpu_notifier(&perf_cpu_nb);
4169}
4170
4171static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4172{
4173 return sprintf(buf, "%d\n", perf_reserved_percpu);
4174}
4175
4176static ssize_t
4177perf_set_reserve_percpu(struct sysdev_class *class,
4178 const char *buf,
4179 size_t count)
4180{
4181 struct perf_cpu_context *cpuctx;
4182 unsigned long val;
4183 int err, cpu, mpt;
4184
4185 err = strict_strtoul(buf, 10, &val);
4186 if (err)
4187 return err;
4188 if (val > perf_max_counters)
4189 return -EINVAL;
4190
4191 spin_lock(&perf_resource_lock);
4192 perf_reserved_percpu = val;
4193 for_each_online_cpu(cpu) {
4194 cpuctx = &per_cpu(perf_cpu_context, cpu);
4195 spin_lock_irq(&cpuctx->ctx.lock);
4196 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
4197 perf_max_counters - perf_reserved_percpu);
4198 cpuctx->max_pertask = mpt;
4199 spin_unlock_irq(&cpuctx->ctx.lock);
4200 }
4201 spin_unlock(&perf_resource_lock);
4202
4203 return count;
4204}
4205
4206static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4207{
4208 return sprintf(buf, "%d\n", perf_overcommit);
4209}
4210
4211static ssize_t
4212perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4213{
4214 unsigned long val;
4215 int err;
4216
4217 err = strict_strtoul(buf, 10, &val);
4218 if (err)
4219 return err;
4220 if (val > 1)
4221 return -EINVAL;
4222
4223 spin_lock(&perf_resource_lock);
4224 perf_overcommit = val;
4225 spin_unlock(&perf_resource_lock);
4226
4227 return count;
4228}
4229
4230static SYSDEV_CLASS_ATTR(
4231 reserve_percpu,
4232 0644,
4233 perf_show_reserve_percpu,
4234 perf_set_reserve_percpu
4235 );
4236
4237static SYSDEV_CLASS_ATTR(
4238 overcommit,
4239 0644,
4240 perf_show_overcommit,
4241 perf_set_overcommit
4242 );
4243
4244static struct attribute *perfclass_attrs[] = {
4245 &attr_reserve_percpu.attr,
4246 &attr_overcommit.attr,
4247 NULL
4248};
4249
4250static struct attribute_group perfclass_attr_group = {
4251 .attrs = perfclass_attrs,
4252 .name = "perf_counters",
4253};
4254
4255static int __init perf_counter_sysfs_init(void)
4256{
4257 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4258 &perfclass_attr_group);
4259}
4260device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/profile.c b/kernel/profile.c
index 7724e0409bae..28cf26ad2d24 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -111,12 +111,6 @@ int __ref profile_init(void)
111 /* only text is profiled */ 111 /* only text is profiled */
112 prof_len = (_etext - _stext) >> prof_shift; 112 prof_len = (_etext - _stext) >> prof_shift;
113 buffer_bytes = prof_len*sizeof(atomic_t); 113 buffer_bytes = prof_len*sizeof(atomic_t);
114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes);
116 alloc_bootmem_cpumask_var(&prof_cpu_mask);
117 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
118 return 0;
119 }
120 114
121 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) 115 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
122 return -ENOMEM; 116 return -ENOMEM;
diff --git a/kernel/sched.c b/kernel/sched.c
index 14c447ae5d53..f04aa9664504 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h>
42#include <linux/security.h> 43#include <linux/security.h>
43#include <linux/notifier.h> 44#include <linux/notifier.h>
44#include <linux/profile.h> 45#include <linux/profile.h>
@@ -68,7 +69,6 @@
68#include <linux/pagemap.h> 69#include <linux/pagemap.h>
69#include <linux/hrtimer.h> 70#include <linux/hrtimer.h>
70#include <linux/tick.h> 71#include <linux/tick.h>
71#include <linux/bootmem.h>
72#include <linux/debugfs.h> 72#include <linux/debugfs.h>
73#include <linux/ctype.h> 73#include <linux/ctype.h>
74#include <linux/ftrace.h> 74#include <linux/ftrace.h>
@@ -580,6 +580,7 @@ struct rq {
580 struct load_weight load; 580 struct load_weight load;
581 unsigned long nr_load_updates; 581 unsigned long nr_load_updates;
582 u64 nr_switches; 582 u64 nr_switches;
583 u64 nr_migrations_in;
583 584
584 struct cfs_rq cfs; 585 struct cfs_rq cfs;
585 struct rt_rq rt; 586 struct rt_rq rt;
@@ -692,7 +693,7 @@ static inline int cpu_of(struct rq *rq)
692#define task_rq(p) cpu_rq(task_cpu(p)) 693#define task_rq(p) cpu_rq(task_cpu(p))
693#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 694#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
694 695
695static inline void update_rq_clock(struct rq *rq) 696inline void update_rq_clock(struct rq *rq)
696{ 697{
697 rq->clock = sched_clock_cpu(cpu_of(rq)); 698 rq->clock = sched_clock_cpu(cpu_of(rq));
698} 699}
@@ -1969,12 +1970,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1969 p->se.sleep_start -= clock_offset; 1970 p->se.sleep_start -= clock_offset;
1970 if (p->se.block_start) 1971 if (p->se.block_start)
1971 p->se.block_start -= clock_offset; 1972 p->se.block_start -= clock_offset;
1973#endif
1972 if (old_cpu != new_cpu) { 1974 if (old_cpu != new_cpu) {
1973 schedstat_inc(p, se.nr_migrations); 1975 p->se.nr_migrations++;
1976 new_rq->nr_migrations_in++;
1977#ifdef CONFIG_SCHEDSTATS
1974 if (task_hot(p, old_rq->clock, NULL)) 1978 if (task_hot(p, old_rq->clock, NULL))
1975 schedstat_inc(p, se.nr_forced2_migrations); 1979 schedstat_inc(p, se.nr_forced2_migrations);
1976 }
1977#endif 1980#endif
1981 perf_counter_task_migration(p, new_cpu);
1982 }
1978 p->se.vruntime -= old_cfsrq->min_vruntime - 1983 p->se.vruntime -= old_cfsrq->min_vruntime -
1979 new_cfsrq->min_vruntime; 1984 new_cfsrq->min_vruntime;
1980 1985
@@ -2369,6 +2374,27 @@ static int sched_balance_self(int cpu, int flag)
2369 2374
2370#endif /* CONFIG_SMP */ 2375#endif /* CONFIG_SMP */
2371 2376
2377/**
2378 * task_oncpu_function_call - call a function on the cpu on which a task runs
2379 * @p: the task to evaluate
2380 * @func: the function to be called
2381 * @info: the function call argument
2382 *
2383 * Calls the function @func when the task is currently running. This might
2384 * be on the current CPU, which just calls the function directly
2385 */
2386void task_oncpu_function_call(struct task_struct *p,
2387 void (*func) (void *info), void *info)
2388{
2389 int cpu;
2390
2391 preempt_disable();
2392 cpu = task_cpu(p);
2393 if (task_curr(p))
2394 smp_call_function_single(cpu, func, info, 1);
2395 preempt_enable();
2396}
2397
2372/*** 2398/***
2373 * try_to_wake_up - wake up a thread 2399 * try_to_wake_up - wake up a thread
2374 * @p: the to-be-woken-up thread 2400 * @p: the to-be-woken-up thread
@@ -2536,6 +2562,7 @@ static void __sched_fork(struct task_struct *p)
2536 p->se.exec_start = 0; 2562 p->se.exec_start = 0;
2537 p->se.sum_exec_runtime = 0; 2563 p->se.sum_exec_runtime = 0;
2538 p->se.prev_sum_exec_runtime = 0; 2564 p->se.prev_sum_exec_runtime = 0;
2565 p->se.nr_migrations = 0;
2539 p->se.last_wakeup = 0; 2566 p->se.last_wakeup = 0;
2540 p->se.avg_overlap = 0; 2567 p->se.avg_overlap = 0;
2541 p->se.start_runtime = 0; 2568 p->se.start_runtime = 0;
@@ -2766,6 +2793,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2766 */ 2793 */
2767 prev_state = prev->state; 2794 prev_state = prev->state;
2768 finish_arch_switch(prev); 2795 finish_arch_switch(prev);
2796 perf_counter_task_sched_in(current, cpu_of(rq));
2769 finish_lock_switch(rq, prev); 2797 finish_lock_switch(rq, prev);
2770#ifdef CONFIG_SMP 2798#ifdef CONFIG_SMP
2771 if (post_schedule) 2799 if (post_schedule)
@@ -2981,6 +3009,15 @@ static void calc_load_account_active(struct rq *this_rq)
2981} 3009}
2982 3010
2983/* 3011/*
3012 * Externally visible per-cpu scheduler statistics:
3013 * cpu_nr_migrations(cpu) - number of migrations into that cpu
3014 */
3015u64 cpu_nr_migrations(int cpu)
3016{
3017 return cpu_rq(cpu)->nr_migrations_in;
3018}
3019
3020/*
2984 * Update rq->cpu_load[] statistics. This function is usually called every 3021 * Update rq->cpu_load[] statistics. This function is usually called every
2985 * scheduler tick (TICK_NSEC). 3022 * scheduler tick (TICK_NSEC).
2986 */ 3023 */
@@ -5078,6 +5115,8 @@ void scheduler_tick(void)
5078 curr->sched_class->task_tick(rq, curr, 0); 5115 curr->sched_class->task_tick(rq, curr, 0);
5079 spin_unlock(&rq->lock); 5116 spin_unlock(&rq->lock);
5080 5117
5118 perf_counter_task_tick(curr, cpu);
5119
5081#ifdef CONFIG_SMP 5120#ifdef CONFIG_SMP
5082 rq->idle_at_tick = idle_cpu(cpu); 5121 rq->idle_at_tick = idle_cpu(cpu);
5083 trigger_load_balance(rq, cpu); 5122 trigger_load_balance(rq, cpu);
@@ -5293,6 +5332,7 @@ need_resched_nonpreemptible:
5293 5332
5294 if (likely(prev != next)) { 5333 if (likely(prev != next)) {
5295 sched_info_switch(prev, next); 5334 sched_info_switch(prev, next);
5335 perf_counter_task_sched_out(prev, next, cpu);
5296 5336
5297 rq->nr_switches++; 5337 rq->nr_switches++;
5298 rq->curr = next; 5338 rq->curr = next;
@@ -7536,8 +7576,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7536 return NOTIFY_OK; 7576 return NOTIFY_OK;
7537} 7577}
7538 7578
7539/* Register at highest priority so that task migration (migrate_all_tasks) 7579/*
7540 * happens before everything else. 7580 * Register at high priority so that task migration (migrate_all_tasks)
7581 * happens before everything else. This has to be lower priority than
7582 * the notifier in the perf_counter subsystem, though.
7541 */ 7583 */
7542static struct notifier_block __cpuinitdata migration_notifier = { 7584static struct notifier_block __cpuinitdata migration_notifier = {
7543 .notifier_call = migration_call, 7585 .notifier_call = migration_call,
@@ -7782,24 +7824,21 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7782 7824
7783static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) 7825static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
7784{ 7826{
7827 gfp_t gfp = GFP_KERNEL;
7828
7785 memset(rd, 0, sizeof(*rd)); 7829 memset(rd, 0, sizeof(*rd));
7786 7830
7787 if (bootmem) { 7831 if (bootmem)
7788 alloc_bootmem_cpumask_var(&def_root_domain.span); 7832 gfp = GFP_NOWAIT;
7789 alloc_bootmem_cpumask_var(&def_root_domain.online);
7790 alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
7791 cpupri_init(&rd->cpupri, true);
7792 return 0;
7793 }
7794 7833
7795 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 7834 if (!alloc_cpumask_var(&rd->span, gfp))
7796 goto out; 7835 goto out;
7797 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 7836 if (!alloc_cpumask_var(&rd->online, gfp))
7798 goto free_span; 7837 goto free_span;
7799 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 7838 if (!alloc_cpumask_var(&rd->rto_mask, gfp))
7800 goto free_online; 7839 goto free_online;
7801 7840
7802 if (cpupri_init(&rd->cpupri, false) != 0) 7841 if (cpupri_init(&rd->cpupri, bootmem) != 0)
7803 goto free_rto_mask; 7842 goto free_rto_mask;
7804 return 0; 7843 return 0;
7805 7844
@@ -9123,7 +9162,7 @@ void __init sched_init(void)
9123 * we use alloc_bootmem(). 9162 * we use alloc_bootmem().
9124 */ 9163 */
9125 if (alloc_size) { 9164 if (alloc_size) {
9126 ptr = (unsigned long)alloc_bootmem(alloc_size); 9165 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
9127 9166
9128#ifdef CONFIG_FAIR_GROUP_SCHED 9167#ifdef CONFIG_FAIR_GROUP_SCHED
9129 init_task_group.se = (struct sched_entity **)ptr; 9168 init_task_group.se = (struct sched_entity **)ptr;
@@ -9218,7 +9257,7 @@ void __init sched_init(void)
9218 * 1024) and two child groups A0 and A1 (of weight 1024 each), 9257 * 1024) and two child groups A0 and A1 (of weight 1024 each),
9219 * then A0's share of the cpu resource is: 9258 * then A0's share of the cpu resource is:
9220 * 9259 *
9221 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 9260 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
9222 * 9261 *
9223 * We achieve this by letting init_task_group's tasks sit 9262 * We achieve this by letting init_task_group's tasks sit
9224 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 9263 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9314,15 +9353,17 @@ void __init sched_init(void)
9314 current->sched_class = &fair_sched_class; 9353 current->sched_class = &fair_sched_class;
9315 9354
9316 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 9355 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
9317 alloc_bootmem_cpumask_var(&nohz_cpu_mask); 9356 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9318#ifdef CONFIG_SMP 9357#ifdef CONFIG_SMP
9319#ifdef CONFIG_NO_HZ 9358#ifdef CONFIG_NO_HZ
9320 alloc_bootmem_cpumask_var(&nohz.cpu_mask); 9359 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9321 alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask); 9360 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9322#endif 9361#endif
9323 alloc_bootmem_cpumask_var(&cpu_isolated_map); 9362 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9324#endif /* SMP */ 9363#endif /* SMP */
9325 9364
9365 perf_counter_init();
9366
9326 scheduler_running = 1; 9367 scheduler_running = 1;
9327} 9368}
9328 9369
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 344712a5e3ed..7deffc9f0e5f 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -154,8 +154,12 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
154 */ 154 */
155int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) 155int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
156{ 156{
157 gfp_t gfp = GFP_KERNEL;
157 int i; 158 int i;
158 159
160 if (bootmem)
161 gfp = GFP_NOWAIT;
162
159 memset(cp, 0, sizeof(*cp)); 163 memset(cp, 0, sizeof(*cp));
160 164
161 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 165 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -163,9 +167,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
163 167
164 spin_lock_init(&vec->lock); 168 spin_lock_init(&vec->lock);
165 vec->count = 0; 169 vec->count = 0;
166 if (bootmem) 170 if (!zalloc_cpumask_var(&vec->mask, gfp))
167 alloc_bootmem_cpumask_var(&vec->mask);
168 else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
169 goto cleanup; 171 goto cleanup;
170 } 172 }
171 173
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index b28d19135f43..521ed2004d63 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -372,8 +372,8 @@ static int slow_work_thread(void *_data)
372 vsmax *= atomic_read(&slow_work_thread_count); 372 vsmax *= atomic_read(&slow_work_thread_count);
373 vsmax /= 100; 373 vsmax /= 100;
374 374
375 prepare_to_wait(&slow_work_thread_wq, &wait, 375 prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
376 TASK_INTERRUPTIBLE); 376 TASK_INTERRUPTIBLE);
377 if (!freezing(current) && 377 if (!freezing(current) &&
378 !slow_work_threads_should_exit && 378 !slow_work_threads_should_exit &&
379 !slow_work_available(vsmax) && 379 !slow_work_available(vsmax) &&
diff --git a/kernel/sys.c b/kernel/sys.c
index e7998cf31498..438d99a38c87 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h>
17#include <linux/resource.h> 18#include <linux/resource.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1793,6 +1794,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1793 case PR_SET_TSC: 1794 case PR_SET_TSC:
1794 error = SET_TSC_CTL(arg2); 1795 error = SET_TSC_CTL(arg2);
1795 break; 1796 break;
1797 case PR_TASK_PERF_COUNTERS_DISABLE:
1798 error = perf_counter_task_disable();
1799 break;
1800 case PR_TASK_PERF_COUNTERS_ENABLE:
1801 error = perf_counter_task_enable();
1802 break;
1796 case PR_GET_TIMERSLACK: 1803 case PR_GET_TIMERSLACK:
1797 error = current->timer_slack_ns; 1804 error = current->timer_slack_ns;
1798 break; 1805 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 27dad2967387..68320f6b07b5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime);
175cond_syscall(compat_sys_timerfd_gettime); 175cond_syscall(compat_sys_timerfd_gettime);
176cond_syscall(sys_eventfd); 176cond_syscall(sys_eventfd);
177cond_syscall(sys_eventfd2); 177cond_syscall(sys_eventfd2);
178
179/* performance counters: */
180cond_syscall(sys_perf_counter_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 944ba03cae19..ce664f98e3fb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,6 +49,7 @@
49#include <linux/reboot.h> 49#include <linux/reboot.h>
50#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/slow-work.h> 51#include <linux/slow-work.h>
52#include <linux/perf_counter.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/processor.h> 55#include <asm/processor.h>
@@ -932,6 +933,32 @@ static struct ctl_table kern_table[] = {
932 .child = slow_work_sysctls, 933 .child = slow_work_sysctls,
933 }, 934 },
934#endif 935#endif
936#ifdef CONFIG_PERF_COUNTERS
937 {
938 .ctl_name = CTL_UNNUMBERED,
939 .procname = "perf_counter_paranoid",
940 .data = &sysctl_perf_counter_paranoid,
941 .maxlen = sizeof(sysctl_perf_counter_paranoid),
942 .mode = 0644,
943 .proc_handler = &proc_dointvec,
944 },
945 {
946 .ctl_name = CTL_UNNUMBERED,
947 .procname = "perf_counter_mlock_kb",
948 .data = &sysctl_perf_counter_mlock,
949 .maxlen = sizeof(sysctl_perf_counter_mlock),
950 .mode = 0644,
951 .proc_handler = &proc_dointvec,
952 },
953 {
954 .ctl_name = CTL_UNNUMBERED,
955 .procname = "perf_counter_max_sample_rate",
956 .data = &sysctl_perf_counter_sample_rate,
957 .maxlen = sizeof(sysctl_perf_counter_sample_rate),
958 .mode = 0644,
959 .proc_handler = &proc_dointvec,
960 },
961#endif
935/* 962/*
936 * NOTE: do not add new entries to this table unless you have read 963 * NOTE: do not add new entries to this table unless you have read
937 * Documentation/sysctl/ctl_unnumbered.txt 964 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/timer.c b/kernel/timer.c
index a26ed294f938..c01e568935ea 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/unistd.h> 43#include <asm/unistd.h>
@@ -1129,6 +1130,8 @@ static void run_timer_softirq(struct softirq_action *h)
1129{ 1130{
1130 struct tvec_base *base = __get_cpu_var(tvec_bases); 1131 struct tvec_base *base = __get_cpu_var(tvec_bases);
1131 1132
1133 perf_counter_do_pending();
1134
1132 hrtimer_run_pending(); 1135 hrtimer_run_pending();
1133 1136
1134 if (time_after_eq(jiffies, base->timer_jiffies)) 1137 if (time_after_eq(jiffies, base->timer_jiffies))
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 6cdcf38f2da9..116a35051be6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -336,6 +336,38 @@ config SLUB_STATS
336 out which slabs are relevant to a particular load. 336 out which slabs are relevant to a particular load.
337 Try running: slabinfo -DA 337 Try running: slabinfo -DA
338 338
339config DEBUG_KMEMLEAK
340 bool "Kernel memory leak detector"
341 depends on DEBUG_KERNEL && EXPERIMENTAL && (X86 || ARM) && \
342 !MEMORY_HOTPLUG
343 select DEBUG_SLAB if SLAB
344 select SLUB_DEBUG if SLUB
345 select DEBUG_FS if SYSFS
346 select STACKTRACE if STACKTRACE_SUPPORT
347 select KALLSYMS
348 help
349 Say Y here if you want to enable the memory leak
350 detector. The memory allocation/freeing is traced in a way
351 similar to the Boehm's conservative garbage collector, the
352 difference being that the orphan objects are not freed but
353 only shown in /sys/kernel/debug/kmemleak. Enabling this
354 feature will introduce an overhead to memory
355 allocations. See Documentation/kmemleak.txt for more
356 details.
357
358 In order to access the kmemleak file, debugfs needs to be
359 mounted (usually at /sys/kernel/debug).
360
361config DEBUG_KMEMLEAK_TEST
362 tristate "Simple test for the kernel memory leak detector"
363 depends on DEBUG_KMEMLEAK
364 help
365 Say Y or M here to build a test for the kernel memory leak
366 detector. This option enables a module that explicitly leaks
367 memory.
368
369 If unsure, say N.
370
339config DEBUG_PREEMPT 371config DEBUG_PREEMPT
340 bool "Debug preemptible kernel" 372 bool "Debug preemptible kernel"
341 depends on DEBUG_KERNEL && PREEMPT && (TRACE_IRQFLAGS_SUPPORT || PPC64) 373 depends on DEBUG_KERNEL && PREEMPT && (TRACE_IRQFLAGS_SUPPORT || PPC64)
diff --git a/lib/cpumask.c b/lib/cpumask.c
index eb23aaa0c7b8..7bb4142a502f 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -92,15 +92,8 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
92 */ 92 */
93bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) 93bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
94{ 94{
95 if (likely(slab_is_available())) 95 *mask = kmalloc_node(cpumask_size(), flags, node);
96 *mask = kmalloc_node(cpumask_size(), flags, node); 96
97 else {
98#ifdef CONFIG_DEBUG_PER_CPU_MAPS
99 printk(KERN_ERR
100 "=> alloc_cpumask_var: kmalloc not available!\n");
101#endif
102 *mask = NULL;
103 }
104#ifdef CONFIG_DEBUG_PER_CPU_MAPS 97#ifdef CONFIG_DEBUG_PER_CPU_MAPS
105 if (!*mask) { 98 if (!*mask) {
106 printk(KERN_ERR "=> alloc_cpumask_var: failed!\n"); 99 printk(KERN_ERR "=> alloc_cpumask_var: failed!\n");
diff --git a/mm/Makefile b/mm/Makefile
index ec73c68b6015..e89acb090b4d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -38,3 +38,5 @@ obj-$(CONFIG_SMP) += allocpercpu.o
38endif 38endif
39obj-$(CONFIG_QUICKLIST) += quicklist.o 39obj-$(CONFIG_QUICKLIST) += quicklist.o
40obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 40obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
41obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
42obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index daf92713f7de..282df0a09e6f 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -532,6 +532,9 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
532 unsigned long size, unsigned long align, 532 unsigned long size, unsigned long align,
533 unsigned long goal, unsigned long limit) 533 unsigned long goal, unsigned long limit)
534{ 534{
535 if (WARN_ON_ONCE(slab_is_available()))
536 return kzalloc(size, GFP_NOWAIT);
537
535#ifdef CONFIG_HAVE_ARCH_BOOTMEM 538#ifdef CONFIG_HAVE_ARCH_BOOTMEM
536 bootmem_data_t *p_bdata; 539 bootmem_data_t *p_bdata;
537 540
@@ -662,6 +665,9 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
662void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, 665void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
663 unsigned long align, unsigned long goal) 666 unsigned long align, unsigned long goal)
664{ 667{
668 if (WARN_ON_ONCE(slab_is_available()))
669 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
670
665 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); 671 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
666} 672}
667 673
@@ -693,6 +699,9 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
693{ 699{
694 void *ptr; 700 void *ptr;
695 701
702 if (WARN_ON_ONCE(slab_is_available()))
703 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
704
696 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); 705 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
697 if (ptr) 706 if (ptr)
698 return ptr; 707 return ptr;
@@ -745,6 +754,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
745void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 754void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
746 unsigned long align, unsigned long goal) 755 unsigned long align, unsigned long goal)
747{ 756{
757 if (WARN_ON_ONCE(slab_is_available()))
758 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
759
748 return ___alloc_bootmem_node(pgdat->bdata, size, align, 760 return ___alloc_bootmem_node(pgdat->bdata, size, align,
749 goal, ARCH_LOW_ADDRESS_LIMIT); 761 goal, ARCH_LOW_ADDRESS_LIMIT);
750} 762}
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
new file mode 100644
index 000000000000..d5292fc6f523
--- /dev/null
+++ b/mm/kmemleak-test.c
@@ -0,0 +1,111 @@
1/*
2 * mm/kmemleak-test.c
3 *
4 * Copyright (C) 2008 ARM Limited
5 * Written by Catalin Marinas <catalin.marinas@arm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21#include <linux/init.h>
22#include <linux/kernel.h>
23#include <linux/module.h>
24#include <linux/slab.h>
25#include <linux/vmalloc.h>
26#include <linux/list.h>
27#include <linux/percpu.h>
28#include <linux/fdtable.h>
29
30#include <linux/kmemleak.h>
31
32struct test_node {
33 long header[25];
34 struct list_head list;
35 long footer[25];
36};
37
38static LIST_HEAD(test_list);
39static DEFINE_PER_CPU(void *, test_pointer);
40
41/*
42 * Some very simple testing. This function needs to be extended for
43 * proper testing.
44 */
45static int __init kmemleak_test_init(void)
46{
47 struct test_node *elem;
48 int i;
49
50 printk(KERN_INFO "Kmemleak testing\n");
51
52 /* make some orphan objects */
53 pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
54 pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
55 pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
56 pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
57 pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
58 pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
59 pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
60 pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
61#ifndef CONFIG_MODULES
62 pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
63 kmem_cache_alloc(files_cachep, GFP_KERNEL));
64 pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
65 kmem_cache_alloc(files_cachep, GFP_KERNEL));
66#endif
67 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
68 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
69 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
70 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
71 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
72
73 /*
74 * Add elements to a list. They should only appear as orphan
75 * after the module is removed.
76 */
77 for (i = 0; i < 10; i++) {
78 elem = kmalloc(sizeof(*elem), GFP_KERNEL);
79 pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem);
80 if (!elem)
81 return -ENOMEM;
82 memset(elem, 0, sizeof(*elem));
83 INIT_LIST_HEAD(&elem->list);
84
85 list_add_tail(&elem->list, &test_list);
86 }
87
88 for_each_possible_cpu(i) {
89 per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL);
90 pr_info("kmemleak: kmalloc(129) = %p\n",
91 per_cpu(test_pointer, i));
92 }
93
94 return 0;
95}
96module_init(kmemleak_test_init);
97
98static void __exit kmemleak_test_exit(void)
99{
100 struct test_node *elem, *tmp;
101
102 /*
103 * Remove the list elements without actually freeing the
104 * memory.
105 */
106 list_for_each_entry_safe(elem, tmp, &test_list, list)
107 list_del(&elem->list);
108}
109module_exit(kmemleak_test_exit);
110
111MODULE_LICENSE("GPL");
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
new file mode 100644
index 000000000000..58ec86c9e58a
--- /dev/null
+++ b/mm/kmemleak.c
@@ -0,0 +1,1498 @@
1/*
2 * mm/kmemleak.c
3 *
4 * Copyright (C) 2008 ARM Limited
5 * Written by Catalin Marinas <catalin.marinas@arm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 *
21 * For more information on the algorithm and kmemleak usage, please see
22 * Documentation/kmemleak.txt.
23 *
24 * Notes on locking
25 * ----------------
26 *
27 * The following locks and mutexes are used by kmemleak:
28 *
29 * - kmemleak_lock (rwlock): protects the object_list modifications and
30 * accesses to the object_tree_root. The object_list is the main list
31 * holding the metadata (struct kmemleak_object) for the allocated memory
32 * blocks. The object_tree_root is a priority search tree used to look-up
33 * metadata based on a pointer to the corresponding memory block. The
34 * kmemleak_object structures are added to the object_list and
35 * object_tree_root in the create_object() function called from the
36 * kmemleak_alloc() callback and removed in delete_object() called from the
37 * kmemleak_free() callback
38 * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to
39 * the metadata (e.g. count) are protected by this lock. Note that some
40 * members of this structure may be protected by other means (atomic or
41 * kmemleak_lock). This lock is also held when scanning the corresponding
42 * memory block to avoid the kernel freeing it via the kmemleak_free()
43 * callback. This is less heavyweight than holding a global lock like
44 * kmemleak_lock during scanning
45 * - scan_mutex (mutex): ensures that only one thread may scan the memory for
46 * unreferenced objects at a time. The gray_list contains the objects which
47 * are already referenced or marked as false positives and need to be
48 * scanned. This list is only modified during a scanning episode when the
49 * scan_mutex is held. At the end of a scan, the gray_list is always empty.
50 * Note that the kmemleak_object.use_count is incremented when an object is
51 * added to the gray_list and therefore cannot be freed
52 * - kmemleak_mutex (mutex): prevents multiple users of the "kmemleak" debugfs
53 * file together with modifications to the memory scanning parameters
54 * including the scan_thread pointer
55 *
56 * The kmemleak_object structures have a use_count incremented or decremented
57 * using the get_object()/put_object() functions. When the use_count becomes
58 * 0, this count can no longer be incremented and put_object() schedules the
59 * kmemleak_object freeing via an RCU callback. All calls to the get_object()
60 * function must be protected by rcu_read_lock() to avoid accessing a freed
61 * structure.
62 */
63
64#include <linux/init.h>
65#include <linux/kernel.h>
66#include <linux/list.h>
67#include <linux/sched.h>
68#include <linux/jiffies.h>
69#include <linux/delay.h>
70#include <linux/module.h>
71#include <linux/kthread.h>
72#include <linux/prio_tree.h>
73#include <linux/gfp.h>
74#include <linux/fs.h>
75#include <linux/debugfs.h>
76#include <linux/seq_file.h>
77#include <linux/cpumask.h>
78#include <linux/spinlock.h>
79#include <linux/mutex.h>
80#include <linux/rcupdate.h>
81#include <linux/stacktrace.h>
82#include <linux/cache.h>
83#include <linux/percpu.h>
84#include <linux/hardirq.h>
85#include <linux/mmzone.h>
86#include <linux/slab.h>
87#include <linux/thread_info.h>
88#include <linux/err.h>
89#include <linux/uaccess.h>
90#include <linux/string.h>
91#include <linux/nodemask.h>
92#include <linux/mm.h>
93
94#include <asm/sections.h>
95#include <asm/processor.h>
96#include <asm/atomic.h>
97
98#include <linux/kmemleak.h>
99
100/*
101 * Kmemleak configuration and common defines.
102 */
103#define MAX_TRACE 16 /* stack trace length */
104#define REPORTS_NR 50 /* maximum number of reported leaks */
105#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */
106#define MSECS_SCAN_YIELD 10 /* CPU yielding period */
107#define SECS_FIRST_SCAN 60 /* delay before the first scan */
108#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
109
110#define BYTES_PER_POINTER sizeof(void *)
111
112/* scanning area inside a memory block */
113struct kmemleak_scan_area {
114 struct hlist_node node;
115 unsigned long offset;
116 size_t length;
117};
118
119/*
120 * Structure holding the metadata for each allocated memory block.
121 * Modifications to such objects should be made while holding the
122 * object->lock. Insertions or deletions from object_list, gray_list or
123 * tree_node are already protected by the corresponding locks or mutex (see
124 * the notes on locking above). These objects are reference-counted
125 * (use_count) and freed using the RCU mechanism.
126 */
127struct kmemleak_object {
128 spinlock_t lock;
129 unsigned long flags; /* object status flags */
130 struct list_head object_list;
131 struct list_head gray_list;
132 struct prio_tree_node tree_node;
133 struct rcu_head rcu; /* object_list lockless traversal */
134 /* object usage count; object freed when use_count == 0 */
135 atomic_t use_count;
136 unsigned long pointer;
137 size_t size;
138 /* minimum number of a pointers found before it is considered leak */
139 int min_count;
140 /* the total number of pointers found pointing to this object */
141 int count;
142 /* memory ranges to be scanned inside an object (empty for all) */
143 struct hlist_head area_list;
144 unsigned long trace[MAX_TRACE];
145 unsigned int trace_len;
146 unsigned long jiffies; /* creation timestamp */
147 pid_t pid; /* pid of the current task */
148 char comm[TASK_COMM_LEN]; /* executable name */
149};
150
151/* flag representing the memory block allocation status */
152#define OBJECT_ALLOCATED (1 << 0)
153/* flag set after the first reporting of an unreference object */
154#define OBJECT_REPORTED (1 << 1)
155/* flag set to not scan the object */
156#define OBJECT_NO_SCAN (1 << 2)
157
158/* the list of all allocated objects */
159static LIST_HEAD(object_list);
160/* the list of gray-colored objects (see color_gray comment below) */
161static LIST_HEAD(gray_list);
162/* prio search tree for object boundaries */
163static struct prio_tree_root object_tree_root;
164/* rw_lock protecting the access to object_list and prio_tree_root */
165static DEFINE_RWLOCK(kmemleak_lock);
166
167/* allocation caches for kmemleak internal data */
168static struct kmem_cache *object_cache;
169static struct kmem_cache *scan_area_cache;
170
171/* set if tracing memory operations is enabled */
172static atomic_t kmemleak_enabled = ATOMIC_INIT(0);
173/* set in the late_initcall if there were no errors */
174static atomic_t kmemleak_initialized = ATOMIC_INIT(0);
175/* enables or disables early logging of the memory operations */
176static atomic_t kmemleak_early_log = ATOMIC_INIT(1);
177/* set if a fata kmemleak error has occurred */
178static atomic_t kmemleak_error = ATOMIC_INIT(0);
179
180/* minimum and maximum address that may be valid pointers */
181static unsigned long min_addr = ULONG_MAX;
182static unsigned long max_addr;
183
184/* used for yielding the CPU to other tasks during scanning */
185static unsigned long next_scan_yield;
186static struct task_struct *scan_thread;
187static unsigned long jiffies_scan_yield;
188static unsigned long jiffies_min_age;
189/* delay between automatic memory scannings */
190static signed long jiffies_scan_wait;
191/* enables or disables the task stacks scanning */
192static int kmemleak_stack_scan;
193/* mutex protecting the memory scanning */
194static DEFINE_MUTEX(scan_mutex);
195/* mutex protecting the access to the /sys/kernel/debug/kmemleak file */
196static DEFINE_MUTEX(kmemleak_mutex);
197
198/* number of leaks reported (for limitation purposes) */
199static int reported_leaks;
200
201/*
202 * Early object allocation/freeing logging. Kkmemleak is initialized after the
203 * kernel allocator. However, both the kernel allocator and kmemleak may
204 * allocate memory blocks which need to be tracked. Kkmemleak defines an
205 * arbitrary buffer to hold the allocation/freeing information before it is
206 * fully initialized.
207 */
208
209/* kmemleak operation type for early logging */
210enum {
211 KMEMLEAK_ALLOC,
212 KMEMLEAK_FREE,
213 KMEMLEAK_NOT_LEAK,
214 KMEMLEAK_IGNORE,
215 KMEMLEAK_SCAN_AREA,
216 KMEMLEAK_NO_SCAN
217};
218
219/*
220 * Structure holding the information passed to kmemleak callbacks during the
221 * early logging.
222 */
223struct early_log {
224 int op_type; /* kmemleak operation type */
225 const void *ptr; /* allocated/freed memory block */
226 size_t size; /* memory block size */
227 int min_count; /* minimum reference count */
228 unsigned long offset; /* scan area offset */
229 size_t length; /* scan area length */
230};
231
232/* early logging buffer and current position */
233static struct early_log early_log[200];
234static int crt_early_log;
235
236static void kmemleak_disable(void);
237
238/*
239 * Print a warning and dump the stack trace.
240 */
241#define kmemleak_warn(x...) do { \
242 pr_warning(x); \
243 dump_stack(); \
244} while (0)
245
246/*
247 * Macro invoked when a serious kmemleak condition occured and cannot be
248 * recovered from. Kkmemleak will be disabled and further allocation/freeing
249 * tracing no longer available.
250 */
251#define kmemleak_panic(x...) do { \
252 kmemleak_warn(x); \
253 kmemleak_disable(); \
254} while (0)
255
256/*
257 * Object colors, encoded with count and min_count:
258 * - white - orphan object, not enough references to it (count < min_count)
259 * - gray - not orphan, not marked as false positive (min_count == 0) or
260 * sufficient references to it (count >= min_count)
261 * - black - ignore, it doesn't contain references (e.g. text section)
262 * (min_count == -1). No function defined for this color.
263 * Newly created objects don't have any color assigned (object->count == -1)
264 * before the next memory scan when they become white.
265 */
266static int color_white(const struct kmemleak_object *object)
267{
268 return object->count != -1 && object->count < object->min_count;
269}
270
271static int color_gray(const struct kmemleak_object *object)
272{
273 return object->min_count != -1 && object->count >= object->min_count;
274}
275
276/*
277 * Objects are considered referenced if their color is gray and they have not
278 * been deleted.
279 */
280static int referenced_object(struct kmemleak_object *object)
281{
282 return (object->flags & OBJECT_ALLOCATED) && color_gray(object);
283}
284
285/*
286 * Objects are considered unreferenced only if their color is white, they have
287 * not be deleted and have a minimum age to avoid false positives caused by
288 * pointers temporarily stored in CPU registers.
289 */
290static int unreferenced_object(struct kmemleak_object *object)
291{
292 return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
293 time_is_before_eq_jiffies(object->jiffies + jiffies_min_age);
294}
295
296/*
297 * Printing of the (un)referenced objects information, either to the seq file
298 * or to the kernel log. The print_referenced/print_unreferenced functions
299 * must be called with the object->lock held.
300 */
301#define print_helper(seq, x...) do { \
302 struct seq_file *s = (seq); \
303 if (s) \
304 seq_printf(s, x); \
305 else \
306 pr_info(x); \
307} while (0)
308
309static void print_referenced(struct kmemleak_object *object)
310{
311 pr_info("kmemleak: referenced object 0x%08lx (size %zu)\n",
312 object->pointer, object->size);
313}
314
315static void print_unreferenced(struct seq_file *seq,
316 struct kmemleak_object *object)
317{
318 int i;
319
320 print_helper(seq, "kmemleak: unreferenced object 0x%08lx (size %zu):\n",
321 object->pointer, object->size);
322 print_helper(seq, " comm \"%s\", pid %d, jiffies %lu\n",
323 object->comm, object->pid, object->jiffies);
324 print_helper(seq, " backtrace:\n");
325
326 for (i = 0; i < object->trace_len; i++) {
327 void *ptr = (void *)object->trace[i];
328 print_helper(seq, " [<%p>] %pS\n", ptr, ptr);
329 }
330}
331
332/*
333 * Print the kmemleak_object information. This function is used mainly for
334 * debugging special cases when kmemleak operations. It must be called with
335 * the object->lock held.
336 */
337static void dump_object_info(struct kmemleak_object *object)
338{
339 struct stack_trace trace;
340
341 trace.nr_entries = object->trace_len;
342 trace.entries = object->trace;
343
344 pr_notice("kmemleak: Object 0x%08lx (size %zu):\n",
345 object->tree_node.start, object->size);
346 pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
347 object->comm, object->pid, object->jiffies);
348 pr_notice(" min_count = %d\n", object->min_count);
349 pr_notice(" count = %d\n", object->count);
350 pr_notice(" backtrace:\n");
351 print_stack_trace(&trace, 4);
352}
353
354/*
355 * Look-up a memory block metadata (kmemleak_object) in the priority search
356 * tree based on a pointer value. If alias is 0, only values pointing to the
357 * beginning of the memory block are allowed. The kmemleak_lock must be held
358 * when calling this function.
359 */
360static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
361{
362 struct prio_tree_node *node;
363 struct prio_tree_iter iter;
364 struct kmemleak_object *object;
365
366 prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr);
367 node = prio_tree_next(&iter);
368 if (node) {
369 object = prio_tree_entry(node, struct kmemleak_object,
370 tree_node);
371 if (!alias && object->pointer != ptr) {
372 kmemleak_warn("kmemleak: Found object by alias");
373 object = NULL;
374 }
375 } else
376 object = NULL;
377
378 return object;
379}
380
381/*
382 * Increment the object use_count. Return 1 if successful or 0 otherwise. Note
383 * that once an object's use_count reached 0, the RCU freeing was already
384 * registered and the object should no longer be used. This function must be
385 * called under the protection of rcu_read_lock().
386 */
387static int get_object(struct kmemleak_object *object)
388{
389 return atomic_inc_not_zero(&object->use_count);
390}
391
392/*
393 * RCU callback to free a kmemleak_object.
394 */
395static void free_object_rcu(struct rcu_head *rcu)
396{
397 struct hlist_node *elem, *tmp;
398 struct kmemleak_scan_area *area;
399 struct kmemleak_object *object =
400 container_of(rcu, struct kmemleak_object, rcu);
401
402 /*
403 * Once use_count is 0 (guaranteed by put_object), there is no other
404 * code accessing this object, hence no need for locking.
405 */
406 hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) {
407 hlist_del(elem);
408 kmem_cache_free(scan_area_cache, area);
409 }
410 kmem_cache_free(object_cache, object);
411}
412
413/*
414 * Decrement the object use_count. Once the count is 0, free the object using
415 * an RCU callback. Since put_object() may be called via the kmemleak_free() ->
416 * delete_object() path, the delayed RCU freeing ensures that there is no
417 * recursive call to the kernel allocator. Lock-less RCU object_list traversal
418 * is also possible.
419 */
420static void put_object(struct kmemleak_object *object)
421{
422 if (!atomic_dec_and_test(&object->use_count))
423 return;
424
425 /* should only get here after delete_object was called */
426 WARN_ON(object->flags & OBJECT_ALLOCATED);
427
428 call_rcu(&object->rcu, free_object_rcu);
429}
430
431/*
432 * Look up an object in the prio search tree and increase its use_count.
433 */
434static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
435{
436 unsigned long flags;
437 struct kmemleak_object *object = NULL;
438
439 rcu_read_lock();
440 read_lock_irqsave(&kmemleak_lock, flags);
441 if (ptr >= min_addr && ptr < max_addr)
442 object = lookup_object(ptr, alias);
443 read_unlock_irqrestore(&kmemleak_lock, flags);
444
445 /* check whether the object is still available */
446 if (object && !get_object(object))
447 object = NULL;
448 rcu_read_unlock();
449
450 return object;
451}
452
453/*
454 * Create the metadata (struct kmemleak_object) corresponding to an allocated
455 * memory block and add it to the object_list and object_tree_root.
456 */
457static void create_object(unsigned long ptr, size_t size, int min_count,
458 gfp_t gfp)
459{
460 unsigned long flags;
461 struct kmemleak_object *object;
462 struct prio_tree_node *node;
463 struct stack_trace trace;
464
465 object = kmem_cache_alloc(object_cache, gfp & ~GFP_SLAB_BUG_MASK);
466 if (!object) {
467 kmemleak_panic("kmemleak: Cannot allocate a kmemleak_object "
468 "structure\n");
469 return;
470 }
471
472 INIT_LIST_HEAD(&object->object_list);
473 INIT_LIST_HEAD(&object->gray_list);
474 INIT_HLIST_HEAD(&object->area_list);
475 spin_lock_init(&object->lock);
476 atomic_set(&object->use_count, 1);
477 object->flags = OBJECT_ALLOCATED;
478 object->pointer = ptr;
479 object->size = size;
480 object->min_count = min_count;
481 object->count = -1; /* no color initially */
482 object->jiffies = jiffies;
483
484 /* task information */
485 if (in_irq()) {
486 object->pid = 0;
487 strncpy(object->comm, "hardirq", sizeof(object->comm));
488 } else if (in_softirq()) {
489 object->pid = 0;
490 strncpy(object->comm, "softirq", sizeof(object->comm));
491 } else {
492 object->pid = current->pid;
493 /*
494 * There is a small chance of a race with set_task_comm(),
495 * however using get_task_comm() here may cause locking
496 * dependency issues with current->alloc_lock. In the worst
497 * case, the command line is not correct.
498 */
499 strncpy(object->comm, current->comm, sizeof(object->comm));
500 }
501
502 /* kernel backtrace */
503 trace.max_entries = MAX_TRACE;
504 trace.nr_entries = 0;
505 trace.entries = object->trace;
506 trace.skip = 1;
507 save_stack_trace(&trace);
508 object->trace_len = trace.nr_entries;
509
510 INIT_PRIO_TREE_NODE(&object->tree_node);
511 object->tree_node.start = ptr;
512 object->tree_node.last = ptr + size - 1;
513
514 write_lock_irqsave(&kmemleak_lock, flags);
515 min_addr = min(min_addr, ptr);
516 max_addr = max(max_addr, ptr + size);
517 node = prio_tree_insert(&object_tree_root, &object->tree_node);
518 /*
519 * The code calling the kernel does not yet have the pointer to the
520 * memory block to be able to free it. However, we still hold the
521 * kmemleak_lock here in case parts of the kernel started freeing
522 * random memory blocks.
523 */
524 if (node != &object->tree_node) {
525 unsigned long flags;
526
527 kmemleak_panic("kmemleak: Cannot insert 0x%lx into the object "
528 "search tree (already existing)\n", ptr);
529 object = lookup_object(ptr, 1);
530 spin_lock_irqsave(&object->lock, flags);
531 dump_object_info(object);
532 spin_unlock_irqrestore(&object->lock, flags);
533
534 goto out;
535 }
536 list_add_tail_rcu(&object->object_list, &object_list);
537out:
538 write_unlock_irqrestore(&kmemleak_lock, flags);
539}
540
541/*
542 * Remove the metadata (struct kmemleak_object) for a memory block from the
543 * object_list and object_tree_root and decrement its use_count.
544 */
545static void delete_object(unsigned long ptr)
546{
547 unsigned long flags;
548 struct kmemleak_object *object;
549
550 write_lock_irqsave(&kmemleak_lock, flags);
551 object = lookup_object(ptr, 0);
552 if (!object) {
553 kmemleak_warn("kmemleak: Freeing unknown object at 0x%08lx\n",
554 ptr);
555 write_unlock_irqrestore(&kmemleak_lock, flags);
556 return;
557 }
558 prio_tree_remove(&object_tree_root, &object->tree_node);
559 list_del_rcu(&object->object_list);
560 write_unlock_irqrestore(&kmemleak_lock, flags);
561
562 WARN_ON(!(object->flags & OBJECT_ALLOCATED));
563 WARN_ON(atomic_read(&object->use_count) < 1);
564
565 /*
566 * Locking here also ensures that the corresponding memory block
567 * cannot be freed when it is being scanned.
568 */
569 spin_lock_irqsave(&object->lock, flags);
570 if (object->flags & OBJECT_REPORTED)
571 print_referenced(object);
572 object->flags &= ~OBJECT_ALLOCATED;
573 spin_unlock_irqrestore(&object->lock, flags);
574 put_object(object);
575}
576
577/*
578 * Make a object permanently as gray-colored so that it can no longer be
579 * reported as a leak. This is used in general to mark a false positive.
580 */
581static void make_gray_object(unsigned long ptr)
582{
583 unsigned long flags;
584 struct kmemleak_object *object;
585
586 object = find_and_get_object(ptr, 0);
587 if (!object) {
588 kmemleak_warn("kmemleak: Graying unknown object at 0x%08lx\n",
589 ptr);
590 return;
591 }
592
593 spin_lock_irqsave(&object->lock, flags);
594 object->min_count = 0;
595 spin_unlock_irqrestore(&object->lock, flags);
596 put_object(object);
597}
598
599/*
600 * Mark the object as black-colored so that it is ignored from scans and
601 * reporting.
602 */
603static void make_black_object(unsigned long ptr)
604{
605 unsigned long flags;
606 struct kmemleak_object *object;
607
608 object = find_and_get_object(ptr, 0);
609 if (!object) {
610 kmemleak_warn("kmemleak: Blacking unknown object at 0x%08lx\n",
611 ptr);
612 return;
613 }
614
615 spin_lock_irqsave(&object->lock, flags);
616 object->min_count = -1;
617 spin_unlock_irqrestore(&object->lock, flags);
618 put_object(object);
619}
620
621/*
622 * Add a scanning area to the object. If at least one such area is added,
623 * kmemleak will only scan these ranges rather than the whole memory block.
624 */
625static void add_scan_area(unsigned long ptr, unsigned long offset,
626 size_t length, gfp_t gfp)
627{
628 unsigned long flags;
629 struct kmemleak_object *object;
630 struct kmemleak_scan_area *area;
631
632 object = find_and_get_object(ptr, 0);
633 if (!object) {
634 kmemleak_warn("kmemleak: Adding scan area to unknown "
635 "object at 0x%08lx\n", ptr);
636 return;
637 }
638
639 area = kmem_cache_alloc(scan_area_cache, gfp & ~GFP_SLAB_BUG_MASK);
640 if (!area) {
641 kmemleak_warn("kmemleak: Cannot allocate a scan area\n");
642 goto out;
643 }
644
645 spin_lock_irqsave(&object->lock, flags);
646 if (offset + length > object->size) {
647 kmemleak_warn("kmemleak: Scan area larger than object "
648 "0x%08lx\n", ptr);
649 dump_object_info(object);
650 kmem_cache_free(scan_area_cache, area);
651 goto out_unlock;
652 }
653
654 INIT_HLIST_NODE(&area->node);
655 area->offset = offset;
656 area->length = length;
657
658 hlist_add_head(&area->node, &object->area_list);
659out_unlock:
660 spin_unlock_irqrestore(&object->lock, flags);
661out:
662 put_object(object);
663}
664
665/*
666 * Set the OBJECT_NO_SCAN flag for the object corresponding to the give
667 * pointer. Such object will not be scanned by kmemleak but references to it
668 * are searched.
669 */
670static void object_no_scan(unsigned long ptr)
671{
672 unsigned long flags;
673 struct kmemleak_object *object;
674
675 object = find_and_get_object(ptr, 0);
676 if (!object) {
677 kmemleak_warn("kmemleak: Not scanning unknown object at "
678 "0x%08lx\n", ptr);
679 return;
680 }
681
682 spin_lock_irqsave(&object->lock, flags);
683 object->flags |= OBJECT_NO_SCAN;
684 spin_unlock_irqrestore(&object->lock, flags);
685 put_object(object);
686}
687
688/*
689 * Log an early kmemleak_* call to the early_log buffer. These calls will be
690 * processed later once kmemleak is fully initialized.
691 */
692static void log_early(int op_type, const void *ptr, size_t size,
693 int min_count, unsigned long offset, size_t length)
694{
695 unsigned long flags;
696 struct early_log *log;
697
698 if (crt_early_log >= ARRAY_SIZE(early_log)) {
699 kmemleak_panic("kmemleak: Early log buffer exceeded\n");
700 return;
701 }
702
703 /*
704 * There is no need for locking since the kernel is still in UP mode
705 * at this stage. Disabling the IRQs is enough.
706 */
707 local_irq_save(flags);
708 log = &early_log[crt_early_log];
709 log->op_type = op_type;
710 log->ptr = ptr;
711 log->size = size;
712 log->min_count = min_count;
713 log->offset = offset;
714 log->length = length;
715 crt_early_log++;
716 local_irq_restore(flags);
717}
718
719/*
720 * Memory allocation function callback. This function is called from the
721 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
722 * vmalloc etc.).
723 */
724void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp)
725{
726 pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
727
728 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
729 create_object((unsigned long)ptr, size, min_count, gfp);
730 else if (atomic_read(&kmemleak_early_log))
731 log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0);
732}
733EXPORT_SYMBOL_GPL(kmemleak_alloc);
734
735/*
736 * Memory freeing function callback. This function is called from the kernel
737 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
738 */
739void kmemleak_free(const void *ptr)
740{
741 pr_debug("%s(0x%p)\n", __func__, ptr);
742
743 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
744 delete_object((unsigned long)ptr);
745 else if (atomic_read(&kmemleak_early_log))
746 log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0);
747}
748EXPORT_SYMBOL_GPL(kmemleak_free);
749
750/*
751 * Mark an already allocated memory block as a false positive. This will cause
752 * the block to no longer be reported as leak and always be scanned.
753 */
754void kmemleak_not_leak(const void *ptr)
755{
756 pr_debug("%s(0x%p)\n", __func__, ptr);
757
758 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
759 make_gray_object((unsigned long)ptr);
760 else if (atomic_read(&kmemleak_early_log))
761 log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0);
762}
763EXPORT_SYMBOL(kmemleak_not_leak);
764
765/*
766 * Ignore a memory block. This is usually done when it is known that the
767 * corresponding block is not a leak and does not contain any references to
768 * other allocated memory blocks.
769 */
770void kmemleak_ignore(const void *ptr)
771{
772 pr_debug("%s(0x%p)\n", __func__, ptr);
773
774 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
775 make_black_object((unsigned long)ptr);
776 else if (atomic_read(&kmemleak_early_log))
777 log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0);
778}
779EXPORT_SYMBOL(kmemleak_ignore);
780
781/*
782 * Limit the range to be scanned in an allocated memory block.
783 */
784void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length,
785 gfp_t gfp)
786{
787 pr_debug("%s(0x%p)\n", __func__, ptr);
788
789 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
790 add_scan_area((unsigned long)ptr, offset, length, gfp);
791 else if (atomic_read(&kmemleak_early_log))
792 log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length);
793}
794EXPORT_SYMBOL(kmemleak_scan_area);
795
796/*
797 * Inform kmemleak not to scan the given memory block.
798 */
799void kmemleak_no_scan(const void *ptr)
800{
801 pr_debug("%s(0x%p)\n", __func__, ptr);
802
803 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
804 object_no_scan((unsigned long)ptr);
805 else if (atomic_read(&kmemleak_early_log))
806 log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0);
807}
808EXPORT_SYMBOL(kmemleak_no_scan);
809
810/*
811 * Yield the CPU so that other tasks get a chance to run. The yielding is
812 * rate-limited to avoid excessive number of calls to the schedule() function
813 * during memory scanning.
814 */
815static void scan_yield(void)
816{
817 might_sleep();
818
819 if (time_is_before_eq_jiffies(next_scan_yield)) {
820 schedule();
821 next_scan_yield = jiffies + jiffies_scan_yield;
822 }
823}
824
825/*
826 * Memory scanning is a long process and it needs to be interruptable. This
827 * function checks whether such interrupt condition occured.
828 */
829static int scan_should_stop(void)
830{
831 if (!atomic_read(&kmemleak_enabled))
832 return 1;
833
834 /*
835 * This function may be called from either process or kthread context,
836 * hence the need to check for both stop conditions.
837 */
838 if (current->mm)
839 return signal_pending(current);
840 else
841 return kthread_should_stop();
842
843 return 0;
844}
845
846/*
847 * Scan a memory block (exclusive range) for valid pointers and add those
848 * found to the gray list.
849 */
850static void scan_block(void *_start, void *_end,
851 struct kmemleak_object *scanned)
852{
853 unsigned long *ptr;
854 unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
855 unsigned long *end = _end - (BYTES_PER_POINTER - 1);
856
857 for (ptr = start; ptr < end; ptr++) {
858 unsigned long flags;
859 unsigned long pointer = *ptr;
860 struct kmemleak_object *object;
861
862 if (scan_should_stop())
863 break;
864
865 /*
866 * When scanning a memory block with a corresponding
867 * kmemleak_object, the CPU yielding is handled in the calling
868 * code since it holds the object->lock to avoid the block
869 * freeing.
870 */
871 if (!scanned)
872 scan_yield();
873
874 object = find_and_get_object(pointer, 1);
875 if (!object)
876 continue;
877 if (object == scanned) {
878 /* self referenced, ignore */
879 put_object(object);
880 continue;
881 }
882
883 /*
884 * Avoid the lockdep recursive warning on object->lock being
885 * previously acquired in scan_object(). These locks are
886 * enclosed by scan_mutex.
887 */
888 spin_lock_irqsave_nested(&object->lock, flags,
889 SINGLE_DEPTH_NESTING);
890 if (!color_white(object)) {
891 /* non-orphan, ignored or new */
892 spin_unlock_irqrestore(&object->lock, flags);
893 put_object(object);
894 continue;
895 }
896
897 /*
898 * Increase the object's reference count (number of pointers
899 * to the memory block). If this count reaches the required
900 * minimum, the object's color will become gray and it will be
901 * added to the gray_list.
902 */
903 object->count++;
904 if (color_gray(object))
905 list_add_tail(&object->gray_list, &gray_list);
906 else
907 put_object(object);
908 spin_unlock_irqrestore(&object->lock, flags);
909 }
910}
911
912/*
913 * Scan a memory block corresponding to a kmemleak_object. A condition is
914 * that object->use_count >= 1.
915 */
916static void scan_object(struct kmemleak_object *object)
917{
918 struct kmemleak_scan_area *area;
919 struct hlist_node *elem;
920 unsigned long flags;
921
922 /*
923 * Once the object->lock is aquired, the corresponding memory block
924 * cannot be freed (the same lock is aquired in delete_object).
925 */
926 spin_lock_irqsave(&object->lock, flags);
927 if (object->flags & OBJECT_NO_SCAN)
928 goto out;
929 if (!(object->flags & OBJECT_ALLOCATED))
930 /* already freed object */
931 goto out;
932 if (hlist_empty(&object->area_list))
933 scan_block((void *)object->pointer,
934 (void *)(object->pointer + object->size), object);
935 else
936 hlist_for_each_entry(area, elem, &object->area_list, node)
937 scan_block((void *)(object->pointer + area->offset),
938 (void *)(object->pointer + area->offset
939 + area->length), object);
940out:
941 spin_unlock_irqrestore(&object->lock, flags);
942}
943
944/*
945 * Scan data sections and all the referenced memory blocks allocated via the
946 * kernel's standard allocators. This function must be called with the
947 * scan_mutex held.
948 */
949static void kmemleak_scan(void)
950{
951 unsigned long flags;
952 struct kmemleak_object *object, *tmp;
953 struct task_struct *task;
954 int i;
955
956 /* prepare the kmemleak_object's */
957 rcu_read_lock();
958 list_for_each_entry_rcu(object, &object_list, object_list) {
959 spin_lock_irqsave(&object->lock, flags);
960#ifdef DEBUG
961 /*
962 * With a few exceptions there should be a maximum of
963 * 1 reference to any object at this point.
964 */
965 if (atomic_read(&object->use_count) > 1) {
966 pr_debug("kmemleak: object->use_count = %d\n",
967 atomic_read(&object->use_count));
968 dump_object_info(object);
969 }
970#endif
971 /* reset the reference count (whiten the object) */
972 object->count = 0;
973 if (color_gray(object) && get_object(object))
974 list_add_tail(&object->gray_list, &gray_list);
975
976 spin_unlock_irqrestore(&object->lock, flags);
977 }
978 rcu_read_unlock();
979
980 /* data/bss scanning */
981 scan_block(_sdata, _edata, NULL);
982 scan_block(__bss_start, __bss_stop, NULL);
983
984#ifdef CONFIG_SMP
985 /* per-cpu sections scanning */
986 for_each_possible_cpu(i)
987 scan_block(__per_cpu_start + per_cpu_offset(i),
988 __per_cpu_end + per_cpu_offset(i), NULL);
989#endif
990
991 /*
992 * Struct page scanning for each node. The code below is not yet safe
993 * with MEMORY_HOTPLUG.
994 */
995 for_each_online_node(i) {
996 pg_data_t *pgdat = NODE_DATA(i);
997 unsigned long start_pfn = pgdat->node_start_pfn;
998 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
999 unsigned long pfn;
1000
1001 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1002 struct page *page;
1003
1004 if (!pfn_valid(pfn))
1005 continue;
1006 page = pfn_to_page(pfn);
1007 /* only scan if page is in use */
1008 if (page_count(page) == 0)
1009 continue;
1010 scan_block(page, page + 1, NULL);
1011 }
1012 }
1013
1014 /*
1015 * Scanning the task stacks may introduce false negatives and it is
1016 * not enabled by default.
1017 */
1018 if (kmemleak_stack_scan) {
1019 read_lock(&tasklist_lock);
1020 for_each_process(task)
1021 scan_block(task_stack_page(task),
1022 task_stack_page(task) + THREAD_SIZE, NULL);
1023 read_unlock(&tasklist_lock);
1024 }
1025
1026 /*
1027 * Scan the objects already referenced from the sections scanned
1028 * above. More objects will be referenced and, if there are no memory
1029 * leaks, all the objects will be scanned. The list traversal is safe
1030 * for both tail additions and removals from inside the loop. The
1031 * kmemleak objects cannot be freed from outside the loop because their
1032 * use_count was increased.
1033 */
1034 object = list_entry(gray_list.next, typeof(*object), gray_list);
1035 while (&object->gray_list != &gray_list) {
1036 scan_yield();
1037
1038 /* may add new objects to the list */
1039 if (!scan_should_stop())
1040 scan_object(object);
1041
1042 tmp = list_entry(object->gray_list.next, typeof(*object),
1043 gray_list);
1044
1045 /* remove the object from the list and release it */
1046 list_del(&object->gray_list);
1047 put_object(object);
1048
1049 object = tmp;
1050 }
1051 WARN_ON(!list_empty(&gray_list));
1052}
1053
1054/*
1055 * Thread function performing automatic memory scanning. Unreferenced objects
1056 * at the end of a memory scan are reported but only the first time.
1057 */
1058static int kmemleak_scan_thread(void *arg)
1059{
1060 static int first_run = 1;
1061
1062 pr_info("kmemleak: Automatic memory scanning thread started\n");
1063
1064 /*
1065 * Wait before the first scan to allow the system to fully initialize.
1066 */
1067 if (first_run) {
1068 first_run = 0;
1069 ssleep(SECS_FIRST_SCAN);
1070 }
1071
1072 while (!kthread_should_stop()) {
1073 struct kmemleak_object *object;
1074 signed long timeout = jiffies_scan_wait;
1075
1076 mutex_lock(&scan_mutex);
1077
1078 kmemleak_scan();
1079 reported_leaks = 0;
1080
1081 rcu_read_lock();
1082 list_for_each_entry_rcu(object, &object_list, object_list) {
1083 unsigned long flags;
1084
1085 if (reported_leaks >= REPORTS_NR)
1086 break;
1087 spin_lock_irqsave(&object->lock, flags);
1088 if (!(object->flags & OBJECT_REPORTED) &&
1089 unreferenced_object(object)) {
1090 print_unreferenced(NULL, object);
1091 object->flags |= OBJECT_REPORTED;
1092 reported_leaks++;
1093 } else if ((object->flags & OBJECT_REPORTED) &&
1094 referenced_object(object)) {
1095 print_referenced(object);
1096 object->flags &= ~OBJECT_REPORTED;
1097 }
1098 spin_unlock_irqrestore(&object->lock, flags);
1099 }
1100 rcu_read_unlock();
1101
1102 mutex_unlock(&scan_mutex);
1103 /* wait before the next scan */
1104 while (timeout && !kthread_should_stop())
1105 timeout = schedule_timeout_interruptible(timeout);
1106 }
1107
1108 pr_info("kmemleak: Automatic memory scanning thread ended\n");
1109
1110 return 0;
1111}
1112
1113/*
1114 * Start the automatic memory scanning thread. This function must be called
1115 * with the kmemleak_mutex held.
1116 */
1117void start_scan_thread(void)
1118{
1119 if (scan_thread)
1120 return;
1121 scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak");
1122 if (IS_ERR(scan_thread)) {
1123 pr_warning("kmemleak: Failed to create the scan thread\n");
1124 scan_thread = NULL;
1125 }
1126}
1127
1128/*
1129 * Stop the automatic memory scanning thread. This function must be called
1130 * with the kmemleak_mutex held.
1131 */
1132void stop_scan_thread(void)
1133{
1134 if (scan_thread) {
1135 kthread_stop(scan_thread);
1136 scan_thread = NULL;
1137 }
1138}
1139
1140/*
1141 * Iterate over the object_list and return the first valid object at or after
1142 * the required position with its use_count incremented. The function triggers
1143 * a memory scanning when the pos argument points to the first position.
1144 */
1145static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
1146{
1147 struct kmemleak_object *object;
1148 loff_t n = *pos;
1149
1150 if (!n) {
1151 kmemleak_scan();
1152 reported_leaks = 0;
1153 }
1154 if (reported_leaks >= REPORTS_NR)
1155 return NULL;
1156
1157 rcu_read_lock();
1158 list_for_each_entry_rcu(object, &object_list, object_list) {
1159 if (n-- > 0)
1160 continue;
1161 if (get_object(object))
1162 goto out;
1163 }
1164 object = NULL;
1165out:
1166 rcu_read_unlock();
1167 return object;
1168}
1169
1170/*
1171 * Return the next object in the object_list. The function decrements the
1172 * use_count of the previous object and increases that of the next one.
1173 */
1174static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1175{
1176 struct kmemleak_object *prev_obj = v;
1177 struct kmemleak_object *next_obj = NULL;
1178 struct list_head *n = &prev_obj->object_list;
1179
1180 ++(*pos);
1181 if (reported_leaks >= REPORTS_NR)
1182 goto out;
1183
1184 rcu_read_lock();
1185 list_for_each_continue_rcu(n, &object_list) {
1186 next_obj = list_entry(n, struct kmemleak_object, object_list);
1187 if (get_object(next_obj))
1188 break;
1189 }
1190 rcu_read_unlock();
1191out:
1192 put_object(prev_obj);
1193 return next_obj;
1194}
1195
1196/*
1197 * Decrement the use_count of the last object required, if any.
1198 */
1199static void kmemleak_seq_stop(struct seq_file *seq, void *v)
1200{
1201 if (v)
1202 put_object(v);
1203}
1204
1205/*
1206 * Print the information for an unreferenced object to the seq file.
1207 */
1208static int kmemleak_seq_show(struct seq_file *seq, void *v)
1209{
1210 struct kmemleak_object *object = v;
1211 unsigned long flags;
1212
1213 spin_lock_irqsave(&object->lock, flags);
1214 if (!unreferenced_object(object))
1215 goto out;
1216 print_unreferenced(seq, object);
1217 reported_leaks++;
1218out:
1219 spin_unlock_irqrestore(&object->lock, flags);
1220 return 0;
1221}
1222
1223static const struct seq_operations kmemleak_seq_ops = {
1224 .start = kmemleak_seq_start,
1225 .next = kmemleak_seq_next,
1226 .stop = kmemleak_seq_stop,
1227 .show = kmemleak_seq_show,
1228};
1229
1230static int kmemleak_open(struct inode *inode, struct file *file)
1231{
1232 int ret = 0;
1233
1234 if (!atomic_read(&kmemleak_enabled))
1235 return -EBUSY;
1236
1237 ret = mutex_lock_interruptible(&kmemleak_mutex);
1238 if (ret < 0)
1239 goto out;
1240 if (file->f_mode & FMODE_READ) {
1241 ret = mutex_lock_interruptible(&scan_mutex);
1242 if (ret < 0)
1243 goto kmemleak_unlock;
1244 ret = seq_open(file, &kmemleak_seq_ops);
1245 if (ret < 0)
1246 goto scan_unlock;
1247 }
1248 return ret;
1249
1250scan_unlock:
1251 mutex_unlock(&scan_mutex);
1252kmemleak_unlock:
1253 mutex_unlock(&kmemleak_mutex);
1254out:
1255 return ret;
1256}
1257
1258static int kmemleak_release(struct inode *inode, struct file *file)
1259{
1260 int ret = 0;
1261
1262 if (file->f_mode & FMODE_READ) {
1263 seq_release(inode, file);
1264 mutex_unlock(&scan_mutex);
1265 }
1266 mutex_unlock(&kmemleak_mutex);
1267
1268 return ret;
1269}
1270
1271/*
1272 * File write operation to configure kmemleak at run-time. The following
1273 * commands can be written to the /sys/kernel/debug/kmemleak file:
1274 * off - disable kmemleak (irreversible)
1275 * stack=on - enable the task stacks scanning
1276 * stack=off - disable the tasks stacks scanning
1277 * scan=on - start the automatic memory scanning thread
1278 * scan=off - stop the automatic memory scanning thread
1279 * scan=... - set the automatic memory scanning period in seconds (0 to
1280 * disable it)
1281 */
1282static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1283 size_t size, loff_t *ppos)
1284{
1285 char buf[64];
1286 int buf_size;
1287
1288 if (!atomic_read(&kmemleak_enabled))
1289 return -EBUSY;
1290
1291 buf_size = min(size, (sizeof(buf) - 1));
1292 if (strncpy_from_user(buf, user_buf, buf_size) < 0)
1293 return -EFAULT;
1294 buf[buf_size] = 0;
1295
1296 if (strncmp(buf, "off", 3) == 0)
1297 kmemleak_disable();
1298 else if (strncmp(buf, "stack=on", 8) == 0)
1299 kmemleak_stack_scan = 1;
1300 else if (strncmp(buf, "stack=off", 9) == 0)
1301 kmemleak_stack_scan = 0;
1302 else if (strncmp(buf, "scan=on", 7) == 0)
1303 start_scan_thread();
1304 else if (strncmp(buf, "scan=off", 8) == 0)
1305 stop_scan_thread();
1306 else if (strncmp(buf, "scan=", 5) == 0) {
1307 unsigned long secs;
1308 int err;
1309
1310 err = strict_strtoul(buf + 5, 0, &secs);
1311 if (err < 0)
1312 return err;
1313 stop_scan_thread();
1314 if (secs) {
1315 jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
1316 start_scan_thread();
1317 }
1318 } else
1319 return -EINVAL;
1320
1321 /* ignore the rest of the buffer, only one command at a time */
1322 *ppos += size;
1323 return size;
1324}
1325
1326static const struct file_operations kmemleak_fops = {
1327 .owner = THIS_MODULE,
1328 .open = kmemleak_open,
1329 .read = seq_read,
1330 .write = kmemleak_write,
1331 .llseek = seq_lseek,
1332 .release = kmemleak_release,
1333};
1334
1335/*
1336 * Perform the freeing of the kmemleak internal objects after waiting for any
1337 * current memory scan to complete.
1338 */
1339static int kmemleak_cleanup_thread(void *arg)
1340{
1341 struct kmemleak_object *object;
1342
1343 mutex_lock(&kmemleak_mutex);
1344 stop_scan_thread();
1345 mutex_unlock(&kmemleak_mutex);
1346
1347 mutex_lock(&scan_mutex);
1348 rcu_read_lock();
1349 list_for_each_entry_rcu(object, &object_list, object_list)
1350 delete_object(object->pointer);
1351 rcu_read_unlock();
1352 mutex_unlock(&scan_mutex);
1353
1354 return 0;
1355}
1356
1357/*
1358 * Start the clean-up thread.
1359 */
1360static void kmemleak_cleanup(void)
1361{
1362 struct task_struct *cleanup_thread;
1363
1364 cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL,
1365 "kmemleak-clean");
1366 if (IS_ERR(cleanup_thread))
1367 pr_warning("kmemleak: Failed to create the clean-up thread\n");
1368}
1369
1370/*
1371 * Disable kmemleak. No memory allocation/freeing will be traced once this
1372 * function is called. Disabling kmemleak is an irreversible operation.
1373 */
1374static void kmemleak_disable(void)
1375{
1376 /* atomically check whether it was already invoked */
1377 if (atomic_cmpxchg(&kmemleak_error, 0, 1))
1378 return;
1379
1380 /* stop any memory operation tracing */
1381 atomic_set(&kmemleak_early_log, 0);
1382 atomic_set(&kmemleak_enabled, 0);
1383
1384 /* check whether it is too early for a kernel thread */
1385 if (atomic_read(&kmemleak_initialized))
1386 kmemleak_cleanup();
1387
1388 pr_info("Kernel memory leak detector disabled\n");
1389}
1390
1391/*
1392 * Allow boot-time kmemleak disabling (enabled by default).
1393 */
1394static int kmemleak_boot_config(char *str)
1395{
1396 if (!str)
1397 return -EINVAL;
1398 if (strcmp(str, "off") == 0)
1399 kmemleak_disable();
1400 else if (strcmp(str, "on") != 0)
1401 return -EINVAL;
1402 return 0;
1403}
1404early_param("kmemleak", kmemleak_boot_config);
1405
1406/*
1407 * Kkmemleak initialization.
1408 */
1409void __init kmemleak_init(void)
1410{
1411 int i;
1412 unsigned long flags;
1413
1414 jiffies_scan_yield = msecs_to_jiffies(MSECS_SCAN_YIELD);
1415 jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
1416 jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
1417
1418 object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
1419 scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
1420 INIT_PRIO_TREE_ROOT(&object_tree_root);
1421
1422 /* the kernel is still in UP mode, so disabling the IRQs is enough */
1423 local_irq_save(flags);
1424 if (!atomic_read(&kmemleak_error)) {
1425 atomic_set(&kmemleak_enabled, 1);
1426 atomic_set(&kmemleak_early_log, 0);
1427 }
1428 local_irq_restore(flags);
1429
1430 /*
1431 * This is the point where tracking allocations is safe. Automatic
1432 * scanning is started during the late initcall. Add the early logged
1433 * callbacks to the kmemleak infrastructure.
1434 */
1435 for (i = 0; i < crt_early_log; i++) {
1436 struct early_log *log = &early_log[i];
1437
1438 switch (log->op_type) {
1439 case KMEMLEAK_ALLOC:
1440 kmemleak_alloc(log->ptr, log->size, log->min_count,
1441 GFP_KERNEL);
1442 break;
1443 case KMEMLEAK_FREE:
1444 kmemleak_free(log->ptr);
1445 break;
1446 case KMEMLEAK_NOT_LEAK:
1447 kmemleak_not_leak(log->ptr);
1448 break;
1449 case KMEMLEAK_IGNORE:
1450 kmemleak_ignore(log->ptr);
1451 break;
1452 case KMEMLEAK_SCAN_AREA:
1453 kmemleak_scan_area(log->ptr, log->offset, log->length,
1454 GFP_KERNEL);
1455 break;
1456 case KMEMLEAK_NO_SCAN:
1457 kmemleak_no_scan(log->ptr);
1458 break;
1459 default:
1460 WARN_ON(1);
1461 }
1462 }
1463}
1464
1465/*
1466 * Late initialization function.
1467 */
1468static int __init kmemleak_late_init(void)
1469{
1470 struct dentry *dentry;
1471
1472 atomic_set(&kmemleak_initialized, 1);
1473
1474 if (atomic_read(&kmemleak_error)) {
1475 /*
1476 * Some error occured and kmemleak was disabled. There is a
1477 * small chance that kmemleak_disable() was called immediately
1478 * after setting kmemleak_initialized and we may end up with
1479 * two clean-up threads but serialized by scan_mutex.
1480 */
1481 kmemleak_cleanup();
1482 return -ENOMEM;
1483 }
1484
1485 dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL,
1486 &kmemleak_fops);
1487 if (!dentry)
1488 pr_warning("kmemleak: Failed to create the debugfs kmemleak "
1489 "file\n");
1490 mutex_lock(&kmemleak_mutex);
1491 start_scan_thread();
1492 mutex_unlock(&kmemleak_mutex);
1493
1494 pr_info("Kernel memory leak detector initialized\n");
1495
1496 return 0;
1497}
1498late_initcall(kmemleak_late_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index 2b43fa1aa3c8..34579b23ebd5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
28#include <linux/mempolicy.h> 28#include <linux/mempolicy.h>
29#include <linux/rmap.h> 29#include <linux/rmap.h>
30#include <linux/mmu_notifier.h> 30#include <linux/mmu_notifier.h>
31#include <linux/perf_counter.h>
31 32
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/cacheflush.h> 34#include <asm/cacheflush.h>
@@ -1222,6 +1223,8 @@ munmap_back:
1222 if (correct_wcount) 1223 if (correct_wcount)
1223 atomic_inc(&inode->i_writecount); 1224 atomic_inc(&inode->i_writecount);
1224out: 1225out:
1226 perf_counter_mmap(vma);
1227
1225 mm->total_vm += len >> PAGE_SHIFT; 1228 mm->total_vm += len >> PAGE_SHIFT;
1226 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1229 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1227 if (vm_flags & VM_LOCKED) { 1230 if (vm_flags & VM_LOCKED) {
@@ -2308,6 +2311,8 @@ int install_special_mapping(struct mm_struct *mm,
2308 2311
2309 mm->total_vm += len >> PAGE_SHIFT; 2312 mm->total_vm += len >> PAGE_SHIFT;
2310 2313
2314 perf_counter_mmap(vma);
2315
2311 return 0; 2316 return 0;
2312} 2317}
2313 2318
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 258197b76fb4..d80311baeb2d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,6 +23,7 @@
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/mmu_notifier.h> 24#include <linux/mmu_notifier.h>
25#include <linux/migrate.h> 25#include <linux/migrate.h>
26#include <linux/perf_counter.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27#include <asm/pgtable.h> 28#include <asm/pgtable.h>
28#include <asm/cacheflush.h> 29#include <asm/cacheflush.h>
@@ -299,6 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
299 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); 300 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
300 if (error) 301 if (error)
301 goto out; 302 goto out;
303 perf_counter_mmap(vma);
302 nstart = tmp; 304 nstart = tmp;
303 305
304 if (nstart < prev->vm_end) 306 if (nstart < prev->vm_end)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 474c7e9dd51a..17d5f539a9aa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -46,6 +46,7 @@
46#include <linux/page-isolation.h> 46#include <linux/page-isolation.h>
47#include <linux/page_cgroup.h> 47#include <linux/page_cgroup.h>
48#include <linux/debugobjects.h> 48#include <linux/debugobjects.h>
49#include <linux/kmemleak.h>
49 50
50#include <asm/tlbflush.h> 51#include <asm/tlbflush.h>
51#include <asm/div64.h> 52#include <asm/div64.h>
@@ -4546,6 +4547,16 @@ void *__init alloc_large_system_hash(const char *tablename,
4546 if (_hash_mask) 4547 if (_hash_mask)
4547 *_hash_mask = (1 << log2qty) - 1; 4548 *_hash_mask = (1 << log2qty) - 1;
4548 4549
4550 /*
4551 * If hashdist is set, the table allocation is done with __vmalloc()
4552 * which invokes the kmemleak_alloc() callback. This function may also
4553 * be called before the slab and kmemleak are initialised when
4554 * kmemleak simply buffers the request to be executed later
4555 * (GFP_ATOMIC flag ignored in this case).
4556 */
4557 if (!hashdist)
4558 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
4559
4549 return table; 4560 return table;
4550} 4561}
4551 4562
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 791905c991df..3dd4a909a1de 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -47,6 +47,8 @@ static int __init alloc_node_page_cgroup(int nid)
47 struct page_cgroup *base, *pc; 47 struct page_cgroup *base, *pc;
48 unsigned long table_size; 48 unsigned long table_size;
49 unsigned long start_pfn, nr_pages, index; 49 unsigned long start_pfn, nr_pages, index;
50 struct page *page;
51 unsigned int order;
50 52
51 start_pfn = NODE_DATA(nid)->node_start_pfn; 53 start_pfn = NODE_DATA(nid)->node_start_pfn;
52 nr_pages = NODE_DATA(nid)->node_spanned_pages; 54 nr_pages = NODE_DATA(nid)->node_spanned_pages;
@@ -55,11 +57,13 @@ static int __init alloc_node_page_cgroup(int nid)
55 return 0; 57 return 0;
56 58
57 table_size = sizeof(struct page_cgroup) * nr_pages; 59 table_size = sizeof(struct page_cgroup) * nr_pages;
58 60 order = get_order(table_size);
59 base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), 61 page = alloc_pages_node(nid, GFP_NOWAIT | __GFP_ZERO, order);
60 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 62 if (!page)
61 if (!base) 63 page = alloc_pages_node(-1, GFP_NOWAIT | __GFP_ZERO, order);
64 if (!page)
62 return -ENOMEM; 65 return -ENOMEM;
66 base = page_address(page);
63 for (index = 0; index < nr_pages; index++) { 67 for (index = 0; index < nr_pages; index++) {
64 pc = base + index; 68 pc = base + index;
65 __init_page_cgroup(pc, start_pfn + index); 69 __init_page_cgroup(pc, start_pfn + index);
diff --git a/mm/slab.c b/mm/slab.c
index f85831da9080..f46b65d124e5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -107,6 +107,7 @@
107#include <linux/string.h> 107#include <linux/string.h>
108#include <linux/uaccess.h> 108#include <linux/uaccess.h>
109#include <linux/nodemask.h> 109#include <linux/nodemask.h>
110#include <linux/kmemleak.h>
110#include <linux/mempolicy.h> 111#include <linux/mempolicy.h>
111#include <linux/mutex.h> 112#include <linux/mutex.h>
112#include <linux/fault-inject.h> 113#include <linux/fault-inject.h>
@@ -178,13 +179,13 @@
178 SLAB_STORE_USER | \ 179 SLAB_STORE_USER | \
179 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 180 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
180 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 181 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
181 SLAB_DEBUG_OBJECTS) 182 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE)
182#else 183#else
183# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ 184# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
184 SLAB_CACHE_DMA | \ 185 SLAB_CACHE_DMA | \
185 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 186 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
186 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 187 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
187 SLAB_DEBUG_OBJECTS) 188 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE)
188#endif 189#endif
189 190
190/* 191/*
@@ -315,7 +316,7 @@ static int drain_freelist(struct kmem_cache *cache,
315 struct kmem_list3 *l3, int tofree); 316 struct kmem_list3 *l3, int tofree);
316static void free_block(struct kmem_cache *cachep, void **objpp, int len, 317static void free_block(struct kmem_cache *cachep, void **objpp, int len,
317 int node); 318 int node);
318static int enable_cpucache(struct kmem_cache *cachep); 319static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
319static void cache_reap(struct work_struct *unused); 320static void cache_reap(struct work_struct *unused);
320 321
321/* 322/*
@@ -958,12 +959,20 @@ static void __cpuinit start_cpu_timer(int cpu)
958} 959}
959 960
960static struct array_cache *alloc_arraycache(int node, int entries, 961static struct array_cache *alloc_arraycache(int node, int entries,
961 int batchcount) 962 int batchcount, gfp_t gfp)
962{ 963{
963 int memsize = sizeof(void *) * entries + sizeof(struct array_cache); 964 int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
964 struct array_cache *nc = NULL; 965 struct array_cache *nc = NULL;
965 966
966 nc = kmalloc_node(memsize, GFP_KERNEL, node); 967 nc = kmalloc_node(memsize, gfp, node);
968 /*
969 * The array_cache structures contain pointers to free object.
970 * However, when such objects are allocated or transfered to another
971 * cache the pointers are not cleared and they could be counted as
972 * valid references during a kmemleak scan. Therefore, kmemleak must
973 * not scan such objects.
974 */
975 kmemleak_no_scan(nc);
967 if (nc) { 976 if (nc) {
968 nc->avail = 0; 977 nc->avail = 0;
969 nc->limit = entries; 978 nc->limit = entries;
@@ -1003,7 +1012,7 @@ static int transfer_objects(struct array_cache *to,
1003#define drain_alien_cache(cachep, alien) do { } while (0) 1012#define drain_alien_cache(cachep, alien) do { } while (0)
1004#define reap_alien(cachep, l3) do { } while (0) 1013#define reap_alien(cachep, l3) do { } while (0)
1005 1014
1006static inline struct array_cache **alloc_alien_cache(int node, int limit) 1015static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
1007{ 1016{
1008 return (struct array_cache **)BAD_ALIEN_MAGIC; 1017 return (struct array_cache **)BAD_ALIEN_MAGIC;
1009} 1018}
@@ -1034,7 +1043,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
1034static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); 1043static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
1035static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 1044static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
1036 1045
1037static struct array_cache **alloc_alien_cache(int node, int limit) 1046static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
1038{ 1047{
1039 struct array_cache **ac_ptr; 1048 struct array_cache **ac_ptr;
1040 int memsize = sizeof(void *) * nr_node_ids; 1049 int memsize = sizeof(void *) * nr_node_ids;
@@ -1042,14 +1051,14 @@ static struct array_cache **alloc_alien_cache(int node, int limit)
1042 1051
1043 if (limit > 1) 1052 if (limit > 1)
1044 limit = 12; 1053 limit = 12;
1045 ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); 1054 ac_ptr = kmalloc_node(memsize, gfp, node);
1046 if (ac_ptr) { 1055 if (ac_ptr) {
1047 for_each_node(i) { 1056 for_each_node(i) {
1048 if (i == node || !node_online(i)) { 1057 if (i == node || !node_online(i)) {
1049 ac_ptr[i] = NULL; 1058 ac_ptr[i] = NULL;
1050 continue; 1059 continue;
1051 } 1060 }
1052 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); 1061 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
1053 if (!ac_ptr[i]) { 1062 if (!ac_ptr[i]) {
1054 for (i--; i >= 0; i--) 1063 for (i--; i >= 0; i--)
1055 kfree(ac_ptr[i]); 1064 kfree(ac_ptr[i]);
@@ -1282,20 +1291,20 @@ static int __cpuinit cpuup_prepare(long cpu)
1282 struct array_cache **alien = NULL; 1291 struct array_cache **alien = NULL;
1283 1292
1284 nc = alloc_arraycache(node, cachep->limit, 1293 nc = alloc_arraycache(node, cachep->limit,
1285 cachep->batchcount); 1294 cachep->batchcount, GFP_KERNEL);
1286 if (!nc) 1295 if (!nc)
1287 goto bad; 1296 goto bad;
1288 if (cachep->shared) { 1297 if (cachep->shared) {
1289 shared = alloc_arraycache(node, 1298 shared = alloc_arraycache(node,
1290 cachep->shared * cachep->batchcount, 1299 cachep->shared * cachep->batchcount,
1291 0xbaadf00d); 1300 0xbaadf00d, GFP_KERNEL);
1292 if (!shared) { 1301 if (!shared) {
1293 kfree(nc); 1302 kfree(nc);
1294 goto bad; 1303 goto bad;
1295 } 1304 }
1296 } 1305 }
1297 if (use_alien_caches) { 1306 if (use_alien_caches) {
1298 alien = alloc_alien_cache(node, cachep->limit); 1307 alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
1299 if (!alien) { 1308 if (!alien) {
1300 kfree(shared); 1309 kfree(shared);
1301 kfree(nc); 1310 kfree(nc);
@@ -1399,10 +1408,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1399{ 1408{
1400 struct kmem_list3 *ptr; 1409 struct kmem_list3 *ptr;
1401 1410
1402 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); 1411 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
1403 BUG_ON(!ptr); 1412 BUG_ON(!ptr);
1404 1413
1405 local_irq_disable();
1406 memcpy(ptr, list, sizeof(struct kmem_list3)); 1414 memcpy(ptr, list, sizeof(struct kmem_list3));
1407 /* 1415 /*
1408 * Do not assume that spinlocks can be initialized via memcpy: 1416 * Do not assume that spinlocks can be initialized via memcpy:
@@ -1411,7 +1419,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1411 1419
1412 MAKE_ALL_LISTS(cachep, ptr, nodeid); 1420 MAKE_ALL_LISTS(cachep, ptr, nodeid);
1413 cachep->nodelists[nodeid] = ptr; 1421 cachep->nodelists[nodeid] = ptr;
1414 local_irq_enable();
1415} 1422}
1416 1423
1417/* 1424/*
@@ -1575,9 +1582,8 @@ void __init kmem_cache_init(void)
1575 { 1582 {
1576 struct array_cache *ptr; 1583 struct array_cache *ptr;
1577 1584
1578 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1585 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1579 1586
1580 local_irq_disable();
1581 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); 1587 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1582 memcpy(ptr, cpu_cache_get(&cache_cache), 1588 memcpy(ptr, cpu_cache_get(&cache_cache),
1583 sizeof(struct arraycache_init)); 1589 sizeof(struct arraycache_init));
@@ -1587,11 +1593,9 @@ void __init kmem_cache_init(void)
1587 spin_lock_init(&ptr->lock); 1593 spin_lock_init(&ptr->lock);
1588 1594
1589 cache_cache.array[smp_processor_id()] = ptr; 1595 cache_cache.array[smp_processor_id()] = ptr;
1590 local_irq_enable();
1591 1596
1592 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1597 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1593 1598
1594 local_irq_disable();
1595 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) 1599 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1596 != &initarray_generic.cache); 1600 != &initarray_generic.cache);
1597 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), 1601 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
@@ -1603,7 +1607,6 @@ void __init kmem_cache_init(void)
1603 1607
1604 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1608 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1605 ptr; 1609 ptr;
1606 local_irq_enable();
1607 } 1610 }
1608 /* 5) Replace the bootstrap kmem_list3's */ 1611 /* 5) Replace the bootstrap kmem_list3's */
1609 { 1612 {
@@ -1627,7 +1630,7 @@ void __init kmem_cache_init(void)
1627 struct kmem_cache *cachep; 1630 struct kmem_cache *cachep;
1628 mutex_lock(&cache_chain_mutex); 1631 mutex_lock(&cache_chain_mutex);
1629 list_for_each_entry(cachep, &cache_chain, next) 1632 list_for_each_entry(cachep, &cache_chain, next)
1630 if (enable_cpucache(cachep)) 1633 if (enable_cpucache(cachep, GFP_NOWAIT))
1631 BUG(); 1634 BUG();
1632 mutex_unlock(&cache_chain_mutex); 1635 mutex_unlock(&cache_chain_mutex);
1633 } 1636 }
@@ -2064,10 +2067,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
2064 return left_over; 2067 return left_over;
2065} 2068}
2066 2069
2067static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) 2070static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2068{ 2071{
2069 if (g_cpucache_up == FULL) 2072 if (g_cpucache_up == FULL)
2070 return enable_cpucache(cachep); 2073 return enable_cpucache(cachep, gfp);
2071 2074
2072 if (g_cpucache_up == NONE) { 2075 if (g_cpucache_up == NONE) {
2073 /* 2076 /*
@@ -2089,7 +2092,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2089 g_cpucache_up = PARTIAL_AC; 2092 g_cpucache_up = PARTIAL_AC;
2090 } else { 2093 } else {
2091 cachep->array[smp_processor_id()] = 2094 cachep->array[smp_processor_id()] =
2092 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 2095 kmalloc(sizeof(struct arraycache_init), gfp);
2093 2096
2094 if (g_cpucache_up == PARTIAL_AC) { 2097 if (g_cpucache_up == PARTIAL_AC) {
2095 set_up_list3s(cachep, SIZE_L3); 2098 set_up_list3s(cachep, SIZE_L3);
@@ -2153,6 +2156,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2153{ 2156{
2154 size_t left_over, slab_size, ralign; 2157 size_t left_over, slab_size, ralign;
2155 struct kmem_cache *cachep = NULL, *pc; 2158 struct kmem_cache *cachep = NULL, *pc;
2159 gfp_t gfp;
2156 2160
2157 /* 2161 /*
2158 * Sanity checks... these are all serious usage bugs. 2162 * Sanity checks... these are all serious usage bugs.
@@ -2168,8 +2172,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2168 * We use cache_chain_mutex to ensure a consistent view of 2172 * We use cache_chain_mutex to ensure a consistent view of
2169 * cpu_online_mask as well. Please see cpuup_callback 2173 * cpu_online_mask as well. Please see cpuup_callback
2170 */ 2174 */
2171 get_online_cpus(); 2175 if (slab_is_available()) {
2172 mutex_lock(&cache_chain_mutex); 2176 get_online_cpus();
2177 mutex_lock(&cache_chain_mutex);
2178 }
2173 2179
2174 list_for_each_entry(pc, &cache_chain, next) { 2180 list_for_each_entry(pc, &cache_chain, next) {
2175 char tmp; 2181 char tmp;
@@ -2278,8 +2284,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2278 */ 2284 */
2279 align = ralign; 2285 align = ralign;
2280 2286
2287 if (slab_is_available())
2288 gfp = GFP_KERNEL;
2289 else
2290 gfp = GFP_NOWAIT;
2291
2281 /* Get cache's description obj. */ 2292 /* Get cache's description obj. */
2282 cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); 2293 cachep = kmem_cache_zalloc(&cache_cache, gfp);
2283 if (!cachep) 2294 if (!cachep)
2284 goto oops; 2295 goto oops;
2285 2296
@@ -2382,7 +2393,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2382 cachep->ctor = ctor; 2393 cachep->ctor = ctor;
2383 cachep->name = name; 2394 cachep->name = name;
2384 2395
2385 if (setup_cpu_cache(cachep)) { 2396 if (setup_cpu_cache(cachep, gfp)) {
2386 __kmem_cache_destroy(cachep); 2397 __kmem_cache_destroy(cachep);
2387 cachep = NULL; 2398 cachep = NULL;
2388 goto oops; 2399 goto oops;
@@ -2394,8 +2405,10 @@ oops:
2394 if (!cachep && (flags & SLAB_PANIC)) 2405 if (!cachep && (flags & SLAB_PANIC))
2395 panic("kmem_cache_create(): failed to create slab `%s'\n", 2406 panic("kmem_cache_create(): failed to create slab `%s'\n",
2396 name); 2407 name);
2397 mutex_unlock(&cache_chain_mutex); 2408 if (slab_is_available()) {
2398 put_online_cpus(); 2409 mutex_unlock(&cache_chain_mutex);
2410 put_online_cpus();
2411 }
2399 return cachep; 2412 return cachep;
2400} 2413}
2401EXPORT_SYMBOL(kmem_cache_create); 2414EXPORT_SYMBOL(kmem_cache_create);
@@ -2621,6 +2634,14 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2621 /* Slab management obj is off-slab. */ 2634 /* Slab management obj is off-slab. */
2622 slabp = kmem_cache_alloc_node(cachep->slabp_cache, 2635 slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2623 local_flags, nodeid); 2636 local_flags, nodeid);
2637 /*
2638 * If the first object in the slab is leaked (it's allocated
2639 * but no one has a reference to it), we want to make sure
2640 * kmemleak does not treat the ->s_mem pointer as a reference
2641 * to the object. Otherwise we will not report the leak.
2642 */
2643 kmemleak_scan_area(slabp, offsetof(struct slab, list),
2644 sizeof(struct list_head), local_flags);
2624 if (!slabp) 2645 if (!slabp)
2625 return NULL; 2646 return NULL;
2626 } else { 2647 } else {
@@ -3141,6 +3162,12 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3141 STATS_INC_ALLOCMISS(cachep); 3162 STATS_INC_ALLOCMISS(cachep);
3142 objp = cache_alloc_refill(cachep, flags); 3163 objp = cache_alloc_refill(cachep, flags);
3143 } 3164 }
3165 /*
3166 * To avoid a false negative, if an object that is in one of the
3167 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
3168 * treat the array pointers as a reference to the object.
3169 */
3170 kmemleak_erase(&ac->entry[ac->avail]);
3144 return objp; 3171 return objp;
3145} 3172}
3146 3173
@@ -3360,6 +3387,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3360 out: 3387 out:
3361 local_irq_restore(save_flags); 3388 local_irq_restore(save_flags);
3362 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3389 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3390 kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
3391 flags);
3363 3392
3364 if (unlikely((flags & __GFP_ZERO) && ptr)) 3393 if (unlikely((flags & __GFP_ZERO) && ptr))
3365 memset(ptr, 0, obj_size(cachep)); 3394 memset(ptr, 0, obj_size(cachep));
@@ -3415,6 +3444,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3415 objp = __do_cache_alloc(cachep, flags); 3444 objp = __do_cache_alloc(cachep, flags);
3416 local_irq_restore(save_flags); 3445 local_irq_restore(save_flags);
3417 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3446 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3447 kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
3448 flags);
3418 prefetchw(objp); 3449 prefetchw(objp);
3419 3450
3420 if (unlikely((flags & __GFP_ZERO) && objp)) 3451 if (unlikely((flags & __GFP_ZERO) && objp))
@@ -3530,6 +3561,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3530 struct array_cache *ac = cpu_cache_get(cachep); 3561 struct array_cache *ac = cpu_cache_get(cachep);
3531 3562
3532 check_irq_off(); 3563 check_irq_off();
3564 kmemleak_free_recursive(objp, cachep->flags);
3533 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3565 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3534 3566
3535 /* 3567 /*
@@ -3802,7 +3834,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
3802/* 3834/*
3803 * This initializes kmem_list3 or resizes various caches for all nodes. 3835 * This initializes kmem_list3 or resizes various caches for all nodes.
3804 */ 3836 */
3805static int alloc_kmemlist(struct kmem_cache *cachep) 3837static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
3806{ 3838{
3807 int node; 3839 int node;
3808 struct kmem_list3 *l3; 3840 struct kmem_list3 *l3;
@@ -3812,7 +3844,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3812 for_each_online_node(node) { 3844 for_each_online_node(node) {
3813 3845
3814 if (use_alien_caches) { 3846 if (use_alien_caches) {
3815 new_alien = alloc_alien_cache(node, cachep->limit); 3847 new_alien = alloc_alien_cache(node, cachep->limit, gfp);
3816 if (!new_alien) 3848 if (!new_alien)
3817 goto fail; 3849 goto fail;
3818 } 3850 }
@@ -3821,7 +3853,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3821 if (cachep->shared) { 3853 if (cachep->shared) {
3822 new_shared = alloc_arraycache(node, 3854 new_shared = alloc_arraycache(node,
3823 cachep->shared*cachep->batchcount, 3855 cachep->shared*cachep->batchcount,
3824 0xbaadf00d); 3856 0xbaadf00d, gfp);
3825 if (!new_shared) { 3857 if (!new_shared) {
3826 free_alien_cache(new_alien); 3858 free_alien_cache(new_alien);
3827 goto fail; 3859 goto fail;
@@ -3850,7 +3882,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3850 free_alien_cache(new_alien); 3882 free_alien_cache(new_alien);
3851 continue; 3883 continue;
3852 } 3884 }
3853 l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); 3885 l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
3854 if (!l3) { 3886 if (!l3) {
3855 free_alien_cache(new_alien); 3887 free_alien_cache(new_alien);
3856 kfree(new_shared); 3888 kfree(new_shared);
@@ -3906,18 +3938,18 @@ static void do_ccupdate_local(void *info)
3906 3938
3907/* Always called with the cache_chain_mutex held */ 3939/* Always called with the cache_chain_mutex held */
3908static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 3940static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3909 int batchcount, int shared) 3941 int batchcount, int shared, gfp_t gfp)
3910{ 3942{
3911 struct ccupdate_struct *new; 3943 struct ccupdate_struct *new;
3912 int i; 3944 int i;
3913 3945
3914 new = kzalloc(sizeof(*new), GFP_KERNEL); 3946 new = kzalloc(sizeof(*new), gfp);
3915 if (!new) 3947 if (!new)
3916 return -ENOMEM; 3948 return -ENOMEM;
3917 3949
3918 for_each_online_cpu(i) { 3950 for_each_online_cpu(i) {
3919 new->new[i] = alloc_arraycache(cpu_to_node(i), limit, 3951 new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
3920 batchcount); 3952 batchcount, gfp);
3921 if (!new->new[i]) { 3953 if (!new->new[i]) {
3922 for (i--; i >= 0; i--) 3954 for (i--; i >= 0; i--)
3923 kfree(new->new[i]); 3955 kfree(new->new[i]);
@@ -3944,11 +3976,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3944 kfree(ccold); 3976 kfree(ccold);
3945 } 3977 }
3946 kfree(new); 3978 kfree(new);
3947 return alloc_kmemlist(cachep); 3979 return alloc_kmemlist(cachep, gfp);
3948} 3980}
3949 3981
3950/* Called with cache_chain_mutex held always */ 3982/* Called with cache_chain_mutex held always */
3951static int enable_cpucache(struct kmem_cache *cachep) 3983static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
3952{ 3984{
3953 int err; 3985 int err;
3954 int limit, shared; 3986 int limit, shared;
@@ -3994,7 +4026,7 @@ static int enable_cpucache(struct kmem_cache *cachep)
3994 if (limit > 32) 4026 if (limit > 32)
3995 limit = 32; 4027 limit = 32;
3996#endif 4028#endif
3997 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared); 4029 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
3998 if (err) 4030 if (err)
3999 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 4031 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
4000 cachep->name, -err); 4032 cachep->name, -err);
@@ -4300,7 +4332,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4300 res = 0; 4332 res = 0;
4301 } else { 4333 } else {
4302 res = do_tune_cpucache(cachep, limit, 4334 res = do_tune_cpucache(cachep, limit,
4303 batchcount, shared); 4335 batchcount, shared,
4336 GFP_KERNEL);
4304 } 4337 }
4305 break; 4338 break;
4306 } 4339 }
diff --git a/mm/slob.c b/mm/slob.c
index 9b1737b0787b..12f261499925 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -67,6 +67,7 @@
67#include <linux/rcupdate.h> 67#include <linux/rcupdate.h>
68#include <linux/list.h> 68#include <linux/list.h>
69#include <linux/kmemtrace.h> 69#include <linux/kmemtrace.h>
70#include <linux/kmemleak.h>
70#include <asm/atomic.h> 71#include <asm/atomic.h>
71 72
72/* 73/*
@@ -509,6 +510,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
509 size, PAGE_SIZE << order, gfp, node); 510 size, PAGE_SIZE << order, gfp, node);
510 } 511 }
511 512
513 kmemleak_alloc(ret, size, 1, gfp);
512 return ret; 514 return ret;
513} 515}
514EXPORT_SYMBOL(__kmalloc_node); 516EXPORT_SYMBOL(__kmalloc_node);
@@ -521,6 +523,7 @@ void kfree(const void *block)
521 523
522 if (unlikely(ZERO_OR_NULL_PTR(block))) 524 if (unlikely(ZERO_OR_NULL_PTR(block)))
523 return; 525 return;
526 kmemleak_free(block);
524 527
525 sp = slob_page(block); 528 sp = slob_page(block);
526 if (is_slob_page(sp)) { 529 if (is_slob_page(sp)) {
@@ -584,12 +587,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
584 } else if (flags & SLAB_PANIC) 587 } else if (flags & SLAB_PANIC)
585 panic("Cannot create slab cache %s\n", name); 588 panic("Cannot create slab cache %s\n", name);
586 589
590 kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
587 return c; 591 return c;
588} 592}
589EXPORT_SYMBOL(kmem_cache_create); 593EXPORT_SYMBOL(kmem_cache_create);
590 594
591void kmem_cache_destroy(struct kmem_cache *c) 595void kmem_cache_destroy(struct kmem_cache *c)
592{ 596{
597 kmemleak_free(c);
593 slob_free(c, sizeof(struct kmem_cache)); 598 slob_free(c, sizeof(struct kmem_cache));
594} 599}
595EXPORT_SYMBOL(kmem_cache_destroy); 600EXPORT_SYMBOL(kmem_cache_destroy);
@@ -613,6 +618,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
613 if (c->ctor) 618 if (c->ctor)
614 c->ctor(b); 619 c->ctor(b);
615 620
621 kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
616 return b; 622 return b;
617} 623}
618EXPORT_SYMBOL(kmem_cache_alloc_node); 624EXPORT_SYMBOL(kmem_cache_alloc_node);
@@ -635,6 +641,7 @@ static void kmem_rcu_free(struct rcu_head *head)
635 641
636void kmem_cache_free(struct kmem_cache *c, void *b) 642void kmem_cache_free(struct kmem_cache *c, void *b)
637{ 643{
644 kmemleak_free_recursive(b, c->flags);
638 if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { 645 if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
639 struct slob_rcu *slob_rcu; 646 struct slob_rcu *slob_rcu;
640 slob_rcu = b + (c->size - sizeof(struct slob_rcu)); 647 slob_rcu = b + (c->size - sizeof(struct slob_rcu));
diff --git a/mm/slub.c b/mm/slub.c
index 5e805a6fe36c..3964d3ce4c15 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -20,6 +20,7 @@
20#include <linux/kmemtrace.h> 20#include <linux/kmemtrace.h>
21#include <linux/cpu.h> 21#include <linux/cpu.h>
22#include <linux/cpuset.h> 22#include <linux/cpuset.h>
23#include <linux/kmemleak.h>
23#include <linux/mempolicy.h> 24#include <linux/mempolicy.h>
24#include <linux/ctype.h> 25#include <linux/ctype.h>
25#include <linux/debugobjects.h> 26#include <linux/debugobjects.h>
@@ -143,7 +144,7 @@
143 * Set of flags that will prevent slab merging 144 * Set of flags that will prevent slab merging
144 */ 145 */
145#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 146#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
146 SLAB_TRACE | SLAB_DESTROY_BY_RCU) 147 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE)
147 148
148#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 149#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
149 SLAB_CACHE_DMA) 150 SLAB_CACHE_DMA)
@@ -1617,6 +1618,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1617 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1618 if (unlikely((gfpflags & __GFP_ZERO) && object))
1618 memset(object, 0, objsize); 1619 memset(object, 0, objsize);
1619 1620
1621 kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags);
1620 return object; 1622 return object;
1621} 1623}
1622 1624
@@ -1746,6 +1748,7 @@ static __always_inline void slab_free(struct kmem_cache *s,
1746 struct kmem_cache_cpu *c; 1748 struct kmem_cache_cpu *c;
1747 unsigned long flags; 1749 unsigned long flags;
1748 1750
1751 kmemleak_free_recursive(x, s->flags);
1749 local_irq_save(flags); 1752 local_irq_save(flags);
1750 c = get_cpu_slab(s, smp_processor_id()); 1753 c = get_cpu_slab(s, smp_processor_id());
1751 debug_check_no_locks_freed(object, c->objsize); 1754 debug_check_no_locks_freed(object, c->objsize);
@@ -2557,13 +2560,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
2557 if (gfp_flags & SLUB_DMA) 2560 if (gfp_flags & SLUB_DMA)
2558 flags = SLAB_CACHE_DMA; 2561 flags = SLAB_CACHE_DMA;
2559 2562
2560 down_write(&slub_lock); 2563 /*
2564 * This function is called with IRQs disabled during early-boot on
2565 * single CPU so there's no need to take slub_lock here.
2566 */
2561 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, 2567 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
2562 flags, NULL)) 2568 flags, NULL))
2563 goto panic; 2569 goto panic;
2564 2570
2565 list_add(&s->list, &slab_caches); 2571 list_add(&s->list, &slab_caches);
2566 up_write(&slub_lock); 2572
2567 if (sysfs_slab_add(s)) 2573 if (sysfs_slab_add(s))
2568 goto panic; 2574 goto panic;
2569 return s; 2575 return s;
@@ -3021,7 +3027,7 @@ void __init kmem_cache_init(void)
3021 * kmem_cache_open for slab_state == DOWN. 3027 * kmem_cache_open for slab_state == DOWN.
3022 */ 3028 */
3023 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 3029 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
3024 sizeof(struct kmem_cache_node), GFP_KERNEL); 3030 sizeof(struct kmem_cache_node), GFP_NOWAIT);
3025 kmalloc_caches[0].refcount = -1; 3031 kmalloc_caches[0].refcount = -1;
3026 caches++; 3032 caches++;
3027 3033
@@ -3034,16 +3040,16 @@ void __init kmem_cache_init(void)
3034 /* Caches that are not of the two-to-the-power-of size */ 3040 /* Caches that are not of the two-to-the-power-of size */
3035 if (KMALLOC_MIN_SIZE <= 64) { 3041 if (KMALLOC_MIN_SIZE <= 64) {
3036 create_kmalloc_cache(&kmalloc_caches[1], 3042 create_kmalloc_cache(&kmalloc_caches[1],
3037 "kmalloc-96", 96, GFP_KERNEL); 3043 "kmalloc-96", 96, GFP_NOWAIT);
3038 caches++; 3044 caches++;
3039 create_kmalloc_cache(&kmalloc_caches[2], 3045 create_kmalloc_cache(&kmalloc_caches[2],
3040 "kmalloc-192", 192, GFP_KERNEL); 3046 "kmalloc-192", 192, GFP_NOWAIT);
3041 caches++; 3047 caches++;
3042 } 3048 }
3043 3049
3044 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3050 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
3045 create_kmalloc_cache(&kmalloc_caches[i], 3051 create_kmalloc_cache(&kmalloc_caches[i],
3046 "kmalloc", 1 << i, GFP_KERNEL); 3052 "kmalloc", 1 << i, GFP_NOWAIT);
3047 caches++; 3053 caches++;
3048 } 3054 }
3049 3055
@@ -3080,7 +3086,7 @@ void __init kmem_cache_init(void)
3080 /* Provide the correct kmalloc names now that the caches are up */ 3086 /* Provide the correct kmalloc names now that the caches are up */
3081 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) 3087 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
3082 kmalloc_caches[i]. name = 3088 kmalloc_caches[i]. name =
3083 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); 3089 kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
3084 3090
3085#ifdef CONFIG_SMP 3091#ifdef CONFIG_SMP
3086 register_cpu_notifier(&slab_notifier); 3092 register_cpu_notifier(&slab_notifier);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 083716ea38c9..f8189a4b3e13 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -23,8 +23,8 @@
23#include <linux/rbtree.h> 23#include <linux/rbtree.h>
24#include <linux/radix-tree.h> 24#include <linux/radix-tree.h>
25#include <linux/rcupdate.h> 25#include <linux/rcupdate.h>
26#include <linux/bootmem.h>
27#include <linux/pfn.h> 26#include <linux/pfn.h>
27#include <linux/kmemleak.h>
28 28
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
@@ -1032,7 +1032,7 @@ void __init vmalloc_init(void)
1032 1032
1033 /* Import existing vmlist entries. */ 1033 /* Import existing vmlist entries. */
1034 for (tmp = vmlist; tmp; tmp = tmp->next) { 1034 for (tmp = vmlist; tmp; tmp = tmp->next) {
1035 va = alloc_bootmem(sizeof(struct vmap_area)); 1035 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
1036 va->flags = tmp->flags | VM_VM_AREA; 1036 va->flags = tmp->flags | VM_VM_AREA;
1037 va->va_start = (unsigned long)tmp->addr; 1037 va->va_start = (unsigned long)tmp->addr;
1038 va->va_end = va->va_start + tmp->size; 1038 va->va_end = va->va_start + tmp->size;
@@ -1327,6 +1327,9 @@ static void __vunmap(const void *addr, int deallocate_pages)
1327void vfree(const void *addr) 1327void vfree(const void *addr)
1328{ 1328{
1329 BUG_ON(in_interrupt()); 1329 BUG_ON(in_interrupt());
1330
1331 kmemleak_free(addr);
1332
1330 __vunmap(addr, 1); 1333 __vunmap(addr, 1);
1331} 1334}
1332EXPORT_SYMBOL(vfree); 1335EXPORT_SYMBOL(vfree);
@@ -1439,8 +1442,17 @@ fail:
1439 1442
1440void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) 1443void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1441{ 1444{
1442 return __vmalloc_area_node(area, gfp_mask, prot, -1, 1445 void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
1443 __builtin_return_address(0)); 1446 __builtin_return_address(0));
1447
1448 /*
1449 * A ref_count = 3 is needed because the vm_struct and vmap_area
1450 * structures allocated in the __get_vm_area_node() function contain
1451 * references to the virtual address of the vmalloc'ed block.
1452 */
1453 kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
1454
1455 return addr;
1444} 1456}
1445 1457
1446/** 1458/**
@@ -1459,6 +1471,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1459 int node, void *caller) 1471 int node, void *caller)
1460{ 1472{
1461 struct vm_struct *area; 1473 struct vm_struct *area;
1474 void *addr;
1475 unsigned long real_size = size;
1462 1476
1463 size = PAGE_ALIGN(size); 1477 size = PAGE_ALIGN(size);
1464 if (!size || (size >> PAGE_SHIFT) > num_physpages) 1478 if (!size || (size >> PAGE_SHIFT) > num_physpages)
@@ -1470,7 +1484,16 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1470 if (!area) 1484 if (!area)
1471 return NULL; 1485 return NULL;
1472 1486
1473 return __vmalloc_area_node(area, gfp_mask, prot, node, caller); 1487 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
1488
1489 /*
1490 * A ref_count = 3 is needed because the vm_struct and vmap_area
1491 * structures allocated in the __get_vm_area_node() function contain
1492 * references to the virtual address of the vmalloc'ed block.
1493 */
1494 kmemleak_alloc(addr, real_size, 3, gfp_mask);
1495
1496 return addr;
1474} 1497}
1475 1498
1476void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1499void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
new file mode 100644
index 000000000000..d69a759a1046
--- /dev/null
+++ b/tools/perf/.gitignore
@@ -0,0 +1,16 @@
1PERF-BUILD-OPTIONS
2PERF-CFLAGS
3PERF-GUI-VARS
4PERF-VERSION-FILE
5perf
6perf-help
7perf-record
8perf-report
9perf-stat
10perf-top
11perf*.1
12perf*.xml
13common-cmds.h
14tags
15TAGS
16cscope*
diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile
new file mode 100644
index 000000000000..5457192e1b41
--- /dev/null
+++ b/tools/perf/Documentation/Makefile
@@ -0,0 +1,300 @@
1MAN1_TXT= \
2 $(filter-out $(addsuffix .txt, $(ARTICLES) $(SP_ARTICLES)), \
3 $(wildcard perf-*.txt)) \
4 perf.txt
5MAN5_TXT=
6MAN7_TXT=
7
8MAN_TXT = $(MAN1_TXT) $(MAN5_TXT) $(MAN7_TXT)
9MAN_XML=$(patsubst %.txt,%.xml,$(MAN_TXT))
10MAN_HTML=$(patsubst %.txt,%.html,$(MAN_TXT))
11
12DOC_HTML=$(MAN_HTML)
13
14ARTICLES =
15# with their own formatting rules.
16SP_ARTICLES =
17API_DOCS = $(patsubst %.txt,%,$(filter-out technical/api-index-skel.txt technical/api-index.txt, $(wildcard technical/api-*.txt)))
18SP_ARTICLES += $(API_DOCS)
19SP_ARTICLES += technical/api-index
20
21DOC_HTML += $(patsubst %,%.html,$(ARTICLES) $(SP_ARTICLES))
22
23DOC_MAN1=$(patsubst %.txt,%.1,$(MAN1_TXT))
24DOC_MAN5=$(patsubst %.txt,%.5,$(MAN5_TXT))
25DOC_MAN7=$(patsubst %.txt,%.7,$(MAN7_TXT))
26
27prefix?=$(HOME)
28bindir?=$(prefix)/bin
29htmldir?=$(prefix)/share/doc/perf-doc
30pdfdir?=$(prefix)/share/doc/perf-doc
31mandir?=$(prefix)/share/man
32man1dir=$(mandir)/man1
33man5dir=$(mandir)/man5
34man7dir=$(mandir)/man7
35# DESTDIR=
36
37ASCIIDOC=asciidoc
38ASCIIDOC_EXTRA =
39MANPAGE_XSL = manpage-normal.xsl
40XMLTO_EXTRA =
41INSTALL?=install
42RM ?= rm -f
43DOC_REF = origin/man
44HTML_REF = origin/html
45
46infodir?=$(prefix)/share/info
47MAKEINFO=makeinfo
48INSTALL_INFO=install-info
49DOCBOOK2X_TEXI=docbook2x-texi
50DBLATEX=dblatex
51ifndef PERL_PATH
52 PERL_PATH = /usr/bin/perl
53endif
54
55-include ../config.mak.autogen
56-include ../config.mak
57
58#
59# For asciidoc ...
60# -7.1.2, no extra settings are needed.
61# 8.0-, set ASCIIDOC8.
62#
63
64#
65# For docbook-xsl ...
66# -1.68.1, set ASCIIDOC_NO_ROFF? (based on changelog from 1.73.0)
67# 1.69.0, no extra settings are needed?
68# 1.69.1-1.71.0, set DOCBOOK_SUPPRESS_SP?
69# 1.71.1, no extra settings are needed?
70# 1.72.0, set DOCBOOK_XSL_172.
71# 1.73.0-, set ASCIIDOC_NO_ROFF
72#
73
74#
75# If you had been using DOCBOOK_XSL_172 in an attempt to get rid
76# of 'the ".ft C" problem' in your generated manpages, and you
77# instead ended up with weird characters around callouts, try
78# using ASCIIDOC_NO_ROFF instead (it works fine with ASCIIDOC8).
79#
80
81ifdef ASCIIDOC8
82ASCIIDOC_EXTRA += -a asciidoc7compatible
83endif
84ifdef DOCBOOK_XSL_172
85ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff
86MANPAGE_XSL = manpage-1.72.xsl
87else
88 ifdef ASCIIDOC_NO_ROFF
89 # docbook-xsl after 1.72 needs the regular XSL, but will not
90 # pass-thru raw roff codes from asciidoc.conf, so turn them off.
91 ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff
92 endif
93endif
94ifdef MAN_BOLD_LITERAL
95XMLTO_EXTRA += -m manpage-bold-literal.xsl
96endif
97ifdef DOCBOOK_SUPPRESS_SP
98XMLTO_EXTRA += -m manpage-suppress-sp.xsl
99endif
100
101SHELL_PATH ?= $(SHELL)
102# Shell quote;
103SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
104
105#
106# Please note that there is a minor bug in asciidoc.
107# The version after 6.0.3 _will_ include the patch found here:
108# http://marc.theaimsgroup.com/?l=perf&m=111558757202243&w=2
109#
110# Until that version is released you may have to apply the patch
111# yourself - yes, all 6 characters of it!
112#
113
114QUIET_SUBDIR0 = +$(MAKE) -C # space to separate -C and subdir
115QUIET_SUBDIR1 =
116
117ifneq ($(findstring $(MAKEFLAGS),w),w)
118PRINT_DIR = --no-print-directory
119else # "make -w"
120NO_SUBDIR = :
121endif
122
123ifneq ($(findstring $(MAKEFLAGS),s),s)
124ifndef V
125 QUIET_ASCIIDOC = @echo ' ' ASCIIDOC $@;
126 QUIET_XMLTO = @echo ' ' XMLTO $@;
127 QUIET_DB2TEXI = @echo ' ' DB2TEXI $@;
128 QUIET_MAKEINFO = @echo ' ' MAKEINFO $@;
129 QUIET_DBLATEX = @echo ' ' DBLATEX $@;
130 QUIET_XSLTPROC = @echo ' ' XSLTPROC $@;
131 QUIET_GEN = @echo ' ' GEN $@;
132 QUIET_STDERR = 2> /dev/null
133 QUIET_SUBDIR0 = +@subdir=
134 QUIET_SUBDIR1 = ;$(NO_SUBDIR) echo ' ' SUBDIR $$subdir; \
135 $(MAKE) $(PRINT_DIR) -C $$subdir
136 export V
137endif
138endif
139
140all: html man
141
142html: $(DOC_HTML)
143
144$(DOC_HTML) $(DOC_MAN1) $(DOC_MAN5) $(DOC_MAN7): asciidoc.conf
145
146man: man1 man5 man7
147man1: $(DOC_MAN1)
148man5: $(DOC_MAN5)
149man7: $(DOC_MAN7)
150
151info: perf.info perfman.info
152
153pdf: user-manual.pdf
154
155install: install-man
156
157install-man: man
158 $(INSTALL) -d -m 755 $(DESTDIR)$(man1dir)
159# $(INSTALL) -d -m 755 $(DESTDIR)$(man5dir)
160# $(INSTALL) -d -m 755 $(DESTDIR)$(man7dir)
161 $(INSTALL) -m 644 $(DOC_MAN1) $(DESTDIR)$(man1dir)
162# $(INSTALL) -m 644 $(DOC_MAN5) $(DESTDIR)$(man5dir)
163# $(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir)
164
165install-info: info
166 $(INSTALL) -d -m 755 $(DESTDIR)$(infodir)
167 $(INSTALL) -m 644 perf.info perfman.info $(DESTDIR)$(infodir)
168 if test -r $(DESTDIR)$(infodir)/dir; then \
169 $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perf.info ;\
170 $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perfman.info ;\
171 else \
172 echo "No directory found in $(DESTDIR)$(infodir)" >&2 ; \
173 fi
174
175install-pdf: pdf
176 $(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir)
177 $(INSTALL) -m 644 user-manual.pdf $(DESTDIR)$(pdfdir)
178
179install-html: html
180 '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
181
182../PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
183 $(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) PERF-VERSION-FILE
184
185-include ../PERF-VERSION-FILE
186
187#
188# Determine "include::" file references in asciidoc files.
189#
190doc.dep : $(wildcard *.txt) build-docdep.perl
191 $(QUIET_GEN)$(RM) $@+ $@ && \
192 $(PERL_PATH) ./build-docdep.perl >$@+ $(QUIET_STDERR) && \
193 mv $@+ $@
194
195-include doc.dep
196
197cmds_txt = cmds-ancillaryinterrogators.txt \
198 cmds-ancillarymanipulators.txt \
199 cmds-mainporcelain.txt \
200 cmds-plumbinginterrogators.txt \
201 cmds-plumbingmanipulators.txt \
202 cmds-synchingrepositories.txt \
203 cmds-synchelpers.txt \
204 cmds-purehelpers.txt \
205 cmds-foreignscminterface.txt
206
207$(cmds_txt): cmd-list.made
208
209cmd-list.made: cmd-list.perl ../command-list.txt $(MAN1_TXT)
210 $(QUIET_GEN)$(RM) $@ && \
211 $(PERL_PATH) ./cmd-list.perl ../command-list.txt $(QUIET_STDERR) && \
212 date >$@
213
214clean:
215 $(RM) *.xml *.xml+ *.html *.html+ *.1 *.5 *.7
216 $(RM) *.texi *.texi+ *.texi++ perf.info perfman.info
217 $(RM) howto-index.txt howto/*.html doc.dep
218 $(RM) technical/api-*.html technical/api-index.txt
219 $(RM) $(cmds_txt) *.made
220
221$(MAN_HTML): %.html : %.txt
222 $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
223 $(ASCIIDOC) -b xhtml11 -d manpage -f asciidoc.conf \
224 $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
225 mv $@+ $@
226
227%.1 %.5 %.7 : %.xml
228 $(QUIET_XMLTO)$(RM) $@ && \
229 xmlto -m $(MANPAGE_XSL) $(XMLTO_EXTRA) man $<
230
231%.xml : %.txt
232 $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
233 $(ASCIIDOC) -b docbook -d manpage -f asciidoc.conf \
234 $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
235 mv $@+ $@
236
237XSLT = docbook.xsl
238XSLTOPTS = --xinclude --stringparam html.stylesheet docbook-xsl.css
239
240user-manual.html: user-manual.xml
241 $(QUIET_XSLTPROC)xsltproc $(XSLTOPTS) -o $@ $(XSLT) $<
242
243perf.info: user-manual.texi
244 $(QUIET_MAKEINFO)$(MAKEINFO) --no-split -o $@ user-manual.texi
245
246user-manual.texi: user-manual.xml
247 $(QUIET_DB2TEXI)$(RM) $@+ $@ && \
248 $(DOCBOOK2X_TEXI) user-manual.xml --encoding=UTF-8 --to-stdout >$@++ && \
249 $(PERL_PATH) fix-texi.perl <$@++ >$@+ && \
250 rm $@++ && \
251 mv $@+ $@
252
253user-manual.pdf: user-manual.xml
254 $(QUIET_DBLATEX)$(RM) $@+ $@ && \
255 $(DBLATEX) -o $@+ -p /etc/asciidoc/dblatex/asciidoc-dblatex.xsl -s /etc/asciidoc/dblatex/asciidoc-dblatex.sty $< && \
256 mv $@+ $@
257
258perfman.texi: $(MAN_XML) cat-texi.perl
259 $(QUIET_DB2TEXI)$(RM) $@+ $@ && \
260 ($(foreach xml,$(MAN_XML),$(DOCBOOK2X_TEXI) --encoding=UTF-8 \
261 --to-stdout $(xml) &&) true) > $@++ && \
262 $(PERL_PATH) cat-texi.perl $@ <$@++ >$@+ && \
263 rm $@++ && \
264 mv $@+ $@
265
266perfman.info: perfman.texi
267 $(QUIET_MAKEINFO)$(MAKEINFO) --no-split --no-validate $*.texi
268
269$(patsubst %.txt,%.texi,$(MAN_TXT)): %.texi : %.xml
270 $(QUIET_DB2TEXI)$(RM) $@+ $@ && \
271 $(DOCBOOK2X_TEXI) --to-stdout $*.xml >$@+ && \
272 mv $@+ $@
273
274howto-index.txt: howto-index.sh $(wildcard howto/*.txt)
275 $(QUIET_GEN)$(RM) $@+ $@ && \
276 '$(SHELL_PATH_SQ)' ./howto-index.sh $(wildcard howto/*.txt) >$@+ && \
277 mv $@+ $@
278
279$(patsubst %,%.html,$(ARTICLES)) : %.html : %.txt
280 $(QUIET_ASCIIDOC)$(ASCIIDOC) -b xhtml11 $*.txt
281
282WEBDOC_DEST = /pub/software/tools/perf/docs
283
284$(patsubst %.txt,%.html,$(wildcard howto/*.txt)): %.html : %.txt
285 $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
286 sed -e '1,/^$$/d' $< | $(ASCIIDOC) -b xhtml11 - >$@+ && \
287 mv $@+ $@
288
289install-webdoc : html
290 '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST)
291
292quick-install: quick-install-man
293
294quick-install-man:
295 '$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir)
296
297quick-install-html:
298 '$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir)
299
300.PHONY: .FORCE-PERF-VERSION-FILE
diff --git a/tools/perf/Documentation/asciidoc.conf b/tools/perf/Documentation/asciidoc.conf
new file mode 100644
index 000000000000..356b23a40339
--- /dev/null
+++ b/tools/perf/Documentation/asciidoc.conf
@@ -0,0 +1,91 @@
1## linkperf: macro
2#
3# Usage: linkperf:command[manpage-section]
4#
5# Note, {0} is the manpage section, while {target} is the command.
6#
7# Show PERF link as: <command>(<section>); if section is defined, else just show
8# the command.
9
10[macros]
11(?su)[\\]?(?P<name>linkperf):(?P<target>\S*?)\[(?P<attrlist>.*?)\]=
12
13[attributes]
14asterisk=&#42;
15plus=&#43;
16caret=&#94;
17startsb=&#91;
18endsb=&#93;
19tilde=&#126;
20
21ifdef::backend-docbook[]
22[linkperf-inlinemacro]
23{0%{target}}
24{0#<citerefentry>}
25{0#<refentrytitle>{target}</refentrytitle><manvolnum>{0}</manvolnum>}
26{0#</citerefentry>}
27endif::backend-docbook[]
28
29ifdef::backend-docbook[]
30ifndef::perf-asciidoc-no-roff[]
31# "unbreak" docbook-xsl v1.68 for manpages. v1.69 works with or without this.
32# v1.72 breaks with this because it replaces dots not in roff requests.
33[listingblock]
34<example><title>{title}</title>
35<literallayout>
36ifdef::doctype-manpage[]
37&#10;.ft C&#10;
38endif::doctype-manpage[]
39|
40ifdef::doctype-manpage[]
41&#10;.ft&#10;
42endif::doctype-manpage[]
43</literallayout>
44{title#}</example>
45endif::perf-asciidoc-no-roff[]
46
47ifdef::perf-asciidoc-no-roff[]
48ifdef::doctype-manpage[]
49# The following two small workarounds insert a simple paragraph after screen
50[listingblock]
51<example><title>{title}</title>
52<literallayout>
53|
54</literallayout><simpara></simpara>
55{title#}</example>
56
57[verseblock]
58<formalpara{id? id="{id}"}><title>{title}</title><para>
59{title%}<literallayout{id? id="{id}"}>
60{title#}<literallayout>
61|
62</literallayout>
63{title#}</para></formalpara>
64{title%}<simpara></simpara>
65endif::doctype-manpage[]
66endif::perf-asciidoc-no-roff[]
67endif::backend-docbook[]
68
69ifdef::doctype-manpage[]
70ifdef::backend-docbook[]
71[header]
72template::[header-declarations]
73<refentry>
74<refmeta>
75<refentrytitle>{mantitle}</refentrytitle>
76<manvolnum>{manvolnum}</manvolnum>
77<refmiscinfo class="source">perf</refmiscinfo>
78<refmiscinfo class="version">{perf_version}</refmiscinfo>
79<refmiscinfo class="manual">perf Manual</refmiscinfo>
80</refmeta>
81<refnamediv>
82 <refname>{manname}</refname>
83 <refpurpose>{manpurpose}</refpurpose>
84</refnamediv>
85endif::backend-docbook[]
86endif::doctype-manpage[]
87
88ifdef::backend-xhtml11[]
89[linkperf-inlinemacro]
90<a href="{target}.html">{target}{0?({0})}</a>
91endif::backend-xhtml11[]
diff --git a/tools/perf/Documentation/manpage-1.72.xsl b/tools/perf/Documentation/manpage-1.72.xsl
new file mode 100644
index 000000000000..b4d315cb8c47
--- /dev/null
+++ b/tools/perf/Documentation/manpage-1.72.xsl
@@ -0,0 +1,14 @@
1<!-- manpage-1.72.xsl:
2 special settings for manpages rendered from asciidoc+docbook
3 handles peculiarities in docbook-xsl 1.72.0 -->
4<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
5 version="1.0">
6
7<xsl:import href="manpage-base.xsl"/>
8
9<!-- these are the special values for the roff control characters
10 needed for docbook-xsl 1.72.0 -->
11<xsl:param name="git.docbook.backslash">&#x2593;</xsl:param>
12<xsl:param name="git.docbook.dot" >&#x2302;</xsl:param>
13
14</xsl:stylesheet>
diff --git a/tools/perf/Documentation/manpage-base.xsl b/tools/perf/Documentation/manpage-base.xsl
new file mode 100644
index 000000000000..a264fa616093
--- /dev/null
+++ b/tools/perf/Documentation/manpage-base.xsl
@@ -0,0 +1,35 @@
1<!-- manpage-base.xsl:
2 special formatting for manpages rendered from asciidoc+docbook -->
3<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
4 version="1.0">
5
6<!-- these params silence some output from xmlto -->
7<xsl:param name="man.output.quietly" select="1"/>
8<xsl:param name="refentry.meta.get.quietly" select="1"/>
9
10<!-- convert asciidoc callouts to man page format;
11 git.docbook.backslash and git.docbook.dot params
12 must be supplied by another XSL file or other means -->
13<xsl:template match="co">
14 <xsl:value-of select="concat(
15 $git.docbook.backslash,'fB(',
16 substring-after(@id,'-'),')',
17 $git.docbook.backslash,'fR')"/>
18</xsl:template>
19<xsl:template match="calloutlist">
20 <xsl:value-of select="$git.docbook.dot"/>
21 <xsl:text>sp&#10;</xsl:text>
22 <xsl:apply-templates/>
23 <xsl:text>&#10;</xsl:text>
24</xsl:template>
25<xsl:template match="callout">
26 <xsl:value-of select="concat(
27 $git.docbook.backslash,'fB',
28 substring-after(@arearefs,'-'),
29 '. ',$git.docbook.backslash,'fR')"/>
30 <xsl:apply-templates/>
31 <xsl:value-of select="$git.docbook.dot"/>
32 <xsl:text>br&#10;</xsl:text>
33</xsl:template>
34
35</xsl:stylesheet>
diff --git a/tools/perf/Documentation/manpage-bold-literal.xsl b/tools/perf/Documentation/manpage-bold-literal.xsl
new file mode 100644
index 000000000000..608eb5df6281
--- /dev/null
+++ b/tools/perf/Documentation/manpage-bold-literal.xsl
@@ -0,0 +1,17 @@
1<!-- manpage-bold-literal.xsl:
2 special formatting for manpages rendered from asciidoc+docbook -->
3<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
4 version="1.0">
5
6<!-- render literal text as bold (instead of plain or monospace);
7 this makes literal text easier to distinguish in manpages
8 viewed on a tty -->
9<xsl:template match="literal">
10 <xsl:value-of select="$git.docbook.backslash"/>
11 <xsl:text>fB</xsl:text>
12 <xsl:apply-templates/>
13 <xsl:value-of select="$git.docbook.backslash"/>
14 <xsl:text>fR</xsl:text>
15</xsl:template>
16
17</xsl:stylesheet>
diff --git a/tools/perf/Documentation/manpage-normal.xsl b/tools/perf/Documentation/manpage-normal.xsl
new file mode 100644
index 000000000000..a48f5b11f3dc
--- /dev/null
+++ b/tools/perf/Documentation/manpage-normal.xsl
@@ -0,0 +1,13 @@
1<!-- manpage-normal.xsl:
2 special settings for manpages rendered from asciidoc+docbook
3 handles anything we want to keep away from docbook-xsl 1.72.0 -->
4<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
5 version="1.0">
6
7<xsl:import href="manpage-base.xsl"/>
8
9<!-- these are the normal values for the roff control characters -->
10<xsl:param name="git.docbook.backslash">\</xsl:param>
11<xsl:param name="git.docbook.dot" >.</xsl:param>
12
13</xsl:stylesheet>
diff --git a/tools/perf/Documentation/manpage-suppress-sp.xsl b/tools/perf/Documentation/manpage-suppress-sp.xsl
new file mode 100644
index 000000000000..a63c7632a87d
--- /dev/null
+++ b/tools/perf/Documentation/manpage-suppress-sp.xsl
@@ -0,0 +1,21 @@
1<!-- manpage-suppress-sp.xsl:
2 special settings for manpages rendered from asciidoc+docbook
3 handles erroneous, inline .sp in manpage output of some
4 versions of docbook-xsl -->
5<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
6 version="1.0">
7
8<!-- attempt to work around spurious .sp at the tail of the line
9 that some versions of docbook stylesheets seem to add -->
10<xsl:template match="simpara">
11 <xsl:variable name="content">
12 <xsl:apply-templates/>
13 </xsl:variable>
14 <xsl:value-of select="normalize-space($content)"/>
15 <xsl:if test="not(ancestor::authorblurb) and
16 not(ancestor::personblurb)">
17 <xsl:text>&#10;&#10;</xsl:text>
18 </xsl:if>
19</xsl:template>
20
21</xsl:stylesheet>
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
new file mode 100644
index 000000000000..c9dcade06831
--- /dev/null
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -0,0 +1,29 @@
1perf-annotate(1)
2==============
3
4NAME
5----
6perf-annotate - Read perf.data (created by perf record) and display annotated code
7
8SYNOPSIS
9--------
10[verse]
11'perf annotate' [-i <file> | --input=file] symbol_name
12
13DESCRIPTION
14-----------
15This command reads the input file and displays an annotated version of the
16code. If the object file has debug symbols then the source code will be
17displayed alongside assembly code.
18
19If there is no debug info in the object, then annotated assembly is displayed.
20
21OPTIONS
22-------
23-i::
24--input=::
25 Input file name. (default: perf.data)
26
27SEE ALSO
28--------
29linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-help.txt b/tools/perf/Documentation/perf-help.txt
new file mode 100644
index 000000000000..514391818d1f
--- /dev/null
+++ b/tools/perf/Documentation/perf-help.txt
@@ -0,0 +1,38 @@
1perf-help(1)
2============
3
4NAME
5----
6perf-help - display help information about perf
7
8SYNOPSIS
9--------
10'perf help' [-a|--all] [COMMAND]
11
12DESCRIPTION
13-----------
14
15With no options and no COMMAND given, the synopsis of the 'perf'
16command and a list of the most commonly used perf commands are printed
17on the standard output.
18
19If the option '--all' or '-a' is given, then all available commands are
20printed on the standard output.
21
22If a perf command is named, a manual page for that command is brought
23up. The 'man' program is used by default for this purpose, but this
24can be overridden by other options or configuration variables.
25
26Note that `perf --help ...` is identical to `perf help ...` because the
27former is internally converted into the latter.
28
29OPTIONS
30-------
31-a::
32--all::
33 Prints all the available commands on the standard output. This
34 option supersedes any other option.
35
36PERF
37----
38Part of the linkperf:perf[1] suite
diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
new file mode 100644
index 000000000000..8290b9422668
--- /dev/null
+++ b/tools/perf/Documentation/perf-list.txt
@@ -0,0 +1,25 @@
1perf-list(1)
2============
3
4NAME
5----
6perf-list - List all symbolic event types
7
8SYNOPSIS
9--------
10[verse]
11'perf list'
12
13DESCRIPTION
14-----------
15This command displays the symbolic event types which can be selected in the
16various perf commands with the -e option.
17
18OPTIONS
19-------
20None
21
22SEE ALSO
23--------
24linkperf:perf-stat[1], linkperf:perf-top[1],
25linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
new file mode 100644
index 000000000000..1dbc1eeb4c01
--- /dev/null
+++ b/tools/perf/Documentation/perf-record.txt
@@ -0,0 +1,42 @@
1perf-record(1)
2==============
3
4NAME
5----
6perf-record - Run a command and record its profile into perf.data
7
8SYNOPSIS
9--------
10[verse]
11'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
12'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] -- <command> [<options>]
13
14DESCRIPTION
15-----------
16This command runs a command and gathers a performance counter profile
17from it, into perf.data - without displaying anything.
18
19This file can then be inspected later on, using 'perf report'.
20
21
22OPTIONS
23-------
24<command>...::
25 Any command you can specify in a shell.
26
27-e::
28--event=::
29 Select the PMU event. Selection can be a symbolic event name
30 (use 'perf list' to list all events) or a raw PMU
31 event (eventsel+umask) in the form of rNNN where NNN is a
32 hexadecimal event descriptor.
33
34-a::
35 system-wide collection
36
37-l::
38 scale counter values
39
40SEE ALSO
41--------
42linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
new file mode 100644
index 000000000000..52d3fc6846a9
--- /dev/null
+++ b/tools/perf/Documentation/perf-report.txt
@@ -0,0 +1,26 @@
1perf-report(1)
2==============
3
4NAME
5----
6perf-report - Read perf.data (created by perf record) and display the profile
7
8SYNOPSIS
9--------
10[verse]
11'perf report' [-i <file> | --input=file]
12
13DESCRIPTION
14-----------
15This command displays the performance counter profile information recorded
16via perf report.
17
18OPTIONS
19-------
20-i::
21--input=::
22 Input file name. (default: perf.data)
23
24SEE ALSO
25--------
26linkperf:perf-stat[1]
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
new file mode 100644
index 000000000000..c368a72721d7
--- /dev/null
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -0,0 +1,66 @@
1perf-stat(1)
2============
3
4NAME
5----
6perf-stat - Run a command and gather performance counter statistics
7
8SYNOPSIS
9--------
10[verse]
11'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
12'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] -- <command> [<options>]
13
14DESCRIPTION
15-----------
16This command runs a command and gathers performance counter statistics
17from it.
18
19
20OPTIONS
21-------
22<command>...::
23 Any command you can specify in a shell.
24
25
26-e::
27--event=::
28 Select the PMU event. Selection can be a symbolic event name
29 (use 'perf list' to list all events) or a raw PMU
30 event (eventsel+umask) in the form of rNNN where NNN is a
31 hexadecimal event descriptor.
32
33-i::
34--inherit::
35 child tasks inherit counters
36-p::
37--pid=<pid>::
38 stat events on existing pid
39
40-a::
41 system-wide collection
42
43-l::
44 scale counter values
45
46EXAMPLES
47--------
48
49$ perf stat -- make -j
50
51 Performance counter stats for 'make -j':
52
53 8117.370256 task clock ticks # 11.281 CPU utilization factor
54 678 context switches # 0.000 M/sec
55 133 CPU migrations # 0.000 M/sec
56 235724 pagefaults # 0.029 M/sec
57 24821162526 CPU cycles # 3057.784 M/sec
58 18687303457 instructions # 2302.138 M/sec
59 172158895 cache references # 21.209 M/sec
60 27075259 cache misses # 3.335 M/sec
61
62 Wall-clock time elapsed: 719.554352 msecs
63
64SEE ALSO
65--------
66linkperf:perf-top[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
new file mode 100644
index 000000000000..539d01289725
--- /dev/null
+++ b/tools/perf/Documentation/perf-top.txt
@@ -0,0 +1,39 @@
1perf-top(1)
2===========
3
4NAME
5----
6perf-top - Run a command and profile it
7
8SYNOPSIS
9--------
10[verse]
11'perf top' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
12
13DESCRIPTION
14-----------
15This command runs a command and gathers a performance counter profile
16from it.
17
18
19OPTIONS
20-------
21<command>...::
22 Any command you can specify in a shell.
23
24-e::
25--event=::
26 Select the PMU event. Selection can be a symbolic event name
27 (use 'perf list' to list all events) or a raw PMU
28 event (eventsel+umask) in the form of rNNN where NNN is a
29 hexadecimal event descriptor.
30
31-a::
32 system-wide collection
33
34-l::
35 scale counter values
36
37SEE ALSO
38--------
39linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf.txt b/tools/perf/Documentation/perf.txt
new file mode 100644
index 000000000000..69c832557199
--- /dev/null
+++ b/tools/perf/Documentation/perf.txt
@@ -0,0 +1,24 @@
1perf(1)
2=======
3
4NAME
5----
6perf - Performance analysis tools for Linux
7
8SYNOPSIS
9--------
10[verse]
11'perf' [--version] [--help] COMMAND [ARGS]
12
13DESCRIPTION
14-----------
15Performance counters for Linux are are a new kernel-based subsystem
16that provide a framework for all things performance analysis. It
17covers hardware level (CPU/PMU, Performance Monitoring Unit) features
18and software features (software counters, tracepoints) as well.
19
20SEE ALSO
21--------
22linkperf:perf-stat[1], linkperf:perf-top[1],
23linkperf:perf-record[1], linkperf:perf-report[1],
24linkperf:perf-list[1]
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
new file mode 100644
index 000000000000..0cbd5d6874ec
--- /dev/null
+++ b/tools/perf/Makefile
@@ -0,0 +1,929 @@
1# The default target of this Makefile is...
2all::
3
4# Define V=1 to have a more verbose compile.
5#
6# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf()
7# or vsnprintf() return -1 instead of number of characters which would
8# have been written to the final string if enough space had been available.
9#
10# Define FREAD_READS_DIRECTORIES if your are on a system which succeeds
11# when attempting to read from an fopen'ed directory.
12#
13# Define NO_OPENSSL environment variable if you do not have OpenSSL.
14# This also implies MOZILLA_SHA1.
15#
16# Define CURLDIR=/foo/bar if your curl header and library files are in
17# /foo/bar/include and /foo/bar/lib directories.
18#
19# Define EXPATDIR=/foo/bar if your expat header and library files are in
20# /foo/bar/include and /foo/bar/lib directories.
21#
22# Define NO_D_INO_IN_DIRENT if you don't have d_ino in your struct dirent.
23#
24# Define NO_D_TYPE_IN_DIRENT if your platform defines DT_UNKNOWN but lacks
25# d_type in struct dirent (latest Cygwin -- will be fixed soonish).
26#
27# Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.)
28# do not support the 'size specifiers' introduced by C99, namely ll, hh,
29# j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t).
30# some C compilers supported these specifiers prior to C99 as an extension.
31#
32# Define NO_STRCASESTR if you don't have strcasestr.
33#
34# Define NO_MEMMEM if you don't have memmem.
35#
36# Define NO_STRTOUMAX if you don't have strtoumax in the C library.
37# If your compiler also does not support long long or does not have
38# strtoull, define NO_STRTOULL.
39#
40# Define NO_SETENV if you don't have setenv in the C library.
41#
42# Define NO_UNSETENV if you don't have unsetenv in the C library.
43#
44# Define NO_MKDTEMP if you don't have mkdtemp in the C library.
45#
46# Define NO_SYS_SELECT_H if you don't have sys/select.h.
47#
48# Define NO_SYMLINK_HEAD if you never want .perf/HEAD to be a symbolic link.
49# Enable it on Windows. By default, symrefs are still used.
50#
51# Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability
52# tests. These tests take up a significant amount of the total test time
53# but are not needed unless you plan to talk to SVN repos.
54#
55# Define NO_FINK if you are building on Darwin/Mac OS X, have Fink
56# installed in /sw, but don't want PERF to link against any libraries
57# installed there. If defined you may specify your own (or Fink's)
58# include directories and library directories by defining CFLAGS
59# and LDFLAGS appropriately.
60#
61# Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X,
62# have DarwinPorts installed in /opt/local, but don't want PERF to
63# link against any libraries installed there. If defined you may
64# specify your own (or DarwinPort's) include directories and
65# library directories by defining CFLAGS and LDFLAGS appropriately.
66#
67# Define PPC_SHA1 environment variable when running make to make use of
68# a bundled SHA1 routine optimized for PowerPC.
69#
70# Define ARM_SHA1 environment variable when running make to make use of
71# a bundled SHA1 routine optimized for ARM.
72#
73# Define MOZILLA_SHA1 environment variable when running make to make use of
74# a bundled SHA1 routine coming from Mozilla. It is GPL'd and should be fast
75# on non-x86 architectures (e.g. PowerPC), while the OpenSSL version (default
76# choice) has very fast version optimized for i586.
77#
78# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin).
79#
80# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin).
81#
82# Define NEEDS_SOCKET if linking with libc is not enough (SunOS,
83# Patrick Mauritz).
84#
85# Define NO_MMAP if you want to avoid mmap.
86#
87# Define NO_PTHREADS if you do not have or do not want to use Pthreads.
88#
89# Define NO_PREAD if you have a problem with pread() system call (e.g.
90# cygwin.dll before v1.5.22).
91#
92# Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is
93# generally faster on your platform than accessing the working directory.
94#
95# Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support
96# the executable mode bit, but doesn't really do so.
97#
98# Define NO_IPV6 if you lack IPv6 support and getaddrinfo().
99#
100# Define NO_SOCKADDR_STORAGE if your platform does not have struct
101# sockaddr_storage.
102#
103# Define NO_ICONV if your libc does not properly support iconv.
104#
105# Define OLD_ICONV if your library has an old iconv(), where the second
106# (input buffer pointer) parameter is declared with type (const char **).
107#
108# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
109#
110# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
111# that tells runtime paths to dynamic libraries;
112# "-Wl,-rpath=/path/lib" is used instead.
113#
114# Define USE_NSEC below if you want perf to care about sub-second file mtimes
115# and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and
116# it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely
117# randomly break unless your underlying filesystem supports those sub-second
118# times (my ext3 doesn't).
119#
120# Define USE_ST_TIMESPEC if your "struct stat" uses "st_ctimespec" instead of
121# "st_ctim"
122#
123# Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec"
124# available. This automatically turns USE_NSEC off.
125#
126# Define USE_STDEV below if you want perf to care about the underlying device
127# change being considered an inode change from the update-index perspective.
128#
129# Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks
130# field that counts the on-disk footprint in 512-byte blocks.
131#
132# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
133#
134# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
135#
136# Define NO_PERL_MAKEMAKER if you cannot use Makefiles generated by perl's
137# MakeMaker (e.g. using ActiveState under Cygwin).
138#
139# Define NO_PERL if you do not want Perl scripts or libraries at all.
140#
141# Define INTERNAL_QSORT to use Git's implementation of qsort(), which
142# is a simplified version of the merge sort used in glibc. This is
143# recommended if Git triggers O(n^2) behavior in your platform's qsort().
144#
145# Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call
146# your external grep (e.g., if your system lacks grep, if its grep is
147# broken, or spawning external process is slower than built-in grep perf has).
148
149PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
150 @$(SHELL_PATH) util/PERF-VERSION-GEN
151-include PERF-VERSION-FILE
152
153uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
154uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
155uname_O := $(shell sh -c 'uname -o 2>/dev/null || echo not')
156uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
157uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
158uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
159
160# CFLAGS and LDFLAGS are for the users to override from the command line.
161
162CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6
163LDFLAGS = -lpthread -lrt -lelf
164ALL_CFLAGS = $(CFLAGS)
165ALL_LDFLAGS = $(LDFLAGS)
166STRIP ?= strip
167
168# Among the variables below, these:
169# perfexecdir
170# template_dir
171# mandir
172# infodir
173# htmldir
174# ETC_PERFCONFIG (but not sysconfdir)
175# can be specified as a relative path some/where/else;
176# this is interpreted as relative to $(prefix) and "perf" at
177# runtime figures out where they are based on the path to the executable.
178# This can help installing the suite in a relocatable way.
179
180prefix = $(HOME)
181bindir_relative = bin
182bindir = $(prefix)/$(bindir_relative)
183mandir = share/man
184infodir = share/info
185perfexecdir = libexec/perf-core
186sharedir = $(prefix)/share
187template_dir = share/perf-core/templates
188htmldir = share/doc/perf-doc
189ifeq ($(prefix),/usr)
190sysconfdir = /etc
191ETC_PERFCONFIG = $(sysconfdir)/perfconfig
192else
193sysconfdir = $(prefix)/etc
194ETC_PERFCONFIG = etc/perfconfig
195endif
196lib = lib
197# DESTDIR=
198
199export prefix bindir sharedir sysconfdir
200
201CC = gcc
202AR = ar
203RM = rm -f
204TAR = tar
205FIND = find
206INSTALL = install
207RPMBUILD = rpmbuild
208PTHREAD_LIBS = -lpthread
209
210# sparse is architecture-neutral, which means that we need to tell it
211# explicitly what architecture to check for. Fix this up for yours..
212SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
213
214
215
216### --- END CONFIGURATION SECTION ---
217
218# Those must not be GNU-specific; they are shared with perl/ which may
219# be built by a different compiler. (Note that this is an artifact now
220# but it still might be nice to keep that distinction.)
221BASIC_CFLAGS =
222BASIC_LDFLAGS =
223
224# Guard against environment variables
225BUILTIN_OBJS =
226BUILT_INS =
227COMPAT_CFLAGS =
228COMPAT_OBJS =
229LIB_H =
230LIB_OBJS =
231SCRIPT_PERL =
232SCRIPT_SH =
233TEST_PROGRAMS =
234
235#
236# No scripts right now:
237#
238
239# SCRIPT_SH += perf-am.sh
240
241#
242# No Perl scripts right now:
243#
244
245# SCRIPT_PERL += perf-add--interactive.perl
246
247SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \
248 $(patsubst %.perl,%,$(SCRIPT_PERL))
249
250# Empty...
251EXTRA_PROGRAMS =
252
253# ... and all the rest that could be moved out of bindir to perfexecdir
254PROGRAMS += $(EXTRA_PROGRAMS)
255
256#
257# Single 'perf' binary right now:
258#
259PROGRAMS += perf
260
261# List built-in command $C whose implementation cmd_$C() is not in
262# builtin-$C.o but is linked in as part of some other command.
263#
264# None right now:
265#
266# BUILT_INS += perf-init $X
267
268# what 'all' will build and 'install' will install, in perfexecdir
269ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
270
271# what 'all' will build but not install in perfexecdir
272OTHER_PROGRAMS = perf$X
273
274# Set paths to tools early so that they can be used for version tests.
275ifndef SHELL_PATH
276 SHELL_PATH = /bin/sh
277endif
278ifndef PERL_PATH
279 PERL_PATH = /usr/bin/perl
280endif
281
282export PERL_PATH
283
284LIB_FILE=libperf.a
285
286LIB_H += ../../include/linux/perf_counter.h
287LIB_H += perf.h
288LIB_H += util/list.h
289LIB_H += util/rbtree.h
290LIB_H += util/levenshtein.h
291LIB_H += util/parse-options.h
292LIB_H += util/parse-events.h
293LIB_H += util/quote.h
294LIB_H += util/util.h
295LIB_H += util/help.h
296LIB_H += util/strbuf.h
297LIB_H += util/string.h
298LIB_H += util/run-command.h
299LIB_H += util/sigchain.h
300LIB_H += util/symbol.h
301LIB_H += util/color.h
302
303LIB_OBJS += util/abspath.o
304LIB_OBJS += util/alias.o
305LIB_OBJS += util/config.o
306LIB_OBJS += util/ctype.o
307LIB_OBJS += util/environment.o
308LIB_OBJS += util/exec_cmd.o
309LIB_OBJS += util/help.o
310LIB_OBJS += util/levenshtein.o
311LIB_OBJS += util/parse-options.o
312LIB_OBJS += util/parse-events.o
313LIB_OBJS += util/path.o
314LIB_OBJS += util/rbtree.o
315LIB_OBJS += util/run-command.o
316LIB_OBJS += util/quote.o
317LIB_OBJS += util/strbuf.o
318LIB_OBJS += util/string.o
319LIB_OBJS += util/usage.o
320LIB_OBJS += util/wrapper.o
321LIB_OBJS += util/sigchain.o
322LIB_OBJS += util/symbol.o
323LIB_OBJS += util/color.o
324LIB_OBJS += util/pager.o
325
326BUILTIN_OBJS += builtin-annotate.o
327BUILTIN_OBJS += builtin-help.o
328BUILTIN_OBJS += builtin-list.o
329BUILTIN_OBJS += builtin-record.o
330BUILTIN_OBJS += builtin-report.o
331BUILTIN_OBJS += builtin-stat.o
332BUILTIN_OBJS += builtin-top.o
333
334PERFLIBS = $(LIB_FILE)
335EXTLIBS =
336
337#
338# Platform specific tweaks
339#
340
341# We choose to avoid "if .. else if .. else .. endif endif"
342# because maintaining the nesting to match is a pain. If
343# we had "elif" things would have been much nicer...
344
345-include config.mak.autogen
346-include config.mak
347
348ifeq ($(uname_S),Darwin)
349 ifndef NO_FINK
350 ifeq ($(shell test -d /sw/lib && echo y),y)
351 BASIC_CFLAGS += -I/sw/include
352 BASIC_LDFLAGS += -L/sw/lib
353 endif
354 endif
355 ifndef NO_DARWIN_PORTS
356 ifeq ($(shell test -d /opt/local/lib && echo y),y)
357 BASIC_CFLAGS += -I/opt/local/include
358 BASIC_LDFLAGS += -L/opt/local/lib
359 endif
360 endif
361 PTHREAD_LIBS =
362endif
363
364ifndef CC_LD_DYNPATH
365 ifdef NO_R_TO_GCC_LINKER
366 # Some gcc does not accept and pass -R to the linker to specify
367 # the runtime dynamic library path.
368 CC_LD_DYNPATH = -Wl,-rpath,
369 else
370 CC_LD_DYNPATH = -R
371 endif
372endif
373
374ifdef ZLIB_PATH
375 BASIC_CFLAGS += -I$(ZLIB_PATH)/include
376 EXTLIBS += -L$(ZLIB_PATH)/$(lib) $(CC_LD_DYNPATH)$(ZLIB_PATH)/$(lib)
377endif
378EXTLIBS += -lz
379
380ifdef NEEDS_SOCKET
381 EXTLIBS += -lsocket
382endif
383ifdef NEEDS_NSL
384 EXTLIBS += -lnsl
385endif
386ifdef NO_D_TYPE_IN_DIRENT
387 BASIC_CFLAGS += -DNO_D_TYPE_IN_DIRENT
388endif
389ifdef NO_D_INO_IN_DIRENT
390 BASIC_CFLAGS += -DNO_D_INO_IN_DIRENT
391endif
392ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
393 BASIC_CFLAGS += -DNO_ST_BLOCKS_IN_STRUCT_STAT
394endif
395ifdef USE_NSEC
396 BASIC_CFLAGS += -DUSE_NSEC
397endif
398ifdef USE_ST_TIMESPEC
399 BASIC_CFLAGS += -DUSE_ST_TIMESPEC
400endif
401ifdef NO_NSEC
402 BASIC_CFLAGS += -DNO_NSEC
403endif
404ifdef NO_C99_FORMAT
405 BASIC_CFLAGS += -DNO_C99_FORMAT
406endif
407ifdef SNPRINTF_RETURNS_BOGUS
408 COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS
409 COMPAT_OBJS += compat/snprintf.o
410endif
411ifdef FREAD_READS_DIRECTORIES
412 COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES
413 COMPAT_OBJS += compat/fopen.o
414endif
415ifdef NO_SYMLINK_HEAD
416 BASIC_CFLAGS += -DNO_SYMLINK_HEAD
417endif
418ifdef NO_STRCASESTR
419 COMPAT_CFLAGS += -DNO_STRCASESTR
420 COMPAT_OBJS += compat/strcasestr.o
421endif
422ifdef NO_STRTOUMAX
423 COMPAT_CFLAGS += -DNO_STRTOUMAX
424 COMPAT_OBJS += compat/strtoumax.o
425endif
426ifdef NO_STRTOULL
427 COMPAT_CFLAGS += -DNO_STRTOULL
428endif
429ifdef NO_SETENV
430 COMPAT_CFLAGS += -DNO_SETENV
431 COMPAT_OBJS += compat/setenv.o
432endif
433ifdef NO_MKDTEMP
434 COMPAT_CFLAGS += -DNO_MKDTEMP
435 COMPAT_OBJS += compat/mkdtemp.o
436endif
437ifdef NO_UNSETENV
438 COMPAT_CFLAGS += -DNO_UNSETENV
439 COMPAT_OBJS += compat/unsetenv.o
440endif
441ifdef NO_SYS_SELECT_H
442 BASIC_CFLAGS += -DNO_SYS_SELECT_H
443endif
444ifdef NO_MMAP
445 COMPAT_CFLAGS += -DNO_MMAP
446 COMPAT_OBJS += compat/mmap.o
447else
448 ifdef USE_WIN32_MMAP
449 COMPAT_CFLAGS += -DUSE_WIN32_MMAP
450 COMPAT_OBJS += compat/win32mmap.o
451 endif
452endif
453ifdef NO_PREAD
454 COMPAT_CFLAGS += -DNO_PREAD
455 COMPAT_OBJS += compat/pread.o
456endif
457ifdef NO_FAST_WORKING_DIRECTORY
458 BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY
459endif
460ifdef NO_TRUSTABLE_FILEMODE
461 BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE
462endif
463ifdef NO_IPV6
464 BASIC_CFLAGS += -DNO_IPV6
465endif
466ifdef NO_UINTMAX_T
467 BASIC_CFLAGS += -Duintmax_t=uint32_t
468endif
469ifdef NO_SOCKADDR_STORAGE
470ifdef NO_IPV6
471 BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in
472else
473 BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in6
474endif
475endif
476ifdef NO_INET_NTOP
477 LIB_OBJS += compat/inet_ntop.o
478endif
479ifdef NO_INET_PTON
480 LIB_OBJS += compat/inet_pton.o
481endif
482
483ifdef NO_ICONV
484 BASIC_CFLAGS += -DNO_ICONV
485endif
486
487ifdef OLD_ICONV
488 BASIC_CFLAGS += -DOLD_ICONV
489endif
490
491ifdef NO_DEFLATE_BOUND
492 BASIC_CFLAGS += -DNO_DEFLATE_BOUND
493endif
494
495ifdef PPC_SHA1
496 SHA1_HEADER = "ppc/sha1.h"
497 LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
498else
499ifdef ARM_SHA1
500 SHA1_HEADER = "arm/sha1.h"
501 LIB_OBJS += arm/sha1.o arm/sha1_arm.o
502else
503ifdef MOZILLA_SHA1
504 SHA1_HEADER = "mozilla-sha1/sha1.h"
505 LIB_OBJS += mozilla-sha1/sha1.o
506else
507 SHA1_HEADER = <openssl/sha.h>
508 EXTLIBS += $(LIB_4_CRYPTO)
509endif
510endif
511endif
512ifdef NO_PERL_MAKEMAKER
513 export NO_PERL_MAKEMAKER
514endif
515ifdef NO_HSTRERROR
516 COMPAT_CFLAGS += -DNO_HSTRERROR
517 COMPAT_OBJS += compat/hstrerror.o
518endif
519ifdef NO_MEMMEM
520 COMPAT_CFLAGS += -DNO_MEMMEM
521 COMPAT_OBJS += compat/memmem.o
522endif
523ifdef INTERNAL_QSORT
524 COMPAT_CFLAGS += -DINTERNAL_QSORT
525 COMPAT_OBJS += compat/qsort.o
526endif
527ifdef RUNTIME_PREFIX
528 COMPAT_CFLAGS += -DRUNTIME_PREFIX
529endif
530
531ifdef DIR_HAS_BSD_GROUP_SEMANTICS
532 COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS
533endif
534ifdef NO_EXTERNAL_GREP
535 BASIC_CFLAGS += -DNO_EXTERNAL_GREP
536endif
537
538ifeq ($(PERL_PATH),)
539NO_PERL=NoThanks
540endif
541
542QUIET_SUBDIR0 = +$(MAKE) -C # space to separate -C and subdir
543QUIET_SUBDIR1 =
544
545ifneq ($(findstring $(MAKEFLAGS),w),w)
546PRINT_DIR = --no-print-directory
547else # "make -w"
548NO_SUBDIR = :
549endif
550
551ifneq ($(findstring $(MAKEFLAGS),s),s)
552ifndef V
553 QUIET_CC = @echo ' ' CC $@;
554 QUIET_AR = @echo ' ' AR $@;
555 QUIET_LINK = @echo ' ' LINK $@;
556 QUIET_BUILT_IN = @echo ' ' BUILTIN $@;
557 QUIET_GEN = @echo ' ' GEN $@;
558 QUIET_SUBDIR0 = +@subdir=
559 QUIET_SUBDIR1 = ;$(NO_SUBDIR) echo ' ' SUBDIR $$subdir; \
560 $(MAKE) $(PRINT_DIR) -C $$subdir
561 export V
562 export QUIET_GEN
563 export QUIET_BUILT_IN
564endif
565endif
566
567ifdef ASCIIDOC8
568 export ASCIIDOC8
569endif
570
571# Shell quote (do not use $(call) to accommodate ancient setups);
572
573SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
574ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
575
576DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
577bindir_SQ = $(subst ','\'',$(bindir))
578bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
579mandir_SQ = $(subst ','\'',$(mandir))
580infodir_SQ = $(subst ','\'',$(infodir))
581perfexecdir_SQ = $(subst ','\'',$(perfexecdir))
582template_dir_SQ = $(subst ','\'',$(template_dir))
583htmldir_SQ = $(subst ','\'',$(htmldir))
584prefix_SQ = $(subst ','\'',$(prefix))
585
586SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
587PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
588
589LIBS = $(PERFLIBS) $(EXTLIBS)
590
591BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
592 $(COMPAT_CFLAGS)
593LIB_OBJS += $(COMPAT_OBJS)
594
595ALL_CFLAGS += $(BASIC_CFLAGS)
596ALL_LDFLAGS += $(BASIC_LDFLAGS)
597
598export TAR INSTALL DESTDIR SHELL_PATH
599
600
601### Build rules
602
603SHELL = $(SHELL_PATH)
604
605all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) PERF-BUILD-OPTIONS
606ifneq (,$X)
607 $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
608endif
609
610all::
611
612please_set_SHELL_PATH_to_a_more_modern_shell:
613 @$$(:)
614
615shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
616
617strip: $(PROGRAMS) perf$X
618 $(STRIP) $(STRIP_OPTS) $(PROGRAMS) perf$X
619
620perf.o: perf.c common-cmds.h PERF-CFLAGS
621 $(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \
622 '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
623 $(ALL_CFLAGS) -c $(filter %.c,$^)
624
625perf$X: perf.o $(BUILTIN_OBJS) $(PERFLIBS)
626 $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ perf.o \
627 $(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
628
629builtin-help.o: builtin-help.c common-cmds.h PERF-CFLAGS
630 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
631 '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
632 '-DPERF_MAN_PATH="$(mandir_SQ)"' \
633 '-DPERF_INFO_PATH="$(infodir_SQ)"' $<
634
635$(BUILT_INS): perf$X
636 $(QUIET_BUILT_IN)$(RM) $@ && \
637 ln perf$X $@ 2>/dev/null || \
638 ln -s perf$X $@ 2>/dev/null || \
639 cp perf$X $@
640
641common-cmds.h: util/generate-cmdlist.sh command-list.txt
642
643common-cmds.h: $(wildcard Documentation/perf-*.txt)
644 $(QUIET_GEN)util/generate-cmdlist.sh > $@+ && mv $@+ $@
645
646$(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
647 $(QUIET_GEN)$(RM) $@ $@+ && \
648 sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
649 -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \
650 -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
651 -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
652 -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
653 $@.sh >$@+ && \
654 chmod +x $@+ && \
655 mv $@+ $@
656
657configure: configure.ac
658 $(QUIET_GEN)$(RM) $@ $<+ && \
659 sed -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
660 $< > $<+ && \
661 autoconf -o $@ $<+ && \
662 $(RM) $<+
663
664# These can record PERF_VERSION
665perf.o perf.spec \
666 $(patsubst %.sh,%,$(SCRIPT_SH)) \
667 $(patsubst %.perl,%,$(SCRIPT_PERL)) \
668 : PERF-VERSION-FILE
669
670%.o: %.c PERF-CFLAGS
671 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
672%.s: %.c PERF-CFLAGS
673 $(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $<
674%.o: %.S
675 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
676
677util/exec_cmd.o: util/exec_cmd.c PERF-CFLAGS
678 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
679 '-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
680 '-DBINDIR="$(bindir_relative_SQ)"' \
681 '-DPREFIX="$(prefix_SQ)"' \
682 $<
683
684builtin-init-db.o: builtin-init-db.c PERF-CFLAGS
685 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
686
687util/config.o: util/config.c PERF-CFLAGS
688 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
689
690perf-%$X: %.o $(PERFLIBS)
691 $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
692
693$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
694$(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
695builtin-revert.o wt-status.o: wt-status.h
696
697$(LIB_FILE): $(LIB_OBJS)
698 $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
699
700doc:
701 $(MAKE) -C Documentation all
702
703man:
704 $(MAKE) -C Documentation man
705
706html:
707 $(MAKE) -C Documentation html
708
709info:
710 $(MAKE) -C Documentation info
711
712pdf:
713 $(MAKE) -C Documentation pdf
714
715TAGS:
716 $(RM) TAGS
717 $(FIND) . -name '*.[hcS]' -print | xargs etags -a
718
719tags:
720 $(RM) tags
721 $(FIND) . -name '*.[hcS]' -print | xargs ctags -a
722
723cscope:
724 $(RM) cscope*
725 $(FIND) . -name '*.[hcS]' -print | xargs cscope -b
726
727### Detect prefix changes
728TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\
729 $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
730
731PERF-CFLAGS: .FORCE-PERF-CFLAGS
732 @FLAGS='$(TRACK_CFLAGS)'; \
733 if test x"$$FLAGS" != x"`cat PERF-CFLAGS 2>/dev/null`" ; then \
734 echo 1>&2 " * new build flags or prefix"; \
735 echo "$$FLAGS" >PERF-CFLAGS; \
736 fi
737
738# We need to apply sq twice, once to protect from the shell
739# that runs PERF-BUILD-OPTIONS, and then again to protect it
740# and the first level quoting from the shell that runs "echo".
741PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
742 @echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
743 @echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
744 @echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
745 @echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
746
747### Testing rules
748
749#
750# None right now:
751#
752# TEST_PROGRAMS += test-something$X
753
754all:: $(TEST_PROGRAMS)
755
756# GNU make supports exporting all variables by "export" without parameters.
757# However, the environment gets quite big, and some programs have problems
758# with that.
759
760export NO_SVN_TESTS
761
762check: common-cmds.h
763 if sparse; \
764 then \
765 for i in *.c */*.c; \
766 do \
767 sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
768 done; \
769 else \
770 echo 2>&1 "Did you mean 'make test'?"; \
771 exit 1; \
772 fi
773
774remove-dashes:
775 ./fixup-builtins $(BUILT_INS) $(PROGRAMS) $(SCRIPTS)
776
777### Installation rules
778
779ifneq ($(filter /%,$(firstword $(template_dir))),)
780template_instdir = $(template_dir)
781else
782template_instdir = $(prefix)/$(template_dir)
783endif
784export template_instdir
785
786ifneq ($(filter /%,$(firstword $(perfexecdir))),)
787perfexec_instdir = $(perfexecdir)
788else
789perfexec_instdir = $(prefix)/$(perfexecdir)
790endif
791perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
792export perfexec_instdir
793
794install: all
795 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
796 $(INSTALL) perf$X '$(DESTDIR_SQ)$(bindir_SQ)'
797ifdef BUILT_INS
798 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
799 $(INSTALL) $(BUILT_INS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
800ifneq (,$X)
801 $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
802endif
803endif
804
805install-doc:
806 $(MAKE) -C Documentation install
807
808install-man:
809 $(MAKE) -C Documentation install-man
810
811install-html:
812 $(MAKE) -C Documentation install-html
813
814install-info:
815 $(MAKE) -C Documentation install-info
816
817install-pdf:
818 $(MAKE) -C Documentation install-pdf
819
820quick-install-doc:
821 $(MAKE) -C Documentation quick-install
822
823quick-install-man:
824 $(MAKE) -C Documentation quick-install-man
825
826quick-install-html:
827 $(MAKE) -C Documentation quick-install-html
828
829
830### Maintainer's dist rules
831#
832# None right now
833#
834#
835# perf.spec: perf.spec.in
836# sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
837# mv $@+ $@
838#
839# PERF_TARNAME=perf-$(PERF_VERSION)
840# dist: perf.spec perf-archive$(X) configure
841# ./perf-archive --format=tar \
842# --prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
843# @mkdir -p $(PERF_TARNAME)
844# @cp perf.spec configure $(PERF_TARNAME)
845# @echo $(PERF_VERSION) > $(PERF_TARNAME)/version
846# $(TAR) rf $(PERF_TARNAME).tar \
847# $(PERF_TARNAME)/perf.spec \
848# $(PERF_TARNAME)/configure \
849# $(PERF_TARNAME)/version
850# @$(RM) -r $(PERF_TARNAME)
851# gzip -f -9 $(PERF_TARNAME).tar
852#
853# htmldocs = perf-htmldocs-$(PERF_VERSION)
854# manpages = perf-manpages-$(PERF_VERSION)
855# dist-doc:
856# $(RM) -r .doc-tmp-dir
857# mkdir .doc-tmp-dir
858# $(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
859# cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
860# gzip -n -9 -f $(htmldocs).tar
861# :
862# $(RM) -r .doc-tmp-dir
863# mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
864# $(MAKE) -C Documentation DESTDIR=./ \
865# man1dir=../.doc-tmp-dir/man1 \
866# man5dir=../.doc-tmp-dir/man5 \
867# man7dir=../.doc-tmp-dir/man7 \
868# install
869# cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
870# gzip -n -9 -f $(manpages).tar
871# $(RM) -r .doc-tmp-dir
872#
873# rpm: dist
874# $(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
875
876### Cleaning rules
877
878distclean: clean
879# $(RM) configure
880
881clean:
882 $(RM) *.o */*.o $(LIB_FILE)
883 $(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
884 $(RM) $(TEST_PROGRAMS)
885 $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
886 $(RM) -r autom4te.cache
887 $(RM) config.log config.mak.autogen config.mak.append config.status config.cache
888 $(RM) -r $(PERF_TARNAME) .doc-tmp-dir
889 $(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
890 $(RM) $(htmldocs).tar.gz $(manpages).tar.gz
891 $(MAKE) -C Documentation/ clean
892 $(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
893
894.PHONY: all install clean strip
895.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
896.PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
897.PHONY: .FORCE-PERF-BUILD-OPTIONS
898
899### Make sure built-ins do not have dups and listed in perf.c
900#
901check-builtins::
902 ./check-builtins.sh
903
904### Test suite coverage testing
905#
906# None right now
907#
908# .PHONY: coverage coverage-clean coverage-build coverage-report
909#
910# coverage:
911# $(MAKE) coverage-build
912# $(MAKE) coverage-report
913#
914# coverage-clean:
915# rm -f *.gcda *.gcno
916#
917# COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
918# COVERAGE_LDFLAGS = $(CFLAGS) -O0 -lgcov
919#
920# coverage-build: coverage-clean
921# $(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
922# $(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
923# -j1 test
924#
925# coverage-report:
926# gcov -b *.c */*.c
927# grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
928# | sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
929# | tee coverage-untested-functions
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
new file mode 100644
index 000000000000..b1ed5f766cb3
--- /dev/null
+++ b/tools/perf/builtin-annotate.c
@@ -0,0 +1,1356 @@
1/*
2 * builtin-annotate.c
3 *
4 * Builtin annotate command: Analyze the perf.data input file,
5 * look up and read DSOs and symbol information and display
6 * a histogram of results, along various sorting keys.
7 */
8#include "builtin.h"
9
10#include "util/util.h"
11
12#include "util/color.h"
13#include "util/list.h"
14#include "util/cache.h"
15#include "util/rbtree.h"
16#include "util/symbol.h"
17#include "util/string.h"
18
19#include "perf.h"
20
21#include "util/parse-options.h"
22#include "util/parse-events.h"
23
24#define SHOW_KERNEL 1
25#define SHOW_USER 2
26#define SHOW_HV 4
27
28static char const *input_name = "perf.data";
29static char *vmlinux = "vmlinux";
30
31static char default_sort_order[] = "comm,symbol";
32static char *sort_order = default_sort_order;
33
34static int input;
35static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
36
37static int dump_trace = 0;
38#define dprintf(x...) do { if (dump_trace) printf(x); } while (0)
39
40static int verbose;
41
42static unsigned long page_size;
43static unsigned long mmap_window = 32;
44
45struct ip_event {
46 struct perf_event_header header;
47 __u64 ip;
48 __u32 pid, tid;
49};
50
51struct mmap_event {
52 struct perf_event_header header;
53 __u32 pid, tid;
54 __u64 start;
55 __u64 len;
56 __u64 pgoff;
57 char filename[PATH_MAX];
58};
59
60struct comm_event {
61 struct perf_event_header header;
62 __u32 pid, tid;
63 char comm[16];
64};
65
66struct fork_event {
67 struct perf_event_header header;
68 __u32 pid, ppid;
69};
70
71struct period_event {
72 struct perf_event_header header;
73 __u64 time;
74 __u64 id;
75 __u64 sample_period;
76};
77
78typedef union event_union {
79 struct perf_event_header header;
80 struct ip_event ip;
81 struct mmap_event mmap;
82 struct comm_event comm;
83 struct fork_event fork;
84 struct period_event period;
85} event_t;
86
87static LIST_HEAD(dsos);
88static struct dso *kernel_dso;
89static struct dso *vdso;
90
91
92static void dsos__add(struct dso *dso)
93{
94 list_add_tail(&dso->node, &dsos);
95}
96
97static struct dso *dsos__find(const char *name)
98{
99 struct dso *pos;
100
101 list_for_each_entry(pos, &dsos, node)
102 if (strcmp(pos->name, name) == 0)
103 return pos;
104 return NULL;
105}
106
107static struct dso *dsos__findnew(const char *name)
108{
109 struct dso *dso = dsos__find(name);
110 int nr;
111
112 if (dso)
113 return dso;
114
115 dso = dso__new(name, 0);
116 if (!dso)
117 goto out_delete_dso;
118
119 nr = dso__load(dso, NULL, verbose);
120 if (nr < 0) {
121 if (verbose)
122 fprintf(stderr, "Failed to open: %s\n", name);
123 goto out_delete_dso;
124 }
125 if (!nr && verbose) {
126 fprintf(stderr,
127 "No symbols found in: %s, maybe install a debug package?\n",
128 name);
129 }
130
131 dsos__add(dso);
132
133 return dso;
134
135out_delete_dso:
136 dso__delete(dso);
137 return NULL;
138}
139
140static void dsos__fprintf(FILE *fp)
141{
142 struct dso *pos;
143
144 list_for_each_entry(pos, &dsos, node)
145 dso__fprintf(pos, fp);
146}
147
148static struct symbol *vdso__find_symbol(struct dso *dso, __u64 ip)
149{
150 return dso__find_symbol(kernel_dso, ip);
151}
152
153static int load_kernel(void)
154{
155 int err;
156
157 kernel_dso = dso__new("[kernel]", 0);
158 if (!kernel_dso)
159 return -1;
160
161 err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose);
162 if (err) {
163 dso__delete(kernel_dso);
164 kernel_dso = NULL;
165 } else
166 dsos__add(kernel_dso);
167
168 vdso = dso__new("[vdso]", 0);
169 if (!vdso)
170 return -1;
171
172 vdso->find_symbol = vdso__find_symbol;
173
174 dsos__add(vdso);
175
176 return err;
177}
178
179struct map {
180 struct list_head node;
181 __u64 start;
182 __u64 end;
183 __u64 pgoff;
184 __u64 (*map_ip)(struct map *, __u64);
185 struct dso *dso;
186};
187
188static __u64 map__map_ip(struct map *map, __u64 ip)
189{
190 return ip - map->start + map->pgoff;
191}
192
193static __u64 vdso__map_ip(struct map *map, __u64 ip)
194{
195 return ip;
196}
197
198static struct map *map__new(struct mmap_event *event)
199{
200 struct map *self = malloc(sizeof(*self));
201
202 if (self != NULL) {
203 const char *filename = event->filename;
204
205 self->start = event->start;
206 self->end = event->start + event->len;
207 self->pgoff = event->pgoff;
208
209 self->dso = dsos__findnew(filename);
210 if (self->dso == NULL)
211 goto out_delete;
212
213 if (self->dso == vdso)
214 self->map_ip = vdso__map_ip;
215 else
216 self->map_ip = map__map_ip;
217 }
218 return self;
219out_delete:
220 free(self);
221 return NULL;
222}
223
224static struct map *map__clone(struct map *self)
225{
226 struct map *map = malloc(sizeof(*self));
227
228 if (!map)
229 return NULL;
230
231 memcpy(map, self, sizeof(*self));
232
233 return map;
234}
235
236static int map__overlap(struct map *l, struct map *r)
237{
238 if (l->start > r->start) {
239 struct map *t = l;
240 l = r;
241 r = t;
242 }
243
244 if (l->end > r->start)
245 return 1;
246
247 return 0;
248}
249
250static size_t map__fprintf(struct map *self, FILE *fp)
251{
252 return fprintf(fp, " %Lx-%Lx %Lx %s\n",
253 self->start, self->end, self->pgoff, self->dso->name);
254}
255
256
257struct thread {
258 struct rb_node rb_node;
259 struct list_head maps;
260 pid_t pid;
261 char *comm;
262};
263
264static struct thread *thread__new(pid_t pid)
265{
266 struct thread *self = malloc(sizeof(*self));
267
268 if (self != NULL) {
269 self->pid = pid;
270 self->comm = malloc(32);
271 if (self->comm)
272 snprintf(self->comm, 32, ":%d", self->pid);
273 INIT_LIST_HEAD(&self->maps);
274 }
275
276 return self;
277}
278
279static int thread__set_comm(struct thread *self, const char *comm)
280{
281 if (self->comm)
282 free(self->comm);
283 self->comm = strdup(comm);
284 return self->comm ? 0 : -ENOMEM;
285}
286
287static size_t thread__fprintf(struct thread *self, FILE *fp)
288{
289 struct map *pos;
290 size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
291
292 list_for_each_entry(pos, &self->maps, node)
293 ret += map__fprintf(pos, fp);
294
295 return ret;
296}
297
298
299static struct rb_root threads;
300static struct thread *last_match;
301
302static struct thread *threads__findnew(pid_t pid)
303{
304 struct rb_node **p = &threads.rb_node;
305 struct rb_node *parent = NULL;
306 struct thread *th;
307
308 /*
309 * Font-end cache - PID lookups come in blocks,
310 * so most of the time we dont have to look up
311 * the full rbtree:
312 */
313 if (last_match && last_match->pid == pid)
314 return last_match;
315
316 while (*p != NULL) {
317 parent = *p;
318 th = rb_entry(parent, struct thread, rb_node);
319
320 if (th->pid == pid) {
321 last_match = th;
322 return th;
323 }
324
325 if (pid < th->pid)
326 p = &(*p)->rb_left;
327 else
328 p = &(*p)->rb_right;
329 }
330
331 th = thread__new(pid);
332 if (th != NULL) {
333 rb_link_node(&th->rb_node, parent, p);
334 rb_insert_color(&th->rb_node, &threads);
335 last_match = th;
336 }
337
338 return th;
339}
340
341static void thread__insert_map(struct thread *self, struct map *map)
342{
343 struct map *pos, *tmp;
344
345 list_for_each_entry_safe(pos, tmp, &self->maps, node) {
346 if (map__overlap(pos, map)) {
347 list_del_init(&pos->node);
348 /* XXX leaks dsos */
349 free(pos);
350 }
351 }
352
353 list_add_tail(&map->node, &self->maps);
354}
355
356static int thread__fork(struct thread *self, struct thread *parent)
357{
358 struct map *map;
359
360 if (self->comm)
361 free(self->comm);
362 self->comm = strdup(parent->comm);
363 if (!self->comm)
364 return -ENOMEM;
365
366 list_for_each_entry(map, &parent->maps, node) {
367 struct map *new = map__clone(map);
368 if (!new)
369 return -ENOMEM;
370 thread__insert_map(self, new);
371 }
372
373 return 0;
374}
375
376static struct map *thread__find_map(struct thread *self, __u64 ip)
377{
378 struct map *pos;
379
380 if (self == NULL)
381 return NULL;
382
383 list_for_each_entry(pos, &self->maps, node)
384 if (ip >= pos->start && ip <= pos->end)
385 return pos;
386
387 return NULL;
388}
389
390static size_t threads__fprintf(FILE *fp)
391{
392 size_t ret = 0;
393 struct rb_node *nd;
394
395 for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
396 struct thread *pos = rb_entry(nd, struct thread, rb_node);
397
398 ret += thread__fprintf(pos, fp);
399 }
400
401 return ret;
402}
403
404/*
405 * histogram, sorted on item, collects counts
406 */
407
408static struct rb_root hist;
409
410struct hist_entry {
411 struct rb_node rb_node;
412
413 struct thread *thread;
414 struct map *map;
415 struct dso *dso;
416 struct symbol *sym;
417 __u64 ip;
418 char level;
419
420 uint32_t count;
421};
422
423/*
424 * configurable sorting bits
425 */
426
427struct sort_entry {
428 struct list_head list;
429
430 char *header;
431
432 int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
433 int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
434 size_t (*print)(FILE *fp, struct hist_entry *);
435};
436
437/* --sort pid */
438
439static int64_t
440sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
441{
442 return right->thread->pid - left->thread->pid;
443}
444
445static size_t
446sort__thread_print(FILE *fp, struct hist_entry *self)
447{
448 return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid);
449}
450
451static struct sort_entry sort_thread = {
452 .header = " Command: Pid",
453 .cmp = sort__thread_cmp,
454 .print = sort__thread_print,
455};
456
457/* --sort comm */
458
459static int64_t
460sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
461{
462 return right->thread->pid - left->thread->pid;
463}
464
465static int64_t
466sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
467{
468 char *comm_l = left->thread->comm;
469 char *comm_r = right->thread->comm;
470
471 if (!comm_l || !comm_r) {
472 if (!comm_l && !comm_r)
473 return 0;
474 else if (!comm_l)
475 return -1;
476 else
477 return 1;
478 }
479
480 return strcmp(comm_l, comm_r);
481}
482
483static size_t
484sort__comm_print(FILE *fp, struct hist_entry *self)
485{
486 return fprintf(fp, "%16s", self->thread->comm);
487}
488
489static struct sort_entry sort_comm = {
490 .header = " Command",
491 .cmp = sort__comm_cmp,
492 .collapse = sort__comm_collapse,
493 .print = sort__comm_print,
494};
495
496/* --sort dso */
497
498static int64_t
499sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
500{
501 struct dso *dso_l = left->dso;
502 struct dso *dso_r = right->dso;
503
504 if (!dso_l || !dso_r) {
505 if (!dso_l && !dso_r)
506 return 0;
507 else if (!dso_l)
508 return -1;
509 else
510 return 1;
511 }
512
513 return strcmp(dso_l->name, dso_r->name);
514}
515
516static size_t
517sort__dso_print(FILE *fp, struct hist_entry *self)
518{
519 if (self->dso)
520 return fprintf(fp, "%-25s", self->dso->name);
521
522 return fprintf(fp, "%016llx ", (__u64)self->ip);
523}
524
525static struct sort_entry sort_dso = {
526 .header = "Shared Object ",
527 .cmp = sort__dso_cmp,
528 .print = sort__dso_print,
529};
530
531/* --sort symbol */
532
533static int64_t
534sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
535{
536 __u64 ip_l, ip_r;
537
538 if (left->sym == right->sym)
539 return 0;
540
541 ip_l = left->sym ? left->sym->start : left->ip;
542 ip_r = right->sym ? right->sym->start : right->ip;
543
544 return (int64_t)(ip_r - ip_l);
545}
546
547static size_t
548sort__sym_print(FILE *fp, struct hist_entry *self)
549{
550 size_t ret = 0;
551
552 if (verbose)
553 ret += fprintf(fp, "%#018llx ", (__u64)self->ip);
554
555 if (self->sym) {
556 ret += fprintf(fp, "[%c] %s",
557 self->dso == kernel_dso ? 'k' : '.', self->sym->name);
558 } else {
559 ret += fprintf(fp, "%#016llx", (__u64)self->ip);
560 }
561
562 return ret;
563}
564
565static struct sort_entry sort_sym = {
566 .header = "Symbol",
567 .cmp = sort__sym_cmp,
568 .print = sort__sym_print,
569};
570
571static int sort__need_collapse = 0;
572
573struct sort_dimension {
574 char *name;
575 struct sort_entry *entry;
576 int taken;
577};
578
579static struct sort_dimension sort_dimensions[] = {
580 { .name = "pid", .entry = &sort_thread, },
581 { .name = "comm", .entry = &sort_comm, },
582 { .name = "dso", .entry = &sort_dso, },
583 { .name = "symbol", .entry = &sort_sym, },
584};
585
586static LIST_HEAD(hist_entry__sort_list);
587
588static int sort_dimension__add(char *tok)
589{
590 int i;
591
592 for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
593 struct sort_dimension *sd = &sort_dimensions[i];
594
595 if (sd->taken)
596 continue;
597
598 if (strncasecmp(tok, sd->name, strlen(tok)))
599 continue;
600
601 if (sd->entry->collapse)
602 sort__need_collapse = 1;
603
604 list_add_tail(&sd->entry->list, &hist_entry__sort_list);
605 sd->taken = 1;
606
607 return 0;
608 }
609
610 return -ESRCH;
611}
612
613static int64_t
614hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
615{
616 struct sort_entry *se;
617 int64_t cmp = 0;
618
619 list_for_each_entry(se, &hist_entry__sort_list, list) {
620 cmp = se->cmp(left, right);
621 if (cmp)
622 break;
623 }
624
625 return cmp;
626}
627
628static int64_t
629hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
630{
631 struct sort_entry *se;
632 int64_t cmp = 0;
633
634 list_for_each_entry(se, &hist_entry__sort_list, list) {
635 int64_t (*f)(struct hist_entry *, struct hist_entry *);
636
637 f = se->collapse ?: se->cmp;
638
639 cmp = f(left, right);
640 if (cmp)
641 break;
642 }
643
644 return cmp;
645}
646
647/*
648 * collect histogram counts
649 */
650static void hist_hit(struct hist_entry *he, __u64 ip)
651{
652 unsigned int sym_size, offset;
653 struct symbol *sym = he->sym;
654
655 he->count++;
656
657 if (!sym || !sym->hist)
658 return;
659
660 sym_size = sym->end - sym->start;
661 offset = ip - sym->start;
662
663 if (offset >= sym_size)
664 return;
665
666 sym->hist_sum++;
667 sym->hist[offset]++;
668
669 if (verbose >= 3)
670 printf("%p %s: count++ [ip: %p, %08Lx] => %Ld\n",
671 (void *)(unsigned long)he->sym->start,
672 he->sym->name,
673 (void *)(unsigned long)ip, ip - he->sym->start,
674 sym->hist[offset]);
675}
676
677static int
678hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
679 struct symbol *sym, __u64 ip, char level)
680{
681 struct rb_node **p = &hist.rb_node;
682 struct rb_node *parent = NULL;
683 struct hist_entry *he;
684 struct hist_entry entry = {
685 .thread = thread,
686 .map = map,
687 .dso = dso,
688 .sym = sym,
689 .ip = ip,
690 .level = level,
691 .count = 1,
692 };
693 int cmp;
694
695 while (*p != NULL) {
696 parent = *p;
697 he = rb_entry(parent, struct hist_entry, rb_node);
698
699 cmp = hist_entry__cmp(&entry, he);
700
701 if (!cmp) {
702 hist_hit(he, ip);
703
704 return 0;
705 }
706
707 if (cmp < 0)
708 p = &(*p)->rb_left;
709 else
710 p = &(*p)->rb_right;
711 }
712
713 he = malloc(sizeof(*he));
714 if (!he)
715 return -ENOMEM;
716 *he = entry;
717 rb_link_node(&he->rb_node, parent, p);
718 rb_insert_color(&he->rb_node, &hist);
719
720 return 0;
721}
722
723static void hist_entry__free(struct hist_entry *he)
724{
725 free(he);
726}
727
728/*
729 * collapse the histogram
730 */
731
732static struct rb_root collapse_hists;
733
734static void collapse__insert_entry(struct hist_entry *he)
735{
736 struct rb_node **p = &collapse_hists.rb_node;
737 struct rb_node *parent = NULL;
738 struct hist_entry *iter;
739 int64_t cmp;
740
741 while (*p != NULL) {
742 parent = *p;
743 iter = rb_entry(parent, struct hist_entry, rb_node);
744
745 cmp = hist_entry__collapse(iter, he);
746
747 if (!cmp) {
748 iter->count += he->count;
749 hist_entry__free(he);
750 return;
751 }
752
753 if (cmp < 0)
754 p = &(*p)->rb_left;
755 else
756 p = &(*p)->rb_right;
757 }
758
759 rb_link_node(&he->rb_node, parent, p);
760 rb_insert_color(&he->rb_node, &collapse_hists);
761}
762
763static void collapse__resort(void)
764{
765 struct rb_node *next;
766 struct hist_entry *n;
767
768 if (!sort__need_collapse)
769 return;
770
771 next = rb_first(&hist);
772 while (next) {
773 n = rb_entry(next, struct hist_entry, rb_node);
774 next = rb_next(&n->rb_node);
775
776 rb_erase(&n->rb_node, &hist);
777 collapse__insert_entry(n);
778 }
779}
780
781/*
782 * reverse the map, sort on count.
783 */
784
785static struct rb_root output_hists;
786
787static void output__insert_entry(struct hist_entry *he)
788{
789 struct rb_node **p = &output_hists.rb_node;
790 struct rb_node *parent = NULL;
791 struct hist_entry *iter;
792
793 while (*p != NULL) {
794 parent = *p;
795 iter = rb_entry(parent, struct hist_entry, rb_node);
796
797 if (he->count > iter->count)
798 p = &(*p)->rb_left;
799 else
800 p = &(*p)->rb_right;
801 }
802
803 rb_link_node(&he->rb_node, parent, p);
804 rb_insert_color(&he->rb_node, &output_hists);
805}
806
807static void output__resort(void)
808{
809 struct rb_node *next;
810 struct hist_entry *n;
811 struct rb_root *tree = &hist;
812
813 if (sort__need_collapse)
814 tree = &collapse_hists;
815
816 next = rb_first(tree);
817
818 while (next) {
819 n = rb_entry(next, struct hist_entry, rb_node);
820 next = rb_next(&n->rb_node);
821
822 rb_erase(&n->rb_node, tree);
823 output__insert_entry(n);
824 }
825}
826
827static void register_idle_thread(void)
828{
829 struct thread *thread = threads__findnew(0);
830
831 if (thread == NULL ||
832 thread__set_comm(thread, "[idle]")) {
833 fprintf(stderr, "problem inserting idle task.\n");
834 exit(-1);
835 }
836}
837
838static unsigned long total = 0,
839 total_mmap = 0,
840 total_comm = 0,
841 total_fork = 0,
842 total_unknown = 0;
843
844static int
845process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
846{
847 char level;
848 int show = 0;
849 struct dso *dso = NULL;
850 struct thread *thread = threads__findnew(event->ip.pid);
851 __u64 ip = event->ip.ip;
852 struct map *map = NULL;
853
854 dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
855 (void *)(offset + head),
856 (void *)(long)(event->header.size),
857 event->header.misc,
858 event->ip.pid,
859 (void *)(long)ip);
860
861 dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
862
863 if (thread == NULL) {
864 fprintf(stderr, "problem processing %d event, skipping it.\n",
865 event->header.type);
866 return -1;
867 }
868
869 if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
870 show = SHOW_KERNEL;
871 level = 'k';
872
873 dso = kernel_dso;
874
875 dprintf(" ...... dso: %s\n", dso->name);
876
877 } else if (event->header.misc & PERF_EVENT_MISC_USER) {
878
879 show = SHOW_USER;
880 level = '.';
881
882 map = thread__find_map(thread, ip);
883 if (map != NULL) {
884 ip = map->map_ip(map, ip);
885 dso = map->dso;
886 } else {
887 /*
888 * If this is outside of all known maps,
889 * and is a negative address, try to look it
890 * up in the kernel dso, as it might be a
891 * vsyscall (which executes in user-mode):
892 */
893 if ((long long)ip < 0)
894 dso = kernel_dso;
895 }
896 dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
897
898 } else {
899 show = SHOW_HV;
900 level = 'H';
901 dprintf(" ...... dso: [hypervisor]\n");
902 }
903
904 if (show & show_mask) {
905 struct symbol *sym = NULL;
906
907 if (dso)
908 sym = dso->find_symbol(dso, ip);
909
910 if (hist_entry__add(thread, map, dso, sym, ip, level)) {
911 fprintf(stderr,
912 "problem incrementing symbol count, skipping event\n");
913 return -1;
914 }
915 }
916 total++;
917
918 return 0;
919}
920
921static int
922process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
923{
924 struct thread *thread = threads__findnew(event->mmap.pid);
925 struct map *map = map__new(&event->mmap);
926
927 dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
928 (void *)(offset + head),
929 (void *)(long)(event->header.size),
930 event->mmap.pid,
931 (void *)(long)event->mmap.start,
932 (void *)(long)event->mmap.len,
933 (void *)(long)event->mmap.pgoff,
934 event->mmap.filename);
935
936 if (thread == NULL || map == NULL) {
937 dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n");
938 return 0;
939 }
940
941 thread__insert_map(thread, map);
942 total_mmap++;
943
944 return 0;
945}
946
947static int
948process_comm_event(event_t *event, unsigned long offset, unsigned long head)
949{
950 struct thread *thread = threads__findnew(event->comm.pid);
951
952 dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
953 (void *)(offset + head),
954 (void *)(long)(event->header.size),
955 event->comm.comm, event->comm.pid);
956
957 if (thread == NULL ||
958 thread__set_comm(thread, event->comm.comm)) {
959 dprintf("problem processing PERF_EVENT_COMM, skipping event.\n");
960 return -1;
961 }
962 total_comm++;
963
964 return 0;
965}
966
967static int
968process_fork_event(event_t *event, unsigned long offset, unsigned long head)
969{
970 struct thread *thread = threads__findnew(event->fork.pid);
971 struct thread *parent = threads__findnew(event->fork.ppid);
972
973 dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
974 (void *)(offset + head),
975 (void *)(long)(event->header.size),
976 event->fork.pid, event->fork.ppid);
977
978 if (!thread || !parent || thread__fork(thread, parent)) {
979 dprintf("problem processing PERF_EVENT_FORK, skipping event.\n");
980 return -1;
981 }
982 total_fork++;
983
984 return 0;
985}
986
987static int
988process_period_event(event_t *event, unsigned long offset, unsigned long head)
989{
990 dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n",
991 (void *)(offset + head),
992 (void *)(long)(event->header.size),
993 event->period.time,
994 event->period.id,
995 event->period.sample_period);
996
997 return 0;
998}
999
1000static int
1001process_event(event_t *event, unsigned long offset, unsigned long head)
1002{
1003 if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
1004 return process_overflow_event(event, offset, head);
1005
1006 switch (event->header.type) {
1007 case PERF_EVENT_MMAP:
1008 return process_mmap_event(event, offset, head);
1009
1010 case PERF_EVENT_COMM:
1011 return process_comm_event(event, offset, head);
1012
1013 case PERF_EVENT_FORK:
1014 return process_fork_event(event, offset, head);
1015
1016 case PERF_EVENT_PERIOD:
1017 return process_period_event(event, offset, head);
1018 /*
1019 * We dont process them right now but they are fine:
1020 */
1021
1022 case PERF_EVENT_THROTTLE:
1023 case PERF_EVENT_UNTHROTTLE:
1024 return 0;
1025
1026 default:
1027 return -1;
1028 }
1029
1030 return 0;
1031}
1032
1033static int
1034parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
1035{
1036 char *line = NULL, *tmp, *tmp2;
1037 unsigned int offset;
1038 size_t line_len;
1039 __u64 line_ip;
1040 int ret;
1041 char *c;
1042
1043 if (getline(&line, &line_len, file) < 0)
1044 return -1;
1045 if (!line)
1046 return -1;
1047
1048 c = strchr(line, '\n');
1049 if (c)
1050 *c = 0;
1051
1052 line_ip = -1;
1053 offset = 0;
1054 ret = -2;
1055
1056 /*
1057 * Strip leading spaces:
1058 */
1059 tmp = line;
1060 while (*tmp) {
1061 if (*tmp != ' ')
1062 break;
1063 tmp++;
1064 }
1065
1066 if (*tmp) {
1067 /*
1068 * Parse hexa addresses followed by ':'
1069 */
1070 line_ip = strtoull(tmp, &tmp2, 16);
1071 if (*tmp2 != ':')
1072 line_ip = -1;
1073 }
1074
1075 if (line_ip != -1) {
1076 unsigned int hits = 0;
1077 double percent = 0.0;
1078 char *color = PERF_COLOR_NORMAL;
1079
1080 offset = line_ip - start;
1081 if (offset < len)
1082 hits = sym->hist[offset];
1083
1084 if (sym->hist_sum)
1085 percent = 100.0 * hits / sym->hist_sum;
1086
1087 /*
1088 * We color high-overhead entries in red, mid-overhead
1089 * entries in green - and keep the low overhead places
1090 * normal:
1091 */
1092 if (percent >= 5.0)
1093 color = PERF_COLOR_RED;
1094 else {
1095 if (percent > 0.5)
1096 color = PERF_COLOR_GREEN;
1097 }
1098
1099 color_fprintf(stdout, color, " %7.2f", percent);
1100 printf(" : ");
1101 color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", line);
1102 } else {
1103 if (!*line)
1104 printf(" :\n");
1105 else
1106 printf(" : %s\n", line);
1107 }
1108
1109 return 0;
1110}
1111
1112static void annotate_sym(struct dso *dso, struct symbol *sym)
1113{
1114 char *filename = dso->name;
1115 __u64 start, end, len;
1116 char command[PATH_MAX*2];
1117 FILE *file;
1118
1119 if (!filename)
1120 return;
1121 if (dso == kernel_dso)
1122 filename = vmlinux;
1123
1124 printf("\n------------------------------------------------\n");
1125 printf(" Percent | Source code & Disassembly of %s\n", filename);
1126 printf("------------------------------------------------\n");
1127
1128 if (verbose >= 2)
1129 printf("annotating [%p] %30s : [%p] %30s\n", dso, dso->name, sym, sym->name);
1130
1131 start = sym->obj_start;
1132 if (!start)
1133 start = sym->start;
1134
1135 end = start + sym->end - sym->start + 1;
1136 len = sym->end - sym->start;
1137
1138 sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", (__u64)start, (__u64)end, filename);
1139
1140 if (verbose >= 3)
1141 printf("doing: %s\n", command);
1142
1143 file = popen(command, "r");
1144 if (!file)
1145 return;
1146
1147 while (!feof(file)) {
1148 if (parse_line(file, sym, start, len) < 0)
1149 break;
1150 }
1151
1152 pclose(file);
1153}
1154
1155static void find_annotations(void)
1156{
1157 struct rb_node *nd;
1158 struct dso *dso;
1159 int count = 0;
1160
1161 list_for_each_entry(dso, &dsos, node) {
1162
1163 for (nd = rb_first(&dso->syms); nd; nd = rb_next(nd)) {
1164 struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
1165
1166 if (sym->hist) {
1167 annotate_sym(dso, sym);
1168 count++;
1169 }
1170 }
1171 }
1172
1173 if (!count)
1174 printf(" Error: symbol '%s' not present amongst the samples.\n", sym_hist_filter);
1175}
1176
1177static int __cmd_annotate(void)
1178{
1179 int ret, rc = EXIT_FAILURE;
1180 unsigned long offset = 0;
1181 unsigned long head = 0;
1182 struct stat stat;
1183 event_t *event;
1184 uint32_t size;
1185 char *buf;
1186
1187 register_idle_thread();
1188
1189 input = open(input_name, O_RDONLY);
1190 if (input < 0) {
1191 perror("failed to open file");
1192 exit(-1);
1193 }
1194
1195 ret = fstat(input, &stat);
1196 if (ret < 0) {
1197 perror("failed to stat file");
1198 exit(-1);
1199 }
1200
1201 if (!stat.st_size) {
1202 fprintf(stderr, "zero-sized file, nothing to do!\n");
1203 exit(0);
1204 }
1205
1206 if (load_kernel() < 0) {
1207 perror("failed to load kernel symbols");
1208 return EXIT_FAILURE;
1209 }
1210
1211remap:
1212 buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
1213 MAP_SHARED, input, offset);
1214 if (buf == MAP_FAILED) {
1215 perror("failed to mmap file");
1216 exit(-1);
1217 }
1218
1219more:
1220 event = (event_t *)(buf + head);
1221
1222 size = event->header.size;
1223 if (!size)
1224 size = 8;
1225
1226 if (head + event->header.size >= page_size * mmap_window) {
1227 unsigned long shift = page_size * (head / page_size);
1228 int ret;
1229
1230 ret = munmap(buf, page_size * mmap_window);
1231 assert(ret == 0);
1232
1233 offset += shift;
1234 head -= shift;
1235 goto remap;
1236 }
1237
1238 size = event->header.size;
1239
1240 dprintf("%p [%p]: event: %d\n",
1241 (void *)(offset + head),
1242 (void *)(long)event->header.size,
1243 event->header.type);
1244
1245 if (!size || process_event(event, offset, head) < 0) {
1246
1247 dprintf("%p [%p]: skipping unknown header type: %d\n",
1248 (void *)(offset + head),
1249 (void *)(long)(event->header.size),
1250 event->header.type);
1251
1252 total_unknown++;
1253
1254 /*
1255 * assume we lost track of the stream, check alignment, and
1256 * increment a single u64 in the hope to catch on again 'soon'.
1257 */
1258
1259 if (unlikely(head & 7))
1260 head &= ~7ULL;
1261
1262 size = 8;
1263 }
1264
1265 head += size;
1266
1267 if (offset + head < stat.st_size)
1268 goto more;
1269
1270 rc = EXIT_SUCCESS;
1271 close(input);
1272
1273 dprintf(" IP events: %10ld\n", total);
1274 dprintf(" mmap events: %10ld\n", total_mmap);
1275 dprintf(" comm events: %10ld\n", total_comm);
1276 dprintf(" fork events: %10ld\n", total_fork);
1277 dprintf(" unknown events: %10ld\n", total_unknown);
1278
1279 if (dump_trace)
1280 return 0;
1281
1282 if (verbose >= 3)
1283 threads__fprintf(stdout);
1284
1285 if (verbose >= 2)
1286 dsos__fprintf(stdout);
1287
1288 collapse__resort();
1289 output__resort();
1290
1291 find_annotations();
1292
1293 return rc;
1294}
1295
1296static const char * const annotate_usage[] = {
1297 "perf annotate [<options>] <command>",
1298 NULL
1299};
1300
1301static const struct option options[] = {
1302 OPT_STRING('i', "input", &input_name, "file",
1303 "input file name"),
1304 OPT_STRING('s', "symbol", &sym_hist_filter, "symbol",
1305 "symbol to annotate"),
1306 OPT_BOOLEAN('v', "verbose", &verbose,
1307 "be more verbose (show symbol address, etc)"),
1308 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1309 "dump raw trace in ASCII"),
1310 OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
1311 OPT_END()
1312};
1313
1314static void setup_sorting(void)
1315{
1316 char *tmp, *tok, *str = strdup(sort_order);
1317
1318 for (tok = strtok_r(str, ", ", &tmp);
1319 tok; tok = strtok_r(NULL, ", ", &tmp)) {
1320 if (sort_dimension__add(tok) < 0) {
1321 error("Unknown --sort key: `%s'", tok);
1322 usage_with_options(annotate_usage, options);
1323 }
1324 }
1325
1326 free(str);
1327}
1328
1329int cmd_annotate(int argc, const char **argv, const char *prefix)
1330{
1331 symbol__init();
1332
1333 page_size = getpagesize();
1334
1335 argc = parse_options(argc, argv, options, annotate_usage, 0);
1336
1337 setup_sorting();
1338
1339 if (argc) {
1340 /*
1341 * Special case: if there's an argument left then assume tha
1342 * it's a symbol filter:
1343 */
1344 if (argc > 1)
1345 usage_with_options(annotate_usage, options);
1346
1347 sym_hist_filter = argv[0];
1348 }
1349
1350 if (!sym_hist_filter)
1351 usage_with_options(annotate_usage, options);
1352
1353 setup_pager();
1354
1355 return __cmd_annotate();
1356}
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
new file mode 100644
index 000000000000..0f32dc3f3c4c
--- /dev/null
+++ b/tools/perf/builtin-help.c
@@ -0,0 +1,461 @@
1/*
2 * builtin-help.c
3 *
4 * Builtin help command
5 */
6#include "util/cache.h"
7#include "builtin.h"
8#include "util/exec_cmd.h"
9#include "common-cmds.h"
10#include "util/parse-options.h"
11#include "util/run-command.h"
12#include "util/help.h"
13
14static struct man_viewer_list {
15 struct man_viewer_list *next;
16 char name[FLEX_ARRAY];
17} *man_viewer_list;
18
19static struct man_viewer_info_list {
20 struct man_viewer_info_list *next;
21 const char *info;
22 char name[FLEX_ARRAY];
23} *man_viewer_info_list;
24
25enum help_format {
26 HELP_FORMAT_MAN,
27 HELP_FORMAT_INFO,
28 HELP_FORMAT_WEB,
29};
30
31static int show_all = 0;
32static enum help_format help_format = HELP_FORMAT_MAN;
33static struct option builtin_help_options[] = {
34 OPT_BOOLEAN('a', "all", &show_all, "print all available commands"),
35 OPT_SET_INT('m', "man", &help_format, "show man page", HELP_FORMAT_MAN),
36 OPT_SET_INT('w', "web", &help_format, "show manual in web browser",
37 HELP_FORMAT_WEB),
38 OPT_SET_INT('i', "info", &help_format, "show info page",
39 HELP_FORMAT_INFO),
40 OPT_END(),
41};
42
43static const char * const builtin_help_usage[] = {
44 "perf help [--all] [--man|--web|--info] [command]",
45 NULL
46};
47
48static enum help_format parse_help_format(const char *format)
49{
50 if (!strcmp(format, "man"))
51 return HELP_FORMAT_MAN;
52 if (!strcmp(format, "info"))
53 return HELP_FORMAT_INFO;
54 if (!strcmp(format, "web") || !strcmp(format, "html"))
55 return HELP_FORMAT_WEB;
56 die("unrecognized help format '%s'", format);
57}
58
59static const char *get_man_viewer_info(const char *name)
60{
61 struct man_viewer_info_list *viewer;
62
63 for (viewer = man_viewer_info_list; viewer; viewer = viewer->next)
64 {
65 if (!strcasecmp(name, viewer->name))
66 return viewer->info;
67 }
68 return NULL;
69}
70
71static int check_emacsclient_version(void)
72{
73 struct strbuf buffer = STRBUF_INIT;
74 struct child_process ec_process;
75 const char *argv_ec[] = { "emacsclient", "--version", NULL };
76 int version;
77
78 /* emacsclient prints its version number on stderr */
79 memset(&ec_process, 0, sizeof(ec_process));
80 ec_process.argv = argv_ec;
81 ec_process.err = -1;
82 ec_process.stdout_to_stderr = 1;
83 if (start_command(&ec_process)) {
84 fprintf(stderr, "Failed to start emacsclient.\n");
85 return -1;
86 }
87 strbuf_read(&buffer, ec_process.err, 20);
88 close(ec_process.err);
89
90 /*
91 * Don't bother checking return value, because "emacsclient --version"
92 * seems to always exits with code 1.
93 */
94 finish_command(&ec_process);
95
96 if (prefixcmp(buffer.buf, "emacsclient")) {
97 fprintf(stderr, "Failed to parse emacsclient version.\n");
98 strbuf_release(&buffer);
99 return -1;
100 }
101
102 strbuf_remove(&buffer, 0, strlen("emacsclient"));
103 version = atoi(buffer.buf);
104
105 if (version < 22) {
106 fprintf(stderr,
107 "emacsclient version '%d' too old (< 22).\n",
108 version);
109 strbuf_release(&buffer);
110 return -1;
111 }
112
113 strbuf_release(&buffer);
114 return 0;
115}
116
117static void exec_woman_emacs(const char* path, const char *page)
118{
119 if (!check_emacsclient_version()) {
120 /* This works only with emacsclient version >= 22. */
121 struct strbuf man_page = STRBUF_INIT;
122
123 if (!path)
124 path = "emacsclient";
125 strbuf_addf(&man_page, "(woman \"%s\")", page);
126 execlp(path, "emacsclient", "-e", man_page.buf, NULL);
127 warning("failed to exec '%s': %s", path, strerror(errno));
128 }
129}
130
131static void exec_man_konqueror(const char* path, const char *page)
132{
133 const char *display = getenv("DISPLAY");
134 if (display && *display) {
135 struct strbuf man_page = STRBUF_INIT;
136 const char *filename = "kfmclient";
137
138 /* It's simpler to launch konqueror using kfmclient. */
139 if (path) {
140 const char *file = strrchr(path, '/');
141 if (file && !strcmp(file + 1, "konqueror")) {
142 char *new = strdup(path);
143 char *dest = strrchr(new, '/');
144
145 /* strlen("konqueror") == strlen("kfmclient") */
146 strcpy(dest + 1, "kfmclient");
147 path = new;
148 }
149 if (file)
150 filename = file;
151 } else
152 path = "kfmclient";
153 strbuf_addf(&man_page, "man:%s(1)", page);
154 execlp(path, filename, "newTab", man_page.buf, NULL);
155 warning("failed to exec '%s': %s", path, strerror(errno));
156 }
157}
158
159static void exec_man_man(const char* path, const char *page)
160{
161 if (!path)
162 path = "man";
163 execlp(path, "man", page, NULL);
164 warning("failed to exec '%s': %s", path, strerror(errno));
165}
166
167static void exec_man_cmd(const char *cmd, const char *page)
168{
169 struct strbuf shell_cmd = STRBUF_INIT;
170 strbuf_addf(&shell_cmd, "%s %s", cmd, page);
171 execl("/bin/sh", "sh", "-c", shell_cmd.buf, NULL);
172 warning("failed to exec '%s': %s", cmd, strerror(errno));
173}
174
175static void add_man_viewer(const char *name)
176{
177 struct man_viewer_list **p = &man_viewer_list;
178 size_t len = strlen(name);
179
180 while (*p)
181 p = &((*p)->next);
182 *p = calloc(1, (sizeof(**p) + len + 1));
183 strncpy((*p)->name, name, len);
184}
185
186static int supported_man_viewer(const char *name, size_t len)
187{
188 return (!strncasecmp("man", name, len) ||
189 !strncasecmp("woman", name, len) ||
190 !strncasecmp("konqueror", name, len));
191}
192
193static void do_add_man_viewer_info(const char *name,
194 size_t len,
195 const char *value)
196{
197 struct man_viewer_info_list *new = calloc(1, sizeof(*new) + len + 1);
198
199 strncpy(new->name, name, len);
200 new->info = strdup(value);
201 new->next = man_viewer_info_list;
202 man_viewer_info_list = new;
203}
204
205static int add_man_viewer_path(const char *name,
206 size_t len,
207 const char *value)
208{
209 if (supported_man_viewer(name, len))
210 do_add_man_viewer_info(name, len, value);
211 else
212 warning("'%s': path for unsupported man viewer.\n"
213 "Please consider using 'man.<tool>.cmd' instead.",
214 name);
215
216 return 0;
217}
218
219static int add_man_viewer_cmd(const char *name,
220 size_t len,
221 const char *value)
222{
223 if (supported_man_viewer(name, len))
224 warning("'%s': cmd for supported man viewer.\n"
225 "Please consider using 'man.<tool>.path' instead.",
226 name);
227 else
228 do_add_man_viewer_info(name, len, value);
229
230 return 0;
231}
232
233static int add_man_viewer_info(const char *var, const char *value)
234{
235 const char *name = var + 4;
236 const char *subkey = strrchr(name, '.');
237
238 if (!subkey)
239 return error("Config with no key for man viewer: %s", name);
240
241 if (!strcmp(subkey, ".path")) {
242 if (!value)
243 return config_error_nonbool(var);
244 return add_man_viewer_path(name, subkey - name, value);
245 }
246 if (!strcmp(subkey, ".cmd")) {
247 if (!value)
248 return config_error_nonbool(var);
249 return add_man_viewer_cmd(name, subkey - name, value);
250 }
251
252 warning("'%s': unsupported man viewer sub key.", subkey);
253 return 0;
254}
255
256static int perf_help_config(const char *var, const char *value, void *cb)
257{
258 if (!strcmp(var, "help.format")) {
259 if (!value)
260 return config_error_nonbool(var);
261 help_format = parse_help_format(value);
262 return 0;
263 }
264 if (!strcmp(var, "man.viewer")) {
265 if (!value)
266 return config_error_nonbool(var);
267 add_man_viewer(value);
268 return 0;
269 }
270 if (!prefixcmp(var, "man."))
271 return add_man_viewer_info(var, value);
272
273 return perf_default_config(var, value, cb);
274}
275
276static struct cmdnames main_cmds, other_cmds;
277
278void list_common_cmds_help(void)
279{
280 int i, longest = 0;
281
282 for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
283 if (longest < strlen(common_cmds[i].name))
284 longest = strlen(common_cmds[i].name);
285 }
286
287 puts(" The most commonly used perf commands are:");
288 for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
289 printf(" %s ", common_cmds[i].name);
290 mput_char(' ', longest - strlen(common_cmds[i].name));
291 puts(common_cmds[i].help);
292 }
293}
294
295static int is_perf_command(const char *s)
296{
297 return is_in_cmdlist(&main_cmds, s) ||
298 is_in_cmdlist(&other_cmds, s);
299}
300
301static const char *prepend(const char *prefix, const char *cmd)
302{
303 size_t pre_len = strlen(prefix);
304 size_t cmd_len = strlen(cmd);
305 char *p = malloc(pre_len + cmd_len + 1);
306 memcpy(p, prefix, pre_len);
307 strcpy(p + pre_len, cmd);
308 return p;
309}
310
311static const char *cmd_to_page(const char *perf_cmd)
312{
313 if (!perf_cmd)
314 return "perf";
315 else if (!prefixcmp(perf_cmd, "perf"))
316 return perf_cmd;
317 else if (is_perf_command(perf_cmd))
318 return prepend("perf-", perf_cmd);
319 else
320 return prepend("perf-", perf_cmd);
321}
322
323static void setup_man_path(void)
324{
325 struct strbuf new_path = STRBUF_INIT;
326 const char *old_path = getenv("MANPATH");
327
328 /* We should always put ':' after our path. If there is no
329 * old_path, the ':' at the end will let 'man' to try
330 * system-wide paths after ours to find the manual page. If
331 * there is old_path, we need ':' as delimiter. */
332 strbuf_addstr(&new_path, system_path(PERF_MAN_PATH));
333 strbuf_addch(&new_path, ':');
334 if (old_path)
335 strbuf_addstr(&new_path, old_path);
336
337 setenv("MANPATH", new_path.buf, 1);
338
339 strbuf_release(&new_path);
340}
341
342static void exec_viewer(const char *name, const char *page)
343{
344 const char *info = get_man_viewer_info(name);
345
346 if (!strcasecmp(name, "man"))
347 exec_man_man(info, page);
348 else if (!strcasecmp(name, "woman"))
349 exec_woman_emacs(info, page);
350 else if (!strcasecmp(name, "konqueror"))
351 exec_man_konqueror(info, page);
352 else if (info)
353 exec_man_cmd(info, page);
354 else
355 warning("'%s': unknown man viewer.", name);
356}
357
358static void show_man_page(const char *perf_cmd)
359{
360 struct man_viewer_list *viewer;
361 const char *page = cmd_to_page(perf_cmd);
362 const char *fallback = getenv("PERF_MAN_VIEWER");
363
364 setup_man_path();
365 for (viewer = man_viewer_list; viewer; viewer = viewer->next)
366 {
367 exec_viewer(viewer->name, page); /* will return when unable */
368 }
369 if (fallback)
370 exec_viewer(fallback, page);
371 exec_viewer("man", page);
372 die("no man viewer handled the request");
373}
374
375static void show_info_page(const char *perf_cmd)
376{
377 const char *page = cmd_to_page(perf_cmd);
378 setenv("INFOPATH", system_path(PERF_INFO_PATH), 1);
379 execlp("info", "info", "perfman", page, NULL);
380}
381
382static void get_html_page_path(struct strbuf *page_path, const char *page)
383{
384 struct stat st;
385 const char *html_path = system_path(PERF_HTML_PATH);
386
387 /* Check that we have a perf documentation directory. */
388 if (stat(mkpath("%s/perf.html", html_path), &st)
389 || !S_ISREG(st.st_mode))
390 die("'%s': not a documentation directory.", html_path);
391
392 strbuf_init(page_path, 0);
393 strbuf_addf(page_path, "%s/%s.html", html_path, page);
394}
395
396/*
397 * If open_html is not defined in a platform-specific way (see for
398 * example compat/mingw.h), we use the script web--browse to display
399 * HTML.
400 */
401#ifndef open_html
402static void open_html(const char *path)
403{
404 execl_perf_cmd("web--browse", "-c", "help.browser", path, NULL);
405}
406#endif
407
408static void show_html_page(const char *perf_cmd)
409{
410 const char *page = cmd_to_page(perf_cmd);
411 struct strbuf page_path; /* it leaks but we exec bellow */
412
413 get_html_page_path(&page_path, page);
414
415 open_html(page_path.buf);
416}
417
418int cmd_help(int argc, const char **argv, const char *prefix)
419{
420 const char *alias;
421 load_command_list("perf-", &main_cmds, &other_cmds);
422
423 perf_config(perf_help_config, NULL);
424
425 argc = parse_options(argc, argv, builtin_help_options,
426 builtin_help_usage, 0);
427
428 if (show_all) {
429 printf("\n usage: %s\n\n", perf_usage_string);
430 list_commands("perf commands", &main_cmds, &other_cmds);
431 printf(" %s\n\n", perf_more_info_string);
432 return 0;
433 }
434
435 if (!argv[0]) {
436 printf("\n usage: %s\n\n", perf_usage_string);
437 list_common_cmds_help();
438 printf("\n %s\n\n", perf_more_info_string);
439 return 0;
440 }
441
442 alias = alias_lookup(argv[0]);
443 if (alias && !is_perf_command(argv[0])) {
444 printf("`perf %s' is aliased to `%s'\n", argv[0], alias);
445 return 0;
446 }
447
448 switch (help_format) {
449 case HELP_FORMAT_MAN:
450 show_man_page(argv[0]);
451 break;
452 case HELP_FORMAT_INFO:
453 show_info_page(argv[0]);
454 break;
455 case HELP_FORMAT_WEB:
456 show_html_page(argv[0]);
457 break;
458 }
459
460 return 0;
461}
diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c
new file mode 100644
index 000000000000..fe60e37c96ef
--- /dev/null
+++ b/tools/perf/builtin-list.c
@@ -0,0 +1,20 @@
1/*
2 * builtin-list.c
3 *
4 * Builtin list command: list all event types
5 *
6 * Copyright (C) 2009, Thomas Gleixner <tglx@linutronix.de>
7 * Copyright (C) 2008-2009, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
8 */
9#include "builtin.h"
10
11#include "perf.h"
12
13#include "util/parse-options.h"
14#include "util/parse-events.h"
15
16int cmd_list(int argc, const char **argv, const char *prefix)
17{
18 print_events();
19 return 0;
20}
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
new file mode 100644
index 000000000000..29259e74dcfa
--- /dev/null
+++ b/tools/perf/builtin-record.c
@@ -0,0 +1,582 @@
1/*
2 * builtin-record.c
3 *
4 * Builtin record command: Record the profile of a workload
5 * (or a CPU, or a PID) into the perf.data output file - for
6 * later analysis via perf report.
7 */
8#include "builtin.h"
9
10#include "perf.h"
11
12#include "util/util.h"
13#include "util/parse-options.h"
14#include "util/parse-events.h"
15#include "util/string.h"
16
17#include <unistd.h>
18#include <sched.h>
19
20#define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1)
21#define __ALIGN_MASK(x, mask) (((x)+(mask))&~(mask))
22
23static int fd[MAX_NR_CPUS][MAX_COUNTERS];
24
25static long default_interval = 100000;
26
27static int nr_cpus = 0;
28static unsigned int page_size;
29static unsigned int mmap_pages = 128;
30static int freq = 0;
31static int output;
32static const char *output_name = "perf.data";
33static int group = 0;
34static unsigned int realtime_prio = 0;
35static int system_wide = 0;
36static pid_t target_pid = -1;
37static int inherit = 1;
38static int force = 0;
39static int append_file = 0;
40static int verbose = 0;
41
42static long samples;
43static struct timeval last_read;
44static struct timeval this_read;
45
46static __u64 bytes_written;
47
48static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
49
50static int nr_poll;
51static int nr_cpu;
52
53struct mmap_event {
54 struct perf_event_header header;
55 __u32 pid;
56 __u32 tid;
57 __u64 start;
58 __u64 len;
59 __u64 pgoff;
60 char filename[PATH_MAX];
61};
62
63struct comm_event {
64 struct perf_event_header header;
65 __u32 pid;
66 __u32 tid;
67 char comm[16];
68};
69
70
71struct mmap_data {
72 int counter;
73 void *base;
74 unsigned int mask;
75 unsigned int prev;
76};
77
78static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
79
80static unsigned int mmap_read_head(struct mmap_data *md)
81{
82 struct perf_counter_mmap_page *pc = md->base;
83 int head;
84
85 head = pc->data_head;
86 rmb();
87
88 return head;
89}
90
91static void mmap_read(struct mmap_data *md)
92{
93 unsigned int head = mmap_read_head(md);
94 unsigned int old = md->prev;
95 unsigned char *data = md->base + page_size;
96 unsigned long size;
97 void *buf;
98 int diff;
99
100 gettimeofday(&this_read, NULL);
101
102 /*
103 * If we're further behind than half the buffer, there's a chance
104 * the writer will bite our tail and mess up the samples under us.
105 *
106 * If we somehow ended up ahead of the head, we got messed up.
107 *
108 * In either case, truncate and restart at head.
109 */
110 diff = head - old;
111 if (diff > md->mask / 2 || diff < 0) {
112 struct timeval iv;
113 unsigned long msecs;
114
115 timersub(&this_read, &last_read, &iv);
116 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
117
118 fprintf(stderr, "WARNING: failed to keep up with mmap data."
119 " Last read %lu msecs ago.\n", msecs);
120
121 /*
122 * head points to a known good entry, start there.
123 */
124 old = head;
125 }
126
127 last_read = this_read;
128
129 if (old != head)
130 samples++;
131
132 size = head - old;
133
134 if ((old & md->mask) + size != (head & md->mask)) {
135 buf = &data[old & md->mask];
136 size = md->mask + 1 - (old & md->mask);
137 old += size;
138
139 while (size) {
140 int ret = write(output, buf, size);
141
142 if (ret < 0)
143 die("failed to write");
144
145 size -= ret;
146 buf += ret;
147
148 bytes_written += ret;
149 }
150 }
151
152 buf = &data[old & md->mask];
153 size = head - old;
154 old += size;
155
156 while (size) {
157 int ret = write(output, buf, size);
158
159 if (ret < 0)
160 die("failed to write");
161
162 size -= ret;
163 buf += ret;
164
165 bytes_written += ret;
166 }
167
168 md->prev = old;
169}
170
171static volatile int done = 0;
172static volatile int signr = -1;
173
174static void sig_handler(int sig)
175{
176 done = 1;
177 signr = sig;
178}
179
180static void sig_atexit(void)
181{
182 if (signr == -1)
183 return;
184
185 signal(signr, SIG_DFL);
186 kill(getpid(), signr);
187}
188
189static void pid_synthesize_comm_event(pid_t pid, int full)
190{
191 struct comm_event comm_ev;
192 char filename[PATH_MAX];
193 char bf[BUFSIZ];
194 int fd, ret;
195 size_t size;
196 char *field, *sep;
197 DIR *tasks;
198 struct dirent dirent, *next;
199
200 snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
201
202 fd = open(filename, O_RDONLY);
203 if (fd < 0) {
204 fprintf(stderr, "couldn't open %s\n", filename);
205 exit(EXIT_FAILURE);
206 }
207 if (read(fd, bf, sizeof(bf)) < 0) {
208 fprintf(stderr, "couldn't read %s\n", filename);
209 exit(EXIT_FAILURE);
210 }
211 close(fd);
212
213 /* 9027 (cat) R 6747 9027 6747 34816 9027 ... */
214 memset(&comm_ev, 0, sizeof(comm_ev));
215 field = strchr(bf, '(');
216 if (field == NULL)
217 goto out_failure;
218 sep = strchr(++field, ')');
219 if (sep == NULL)
220 goto out_failure;
221 size = sep - field;
222 memcpy(comm_ev.comm, field, size++);
223
224 comm_ev.pid = pid;
225 comm_ev.header.type = PERF_EVENT_COMM;
226 size = ALIGN(size, sizeof(__u64));
227 comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
228
229 if (!full) {
230 comm_ev.tid = pid;
231
232 ret = write(output, &comm_ev, comm_ev.header.size);
233 if (ret < 0) {
234 perror("failed to write");
235 exit(-1);
236 }
237 return;
238 }
239
240 snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
241
242 tasks = opendir(filename);
243 while (!readdir_r(tasks, &dirent, &next) && next) {
244 char *end;
245 pid = strtol(dirent.d_name, &end, 10);
246 if (*end)
247 continue;
248
249 comm_ev.tid = pid;
250
251 ret = write(output, &comm_ev, comm_ev.header.size);
252 if (ret < 0) {
253 perror("failed to write");
254 exit(-1);
255 }
256 }
257 closedir(tasks);
258 return;
259
260out_failure:
261 fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
262 filename);
263 exit(EXIT_FAILURE);
264}
265
266static void pid_synthesize_mmap_samples(pid_t pid)
267{
268 char filename[PATH_MAX];
269 FILE *fp;
270
271 snprintf(filename, sizeof(filename), "/proc/%d/maps", pid);
272
273 fp = fopen(filename, "r");
274 if (fp == NULL) {
275 fprintf(stderr, "couldn't open %s\n", filename);
276 exit(EXIT_FAILURE);
277 }
278 while (1) {
279 char bf[BUFSIZ], *pbf = bf;
280 struct mmap_event mmap_ev = {
281 .header.type = PERF_EVENT_MMAP,
282 };
283 int n;
284 size_t size;
285 if (fgets(bf, sizeof(bf), fp) == NULL)
286 break;
287
288 /* 00400000-0040c000 r-xp 00000000 fd:01 41038 /bin/cat */
289 n = hex2u64(pbf, &mmap_ev.start);
290 if (n < 0)
291 continue;
292 pbf += n + 1;
293 n = hex2u64(pbf, &mmap_ev.len);
294 if (n < 0)
295 continue;
296 pbf += n + 3;
297 if (*pbf == 'x') { /* vm_exec */
298 char *execname = strrchr(bf, ' ');
299
300 if (execname == NULL || execname[1] != '/')
301 continue;
302
303 execname += 1;
304 size = strlen(execname);
305 execname[size - 1] = '\0'; /* Remove \n */
306 memcpy(mmap_ev.filename, execname, size);
307 size = ALIGN(size, sizeof(__u64));
308 mmap_ev.len -= mmap_ev.start;
309 mmap_ev.header.size = (sizeof(mmap_ev) -
310 (sizeof(mmap_ev.filename) - size));
311 mmap_ev.pid = pid;
312 mmap_ev.tid = pid;
313
314 if (write(output, &mmap_ev, mmap_ev.header.size) < 0) {
315 perror("failed to write");
316 exit(-1);
317 }
318 }
319 }
320
321 fclose(fp);
322}
323
324static void synthesize_samples(void)
325{
326 DIR *proc;
327 struct dirent dirent, *next;
328
329 proc = opendir("/proc");
330
331 while (!readdir_r(proc, &dirent, &next) && next) {
332 char *end;
333 pid_t pid;
334
335 pid = strtol(dirent.d_name, &end, 10);
336 if (*end) /* only interested in proper numerical dirents */
337 continue;
338
339 pid_synthesize_comm_event(pid, 1);
340 pid_synthesize_mmap_samples(pid);
341 }
342
343 closedir(proc);
344}
345
346static int group_fd;
347
348static void create_counter(int counter, int cpu, pid_t pid)
349{
350 struct perf_counter_attr *attr = attrs + counter;
351 int track = 1;
352
353 attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
354 if (freq) {
355 attr->sample_type |= PERF_SAMPLE_PERIOD;
356 attr->freq = 1;
357 attr->sample_freq = freq;
358 }
359 attr->mmap = track;
360 attr->comm = track;
361 attr->inherit = (cpu < 0) && inherit;
362 attr->disabled = 1;
363
364 track = 0; /* only the first counter needs these */
365
366try_again:
367 fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
368
369 if (fd[nr_cpu][counter] < 0) {
370 int err = errno;
371
372 if (err == EPERM)
373 die("Permission error - are you root?\n");
374
375 /*
376 * If it's cycles then fall back to hrtimer
377 * based cpu-clock-tick sw counter, which
378 * is always available even if no PMU support:
379 */
380 if (attr->type == PERF_TYPE_HARDWARE
381 && attr->config == PERF_COUNT_HW_CPU_CYCLES) {
382
383 if (verbose)
384 warning(" ... trying to fall back to cpu-clock-ticks\n");
385 attr->type = PERF_TYPE_SOFTWARE;
386 attr->config = PERF_COUNT_SW_CPU_CLOCK;
387 goto try_again;
388 }
389 printf("\n");
390 error("perfcounter syscall returned with %d (%s)\n",
391 fd[nr_cpu][counter], strerror(err));
392 die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
393 exit(-1);
394 }
395
396 assert(fd[nr_cpu][counter] >= 0);
397 fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
398
399 /*
400 * First counter acts as the group leader:
401 */
402 if (group && group_fd == -1)
403 group_fd = fd[nr_cpu][counter];
404
405 event_array[nr_poll].fd = fd[nr_cpu][counter];
406 event_array[nr_poll].events = POLLIN;
407 nr_poll++;
408
409 mmap_array[nr_cpu][counter].counter = counter;
410 mmap_array[nr_cpu][counter].prev = 0;
411 mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
412 mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
413 PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
414 if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
415 error("failed to mmap with %d (%s)\n", errno, strerror(errno));
416 exit(-1);
417 }
418
419 ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
420}
421
422static void open_counters(int cpu, pid_t pid)
423{
424 int counter;
425
426 if (pid > 0) {
427 pid_synthesize_comm_event(pid, 0);
428 pid_synthesize_mmap_samples(pid);
429 }
430
431 group_fd = -1;
432 for (counter = 0; counter < nr_counters; counter++)
433 create_counter(counter, cpu, pid);
434
435 nr_cpu++;
436}
437
438static int __cmd_record(int argc, const char **argv)
439{
440 int i, counter;
441 struct stat st;
442 pid_t pid;
443 int flags;
444 int ret;
445
446 page_size = sysconf(_SC_PAGE_SIZE);
447 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
448 assert(nr_cpus <= MAX_NR_CPUS);
449 assert(nr_cpus >= 0);
450
451 if (!stat(output_name, &st) && !force && !append_file) {
452 fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
453 output_name);
454 exit(-1);
455 }
456
457 flags = O_CREAT|O_RDWR;
458 if (append_file)
459 flags |= O_APPEND;
460 else
461 flags |= O_TRUNC;
462
463 output = open(output_name, flags, S_IRUSR|S_IWUSR);
464 if (output < 0) {
465 perror("failed to create output file");
466 exit(-1);
467 }
468
469 if (!system_wide) {
470 open_counters(-1, target_pid != -1 ? target_pid : getpid());
471 } else for (i = 0; i < nr_cpus; i++)
472 open_counters(i, target_pid);
473
474 atexit(sig_atexit);
475 signal(SIGCHLD, sig_handler);
476 signal(SIGINT, sig_handler);
477
478 if (target_pid == -1 && argc) {
479 pid = fork();
480 if (pid < 0)
481 perror("failed to fork");
482
483 if (!pid) {
484 if (execvp(argv[0], (char **)argv)) {
485 perror(argv[0]);
486 exit(-1);
487 }
488 }
489 }
490
491 if (realtime_prio) {
492 struct sched_param param;
493
494 param.sched_priority = realtime_prio;
495 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
496 printf("Could not set realtime priority.\n");
497 exit(-1);
498 }
499 }
500
501 if (system_wide)
502 synthesize_samples();
503
504 while (!done) {
505 int hits = samples;
506
507 for (i = 0; i < nr_cpu; i++) {
508 for (counter = 0; counter < nr_counters; counter++)
509 mmap_read(&mmap_array[i][counter]);
510 }
511
512 if (hits == samples)
513 ret = poll(event_array, nr_poll, 100);
514 }
515
516 /*
517 * Approximate RIP event size: 24 bytes.
518 */
519 fprintf(stderr,
520 "[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n",
521 (double)bytes_written / 1024.0 / 1024.0,
522 output_name,
523 bytes_written / 24);
524
525 return 0;
526}
527
528static const char * const record_usage[] = {
529 "perf record [<options>] [<command>]",
530 "perf record [<options>] -- <command> [<options>]",
531 NULL
532};
533
534static const struct option options[] = {
535 OPT_CALLBACK('e', "event", NULL, "event",
536 "event selector. use 'perf list' to list available events",
537 parse_events),
538 OPT_INTEGER('p', "pid", &target_pid,
539 "record events on existing pid"),
540 OPT_INTEGER('r', "realtime", &realtime_prio,
541 "collect data with this RT SCHED_FIFO priority"),
542 OPT_BOOLEAN('a', "all-cpus", &system_wide,
543 "system-wide collection from all CPUs"),
544 OPT_BOOLEAN('A', "append", &append_file,
545 "append to the output file to do incremental profiling"),
546 OPT_BOOLEAN('f', "force", &force,
547 "overwrite existing data file"),
548 OPT_LONG('c', "count", &default_interval,
549 "event period to sample"),
550 OPT_STRING('o', "output", &output_name, "file",
551 "output file name"),
552 OPT_BOOLEAN('i', "inherit", &inherit,
553 "child tasks inherit counters"),
554 OPT_INTEGER('F', "freq", &freq,
555 "profile at this frequency"),
556 OPT_INTEGER('m', "mmap-pages", &mmap_pages,
557 "number of mmap data pages"),
558 OPT_BOOLEAN('v', "verbose", &verbose,
559 "be more verbose (show counter open errors, etc)"),
560 OPT_END()
561};
562
563int cmd_record(int argc, const char **argv, const char *prefix)
564{
565 int counter;
566
567 argc = parse_options(argc, argv, options, record_usage, 0);
568 if (!argc && target_pid == -1 && !system_wide)
569 usage_with_options(record_usage, options);
570
571 if (!nr_counters)
572 nr_counters = 1;
573
574 for (counter = 0; counter < nr_counters; counter++) {
575 if (attrs[counter].sample_period)
576 continue;
577
578 attrs[counter].sample_period = default_interval;
579 }
580
581 return __cmd_record(argc, argv);
582}
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
new file mode 100644
index 000000000000..82fa93b4db99
--- /dev/null
+++ b/tools/perf/builtin-report.c
@@ -0,0 +1,1316 @@
1/*
2 * builtin-report.c
3 *
4 * Builtin report command: Analyze the perf.data input file,
5 * look up and read DSOs and symbol information and display
6 * a histogram of results, along various sorting keys.
7 */
8#include "builtin.h"
9
10#include "util/util.h"
11
12#include "util/color.h"
13#include "util/list.h"
14#include "util/cache.h"
15#include "util/rbtree.h"
16#include "util/symbol.h"
17#include "util/string.h"
18
19#include "perf.h"
20
21#include "util/parse-options.h"
22#include "util/parse-events.h"
23
24#define SHOW_KERNEL 1
25#define SHOW_USER 2
26#define SHOW_HV 4
27
28static char const *input_name = "perf.data";
29static char *vmlinux = NULL;
30
31static char default_sort_order[] = "comm,dso";
32static char *sort_order = default_sort_order;
33
34static int input;
35static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
36
37static int dump_trace = 0;
38#define dprintf(x...) do { if (dump_trace) printf(x); } while (0)
39
40static int verbose;
41static int full_paths;
42
43static unsigned long page_size;
44static unsigned long mmap_window = 32;
45
46struct ip_event {
47 struct perf_event_header header;
48 __u64 ip;
49 __u32 pid, tid;
50 __u64 period;
51};
52
53struct mmap_event {
54 struct perf_event_header header;
55 __u32 pid, tid;
56 __u64 start;
57 __u64 len;
58 __u64 pgoff;
59 char filename[PATH_MAX];
60};
61
62struct comm_event {
63 struct perf_event_header header;
64 __u32 pid, tid;
65 char comm[16];
66};
67
68struct fork_event {
69 struct perf_event_header header;
70 __u32 pid, ppid;
71};
72
73struct period_event {
74 struct perf_event_header header;
75 __u64 time;
76 __u64 id;
77 __u64 sample_period;
78};
79
80typedef union event_union {
81 struct perf_event_header header;
82 struct ip_event ip;
83 struct mmap_event mmap;
84 struct comm_event comm;
85 struct fork_event fork;
86 struct period_event period;
87} event_t;
88
89static LIST_HEAD(dsos);
90static struct dso *kernel_dso;
91static struct dso *vdso;
92
93static void dsos__add(struct dso *dso)
94{
95 list_add_tail(&dso->node, &dsos);
96}
97
98static struct dso *dsos__find(const char *name)
99{
100 struct dso *pos;
101
102 list_for_each_entry(pos, &dsos, node)
103 if (strcmp(pos->name, name) == 0)
104 return pos;
105 return NULL;
106}
107
108static struct dso *dsos__findnew(const char *name)
109{
110 struct dso *dso = dsos__find(name);
111 int nr;
112
113 if (dso)
114 return dso;
115
116 dso = dso__new(name, 0);
117 if (!dso)
118 goto out_delete_dso;
119
120 nr = dso__load(dso, NULL, verbose);
121 if (nr < 0) {
122 if (verbose)
123 fprintf(stderr, "Failed to open: %s\n", name);
124 goto out_delete_dso;
125 }
126 if (!nr && verbose) {
127 fprintf(stderr,
128 "No symbols found in: %s, maybe install a debug package?\n",
129 name);
130 }
131
132 dsos__add(dso);
133
134 return dso;
135
136out_delete_dso:
137 dso__delete(dso);
138 return NULL;
139}
140
141static void dsos__fprintf(FILE *fp)
142{
143 struct dso *pos;
144
145 list_for_each_entry(pos, &dsos, node)
146 dso__fprintf(pos, fp);
147}
148
149static struct symbol *vdso__find_symbol(struct dso *dso, __u64 ip)
150{
151 return dso__find_symbol(kernel_dso, ip);
152}
153
154static int load_kernel(void)
155{
156 int err;
157
158 kernel_dso = dso__new("[kernel]", 0);
159 if (!kernel_dso)
160 return -1;
161
162 err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose);
163 if (err) {
164 dso__delete(kernel_dso);
165 kernel_dso = NULL;
166 } else
167 dsos__add(kernel_dso);
168
169 vdso = dso__new("[vdso]", 0);
170 if (!vdso)
171 return -1;
172
173 vdso->find_symbol = vdso__find_symbol;
174
175 dsos__add(vdso);
176
177 return err;
178}
179
180static char __cwd[PATH_MAX];
181static char *cwd = __cwd;
182static int cwdlen;
183
184static int strcommon(const char *pathname)
185{
186 int n = 0;
187
188 while (pathname[n] == cwd[n] && n < cwdlen)
189 ++n;
190
191 return n;
192}
193
194struct map {
195 struct list_head node;
196 __u64 start;
197 __u64 end;
198 __u64 pgoff;
199 __u64 (*map_ip)(struct map *, __u64);
200 struct dso *dso;
201};
202
203static __u64 map__map_ip(struct map *map, __u64 ip)
204{
205 return ip - map->start + map->pgoff;
206}
207
208static __u64 vdso__map_ip(struct map *map, __u64 ip)
209{
210 return ip;
211}
212
213static inline int is_anon_memory(const char *filename)
214{
215 return strcmp(filename, "//anon") == 0;
216}
217
218static struct map *map__new(struct mmap_event *event)
219{
220 struct map *self = malloc(sizeof(*self));
221
222 if (self != NULL) {
223 const char *filename = event->filename;
224 char newfilename[PATH_MAX];
225 int anon;
226
227 if (cwd) {
228 int n = strcommon(filename);
229
230 if (n == cwdlen) {
231 snprintf(newfilename, sizeof(newfilename),
232 ".%s", filename + n);
233 filename = newfilename;
234 }
235 }
236
237 anon = is_anon_memory(filename);
238
239 if (anon) {
240 snprintf(newfilename, sizeof(newfilename), "/tmp/perf-%d.map", event->pid);
241 filename = newfilename;
242 }
243
244 self->start = event->start;
245 self->end = event->start + event->len;
246 self->pgoff = event->pgoff;
247
248 self->dso = dsos__findnew(filename);
249 if (self->dso == NULL)
250 goto out_delete;
251
252 if (self->dso == vdso || anon)
253 self->map_ip = vdso__map_ip;
254 else
255 self->map_ip = map__map_ip;
256 }
257 return self;
258out_delete:
259 free(self);
260 return NULL;
261}
262
263static struct map *map__clone(struct map *self)
264{
265 struct map *map = malloc(sizeof(*self));
266
267 if (!map)
268 return NULL;
269
270 memcpy(map, self, sizeof(*self));
271
272 return map;
273}
274
275static int map__overlap(struct map *l, struct map *r)
276{
277 if (l->start > r->start) {
278 struct map *t = l;
279 l = r;
280 r = t;
281 }
282
283 if (l->end > r->start)
284 return 1;
285
286 return 0;
287}
288
289static size_t map__fprintf(struct map *self, FILE *fp)
290{
291 return fprintf(fp, " %Lx-%Lx %Lx %s\n",
292 self->start, self->end, self->pgoff, self->dso->name);
293}
294
295
296struct thread {
297 struct rb_node rb_node;
298 struct list_head maps;
299 pid_t pid;
300 char *comm;
301};
302
303static struct thread *thread__new(pid_t pid)
304{
305 struct thread *self = malloc(sizeof(*self));
306
307 if (self != NULL) {
308 self->pid = pid;
309 self->comm = malloc(32);
310 if (self->comm)
311 snprintf(self->comm, 32, ":%d", self->pid);
312 INIT_LIST_HEAD(&self->maps);
313 }
314
315 return self;
316}
317
318static int thread__set_comm(struct thread *self, const char *comm)
319{
320 if (self->comm)
321 free(self->comm);
322 self->comm = strdup(comm);
323 return self->comm ? 0 : -ENOMEM;
324}
325
326static size_t thread__fprintf(struct thread *self, FILE *fp)
327{
328 struct map *pos;
329 size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
330
331 list_for_each_entry(pos, &self->maps, node)
332 ret += map__fprintf(pos, fp);
333
334 return ret;
335}
336
337
338static struct rb_root threads;
339static struct thread *last_match;
340
341static struct thread *threads__findnew(pid_t pid)
342{
343 struct rb_node **p = &threads.rb_node;
344 struct rb_node *parent = NULL;
345 struct thread *th;
346
347 /*
348 * Font-end cache - PID lookups come in blocks,
349 * so most of the time we dont have to look up
350 * the full rbtree:
351 */
352 if (last_match && last_match->pid == pid)
353 return last_match;
354
355 while (*p != NULL) {
356 parent = *p;
357 th = rb_entry(parent, struct thread, rb_node);
358
359 if (th->pid == pid) {
360 last_match = th;
361 return th;
362 }
363
364 if (pid < th->pid)
365 p = &(*p)->rb_left;
366 else
367 p = &(*p)->rb_right;
368 }
369
370 th = thread__new(pid);
371 if (th != NULL) {
372 rb_link_node(&th->rb_node, parent, p);
373 rb_insert_color(&th->rb_node, &threads);
374 last_match = th;
375 }
376
377 return th;
378}
379
380static void thread__insert_map(struct thread *self, struct map *map)
381{
382 struct map *pos, *tmp;
383
384 list_for_each_entry_safe(pos, tmp, &self->maps, node) {
385 if (map__overlap(pos, map)) {
386 list_del_init(&pos->node);
387 /* XXX leaks dsos */
388 free(pos);
389 }
390 }
391
392 list_add_tail(&map->node, &self->maps);
393}
394
395static int thread__fork(struct thread *self, struct thread *parent)
396{
397 struct map *map;
398
399 if (self->comm)
400 free(self->comm);
401 self->comm = strdup(parent->comm);
402 if (!self->comm)
403 return -ENOMEM;
404
405 list_for_each_entry(map, &parent->maps, node) {
406 struct map *new = map__clone(map);
407 if (!new)
408 return -ENOMEM;
409 thread__insert_map(self, new);
410 }
411
412 return 0;
413}
414
415static struct map *thread__find_map(struct thread *self, __u64 ip)
416{
417 struct map *pos;
418
419 if (self == NULL)
420 return NULL;
421
422 list_for_each_entry(pos, &self->maps, node)
423 if (ip >= pos->start && ip <= pos->end)
424 return pos;
425
426 return NULL;
427}
428
429static size_t threads__fprintf(FILE *fp)
430{
431 size_t ret = 0;
432 struct rb_node *nd;
433
434 for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
435 struct thread *pos = rb_entry(nd, struct thread, rb_node);
436
437 ret += thread__fprintf(pos, fp);
438 }
439
440 return ret;
441}
442
443/*
444 * histogram, sorted on item, collects counts
445 */
446
447static struct rb_root hist;
448
449struct hist_entry {
450 struct rb_node rb_node;
451
452 struct thread *thread;
453 struct map *map;
454 struct dso *dso;
455 struct symbol *sym;
456 __u64 ip;
457 char level;
458
459 __u64 count;
460};
461
462/*
463 * configurable sorting bits
464 */
465
466struct sort_entry {
467 struct list_head list;
468
469 char *header;
470
471 int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
472 int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
473 size_t (*print)(FILE *fp, struct hist_entry *);
474};
475
476/* --sort pid */
477
478static int64_t
479sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
480{
481 return right->thread->pid - left->thread->pid;
482}
483
484static size_t
485sort__thread_print(FILE *fp, struct hist_entry *self)
486{
487 return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid);
488}
489
490static struct sort_entry sort_thread = {
491 .header = " Command: Pid",
492 .cmp = sort__thread_cmp,
493 .print = sort__thread_print,
494};
495
496/* --sort comm */
497
498static int64_t
499sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
500{
501 return right->thread->pid - left->thread->pid;
502}
503
504static int64_t
505sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
506{
507 char *comm_l = left->thread->comm;
508 char *comm_r = right->thread->comm;
509
510 if (!comm_l || !comm_r) {
511 if (!comm_l && !comm_r)
512 return 0;
513 else if (!comm_l)
514 return -1;
515 else
516 return 1;
517 }
518
519 return strcmp(comm_l, comm_r);
520}
521
522static size_t
523sort__comm_print(FILE *fp, struct hist_entry *self)
524{
525 return fprintf(fp, "%16s", self->thread->comm);
526}
527
528static struct sort_entry sort_comm = {
529 .header = " Command",
530 .cmp = sort__comm_cmp,
531 .collapse = sort__comm_collapse,
532 .print = sort__comm_print,
533};
534
535/* --sort dso */
536
537static int64_t
538sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
539{
540 struct dso *dso_l = left->dso;
541 struct dso *dso_r = right->dso;
542
543 if (!dso_l || !dso_r) {
544 if (!dso_l && !dso_r)
545 return 0;
546 else if (!dso_l)
547 return -1;
548 else
549 return 1;
550 }
551
552 return strcmp(dso_l->name, dso_r->name);
553}
554
555static size_t
556sort__dso_print(FILE *fp, struct hist_entry *self)
557{
558 if (self->dso)
559 return fprintf(fp, "%-25s", self->dso->name);
560
561 return fprintf(fp, "%016llx ", (__u64)self->ip);
562}
563
564static struct sort_entry sort_dso = {
565 .header = "Shared Object ",
566 .cmp = sort__dso_cmp,
567 .print = sort__dso_print,
568};
569
570/* --sort symbol */
571
572static int64_t
573sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
574{
575 __u64 ip_l, ip_r;
576
577 if (left->sym == right->sym)
578 return 0;
579
580 ip_l = left->sym ? left->sym->start : left->ip;
581 ip_r = right->sym ? right->sym->start : right->ip;
582
583 return (int64_t)(ip_r - ip_l);
584}
585
586static size_t
587sort__sym_print(FILE *fp, struct hist_entry *self)
588{
589 size_t ret = 0;
590
591 if (verbose)
592 ret += fprintf(fp, "%#018llx ", (__u64)self->ip);
593
594 if (self->sym) {
595 ret += fprintf(fp, "[%c] %s",
596 self->dso == kernel_dso ? 'k' : '.', self->sym->name);
597 } else {
598 ret += fprintf(fp, "%#016llx", (__u64)self->ip);
599 }
600
601 return ret;
602}
603
604static struct sort_entry sort_sym = {
605 .header = "Symbol",
606 .cmp = sort__sym_cmp,
607 .print = sort__sym_print,
608};
609
610static int sort__need_collapse = 0;
611
612struct sort_dimension {
613 char *name;
614 struct sort_entry *entry;
615 int taken;
616};
617
618static struct sort_dimension sort_dimensions[] = {
619 { .name = "pid", .entry = &sort_thread, },
620 { .name = "comm", .entry = &sort_comm, },
621 { .name = "dso", .entry = &sort_dso, },
622 { .name = "symbol", .entry = &sort_sym, },
623};
624
625static LIST_HEAD(hist_entry__sort_list);
626
627static int sort_dimension__add(char *tok)
628{
629 int i;
630
631 for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
632 struct sort_dimension *sd = &sort_dimensions[i];
633
634 if (sd->taken)
635 continue;
636
637 if (strncasecmp(tok, sd->name, strlen(tok)))
638 continue;
639
640 if (sd->entry->collapse)
641 sort__need_collapse = 1;
642
643 list_add_tail(&sd->entry->list, &hist_entry__sort_list);
644 sd->taken = 1;
645
646 return 0;
647 }
648
649 return -ESRCH;
650}
651
652static int64_t
653hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
654{
655 struct sort_entry *se;
656 int64_t cmp = 0;
657
658 list_for_each_entry(se, &hist_entry__sort_list, list) {
659 cmp = se->cmp(left, right);
660 if (cmp)
661 break;
662 }
663
664 return cmp;
665}
666
667static int64_t
668hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
669{
670 struct sort_entry *se;
671 int64_t cmp = 0;
672
673 list_for_each_entry(se, &hist_entry__sort_list, list) {
674 int64_t (*f)(struct hist_entry *, struct hist_entry *);
675
676 f = se->collapse ?: se->cmp;
677
678 cmp = f(left, right);
679 if (cmp)
680 break;
681 }
682
683 return cmp;
684}
685
686static size_t
687hist_entry__fprintf(FILE *fp, struct hist_entry *self, __u64 total_samples)
688{
689 struct sort_entry *se;
690 size_t ret;
691
692 if (total_samples) {
693 double percent = self->count * 100.0 / total_samples;
694 char *color = PERF_COLOR_NORMAL;
695
696 /*
697 * We color high-overhead entries in red, mid-overhead
698 * entries in green - and keep the low overhead places
699 * normal:
700 */
701 if (percent >= 5.0) {
702 color = PERF_COLOR_RED;
703 } else {
704 if (percent >= 0.5)
705 color = PERF_COLOR_GREEN;
706 }
707
708 ret = color_fprintf(fp, color, " %6.2f%%",
709 (self->count * 100.0) / total_samples);
710 } else
711 ret = fprintf(fp, "%12Ld ", self->count);
712
713 list_for_each_entry(se, &hist_entry__sort_list, list) {
714 fprintf(fp, " ");
715 ret += se->print(fp, self);
716 }
717
718 ret += fprintf(fp, "\n");
719
720 return ret;
721}
722
723/*
724 * collect histogram counts
725 */
726
727static int
728hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
729 struct symbol *sym, __u64 ip, char level, __u64 count)
730{
731 struct rb_node **p = &hist.rb_node;
732 struct rb_node *parent = NULL;
733 struct hist_entry *he;
734 struct hist_entry entry = {
735 .thread = thread,
736 .map = map,
737 .dso = dso,
738 .sym = sym,
739 .ip = ip,
740 .level = level,
741 .count = count,
742 };
743 int cmp;
744
745 while (*p != NULL) {
746 parent = *p;
747 he = rb_entry(parent, struct hist_entry, rb_node);
748
749 cmp = hist_entry__cmp(&entry, he);
750
751 if (!cmp) {
752 he->count += count;
753 return 0;
754 }
755
756 if (cmp < 0)
757 p = &(*p)->rb_left;
758 else
759 p = &(*p)->rb_right;
760 }
761
762 he = malloc(sizeof(*he));
763 if (!he)
764 return -ENOMEM;
765 *he = entry;
766 rb_link_node(&he->rb_node, parent, p);
767 rb_insert_color(&he->rb_node, &hist);
768
769 return 0;
770}
771
772static void hist_entry__free(struct hist_entry *he)
773{
774 free(he);
775}
776
777/*
778 * collapse the histogram
779 */
780
781static struct rb_root collapse_hists;
782
783static void collapse__insert_entry(struct hist_entry *he)
784{
785 struct rb_node **p = &collapse_hists.rb_node;
786 struct rb_node *parent = NULL;
787 struct hist_entry *iter;
788 int64_t cmp;
789
790 while (*p != NULL) {
791 parent = *p;
792 iter = rb_entry(parent, struct hist_entry, rb_node);
793
794 cmp = hist_entry__collapse(iter, he);
795
796 if (!cmp) {
797 iter->count += he->count;
798 hist_entry__free(he);
799 return;
800 }
801
802 if (cmp < 0)
803 p = &(*p)->rb_left;
804 else
805 p = &(*p)->rb_right;
806 }
807
808 rb_link_node(&he->rb_node, parent, p);
809 rb_insert_color(&he->rb_node, &collapse_hists);
810}
811
812static void collapse__resort(void)
813{
814 struct rb_node *next;
815 struct hist_entry *n;
816
817 if (!sort__need_collapse)
818 return;
819
820 next = rb_first(&hist);
821 while (next) {
822 n = rb_entry(next, struct hist_entry, rb_node);
823 next = rb_next(&n->rb_node);
824
825 rb_erase(&n->rb_node, &hist);
826 collapse__insert_entry(n);
827 }
828}
829
830/*
831 * reverse the map, sort on count.
832 */
833
834static struct rb_root output_hists;
835
836static void output__insert_entry(struct hist_entry *he)
837{
838 struct rb_node **p = &output_hists.rb_node;
839 struct rb_node *parent = NULL;
840 struct hist_entry *iter;
841
842 while (*p != NULL) {
843 parent = *p;
844 iter = rb_entry(parent, struct hist_entry, rb_node);
845
846 if (he->count > iter->count)
847 p = &(*p)->rb_left;
848 else
849 p = &(*p)->rb_right;
850 }
851
852 rb_link_node(&he->rb_node, parent, p);
853 rb_insert_color(&he->rb_node, &output_hists);
854}
855
856static void output__resort(void)
857{
858 struct rb_node *next;
859 struct hist_entry *n;
860 struct rb_root *tree = &hist;
861
862 if (sort__need_collapse)
863 tree = &collapse_hists;
864
865 next = rb_first(tree);
866
867 while (next) {
868 n = rb_entry(next, struct hist_entry, rb_node);
869 next = rb_next(&n->rb_node);
870
871 rb_erase(&n->rb_node, tree);
872 output__insert_entry(n);
873 }
874}
875
876static size_t output__fprintf(FILE *fp, __u64 total_samples)
877{
878 struct hist_entry *pos;
879 struct sort_entry *se;
880 struct rb_node *nd;
881 size_t ret = 0;
882
883 fprintf(fp, "\n");
884 fprintf(fp, "#\n");
885 fprintf(fp, "# (%Ld samples)\n", (__u64)total_samples);
886 fprintf(fp, "#\n");
887
888 fprintf(fp, "# Overhead");
889 list_for_each_entry(se, &hist_entry__sort_list, list)
890 fprintf(fp, " %s", se->header);
891 fprintf(fp, "\n");
892
893 fprintf(fp, "# ........");
894 list_for_each_entry(se, &hist_entry__sort_list, list) {
895 int i;
896
897 fprintf(fp, " ");
898 for (i = 0; i < strlen(se->header); i++)
899 fprintf(fp, ".");
900 }
901 fprintf(fp, "\n");
902
903 fprintf(fp, "#\n");
904
905 for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
906 pos = rb_entry(nd, struct hist_entry, rb_node);
907 ret += hist_entry__fprintf(fp, pos, total_samples);
908 }
909
910 if (!strcmp(sort_order, default_sort_order)) {
911 fprintf(fp, "#\n");
912 fprintf(fp, "# (For more details, try: perf report --sort comm,dso,symbol)\n");
913 fprintf(fp, "#\n");
914 }
915 fprintf(fp, "\n");
916
917 return ret;
918}
919
920static void register_idle_thread(void)
921{
922 struct thread *thread = threads__findnew(0);
923
924 if (thread == NULL ||
925 thread__set_comm(thread, "[idle]")) {
926 fprintf(stderr, "problem inserting idle task.\n");
927 exit(-1);
928 }
929}
930
931static unsigned long total = 0,
932 total_mmap = 0,
933 total_comm = 0,
934 total_fork = 0,
935 total_unknown = 0;
936
937static int
938process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
939{
940 char level;
941 int show = 0;
942 struct dso *dso = NULL;
943 struct thread *thread = threads__findnew(event->ip.pid);
944 __u64 ip = event->ip.ip;
945 __u64 period = 1;
946 struct map *map = NULL;
947
948 if (event->header.type & PERF_SAMPLE_PERIOD)
949 period = event->ip.period;
950
951 dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p period: %Ld\n",
952 (void *)(offset + head),
953 (void *)(long)(event->header.size),
954 event->header.misc,
955 event->ip.pid,
956 (void *)(long)ip,
957 (long long)period);
958
959 dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
960
961 if (thread == NULL) {
962 fprintf(stderr, "problem processing %d event, skipping it.\n",
963 event->header.type);
964 return -1;
965 }
966
967 if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
968 show = SHOW_KERNEL;
969 level = 'k';
970
971 dso = kernel_dso;
972
973 dprintf(" ...... dso: %s\n", dso->name);
974
975 } else if (event->header.misc & PERF_EVENT_MISC_USER) {
976
977 show = SHOW_USER;
978 level = '.';
979
980 map = thread__find_map(thread, ip);
981 if (map != NULL) {
982 ip = map->map_ip(map, ip);
983 dso = map->dso;
984 } else {
985 /*
986 * If this is outside of all known maps,
987 * and is a negative address, try to look it
988 * up in the kernel dso, as it might be a
989 * vsyscall (which executes in user-mode):
990 */
991 if ((long long)ip < 0)
992 dso = kernel_dso;
993 }
994 dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
995
996 } else {
997 show = SHOW_HV;
998 level = 'H';
999 dprintf(" ...... dso: [hypervisor]\n");
1000 }
1001
1002 if (show & show_mask) {
1003 struct symbol *sym = NULL;
1004
1005 if (dso)
1006 sym = dso->find_symbol(dso, ip);
1007
1008 if (hist_entry__add(thread, map, dso, sym, ip, level, period)) {
1009 fprintf(stderr,
1010 "problem incrementing symbol count, skipping event\n");
1011 return -1;
1012 }
1013 }
1014 total += period;
1015
1016 return 0;
1017}
1018
1019static int
1020process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
1021{
1022 struct thread *thread = threads__findnew(event->mmap.pid);
1023 struct map *map = map__new(&event->mmap);
1024
1025 dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
1026 (void *)(offset + head),
1027 (void *)(long)(event->header.size),
1028 event->mmap.pid,
1029 (void *)(long)event->mmap.start,
1030 (void *)(long)event->mmap.len,
1031 (void *)(long)event->mmap.pgoff,
1032 event->mmap.filename);
1033
1034 if (thread == NULL || map == NULL) {
1035 dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n");
1036 return 0;
1037 }
1038
1039 thread__insert_map(thread, map);
1040 total_mmap++;
1041
1042 return 0;
1043}
1044
1045static int
1046process_comm_event(event_t *event, unsigned long offset, unsigned long head)
1047{
1048 struct thread *thread = threads__findnew(event->comm.pid);
1049
1050 dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
1051 (void *)(offset + head),
1052 (void *)(long)(event->header.size),
1053 event->comm.comm, event->comm.pid);
1054
1055 if (thread == NULL ||
1056 thread__set_comm(thread, event->comm.comm)) {
1057 dprintf("problem processing PERF_EVENT_COMM, skipping event.\n");
1058 return -1;
1059 }
1060 total_comm++;
1061
1062 return 0;
1063}
1064
1065static int
1066process_fork_event(event_t *event, unsigned long offset, unsigned long head)
1067{
1068 struct thread *thread = threads__findnew(event->fork.pid);
1069 struct thread *parent = threads__findnew(event->fork.ppid);
1070
1071 dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
1072 (void *)(offset + head),
1073 (void *)(long)(event->header.size),
1074 event->fork.pid, event->fork.ppid);
1075
1076 if (!thread || !parent || thread__fork(thread, parent)) {
1077 dprintf("problem processing PERF_EVENT_FORK, skipping event.\n");
1078 return -1;
1079 }
1080 total_fork++;
1081
1082 return 0;
1083}
1084
1085static int
1086process_period_event(event_t *event, unsigned long offset, unsigned long head)
1087{
1088 dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n",
1089 (void *)(offset + head),
1090 (void *)(long)(event->header.size),
1091 event->period.time,
1092 event->period.id,
1093 event->period.sample_period);
1094
1095 return 0;
1096}
1097
1098static int
1099process_event(event_t *event, unsigned long offset, unsigned long head)
1100{
1101 if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
1102 return process_overflow_event(event, offset, head);
1103
1104 switch (event->header.type) {
1105 case PERF_EVENT_MMAP:
1106 return process_mmap_event(event, offset, head);
1107
1108 case PERF_EVENT_COMM:
1109 return process_comm_event(event, offset, head);
1110
1111 case PERF_EVENT_FORK:
1112 return process_fork_event(event, offset, head);
1113
1114 case PERF_EVENT_PERIOD:
1115 return process_period_event(event, offset, head);
1116 /*
1117 * We dont process them right now but they are fine:
1118 */
1119
1120 case PERF_EVENT_THROTTLE:
1121 case PERF_EVENT_UNTHROTTLE:
1122 return 0;
1123
1124 default:
1125 return -1;
1126 }
1127
1128 return 0;
1129}
1130
1131static int __cmd_report(void)
1132{
1133 int ret, rc = EXIT_FAILURE;
1134 unsigned long offset = 0;
1135 unsigned long head = 0;
1136 struct stat stat;
1137 event_t *event;
1138 uint32_t size;
1139 char *buf;
1140
1141 register_idle_thread();
1142
1143 input = open(input_name, O_RDONLY);
1144 if (input < 0) {
1145 fprintf(stderr, " failed to open file: %s", input_name);
1146 if (!strcmp(input_name, "perf.data"))
1147 fprintf(stderr, " (try 'perf record' first)");
1148 fprintf(stderr, "\n");
1149 exit(-1);
1150 }
1151
1152 ret = fstat(input, &stat);
1153 if (ret < 0) {
1154 perror("failed to stat file");
1155 exit(-1);
1156 }
1157
1158 if (!stat.st_size) {
1159 fprintf(stderr, "zero-sized file, nothing to do!\n");
1160 exit(0);
1161 }
1162
1163 if (load_kernel() < 0) {
1164 perror("failed to load kernel symbols");
1165 return EXIT_FAILURE;
1166 }
1167
1168 if (!full_paths) {
1169 if (getcwd(__cwd, sizeof(__cwd)) == NULL) {
1170 perror("failed to get the current directory");
1171 return EXIT_FAILURE;
1172 }
1173 cwdlen = strlen(cwd);
1174 } else {
1175 cwd = NULL;
1176 cwdlen = 0;
1177 }
1178remap:
1179 buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
1180 MAP_SHARED, input, offset);
1181 if (buf == MAP_FAILED) {
1182 perror("failed to mmap file");
1183 exit(-1);
1184 }
1185
1186more:
1187 event = (event_t *)(buf + head);
1188
1189 size = event->header.size;
1190 if (!size)
1191 size = 8;
1192
1193 if (head + event->header.size >= page_size * mmap_window) {
1194 unsigned long shift = page_size * (head / page_size);
1195 int ret;
1196
1197 ret = munmap(buf, page_size * mmap_window);
1198 assert(ret == 0);
1199
1200 offset += shift;
1201 head -= shift;
1202 goto remap;
1203 }
1204
1205 size = event->header.size;
1206
1207 dprintf("%p [%p]: event: %d\n",
1208 (void *)(offset + head),
1209 (void *)(long)event->header.size,
1210 event->header.type);
1211
1212 if (!size || process_event(event, offset, head) < 0) {
1213
1214 dprintf("%p [%p]: skipping unknown header type: %d\n",
1215 (void *)(offset + head),
1216 (void *)(long)(event->header.size),
1217 event->header.type);
1218
1219 total_unknown++;
1220
1221 /*
1222 * assume we lost track of the stream, check alignment, and
1223 * increment a single u64 in the hope to catch on again 'soon'.
1224 */
1225
1226 if (unlikely(head & 7))
1227 head &= ~7ULL;
1228
1229 size = 8;
1230 }
1231
1232 head += size;
1233
1234 if (offset + head < stat.st_size)
1235 goto more;
1236
1237 rc = EXIT_SUCCESS;
1238 close(input);
1239
1240 dprintf(" IP events: %10ld\n", total);
1241 dprintf(" mmap events: %10ld\n", total_mmap);
1242 dprintf(" comm events: %10ld\n", total_comm);
1243 dprintf(" fork events: %10ld\n", total_fork);
1244 dprintf(" unknown events: %10ld\n", total_unknown);
1245
1246 if (dump_trace)
1247 return 0;
1248
1249 if (verbose >= 3)
1250 threads__fprintf(stdout);
1251
1252 if (verbose >= 2)
1253 dsos__fprintf(stdout);
1254
1255 collapse__resort();
1256 output__resort();
1257 output__fprintf(stdout, total);
1258
1259 return rc;
1260}
1261
1262static const char * const report_usage[] = {
1263 "perf report [<options>] <command>",
1264 NULL
1265};
1266
1267static const struct option options[] = {
1268 OPT_STRING('i', "input", &input_name, "file",
1269 "input file name"),
1270 OPT_BOOLEAN('v', "verbose", &verbose,
1271 "be more verbose (show symbol address, etc)"),
1272 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1273 "dump raw trace in ASCII"),
1274 OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
1275 OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
1276 "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
1277 OPT_BOOLEAN('P', "full-paths", &full_paths,
1278 "Don't shorten the pathnames taking into account the cwd"),
1279 OPT_END()
1280};
1281
1282static void setup_sorting(void)
1283{
1284 char *tmp, *tok, *str = strdup(sort_order);
1285
1286 for (tok = strtok_r(str, ", ", &tmp);
1287 tok; tok = strtok_r(NULL, ", ", &tmp)) {
1288 if (sort_dimension__add(tok) < 0) {
1289 error("Unknown --sort key: `%s'", tok);
1290 usage_with_options(report_usage, options);
1291 }
1292 }
1293
1294 free(str);
1295}
1296
1297int cmd_report(int argc, const char **argv, const char *prefix)
1298{
1299 symbol__init();
1300
1301 page_size = getpagesize();
1302
1303 argc = parse_options(argc, argv, options, report_usage, 0);
1304
1305 setup_sorting();
1306
1307 /*
1308 * Any (unrecognized) arguments left?
1309 */
1310 if (argc)
1311 usage_with_options(report_usage, options);
1312
1313 setup_pager();
1314
1315 return __cmd_report();
1316}
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
new file mode 100644
index 000000000000..c43e4a97dc42
--- /dev/null
+++ b/tools/perf/builtin-stat.c
@@ -0,0 +1,367 @@
1/*
2 * builtin-stat.c
3 *
4 * Builtin stat command: Give a precise performance counters summary
5 * overview about any workload, CPU or specific PID.
6 *
7 * Sample output:
8
9 $ perf stat ~/hackbench 10
10 Time: 0.104
11
12 Performance counter stats for '/home/mingo/hackbench':
13
14 1255.538611 task clock ticks # 10.143 CPU utilization factor
15 54011 context switches # 0.043 M/sec
16 385 CPU migrations # 0.000 M/sec
17 17755 pagefaults # 0.014 M/sec
18 3808323185 CPU cycles # 3033.219 M/sec
19 1575111190 instructions # 1254.530 M/sec
20 17367895 cache references # 13.833 M/sec
21 7674421 cache misses # 6.112 M/sec
22
23 Wall-clock time elapsed: 123.786620 msecs
24
25 *
26 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
27 *
28 * Improvements and fixes by:
29 *
30 * Arjan van de Ven <arjan@linux.intel.com>
31 * Yanmin Zhang <yanmin.zhang@intel.com>
32 * Wu Fengguang <fengguang.wu@intel.com>
33 * Mike Galbraith <efault@gmx.de>
34 * Paul Mackerras <paulus@samba.org>
35 *
36 * Released under the GPL v2. (and only v2, not any later version)
37 */
38
39#include "perf.h"
40#include "builtin.h"
41#include "util/util.h"
42#include "util/parse-options.h"
43#include "util/parse-events.h"
44
45#include <sys/prctl.h>
46
47static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
48
49 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK },
50 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES},
51 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS },
52 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS },
53
54 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES },
55 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS },
56 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES},
57 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES },
58
59};
60
61static int system_wide = 0;
62static int inherit = 1;
63static int verbose = 0;
64
65static int fd[MAX_NR_CPUS][MAX_COUNTERS];
66
67static int target_pid = -1;
68static int nr_cpus = 0;
69static unsigned int page_size;
70
71static int scale = 1;
72
73static const unsigned int default_count[] = {
74 1000000,
75 1000000,
76 10000,
77 10000,
78 1000000,
79 10000,
80};
81
82static __u64 event_res[MAX_COUNTERS][3];
83static __u64 event_scaled[MAX_COUNTERS];
84
85static __u64 runtime_nsecs;
86static __u64 walltime_nsecs;
87static __u64 runtime_cycles;
88
89static void create_perf_stat_counter(int counter)
90{
91 struct perf_counter_attr *attr = attrs + counter;
92
93 if (scale)
94 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
95 PERF_FORMAT_TOTAL_TIME_RUNNING;
96
97 if (system_wide) {
98 int cpu;
99 for (cpu = 0; cpu < nr_cpus; cpu ++) {
100 fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
101 if (fd[cpu][counter] < 0 && verbose) {
102 printf("Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n", counter, fd[cpu][counter], strerror(errno));
103 }
104 }
105 } else {
106 attr->inherit = inherit;
107 attr->disabled = 1;
108
109 fd[0][counter] = sys_perf_counter_open(attr, 0, -1, -1, 0);
110 if (fd[0][counter] < 0 && verbose) {
111 printf("Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n", counter, fd[0][counter], strerror(errno));
112 }
113 }
114}
115
116/*
117 * Does the counter have nsecs as a unit?
118 */
119static inline int nsec_counter(int counter)
120{
121 if (attrs[counter].type != PERF_TYPE_SOFTWARE)
122 return 0;
123
124 if (attrs[counter].config == PERF_COUNT_SW_CPU_CLOCK)
125 return 1;
126
127 if (attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
128 return 1;
129
130 return 0;
131}
132
133/*
134 * Read out the results of a single counter:
135 */
136static void read_counter(int counter)
137{
138 __u64 *count, single_count[3];
139 ssize_t res;
140 int cpu, nv;
141 int scaled;
142
143 count = event_res[counter];
144
145 count[0] = count[1] = count[2] = 0;
146
147 nv = scale ? 3 : 1;
148 for (cpu = 0; cpu < nr_cpus; cpu ++) {
149 if (fd[cpu][counter] < 0)
150 continue;
151
152 res = read(fd[cpu][counter], single_count, nv * sizeof(__u64));
153 assert(res == nv * sizeof(__u64));
154
155 count[0] += single_count[0];
156 if (scale) {
157 count[1] += single_count[1];
158 count[2] += single_count[2];
159 }
160 }
161
162 scaled = 0;
163 if (scale) {
164 if (count[2] == 0) {
165 event_scaled[counter] = -1;
166 count[0] = 0;
167 return;
168 }
169
170 if (count[2] < count[1]) {
171 event_scaled[counter] = 1;
172 count[0] = (unsigned long long)
173 ((double)count[0] * count[1] / count[2] + 0.5);
174 }
175 }
176 /*
177 * Save the full runtime - to allow normalization during printout:
178 */
179 if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
180 attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
181 runtime_nsecs = count[0];
182 if (attrs[counter].type == PERF_TYPE_HARDWARE &&
183 attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES)
184 runtime_cycles = count[0];
185}
186
187/*
188 * Print out the results of a single counter:
189 */
190static void print_counter(int counter)
191{
192 __u64 *count;
193 int scaled;
194
195 count = event_res[counter];
196 scaled = event_scaled[counter];
197
198 if (scaled == -1) {
199 fprintf(stderr, " %14s %-20s\n",
200 "<not counted>", event_name(counter));
201 return;
202 }
203
204 if (nsec_counter(counter)) {
205 double msecs = (double)count[0] / 1000000;
206
207 fprintf(stderr, " %14.6f %-20s",
208 msecs, event_name(counter));
209 if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
210 attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
211
212 if (walltime_nsecs)
213 fprintf(stderr, " # %11.3f CPU utilization factor",
214 (double)count[0] / (double)walltime_nsecs);
215 }
216 } else {
217 fprintf(stderr, " %14Ld %-20s",
218 count[0], event_name(counter));
219 if (runtime_nsecs)
220 fprintf(stderr, " # %11.3f M/sec",
221 (double)count[0]/runtime_nsecs*1000.0);
222 if (runtime_cycles &&
223 attrs[counter].type == PERF_TYPE_HARDWARE &&
224 attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
225
226 fprintf(stderr, " # %1.3f per cycle",
227 (double)count[0] / (double)runtime_cycles);
228 }
229 }
230 if (scaled)
231 fprintf(stderr, " (scaled from %.2f%%)",
232 (double) count[2] / count[1] * 100);
233 fprintf(stderr, "\n");
234}
235
236static int do_perf_stat(int argc, const char **argv)
237{
238 unsigned long long t0, t1;
239 int counter;
240 int status;
241 int pid;
242 int i;
243
244 if (!system_wide)
245 nr_cpus = 1;
246
247 for (counter = 0; counter < nr_counters; counter++)
248 create_perf_stat_counter(counter);
249
250 /*
251 * Enable counters and exec the command:
252 */
253 t0 = rdclock();
254 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
255
256 if ((pid = fork()) < 0)
257 perror("failed to fork");
258
259 if (!pid) {
260 if (execvp(argv[0], (char **)argv)) {
261 perror(argv[0]);
262 exit(-1);
263 }
264 }
265
266 while (wait(&status) >= 0)
267 ;
268
269 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
270 t1 = rdclock();
271
272 walltime_nsecs = t1 - t0;
273
274 fflush(stdout);
275
276 fprintf(stderr, "\n");
277 fprintf(stderr, " Performance counter stats for \'%s", argv[0]);
278
279 for (i = 1; i < argc; i++)
280 fprintf(stderr, " %s", argv[i]);
281
282 fprintf(stderr, "\':\n");
283 fprintf(stderr, "\n");
284
285 for (counter = 0; counter < nr_counters; counter++)
286 read_counter(counter);
287
288 for (counter = 0; counter < nr_counters; counter++)
289 print_counter(counter);
290
291
292 fprintf(stderr, "\n");
293 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
294 (double)(t1-t0)/1e6);
295 fprintf(stderr, "\n");
296
297 return 0;
298}
299
300static volatile int signr = -1;
301
302static void skip_signal(int signo)
303{
304 signr = signo;
305}
306
307static void sig_atexit(void)
308{
309 if (signr == -1)
310 return;
311
312 signal(signr, SIG_DFL);
313 kill(getpid(), signr);
314}
315
316static const char * const stat_usage[] = {
317 "perf stat [<options>] <command>",
318 NULL
319};
320
321static const struct option options[] = {
322 OPT_CALLBACK('e', "event", NULL, "event",
323 "event selector. use 'perf list' to list available events",
324 parse_events),
325 OPT_BOOLEAN('i', "inherit", &inherit,
326 "child tasks inherit counters"),
327 OPT_INTEGER('p', "pid", &target_pid,
328 "stat events on existing pid"),
329 OPT_BOOLEAN('a', "all-cpus", &system_wide,
330 "system-wide collection from all CPUs"),
331 OPT_BOOLEAN('S', "scale", &scale,
332 "scale/normalize counters"),
333 OPT_BOOLEAN('v', "verbose", &verbose,
334 "be more verbose (show counter open errors, etc)"),
335 OPT_END()
336};
337
338int cmd_stat(int argc, const char **argv, const char *prefix)
339{
340 page_size = sysconf(_SC_PAGE_SIZE);
341
342 memcpy(attrs, default_attrs, sizeof(attrs));
343
344 argc = parse_options(argc, argv, options, stat_usage, 0);
345 if (!argc)
346 usage_with_options(stat_usage, options);
347
348 if (!nr_counters)
349 nr_counters = 8;
350
351 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
352 assert(nr_cpus <= MAX_NR_CPUS);
353 assert(nr_cpus >= 0);
354
355 /*
356 * We dont want to block the signals - that would cause
357 * child tasks to inherit that and Ctrl-C would not work.
358 * What we want is for Ctrl-C to work in the exec()-ed
359 * task, but being ignored by perf stat itself:
360 */
361 atexit(sig_atexit);
362 signal(SIGINT, skip_signal);
363 signal(SIGALRM, skip_signal);
364 signal(SIGABRT, skip_signal);
365
366 return do_perf_stat(argc, argv);
367}
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
new file mode 100644
index 000000000000..fe338d3c5d7e
--- /dev/null
+++ b/tools/perf/builtin-top.c
@@ -0,0 +1,736 @@
1/*
2 * builtin-top.c
3 *
4 * Builtin top command: Display a continuously updated profile of
5 * any workload, CPU or specific PID.
6 *
7 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
8 *
9 * Improvements and fixes by:
10 *
11 * Arjan van de Ven <arjan@linux.intel.com>
12 * Yanmin Zhang <yanmin.zhang@intel.com>
13 * Wu Fengguang <fengguang.wu@intel.com>
14 * Mike Galbraith <efault@gmx.de>
15 * Paul Mackerras <paulus@samba.org>
16 *
17 * Released under the GPL v2. (and only v2, not any later version)
18 */
19#include "builtin.h"
20
21#include "perf.h"
22
23#include "util/symbol.h"
24#include "util/color.h"
25#include "util/util.h"
26#include "util/rbtree.h"
27#include "util/parse-options.h"
28#include "util/parse-events.h"
29
30#include <assert.h>
31#include <fcntl.h>
32
33#include <stdio.h>
34
35#include <errno.h>
36#include <time.h>
37#include <sched.h>
38#include <pthread.h>
39
40#include <sys/syscall.h>
41#include <sys/ioctl.h>
42#include <sys/poll.h>
43#include <sys/prctl.h>
44#include <sys/wait.h>
45#include <sys/uio.h>
46#include <sys/mman.h>
47
48#include <linux/unistd.h>
49#include <linux/types.h>
50
51static int fd[MAX_NR_CPUS][MAX_COUNTERS];
52
53static int system_wide = 0;
54
55static int default_interval = 100000;
56
57static __u64 count_filter = 5;
58static int print_entries = 15;
59
60static int target_pid = -1;
61static int profile_cpu = -1;
62static int nr_cpus = 0;
63static unsigned int realtime_prio = 0;
64static int group = 0;
65static unsigned int page_size;
66static unsigned int mmap_pages = 16;
67static int freq = 0;
68static int verbose = 0;
69
70static char *sym_filter;
71static unsigned long filter_start;
72static unsigned long filter_end;
73
74static int delay_secs = 2;
75static int zero;
76static int dump_symtab;
77
78/*
79 * Symbols
80 */
81
82static __u64 min_ip;
83static __u64 max_ip = -1ll;
84
85struct sym_entry {
86 struct rb_node rb_node;
87 struct list_head node;
88 unsigned long count[MAX_COUNTERS];
89 unsigned long snap_count;
90 double weight;
91 int skip;
92};
93
94struct sym_entry *sym_filter_entry;
95
96struct dso *kernel_dso;
97
98/*
99 * Symbols will be added here in record_ip and will get out
100 * after decayed.
101 */
102static LIST_HEAD(active_symbols);
103static pthread_mutex_t active_symbols_lock = PTHREAD_MUTEX_INITIALIZER;
104
105/*
106 * Ordering weight: count-1 * count-2 * ... / count-n
107 */
108static double sym_weight(const struct sym_entry *sym)
109{
110 double weight = sym->snap_count;
111 int counter;
112
113 for (counter = 1; counter < nr_counters-1; counter++)
114 weight *= sym->count[counter];
115
116 weight /= (sym->count[counter] + 1);
117
118 return weight;
119}
120
121static long samples;
122static long userspace_samples;
123static const char CONSOLE_CLEAR[] = "";
124
125static void __list_insert_active_sym(struct sym_entry *syme)
126{
127 list_add(&syme->node, &active_symbols);
128}
129
130static void list_remove_active_sym(struct sym_entry *syme)
131{
132 pthread_mutex_lock(&active_symbols_lock);
133 list_del_init(&syme->node);
134 pthread_mutex_unlock(&active_symbols_lock);
135}
136
137static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
138{
139 struct rb_node **p = &tree->rb_node;
140 struct rb_node *parent = NULL;
141 struct sym_entry *iter;
142
143 while (*p != NULL) {
144 parent = *p;
145 iter = rb_entry(parent, struct sym_entry, rb_node);
146
147 if (se->weight > iter->weight)
148 p = &(*p)->rb_left;
149 else
150 p = &(*p)->rb_right;
151 }
152
153 rb_link_node(&se->rb_node, parent, p);
154 rb_insert_color(&se->rb_node, tree);
155}
156
157static void print_sym_table(void)
158{
159 int printed = 0, j;
160 int counter;
161 float samples_per_sec = samples/delay_secs;
162 float ksamples_per_sec = (samples-userspace_samples)/delay_secs;
163 float sum_ksamples = 0.0;
164 struct sym_entry *syme, *n;
165 struct rb_root tmp = RB_ROOT;
166 struct rb_node *nd;
167
168 samples = userspace_samples = 0;
169
170 /* Sort the active symbols */
171 pthread_mutex_lock(&active_symbols_lock);
172 syme = list_entry(active_symbols.next, struct sym_entry, node);
173 pthread_mutex_unlock(&active_symbols_lock);
174
175 list_for_each_entry_safe_from(syme, n, &active_symbols, node) {
176 syme->snap_count = syme->count[0];
177 if (syme->snap_count != 0) {
178 syme->weight = sym_weight(syme);
179 rb_insert_active_sym(&tmp, syme);
180 sum_ksamples += syme->snap_count;
181
182 for (j = 0; j < nr_counters; j++)
183 syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8;
184 } else
185 list_remove_active_sym(syme);
186 }
187
188 puts(CONSOLE_CLEAR);
189
190 printf(
191"------------------------------------------------------------------------------\n");
192 printf( " PerfTop:%8.0f irqs/sec kernel:%4.1f%% [",
193 samples_per_sec,
194 100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)));
195
196 if (nr_counters == 1) {
197 printf("%Ld", attrs[0].sample_period);
198 if (freq)
199 printf("Hz ");
200 else
201 printf(" ");
202 }
203
204 for (counter = 0; counter < nr_counters; counter++) {
205 if (counter)
206 printf("/");
207
208 printf("%s", event_name(counter));
209 }
210
211 printf( "], ");
212
213 if (target_pid != -1)
214 printf(" (target_pid: %d", target_pid);
215 else
216 printf(" (all");
217
218 if (profile_cpu != -1)
219 printf(", cpu: %d)\n", profile_cpu);
220 else {
221 if (target_pid != -1)
222 printf(")\n");
223 else
224 printf(", %d CPUs)\n", nr_cpus);
225 }
226
227 printf("------------------------------------------------------------------------------\n\n");
228
229 if (nr_counters == 1)
230 printf(" samples pcnt");
231 else
232 printf(" weight samples pcnt");
233
234 printf(" RIP kernel function\n"
235 " ______ _______ _____ ________________ _______________\n\n"
236 );
237
238 for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
239 struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node);
240 struct symbol *sym = (struct symbol *)(syme + 1);
241 char *color = PERF_COLOR_NORMAL;
242 double pcnt;
243
244 if (++printed > print_entries || syme->snap_count < count_filter)
245 continue;
246
247 pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) /
248 sum_ksamples));
249
250 /*
251 * We color high-overhead entries in red, mid-overhead
252 * entries in green - and keep the low overhead places
253 * normal:
254 */
255 if (pcnt >= 5.0) {
256 color = PERF_COLOR_RED;
257 } else {
258 if (pcnt >= 0.5)
259 color = PERF_COLOR_GREEN;
260 }
261
262 if (nr_counters == 1)
263 printf("%20.2f - ", syme->weight);
264 else
265 printf("%9.1f %10ld - ", syme->weight, syme->snap_count);
266
267 color_fprintf(stdout, color, "%4.1f%%", pcnt);
268 printf(" - %016llx : %s\n", sym->start, sym->name);
269 }
270}
271
272static void *display_thread(void *arg)
273{
274 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
275 int delay_msecs = delay_secs * 1000;
276
277 printf("PerfTop refresh period: %d seconds\n", delay_secs);
278
279 do {
280 print_sym_table();
281 } while (!poll(&stdin_poll, 1, delay_msecs) == 1);
282
283 printf("key pressed - exiting.\n");
284 exit(0);
285
286 return NULL;
287}
288
289static int symbol_filter(struct dso *self, struct symbol *sym)
290{
291 static int filter_match;
292 struct sym_entry *syme;
293 const char *name = sym->name;
294
295 if (!strcmp(name, "_text") ||
296 !strcmp(name, "_etext") ||
297 !strcmp(name, "_sinittext") ||
298 !strncmp("init_module", name, 11) ||
299 !strncmp("cleanup_module", name, 14) ||
300 strstr(name, "_text_start") ||
301 strstr(name, "_text_end"))
302 return 1;
303
304 syme = dso__sym_priv(self, sym);
305 /* Tag samples to be skipped. */
306 if (!strcmp("default_idle", name) ||
307 !strcmp("cpu_idle", name) ||
308 !strcmp("enter_idle", name) ||
309 !strcmp("exit_idle", name) ||
310 !strcmp("mwait_idle", name))
311 syme->skip = 1;
312
313 if (filter_match == 1) {
314 filter_end = sym->start;
315 filter_match = -1;
316 if (filter_end - filter_start > 10000) {
317 fprintf(stderr,
318 "hm, too large filter symbol <%s> - skipping.\n",
319 sym_filter);
320 fprintf(stderr, "symbol filter start: %016lx\n",
321 filter_start);
322 fprintf(stderr, " end: %016lx\n",
323 filter_end);
324 filter_end = filter_start = 0;
325 sym_filter = NULL;
326 sleep(1);
327 }
328 }
329
330 if (filter_match == 0 && sym_filter && !strcmp(name, sym_filter)) {
331 filter_match = 1;
332 filter_start = sym->start;
333 }
334
335
336 return 0;
337}
338
339static int parse_symbols(void)
340{
341 struct rb_node *node;
342 struct symbol *sym;
343
344 kernel_dso = dso__new("[kernel]", sizeof(struct sym_entry));
345 if (kernel_dso == NULL)
346 return -1;
347
348 if (dso__load_kernel(kernel_dso, NULL, symbol_filter, 1) != 0)
349 goto out_delete_dso;
350
351 node = rb_first(&kernel_dso->syms);
352 sym = rb_entry(node, struct symbol, rb_node);
353 min_ip = sym->start;
354
355 node = rb_last(&kernel_dso->syms);
356 sym = rb_entry(node, struct symbol, rb_node);
357 max_ip = sym->end;
358
359 if (dump_symtab)
360 dso__fprintf(kernel_dso, stderr);
361
362 return 0;
363
364out_delete_dso:
365 dso__delete(kernel_dso);
366 kernel_dso = NULL;
367 return -1;
368}
369
370#define TRACE_COUNT 3
371
372/*
373 * Binary search in the histogram table and record the hit:
374 */
375static void record_ip(__u64 ip, int counter)
376{
377 struct symbol *sym = dso__find_symbol(kernel_dso, ip);
378
379 if (sym != NULL) {
380 struct sym_entry *syme = dso__sym_priv(kernel_dso, sym);
381
382 if (!syme->skip) {
383 syme->count[counter]++;
384 pthread_mutex_lock(&active_symbols_lock);
385 if (list_empty(&syme->node) || !syme->node.next)
386 __list_insert_active_sym(syme);
387 pthread_mutex_unlock(&active_symbols_lock);
388 return;
389 }
390 }
391
392 samples--;
393}
394
395static void process_event(__u64 ip, int counter)
396{
397 samples++;
398
399 if (ip < min_ip || ip > max_ip) {
400 userspace_samples++;
401 return;
402 }
403
404 record_ip(ip, counter);
405}
406
407struct mmap_data {
408 int counter;
409 void *base;
410 unsigned int mask;
411 unsigned int prev;
412};
413
414static unsigned int mmap_read_head(struct mmap_data *md)
415{
416 struct perf_counter_mmap_page *pc = md->base;
417 int head;
418
419 head = pc->data_head;
420 rmb();
421
422 return head;
423}
424
425struct timeval last_read, this_read;
426
427static void mmap_read_counter(struct mmap_data *md)
428{
429 unsigned int head = mmap_read_head(md);
430 unsigned int old = md->prev;
431 unsigned char *data = md->base + page_size;
432 int diff;
433
434 gettimeofday(&this_read, NULL);
435
436 /*
437 * If we're further behind than half the buffer, there's a chance
438 * the writer will bite our tail and mess up the samples under us.
439 *
440 * If we somehow ended up ahead of the head, we got messed up.
441 *
442 * In either case, truncate and restart at head.
443 */
444 diff = head - old;
445 if (diff > md->mask / 2 || diff < 0) {
446 struct timeval iv;
447 unsigned long msecs;
448
449 timersub(&this_read, &last_read, &iv);
450 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
451
452 fprintf(stderr, "WARNING: failed to keep up with mmap data."
453 " Last read %lu msecs ago.\n", msecs);
454
455 /*
456 * head points to a known good entry, start there.
457 */
458 old = head;
459 }
460
461 last_read = this_read;
462
463 for (; old != head;) {
464 struct ip_event {
465 struct perf_event_header header;
466 __u64 ip;
467 __u32 pid, target_pid;
468 };
469 struct mmap_event {
470 struct perf_event_header header;
471 __u32 pid, target_pid;
472 __u64 start;
473 __u64 len;
474 __u64 pgoff;
475 char filename[PATH_MAX];
476 };
477
478 typedef union event_union {
479 struct perf_event_header header;
480 struct ip_event ip;
481 struct mmap_event mmap;
482 } event_t;
483
484 event_t *event = (event_t *)&data[old & md->mask];
485
486 event_t event_copy;
487
488 size_t size = event->header.size;
489
490 /*
491 * Event straddles the mmap boundary -- header should always
492 * be inside due to u64 alignment of output.
493 */
494 if ((old & md->mask) + size != ((old + size) & md->mask)) {
495 unsigned int offset = old;
496 unsigned int len = min(sizeof(*event), size), cpy;
497 void *dst = &event_copy;
498
499 do {
500 cpy = min(md->mask + 1 - (offset & md->mask), len);
501 memcpy(dst, &data[offset & md->mask], cpy);
502 offset += cpy;
503 dst += cpy;
504 len -= cpy;
505 } while (len);
506
507 event = &event_copy;
508 }
509
510 old += size;
511
512 if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
513 if (event->header.type & PERF_SAMPLE_IP)
514 process_event(event->ip.ip, md->counter);
515 }
516 }
517
518 md->prev = old;
519}
520
521static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
522static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
523
524static void mmap_read(void)
525{
526 int i, counter;
527
528 for (i = 0; i < nr_cpus; i++) {
529 for (counter = 0; counter < nr_counters; counter++)
530 mmap_read_counter(&mmap_array[i][counter]);
531 }
532}
533
534int nr_poll;
535int group_fd;
536
537static void start_counter(int i, int counter)
538{
539 struct perf_counter_attr *attr;
540 unsigned int cpu;
541
542 cpu = profile_cpu;
543 if (target_pid == -1 && profile_cpu == -1)
544 cpu = i;
545
546 attr = attrs + counter;
547
548 attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
549 attr->freq = freq;
550
551try_again:
552 fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
553
554 if (fd[i][counter] < 0) {
555 int err = errno;
556
557 if (err == EPERM)
558 die("No permission - are you root?\n");
559 /*
560 * If it's cycles then fall back to hrtimer
561 * based cpu-clock-tick sw counter, which
562 * is always available even if no PMU support:
563 */
564 if (attr->type == PERF_TYPE_HARDWARE
565 && attr->config == PERF_COUNT_HW_CPU_CYCLES) {
566
567 if (verbose)
568 warning(" ... trying to fall back to cpu-clock-ticks\n");
569
570 attr->type = PERF_TYPE_SOFTWARE;
571 attr->config = PERF_COUNT_SW_CPU_CLOCK;
572 goto try_again;
573 }
574 printf("\n");
575 error("perfcounter syscall returned with %d (%s)\n",
576 fd[i][counter], strerror(err));
577 die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
578 exit(-1);
579 }
580 assert(fd[i][counter] >= 0);
581 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
582
583 /*
584 * First counter acts as the group leader:
585 */
586 if (group && group_fd == -1)
587 group_fd = fd[i][counter];
588
589 event_array[nr_poll].fd = fd[i][counter];
590 event_array[nr_poll].events = POLLIN;
591 nr_poll++;
592
593 mmap_array[i][counter].counter = counter;
594 mmap_array[i][counter].prev = 0;
595 mmap_array[i][counter].mask = mmap_pages*page_size - 1;
596 mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
597 PROT_READ, MAP_SHARED, fd[i][counter], 0);
598 if (mmap_array[i][counter].base == MAP_FAILED)
599 die("failed to mmap with %d (%s)\n", errno, strerror(errno));
600}
601
602static int __cmd_top(void)
603{
604 pthread_t thread;
605 int i, counter;
606 int ret;
607
608 for (i = 0; i < nr_cpus; i++) {
609 group_fd = -1;
610 for (counter = 0; counter < nr_counters; counter++)
611 start_counter(i, counter);
612 }
613
614 /* Wait for a minimal set of events before starting the snapshot */
615 poll(event_array, nr_poll, 100);
616
617 mmap_read();
618
619 if (pthread_create(&thread, NULL, display_thread, NULL)) {
620 printf("Could not create display thread.\n");
621 exit(-1);
622 }
623
624 if (realtime_prio) {
625 struct sched_param param;
626
627 param.sched_priority = realtime_prio;
628 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
629 printf("Could not set realtime priority.\n");
630 exit(-1);
631 }
632 }
633
634 while (1) {
635 int hits = samples;
636
637 mmap_read();
638
639 if (hits == samples)
640 ret = poll(event_array, nr_poll, 100);
641 }
642
643 return 0;
644}
645
646static const char * const top_usage[] = {
647 "perf top [<options>]",
648 NULL
649};
650
651static const struct option options[] = {
652 OPT_CALLBACK('e', "event", NULL, "event",
653 "event selector. use 'perf list' to list available events",
654 parse_events),
655 OPT_INTEGER('c', "count", &default_interval,
656 "event period to sample"),
657 OPT_INTEGER('p', "pid", &target_pid,
658 "profile events on existing pid"),
659 OPT_BOOLEAN('a', "all-cpus", &system_wide,
660 "system-wide collection from all CPUs"),
661 OPT_INTEGER('C', "CPU", &profile_cpu,
662 "CPU to profile on"),
663 OPT_INTEGER('m', "mmap-pages", &mmap_pages,
664 "number of mmap data pages"),
665 OPT_INTEGER('r', "realtime", &realtime_prio,
666 "collect data with this RT SCHED_FIFO priority"),
667 OPT_INTEGER('d', "delay", &delay_secs,
668 "number of seconds to delay between refreshes"),
669 OPT_BOOLEAN('D', "dump-symtab", &dump_symtab,
670 "dump the symbol table used for profiling"),
671 OPT_INTEGER('f', "count-filter", &count_filter,
672 "only display functions with more events than this"),
673 OPT_BOOLEAN('g', "group", &group,
674 "put the counters into a counter group"),
675 OPT_STRING('s', "sym-filter", &sym_filter, "pattern",
676 "only display symbols matchig this pattern"),
677 OPT_BOOLEAN('z', "zero", &group,
678 "zero history across updates"),
679 OPT_INTEGER('F', "freq", &freq,
680 "profile at this frequency"),
681 OPT_INTEGER('E', "entries", &print_entries,
682 "display this many functions"),
683 OPT_BOOLEAN('v', "verbose", &verbose,
684 "be more verbose (show counter open errors, etc)"),
685 OPT_END()
686};
687
688int cmd_top(int argc, const char **argv, const char *prefix)
689{
690 int counter;
691
692 page_size = sysconf(_SC_PAGE_SIZE);
693
694 argc = parse_options(argc, argv, options, top_usage, 0);
695 if (argc)
696 usage_with_options(top_usage, options);
697
698 if (freq) {
699 default_interval = freq;
700 freq = 1;
701 }
702
703 /* CPU and PID are mutually exclusive */
704 if (target_pid != -1 && profile_cpu != -1) {
705 printf("WARNING: PID switch overriding CPU\n");
706 sleep(1);
707 profile_cpu = -1;
708 }
709
710 if (!nr_counters)
711 nr_counters = 1;
712
713 if (delay_secs < 1)
714 delay_secs = 1;
715
716 parse_symbols();
717
718 /*
719 * Fill in the ones not specifically initialized via -c:
720 */
721 for (counter = 0; counter < nr_counters; counter++) {
722 if (attrs[counter].sample_period)
723 continue;
724
725 attrs[counter].sample_period = default_interval;
726 }
727
728 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
729 assert(nr_cpus <= MAX_NR_CPUS);
730 assert(nr_cpus >= 0);
731
732 if (target_pid != -1 || profile_cpu != -1)
733 nr_cpus = 1;
734
735 return __cmd_top();
736}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
new file mode 100644
index 000000000000..51d168230ee7
--- /dev/null
+++ b/tools/perf/builtin.h
@@ -0,0 +1,26 @@
1#ifndef BUILTIN_H
2#define BUILTIN_H
3
4#include "util/util.h"
5#include "util/strbuf.h"
6
7extern const char perf_version_string[];
8extern const char perf_usage_string[];
9extern const char perf_more_info_string[];
10
11extern void list_common_cmds_help(void);
12extern const char *help_unknown_cmd(const char *cmd);
13extern void prune_packed_objects(int);
14extern int read_line_with_nul(char *buf, int size, FILE *file);
15extern int check_pager_config(const char *cmd);
16
17extern int cmd_annotate(int argc, const char **argv, const char *prefix);
18extern int cmd_help(int argc, const char **argv, const char *prefix);
19extern int cmd_record(int argc, const char **argv, const char *prefix);
20extern int cmd_report(int argc, const char **argv, const char *prefix);
21extern int cmd_stat(int argc, const char **argv, const char *prefix);
22extern int cmd_top(int argc, const char **argv, const char *prefix);
23extern int cmd_version(int argc, const char **argv, const char *prefix);
24extern int cmd_list(int argc, const char **argv, const char *prefix);
25
26#endif
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
new file mode 100644
index 000000000000..eebce30afbc0
--- /dev/null
+++ b/tools/perf/command-list.txt
@@ -0,0 +1,10 @@
1#
2# List of known perf commands.
3# command name category [deprecated] [common]
4#
5perf-annotate mainporcelain common
6perf-list mainporcelain common
7perf-record mainporcelain common
8perf-report mainporcelain common
9perf-stat mainporcelain common
10perf-top mainporcelain common
diff --git a/tools/perf/design.txt b/tools/perf/design.txt
new file mode 100644
index 000000000000..860e116d979c
--- /dev/null
+++ b/tools/perf/design.txt
@@ -0,0 +1,442 @@
1
2Performance Counters for Linux
3------------------------------
4
5Performance counters are special hardware registers available on most modern
6CPUs. These registers count the number of certain types of hw events: such
7as instructions executed, cachemisses suffered, or branches mis-predicted -
8without slowing down the kernel or applications. These registers can also
9trigger interrupts when a threshold number of events have passed - and can
10thus be used to profile the code that runs on that CPU.
11
12The Linux Performance Counter subsystem provides an abstraction of these
13hardware capabilities. It provides per task and per CPU counters, counter
14groups, and it provides event capabilities on top of those. It
15provides "virtual" 64-bit counters, regardless of the width of the
16underlying hardware counters.
17
18Performance counters are accessed via special file descriptors.
19There's one file descriptor per virtual counter used.
20
21The special file descriptor is opened via the perf_counter_open()
22system call:
23
24 int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
25 pid_t pid, int cpu, int group_fd,
26 unsigned long flags);
27
28The syscall returns the new fd. The fd can be used via the normal
29VFS system calls: read() can be used to read the counter, fcntl()
30can be used to set the blocking mode, etc.
31
32Multiple counters can be kept open at a time, and the counters
33can be poll()ed.
34
35When creating a new counter fd, 'perf_counter_hw_event' is:
36
37struct perf_counter_hw_event {
38 /*
39 * The MSB of the config word signifies if the rest contains cpu
40 * specific (raw) counter configuration data, if unset, the next
41 * 7 bits are an event type and the rest of the bits are the event
42 * identifier.
43 */
44 __u64 config;
45
46 __u64 irq_period;
47 __u32 record_type;
48 __u32 read_format;
49
50 __u64 disabled : 1, /* off by default */
51 inherit : 1, /* children inherit it */
52 pinned : 1, /* must always be on PMU */
53 exclusive : 1, /* only group on PMU */
54 exclude_user : 1, /* don't count user */
55 exclude_kernel : 1, /* ditto kernel */
56 exclude_hv : 1, /* ditto hypervisor */
57 exclude_idle : 1, /* don't count when idle */
58 mmap : 1, /* include mmap data */
59 munmap : 1, /* include munmap data */
60 comm : 1, /* include comm data */
61
62 __reserved_1 : 52;
63
64 __u32 extra_config_len;
65 __u32 wakeup_events; /* wakeup every n events */
66
67 __u64 __reserved_2;
68 __u64 __reserved_3;
69};
70
71The 'config' field specifies what the counter should count. It
72is divided into 3 bit-fields:
73
74raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000
75type: 7 bits (next most significant) 0x7f00_0000_0000_0000
76event_id: 56 bits (least significant) 0x00ff_ffff_ffff_ffff
77
78If 'raw_type' is 1, then the counter will count a hardware event
79specified by the remaining 63 bits of event_config. The encoding is
80machine-specific.
81
82If 'raw_type' is 0, then the 'type' field says what kind of counter
83this is, with the following encoding:
84
85enum perf_event_types {
86 PERF_TYPE_HARDWARE = 0,
87 PERF_TYPE_SOFTWARE = 1,
88 PERF_TYPE_TRACEPOINT = 2,
89};
90
91A counter of PERF_TYPE_HARDWARE will count the hardware event
92specified by 'event_id':
93
94/*
95 * Generalized performance counter event types, used by the hw_event.event_id
96 * parameter of the sys_perf_counter_open() syscall:
97 */
98enum hw_event_ids {
99 /*
100 * Common hardware events, generalized by the kernel:
101 */
102 PERF_COUNT_HW_CPU_CYCLES = 0,
103 PERF_COUNT_HW_INSTRUCTIONS = 1,
104 PERF_COUNT_HW_CACHE_REFERENCES = 2,
105 PERF_COUNT_HW_CACHE_MISSES = 3,
106 PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4,
107 PERF_COUNT_HW_BRANCH_MISSES = 5,
108 PERF_COUNT_HW_BUS_CYCLES = 6,
109};
110
111These are standardized types of events that work relatively uniformly
112on all CPUs that implement Performance Counters support under Linux,
113although there may be variations (e.g., different CPUs might count
114cache references and misses at different levels of the cache hierarchy).
115If a CPU is not able to count the selected event, then the system call
116will return -EINVAL.
117
118More hw_event_types are supported as well, but they are CPU-specific
119and accessed as raw events. For example, to count "External bus
120cycles while bus lock signal asserted" events on Intel Core CPUs, pass
121in a 0x4064 event_id value and set hw_event.raw_type to 1.
122
123A counter of type PERF_TYPE_SOFTWARE will count one of the available
124software events, selected by 'event_id':
125
126/*
127 * Special "software" counters provided by the kernel, even if the hardware
128 * does not support performance counters. These counters measure various
129 * physical and sw events of the kernel (and allow the profiling of them as
130 * well):
131 */
132enum sw_event_ids {
133 PERF_COUNT_SW_CPU_CLOCK = 0,
134 PERF_COUNT_SW_TASK_CLOCK = 1,
135 PERF_COUNT_SW_PAGE_FAULTS = 2,
136 PERF_COUNT_SW_CONTEXT_SWITCHES = 3,
137 PERF_COUNT_SW_CPU_MIGRATIONS = 4,
138 PERF_COUNT_SW_PAGE_FAULTS_MIN = 5,
139 PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6,
140};
141
142Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
143tracer is available, and event_id values can be obtained from
144/debug/tracing/events/*/*/id
145
146
147Counters come in two flavours: counting counters and sampling
148counters. A "counting" counter is one that is used for counting the
149number of events that occur, and is characterised by having
150irq_period = 0.
151
152
153A read() on a counter returns the current value of the counter and possible
154additional values as specified by 'read_format', each value is a u64 (8 bytes)
155in size.
156
157/*
158 * Bits that can be set in hw_event.read_format to request that
159 * reads on the counter should return the indicated quantities,
160 * in increasing order of bit value, after the counter value.
161 */
162enum perf_counter_read_format {
163 PERF_FORMAT_TOTAL_TIME_ENABLED = 1,
164 PERF_FORMAT_TOTAL_TIME_RUNNING = 2,
165};
166
167Using these additional values one can establish the overcommit ratio for a
168particular counter allowing one to take the round-robin scheduling effect
169into account.
170
171
172A "sampling" counter is one that is set up to generate an interrupt
173every N events, where N is given by 'irq_period'. A sampling counter
174has irq_period > 0. The record_type controls what data is recorded on each
175interrupt:
176
177/*
178 * Bits that can be set in hw_event.record_type to request information
179 * in the overflow packets.
180 */
181enum perf_counter_record_format {
182 PERF_RECORD_IP = 1U << 0,
183 PERF_RECORD_TID = 1U << 1,
184 PERF_RECORD_TIME = 1U << 2,
185 PERF_RECORD_ADDR = 1U << 3,
186 PERF_RECORD_GROUP = 1U << 4,
187 PERF_RECORD_CALLCHAIN = 1U << 5,
188};
189
190Such (and other) events will be recorded in a ring-buffer, which is
191available to user-space using mmap() (see below).
192
193The 'disabled' bit specifies whether the counter starts out disabled
194or enabled. If it is initially disabled, it can be enabled by ioctl
195or prctl (see below).
196
197The 'inherit' bit, if set, specifies that this counter should count
198events on descendant tasks as well as the task specified. This only
199applies to new descendents, not to any existing descendents at the
200time the counter is created (nor to any new descendents of existing
201descendents).
202
203The 'pinned' bit, if set, specifies that the counter should always be
204on the CPU if at all possible. It only applies to hardware counters
205and only to group leaders. If a pinned counter cannot be put onto the
206CPU (e.g. because there are not enough hardware counters or because of
207a conflict with some other event), then the counter goes into an
208'error' state, where reads return end-of-file (i.e. read() returns 0)
209until the counter is subsequently enabled or disabled.
210
211The 'exclusive' bit, if set, specifies that when this counter's group
212is on the CPU, it should be the only group using the CPU's counters.
213In future, this will allow sophisticated monitoring programs to supply
214extra configuration information via 'extra_config_len' to exploit
215advanced features of the CPU's Performance Monitor Unit (PMU) that are
216not otherwise accessible and that might disrupt other hardware
217counters.
218
219The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
220way to request that counting of events be restricted to times when the
221CPU is in user, kernel and/or hypervisor mode.
222
223The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap
224operations, these can be used to relate userspace IP addresses to actual
225code, even after the mapping (or even the whole process) is gone,
226these events are recorded in the ring-buffer (see below).
227
228The 'comm' bit allows tracking of process comm data on process creation.
229This too is recorded in the ring-buffer (see below).
230
231The 'pid' parameter to the perf_counter_open() system call allows the
232counter to be specific to a task:
233
234 pid == 0: if the pid parameter is zero, the counter is attached to the
235 current task.
236
237 pid > 0: the counter is attached to a specific task (if the current task
238 has sufficient privilege to do so)
239
240 pid < 0: all tasks are counted (per cpu counters)
241
242The 'cpu' parameter allows a counter to be made specific to a CPU:
243
244 cpu >= 0: the counter is restricted to a specific CPU
245 cpu == -1: the counter counts on all CPUs
246
247(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
248
249A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
250events of that task and 'follows' that task to whatever CPU the task
251gets schedule to. Per task counters can be created by any user, for
252their own tasks.
253
254A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
255all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
256
257The 'flags' parameter is currently unused and must be zero.
258
259The 'group_fd' parameter allows counter "groups" to be set up. A
260counter group has one counter which is the group "leader". The leader
261is created first, with group_fd = -1 in the perf_counter_open call
262that creates it. The rest of the group members are created
263subsequently, with group_fd giving the fd of the group leader.
264(A single counter on its own is created with group_fd = -1 and is
265considered to be a group with only 1 member.)
266
267A counter group is scheduled onto the CPU as a unit, that is, it will
268only be put onto the CPU if all of the counters in the group can be
269put onto the CPU. This means that the values of the member counters
270can be meaningfully compared, added, divided (to get ratios), etc.,
271with each other, since they have counted events for the same set of
272executed instructions.
273
274
275Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap
276tracking are logged into a ring-buffer. This ring-buffer is created and
277accessed through mmap().
278
279The mmap size should be 1+2^n pages, where the first page is a meta-data page
280(struct perf_counter_mmap_page) that contains various bits of information such
281as where the ring-buffer head is.
282
283/*
284 * Structure of the page that can be mapped via mmap
285 */
286struct perf_counter_mmap_page {
287 __u32 version; /* version number of this structure */
288 __u32 compat_version; /* lowest version this is compat with */
289
290 /*
291 * Bits needed to read the hw counters in user-space.
292 *
293 * u32 seq;
294 * s64 count;
295 *
296 * do {
297 * seq = pc->lock;
298 *
299 * barrier()
300 * if (pc->index) {
301 * count = pmc_read(pc->index - 1);
302 * count += pc->offset;
303 * } else
304 * goto regular_read;
305 *
306 * barrier();
307 * } while (pc->lock != seq);
308 *
309 * NOTE: for obvious reason this only works on self-monitoring
310 * processes.
311 */
312 __u32 lock; /* seqlock for synchronization */
313 __u32 index; /* hardware counter identifier */
314 __s64 offset; /* add to hardware counter value */
315
316 /*
317 * Control data for the mmap() data buffer.
318 *
319 * User-space reading this value should issue an rmb(), on SMP capable
320 * platforms, after reading this value -- see perf_counter_wakeup().
321 */
322 __u32 data_head; /* head in the data section */
323};
324
325NOTE: the hw-counter userspace bits are arch specific and are currently only
326 implemented on powerpc.
327
328The following 2^n pages are the ring-buffer which contains events of the form:
329
330#define PERF_EVENT_MISC_KERNEL (1 << 0)
331#define PERF_EVENT_MISC_USER (1 << 1)
332#define PERF_EVENT_MISC_OVERFLOW (1 << 2)
333
334struct perf_event_header {
335 __u32 type;
336 __u16 misc;
337 __u16 size;
338};
339
340enum perf_event_type {
341
342 /*
343 * The MMAP events record the PROT_EXEC mappings so that we can
344 * correlate userspace IPs to code. They have the following structure:
345 *
346 * struct {
347 * struct perf_event_header header;
348 *
349 * u32 pid, tid;
350 * u64 addr;
351 * u64 len;
352 * u64 pgoff;
353 * char filename[];
354 * };
355 */
356 PERF_EVENT_MMAP = 1,
357 PERF_EVENT_MUNMAP = 2,
358
359 /*
360 * struct {
361 * struct perf_event_header header;
362 *
363 * u32 pid, tid;
364 * char comm[];
365 * };
366 */
367 PERF_EVENT_COMM = 3,
368
369 /*
370 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
371 * will be PERF_RECORD_*
372 *
373 * struct {
374 * struct perf_event_header header;
375 *
376 * { u64 ip; } && PERF_RECORD_IP
377 * { u32 pid, tid; } && PERF_RECORD_TID
378 * { u64 time; } && PERF_RECORD_TIME
379 * { u64 addr; } && PERF_RECORD_ADDR
380 *
381 * { u64 nr;
382 * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP
383 *
384 * { u16 nr,
385 * hv,
386 * kernel,
387 * user;
388 * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN
389 * };
390 */
391};
392
393NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented
394 on x86.
395
396Notification of new events is possible through poll()/select()/epoll() and
397fcntl() managing signals.
398
399Normally a notification is generated for every page filled, however one can
400additionally set perf_counter_hw_event.wakeup_events to generate one every
401so many counter overflow events.
402
403Future work will include a splice() interface to the ring-buffer.
404
405
406Counters can be enabled and disabled in two ways: via ioctl and via
407prctl. When a counter is disabled, it doesn't count or generate
408events but does continue to exist and maintain its count value.
409
410An individual counter or counter group can be enabled with
411
412 ioctl(fd, PERF_COUNTER_IOC_ENABLE);
413
414or disabled with
415
416 ioctl(fd, PERF_COUNTER_IOC_DISABLE);
417
418Enabling or disabling the leader of a group enables or disables the
419whole group; that is, while the group leader is disabled, none of the
420counters in the group will count. Enabling or disabling a member of a
421group other than the leader only affects that counter - disabling an
422non-leader stops that counter from counting but doesn't affect any
423other counter.
424
425Additionally, non-inherited overflow counters can use
426
427 ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr);
428
429to enable a counter for 'nr' events, after which it gets disabled again.
430
431A process can enable or disable all the counter groups that are
432attached to it, using prctl:
433
434 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
435
436 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
437
438This applies to all counters on the current process, whether created
439by this process or by another, and doesn't affect any counters that
440this process has created on other processes. It only enables or
441disables the group leaders, not any other members in the groups.
442
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
new file mode 100644
index 000000000000..4eb725933703
--- /dev/null
+++ b/tools/perf/perf.c
@@ -0,0 +1,428 @@
1/*
2 * perf.c
3 *
4 * Performance analysis utility.
5 *
6 * This is the main hub from which the sub-commands (perf stat,
7 * perf top, perf record, perf report, etc.) are started.
8 */
9#include "builtin.h"
10
11#include "util/exec_cmd.h"
12#include "util/cache.h"
13#include "util/quote.h"
14#include "util/run-command.h"
15
16const char perf_usage_string[] =
17 "perf [--version] [--help] COMMAND [ARGS]";
18
19const char perf_more_info_string[] =
20 "See 'perf help COMMAND' for more information on a specific command.";
21
22static int use_pager = -1;
23struct pager_config {
24 const char *cmd;
25 int val;
26};
27
28static int pager_command_config(const char *var, const char *value, void *data)
29{
30 struct pager_config *c = data;
31 if (!prefixcmp(var, "pager.") && !strcmp(var + 6, c->cmd))
32 c->val = perf_config_bool(var, value);
33 return 0;
34}
35
36/* returns 0 for "no pager", 1 for "use pager", and -1 for "not specified" */
37int check_pager_config(const char *cmd)
38{
39 struct pager_config c;
40 c.cmd = cmd;
41 c.val = -1;
42 perf_config(pager_command_config, &c);
43 return c.val;
44}
45
46static void commit_pager_choice(void) {
47 switch (use_pager) {
48 case 0:
49 setenv("PERF_PAGER", "cat", 1);
50 break;
51 case 1:
52 /* setup_pager(); */
53 break;
54 default:
55 break;
56 }
57}
58
59static int handle_options(const char*** argv, int* argc, int* envchanged)
60{
61 int handled = 0;
62
63 while (*argc > 0) {
64 const char *cmd = (*argv)[0];
65 if (cmd[0] != '-')
66 break;
67
68 /*
69 * For legacy reasons, the "version" and "help"
70 * commands can be written with "--" prepended
71 * to make them look like flags.
72 */
73 if (!strcmp(cmd, "--help") || !strcmp(cmd, "--version"))
74 break;
75
76 /*
77 * Check remaining flags.
78 */
79 if (!prefixcmp(cmd, "--exec-path")) {
80 cmd += 11;
81 if (*cmd == '=')
82 perf_set_argv_exec_path(cmd + 1);
83 else {
84 puts(perf_exec_path());
85 exit(0);
86 }
87 } else if (!strcmp(cmd, "--html-path")) {
88 puts(system_path(PERF_HTML_PATH));
89 exit(0);
90 } else if (!strcmp(cmd, "-p") || !strcmp(cmd, "--paginate")) {
91 use_pager = 1;
92 } else if (!strcmp(cmd, "--no-pager")) {
93 use_pager = 0;
94 if (envchanged)
95 *envchanged = 1;
96 } else if (!strcmp(cmd, "--perf-dir")) {
97 if (*argc < 2) {
98 fprintf(stderr, "No directory given for --perf-dir.\n" );
99 usage(perf_usage_string);
100 }
101 setenv(PERF_DIR_ENVIRONMENT, (*argv)[1], 1);
102 if (envchanged)
103 *envchanged = 1;
104 (*argv)++;
105 (*argc)--;
106 handled++;
107 } else if (!prefixcmp(cmd, "--perf-dir=")) {
108 setenv(PERF_DIR_ENVIRONMENT, cmd + 10, 1);
109 if (envchanged)
110 *envchanged = 1;
111 } else if (!strcmp(cmd, "--work-tree")) {
112 if (*argc < 2) {
113 fprintf(stderr, "No directory given for --work-tree.\n" );
114 usage(perf_usage_string);
115 }
116 setenv(PERF_WORK_TREE_ENVIRONMENT, (*argv)[1], 1);
117 if (envchanged)
118 *envchanged = 1;
119 (*argv)++;
120 (*argc)--;
121 } else if (!prefixcmp(cmd, "--work-tree=")) {
122 setenv(PERF_WORK_TREE_ENVIRONMENT, cmd + 12, 1);
123 if (envchanged)
124 *envchanged = 1;
125 } else {
126 fprintf(stderr, "Unknown option: %s\n", cmd);
127 usage(perf_usage_string);
128 }
129
130 (*argv)++;
131 (*argc)--;
132 handled++;
133 }
134 return handled;
135}
136
137static int handle_alias(int *argcp, const char ***argv)
138{
139 int envchanged = 0, ret = 0, saved_errno = errno;
140 int count, option_count;
141 const char** new_argv;
142 const char *alias_command;
143 char *alias_string;
144
145 alias_command = (*argv)[0];
146 alias_string = alias_lookup(alias_command);
147 if (alias_string) {
148 if (alias_string[0] == '!') {
149 if (*argcp > 1) {
150 struct strbuf buf;
151
152 strbuf_init(&buf, PATH_MAX);
153 strbuf_addstr(&buf, alias_string);
154 sq_quote_argv(&buf, (*argv) + 1, PATH_MAX);
155 free(alias_string);
156 alias_string = buf.buf;
157 }
158 ret = system(alias_string + 1);
159 if (ret >= 0 && WIFEXITED(ret) &&
160 WEXITSTATUS(ret) != 127)
161 exit(WEXITSTATUS(ret));
162 die("Failed to run '%s' when expanding alias '%s'",
163 alias_string + 1, alias_command);
164 }
165 count = split_cmdline(alias_string, &new_argv);
166 if (count < 0)
167 die("Bad alias.%s string", alias_command);
168 option_count = handle_options(&new_argv, &count, &envchanged);
169 if (envchanged)
170 die("alias '%s' changes environment variables\n"
171 "You can use '!perf' in the alias to do this.",
172 alias_command);
173 memmove(new_argv - option_count, new_argv,
174 count * sizeof(char *));
175 new_argv -= option_count;
176
177 if (count < 1)
178 die("empty alias for %s", alias_command);
179
180 if (!strcmp(alias_command, new_argv[0]))
181 die("recursive alias: %s", alias_command);
182
183 new_argv = realloc(new_argv, sizeof(char*) *
184 (count + *argcp + 1));
185 /* insert after command name */
186 memcpy(new_argv + count, *argv + 1, sizeof(char*) * *argcp);
187 new_argv[count+*argcp] = NULL;
188
189 *argv = new_argv;
190 *argcp += count - 1;
191
192 ret = 1;
193 }
194
195 errno = saved_errno;
196
197 return ret;
198}
199
200const char perf_version_string[] = PERF_VERSION;
201
202#define RUN_SETUP (1<<0)
203#define USE_PAGER (1<<1)
204/*
205 * require working tree to be present -- anything uses this needs
206 * RUN_SETUP for reading from the configuration file.
207 */
208#define NEED_WORK_TREE (1<<2)
209
210struct cmd_struct {
211 const char *cmd;
212 int (*fn)(int, const char **, const char *);
213 int option;
214};
215
216static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
217{
218 int status;
219 struct stat st;
220 const char *prefix;
221
222 prefix = NULL;
223 if (p->option & RUN_SETUP)
224 prefix = NULL; /* setup_perf_directory(); */
225
226 if (use_pager == -1 && p->option & RUN_SETUP)
227 use_pager = check_pager_config(p->cmd);
228 if (use_pager == -1 && p->option & USE_PAGER)
229 use_pager = 1;
230 commit_pager_choice();
231
232 if (p->option & NEED_WORK_TREE)
233 /* setup_work_tree() */;
234
235 status = p->fn(argc, argv, prefix);
236 if (status)
237 return status & 0xff;
238
239 /* Somebody closed stdout? */
240 if (fstat(fileno(stdout), &st))
241 return 0;
242 /* Ignore write errors for pipes and sockets.. */
243 if (S_ISFIFO(st.st_mode) || S_ISSOCK(st.st_mode))
244 return 0;
245
246 /* Check for ENOSPC and EIO errors.. */
247 if (fflush(stdout))
248 die("write failure on standard output: %s", strerror(errno));
249 if (ferror(stdout))
250 die("unknown write failure on standard output");
251 if (fclose(stdout))
252 die("close failed on standard output: %s", strerror(errno));
253 return 0;
254}
255
256static void handle_internal_command(int argc, const char **argv)
257{
258 const char *cmd = argv[0];
259 static struct cmd_struct commands[] = {
260 { "help", cmd_help, 0 },
261 { "list", cmd_list, 0 },
262 { "record", cmd_record, 0 },
263 { "report", cmd_report, 0 },
264 { "stat", cmd_stat, 0 },
265 { "top", cmd_top, 0 },
266 { "annotate", cmd_annotate, 0 },
267 { "version", cmd_version, 0 },
268 };
269 int i;
270 static const char ext[] = STRIP_EXTENSION;
271
272 if (sizeof(ext) > 1) {
273 i = strlen(argv[0]) - strlen(ext);
274 if (i > 0 && !strcmp(argv[0] + i, ext)) {
275 char *argv0 = strdup(argv[0]);
276 argv[0] = cmd = argv0;
277 argv0[i] = '\0';
278 }
279 }
280
281 /* Turn "perf cmd --help" into "perf help cmd" */
282 if (argc > 1 && !strcmp(argv[1], "--help")) {
283 argv[1] = argv[0];
284 argv[0] = cmd = "help";
285 }
286
287 for (i = 0; i < ARRAY_SIZE(commands); i++) {
288 struct cmd_struct *p = commands+i;
289 if (strcmp(p->cmd, cmd))
290 continue;
291 exit(run_builtin(p, argc, argv));
292 }
293}
294
295static void execv_dashed_external(const char **argv)
296{
297 struct strbuf cmd = STRBUF_INIT;
298 const char *tmp;
299 int status;
300
301 strbuf_addf(&cmd, "perf-%s", argv[0]);
302
303 /*
304 * argv[0] must be the perf command, but the argv array
305 * belongs to the caller, and may be reused in
306 * subsequent loop iterations. Save argv[0] and
307 * restore it on error.
308 */
309 tmp = argv[0];
310 argv[0] = cmd.buf;
311
312 /*
313 * if we fail because the command is not found, it is
314 * OK to return. Otherwise, we just pass along the status code.
315 */
316 status = run_command_v_opt(argv, 0);
317 if (status != -ERR_RUN_COMMAND_EXEC) {
318 if (IS_RUN_COMMAND_ERR(status))
319 die("unable to run '%s'", argv[0]);
320 exit(-status);
321 }
322 errno = ENOENT; /* as if we called execvp */
323
324 argv[0] = tmp;
325
326 strbuf_release(&cmd);
327}
328
329static int run_argv(int *argcp, const char ***argv)
330{
331 int done_alias = 0;
332
333 while (1) {
334 /* See if it's an internal command */
335 handle_internal_command(*argcp, *argv);
336
337 /* .. then try the external ones */
338 execv_dashed_external(*argv);
339
340 /* It could be an alias -- this works around the insanity
341 * of overriding "perf log" with "perf show" by having
342 * alias.log = show
343 */
344 if (done_alias || !handle_alias(argcp, argv))
345 break;
346 done_alias = 1;
347 }
348
349 return done_alias;
350}
351
352
353int main(int argc, const char **argv)
354{
355 const char *cmd;
356
357 cmd = perf_extract_argv0_path(argv[0]);
358 if (!cmd)
359 cmd = "perf-help";
360
361 /*
362 * "perf-xxxx" is the same as "perf xxxx", but we obviously:
363 *
364 * - cannot take flags in between the "perf" and the "xxxx".
365 * - cannot execute it externally (since it would just do
366 * the same thing over again)
367 *
368 * So we just directly call the internal command handler, and
369 * die if that one cannot handle it.
370 */
371 if (!prefixcmp(cmd, "perf-")) {
372 cmd += 5;
373 argv[0] = cmd;
374 handle_internal_command(argc, argv);
375 die("cannot handle %s internally", cmd);
376 }
377
378 /* Look for flags.. */
379 argv++;
380 argc--;
381 handle_options(&argv, &argc, NULL);
382 commit_pager_choice();
383 if (argc > 0) {
384 if (!prefixcmp(argv[0], "--"))
385 argv[0] += 2;
386 } else {
387 /* The user didn't specify a command; give them help */
388 printf("\n usage: %s\n\n", perf_usage_string);
389 list_common_cmds_help();
390 printf("\n %s\n\n", perf_more_info_string);
391 exit(1);
392 }
393 cmd = argv[0];
394
395 /*
396 * We use PATH to find perf commands, but we prepend some higher
397 * precidence paths: the "--exec-path" option, the PERF_EXEC_PATH
398 * environment, and the $(perfexecdir) from the Makefile at build
399 * time.
400 */
401 setup_path();
402
403 while (1) {
404 static int done_help = 0;
405 static int was_alias = 0;
406
407 was_alias = run_argv(&argc, &argv);
408 if (errno != ENOENT)
409 break;
410
411 if (was_alias) {
412 fprintf(stderr, "Expansion of alias '%s' failed; "
413 "'%s' is not a perf-command\n",
414 cmd, argv[0]);
415 exit(1);
416 }
417 if (!done_help) {
418 cmd = argv[0] = help_unknown_cmd(cmd);
419 done_help = 1;
420 } else
421 break;
422 }
423
424 fprintf(stderr, "Failed to run command '%s': %s\n",
425 cmd, strerror(errno));
426
427 return 1;
428}
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
new file mode 100644
index 000000000000..af0a5046d743
--- /dev/null
+++ b/tools/perf/perf.h
@@ -0,0 +1,67 @@
1#ifndef _PERF_PERF_H
2#define _PERF_PERF_H
3
4#if defined(__x86_64__) || defined(__i386__)
5#include "../../arch/x86/include/asm/unistd.h"
6#define rmb() asm volatile("lfence" ::: "memory")
7#define cpu_relax() asm volatile("rep; nop" ::: "memory");
8#endif
9
10#ifdef __powerpc__
11#include "../../arch/powerpc/include/asm/unistd.h"
12#define rmb() asm volatile ("sync" ::: "memory")
13#define cpu_relax() asm volatile ("" ::: "memory");
14#endif
15
16#include <time.h>
17#include <unistd.h>
18#include <sys/types.h>
19#include <sys/syscall.h>
20
21#include "../../include/linux/perf_counter.h"
22
23/*
24 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
25 * counters in the current task.
26 */
27#define PR_TASK_PERF_COUNTERS_DISABLE 31
28#define PR_TASK_PERF_COUNTERS_ENABLE 32
29
30#ifndef NSEC_PER_SEC
31# define NSEC_PER_SEC 1000000000ULL
32#endif
33
34static inline unsigned long long rdclock(void)
35{
36 struct timespec ts;
37
38 clock_gettime(CLOCK_MONOTONIC, &ts);
39 return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
40}
41
42/*
43 * Pick up some kernel type conventions:
44 */
45#define __user
46#define asmlinkage
47
48#define unlikely(x) __builtin_expect(!!(x), 0)
49#define min(x, y) ({ \
50 typeof(x) _min1 = (x); \
51 typeof(y) _min2 = (y); \
52 (void) (&_min1 == &_min2); \
53 _min1 < _min2 ? _min1 : _min2; })
54
55static inline int
56sys_perf_counter_open(struct perf_counter_attr *attr_uptr,
57 pid_t pid, int cpu, int group_fd,
58 unsigned long flags)
59{
60 return syscall(__NR_perf_counter_open, attr_uptr, pid, cpu,
61 group_fd, flags);
62}
63
64#define MAX_COUNTERS 256
65#define MAX_NR_CPUS 256
66
67#endif
diff --git a/tools/perf/util/PERF-VERSION-GEN b/tools/perf/util/PERF-VERSION-GEN
new file mode 100755
index 000000000000..c561d1538c03
--- /dev/null
+++ b/tools/perf/util/PERF-VERSION-GEN
@@ -0,0 +1,42 @@
1#!/bin/sh
2
3GVF=PERF-VERSION-FILE
4DEF_VER=v0.0.1.PERF
5
6LF='
7'
8
9# First see if there is a version file (included in release tarballs),
10# then try git-describe, then default.
11if test -f version
12then
13 VN=$(cat version) || VN="$DEF_VER"
14elif test -d .git -o -f .git &&
15 VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
16 case "$VN" in
17 *$LF*) (exit 1) ;;
18 v[0-9]*)
19 git update-index -q --refresh
20 test -z "$(git diff-index --name-only HEAD --)" ||
21 VN="$VN-dirty" ;;
22 esac
23then
24 VN=$(echo "$VN" | sed -e 's/-/./g');
25else
26 VN="$DEF_VER"
27fi
28
29VN=$(expr "$VN" : v*'\(.*\)')
30
31if test -r $GVF
32then
33 VC=$(sed -e 's/^PERF_VERSION = //' <$GVF)
34else
35 VC=unset
36fi
37test "$VN" = "$VC" || {
38 echo >&2 "PERF_VERSION = $VN"
39 echo "PERF_VERSION = $VN" >$GVF
40}
41
42
diff --git a/tools/perf/util/abspath.c b/tools/perf/util/abspath.c
new file mode 100644
index 000000000000..61d33b81fc97
--- /dev/null
+++ b/tools/perf/util/abspath.c
@@ -0,0 +1,117 @@
1#include "cache.h"
2
3/*
4 * Do not use this for inspecting *tracked* content. When path is a
5 * symlink to a directory, we do not want to say it is a directory when
6 * dealing with tracked content in the working tree.
7 */
8static int is_directory(const char *path)
9{
10 struct stat st;
11 return (!stat(path, &st) && S_ISDIR(st.st_mode));
12}
13
14/* We allow "recursive" symbolic links. Only within reason, though. */
15#define MAXDEPTH 5
16
17const char *make_absolute_path(const char *path)
18{
19 static char bufs[2][PATH_MAX + 1], *buf = bufs[0], *next_buf = bufs[1];
20 char cwd[1024] = "";
21 int buf_index = 1, len;
22
23 int depth = MAXDEPTH;
24 char *last_elem = NULL;
25 struct stat st;
26
27 if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
28 die ("Too long path: %.*s", 60, path);
29
30 while (depth--) {
31 if (!is_directory(buf)) {
32 char *last_slash = strrchr(buf, '/');
33 if (last_slash) {
34 *last_slash = '\0';
35 last_elem = xstrdup(last_slash + 1);
36 } else {
37 last_elem = xstrdup(buf);
38 *buf = '\0';
39 }
40 }
41
42 if (*buf) {
43 if (!*cwd && !getcwd(cwd, sizeof(cwd)))
44 die ("Could not get current working directory");
45
46 if (chdir(buf))
47 die ("Could not switch to '%s'", buf);
48 }
49 if (!getcwd(buf, PATH_MAX))
50 die ("Could not get current working directory");
51
52 if (last_elem) {
53 int len = strlen(buf);
54 if (len + strlen(last_elem) + 2 > PATH_MAX)
55 die ("Too long path name: '%s/%s'",
56 buf, last_elem);
57 buf[len] = '/';
58 strcpy(buf + len + 1, last_elem);
59 free(last_elem);
60 last_elem = NULL;
61 }
62
63 if (!lstat(buf, &st) && S_ISLNK(st.st_mode)) {
64 len = readlink(buf, next_buf, PATH_MAX);
65 if (len < 0)
66 die ("Invalid symlink: %s", buf);
67 if (PATH_MAX <= len)
68 die("symbolic link too long: %s", buf);
69 next_buf[len] = '\0';
70 buf = next_buf;
71 buf_index = 1 - buf_index;
72 next_buf = bufs[buf_index];
73 } else
74 break;
75 }
76
77 if (*cwd && chdir(cwd))
78 die ("Could not change back to '%s'", cwd);
79
80 return buf;
81}
82
83static const char *get_pwd_cwd(void)
84{
85 static char cwd[PATH_MAX + 1];
86 char *pwd;
87 struct stat cwd_stat, pwd_stat;
88 if (getcwd(cwd, PATH_MAX) == NULL)
89 return NULL;
90 pwd = getenv("PWD");
91 if (pwd && strcmp(pwd, cwd)) {
92 stat(cwd, &cwd_stat);
93 if (!stat(pwd, &pwd_stat) &&
94 pwd_stat.st_dev == cwd_stat.st_dev &&
95 pwd_stat.st_ino == cwd_stat.st_ino) {
96 strlcpy(cwd, pwd, PATH_MAX);
97 }
98 }
99 return cwd;
100}
101
102const char *make_nonrelative_path(const char *path)
103{
104 static char buf[PATH_MAX + 1];
105
106 if (is_absolute_path(path)) {
107 if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
108 die("Too long path: %.*s", 60, path);
109 } else {
110 const char *cwd = get_pwd_cwd();
111 if (!cwd)
112 die("Cannot determine the current working directory");
113 if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX)
114 die("Too long path: %.*s", 60, path);
115 }
116 return buf;
117}
diff --git a/tools/perf/util/alias.c b/tools/perf/util/alias.c
new file mode 100644
index 000000000000..9b3dd2b428df
--- /dev/null
+++ b/tools/perf/util/alias.c
@@ -0,0 +1,77 @@
1#include "cache.h"
2
3static const char *alias_key;
4static char *alias_val;
5
6static int alias_lookup_cb(const char *k, const char *v, void *cb)
7{
8 if (!prefixcmp(k, "alias.") && !strcmp(k+6, alias_key)) {
9 if (!v)
10 return config_error_nonbool(k);
11 alias_val = strdup(v);
12 return 0;
13 }
14 return 0;
15}
16
17char *alias_lookup(const char *alias)
18{
19 alias_key = alias;
20 alias_val = NULL;
21 perf_config(alias_lookup_cb, NULL);
22 return alias_val;
23}
24
25int split_cmdline(char *cmdline, const char ***argv)
26{
27 int src, dst, count = 0, size = 16;
28 char quoted = 0;
29
30 *argv = malloc(sizeof(char*) * size);
31
32 /* split alias_string */
33 (*argv)[count++] = cmdline;
34 for (src = dst = 0; cmdline[src];) {
35 char c = cmdline[src];
36 if (!quoted && isspace(c)) {
37 cmdline[dst++] = 0;
38 while (cmdline[++src]
39 && isspace(cmdline[src]))
40 ; /* skip */
41 if (count >= size) {
42 size += 16;
43 *argv = realloc(*argv, sizeof(char*) * size);
44 }
45 (*argv)[count++] = cmdline + dst;
46 } else if (!quoted && (c == '\'' || c == '"')) {
47 quoted = c;
48 src++;
49 } else if (c == quoted) {
50 quoted = 0;
51 src++;
52 } else {
53 if (c == '\\' && quoted != '\'') {
54 src++;
55 c = cmdline[src];
56 if (!c) {
57 free(*argv);
58 *argv = NULL;
59 return error("cmdline ends with \\");
60 }
61 }
62 cmdline[dst++] = c;
63 src++;
64 }
65 }
66
67 cmdline[dst] = 0;
68
69 if (quoted) {
70 free(*argv);
71 *argv = NULL;
72 return error("unclosed quote");
73 }
74
75 return count;
76}
77
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
new file mode 100644
index 000000000000..393d6146d13b
--- /dev/null
+++ b/tools/perf/util/cache.h
@@ -0,0 +1,119 @@
1#ifndef CACHE_H
2#define CACHE_H
3
4#include "util.h"
5#include "strbuf.h"
6
7#define PERF_DIR_ENVIRONMENT "PERF_DIR"
8#define PERF_WORK_TREE_ENVIRONMENT "PERF_WORK_TREE"
9#define DEFAULT_PERF_DIR_ENVIRONMENT ".perf"
10#define DB_ENVIRONMENT "PERF_OBJECT_DIRECTORY"
11#define INDEX_ENVIRONMENT "PERF_INDEX_FILE"
12#define GRAFT_ENVIRONMENT "PERF_GRAFT_FILE"
13#define TEMPLATE_DIR_ENVIRONMENT "PERF_TEMPLATE_DIR"
14#define CONFIG_ENVIRONMENT "PERF_CONFIG"
15#define EXEC_PATH_ENVIRONMENT "PERF_EXEC_PATH"
16#define CEILING_DIRECTORIES_ENVIRONMENT "PERF_CEILING_DIRECTORIES"
17#define PERFATTRIBUTES_FILE ".perfattributes"
18#define INFOATTRIBUTES_FILE "info/attributes"
19#define ATTRIBUTE_MACRO_PREFIX "[attr]"
20
21typedef int (*config_fn_t)(const char *, const char *, void *);
22extern int perf_default_config(const char *, const char *, void *);
23extern int perf_config_from_file(config_fn_t fn, const char *, void *);
24extern int perf_config(config_fn_t fn, void *);
25extern int perf_parse_ulong(const char *, unsigned long *);
26extern int perf_config_int(const char *, const char *);
27extern unsigned long perf_config_ulong(const char *, const char *);
28extern int perf_config_bool_or_int(const char *, const char *, int *);
29extern int perf_config_bool(const char *, const char *);
30extern int perf_config_string(const char **, const char *, const char *);
31extern int perf_config_set(const char *, const char *);
32extern int perf_config_set_multivar(const char *, const char *, const char *, int);
33extern int perf_config_rename_section(const char *, const char *);
34extern const char *perf_etc_perfconfig(void);
35extern int check_repository_format_version(const char *var, const char *value, void *cb);
36extern int perf_config_system(void);
37extern int perf_config_global(void);
38extern int config_error_nonbool(const char *);
39extern const char *config_exclusive_filename;
40
41#define MAX_PERFNAME (1000)
42extern char perf_default_email[MAX_PERFNAME];
43extern char perf_default_name[MAX_PERFNAME];
44extern int user_ident_explicitly_given;
45
46extern const char *perf_log_output_encoding;
47extern const char *perf_mailmap_file;
48
49/* IO helper functions */
50extern void maybe_flush_or_die(FILE *, const char *);
51extern int copy_fd(int ifd, int ofd);
52extern int copy_file(const char *dst, const char *src, int mode);
53extern ssize_t read_in_full(int fd, void *buf, size_t count);
54extern ssize_t write_in_full(int fd, const void *buf, size_t count);
55extern void write_or_die(int fd, const void *buf, size_t count);
56extern int write_or_whine(int fd, const void *buf, size_t count, const char *msg);
57extern int write_or_whine_pipe(int fd, const void *buf, size_t count, const char *msg);
58extern void fsync_or_die(int fd, const char *);
59
60/* pager.c */
61extern void setup_pager(void);
62extern const char *pager_program;
63extern int pager_in_use(void);
64extern int pager_use_color;
65
66extern const char *editor_program;
67extern const char *excludes_file;
68
69char *alias_lookup(const char *alias);
70int split_cmdline(char *cmdline, const char ***argv);
71
72#define alloc_nr(x) (((x)+16)*3/2)
73
74/*
75 * Realloc the buffer pointed at by variable 'x' so that it can hold
76 * at least 'nr' entries; the number of entries currently allocated
77 * is 'alloc', using the standard growing factor alloc_nr() macro.
78 *
79 * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
80 */
81#define ALLOC_GROW(x, nr, alloc) \
82 do { \
83 if ((nr) > alloc) { \
84 if (alloc_nr(alloc) < (nr)) \
85 alloc = (nr); \
86 else \
87 alloc = alloc_nr(alloc); \
88 x = xrealloc((x), alloc * sizeof(*(x))); \
89 } \
90 } while(0)
91
92
93static inline int is_absolute_path(const char *path)
94{
95 return path[0] == '/';
96}
97
98const char *make_absolute_path(const char *path);
99const char *make_nonrelative_path(const char *path);
100const char *make_relative_path(const char *abs, const char *base);
101int normalize_path_copy(char *dst, const char *src);
102int longest_ancestor_length(const char *path, const char *prefix_list);
103char *strip_path_suffix(const char *path, const char *suffix);
104
105extern char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
106extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
107/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
108extern int perf_mkstemp(char *path, size_t len, const char *template);
109
110extern char *mksnpath(char *buf, size_t n, const char *fmt, ...)
111 __attribute__((format (printf, 3, 4)));
112extern char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
113 __attribute__((format (printf, 3, 4)));
114extern char *perf_pathdup(const char *fmt, ...)
115 __attribute__((format (printf, 1, 2)));
116
117extern size_t strlcpy(char *dest, const char *src, size_t size);
118
119#endif /* CACHE_H */
diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c
new file mode 100644
index 000000000000..9a8c20ccc53e
--- /dev/null
+++ b/tools/perf/util/color.c
@@ -0,0 +1,241 @@
1#include "cache.h"
2#include "color.h"
3
4int perf_use_color_default = -1;
5
6static int parse_color(const char *name, int len)
7{
8 static const char * const color_names[] = {
9 "normal", "black", "red", "green", "yellow",
10 "blue", "magenta", "cyan", "white"
11 };
12 char *end;
13 int i;
14 for (i = 0; i < ARRAY_SIZE(color_names); i++) {
15 const char *str = color_names[i];
16 if (!strncasecmp(name, str, len) && !str[len])
17 return i - 1;
18 }
19 i = strtol(name, &end, 10);
20 if (end - name == len && i >= -1 && i <= 255)
21 return i;
22 return -2;
23}
24
25static int parse_attr(const char *name, int len)
26{
27 static const int attr_values[] = { 1, 2, 4, 5, 7 };
28 static const char * const attr_names[] = {
29 "bold", "dim", "ul", "blink", "reverse"
30 };
31 int i;
32 for (i = 0; i < ARRAY_SIZE(attr_names); i++) {
33 const char *str = attr_names[i];
34 if (!strncasecmp(name, str, len) && !str[len])
35 return attr_values[i];
36 }
37 return -1;
38}
39
40void color_parse(const char *value, const char *var, char *dst)
41{
42 color_parse_mem(value, strlen(value), var, dst);
43}
44
45void color_parse_mem(const char *value, int value_len, const char *var,
46 char *dst)
47{
48 const char *ptr = value;
49 int len = value_len;
50 int attr = -1;
51 int fg = -2;
52 int bg = -2;
53
54 if (!strncasecmp(value, "reset", len)) {
55 strcpy(dst, PERF_COLOR_RESET);
56 return;
57 }
58
59 /* [fg [bg]] [attr] */
60 while (len > 0) {
61 const char *word = ptr;
62 int val, wordlen = 0;
63
64 while (len > 0 && !isspace(word[wordlen])) {
65 wordlen++;
66 len--;
67 }
68
69 ptr = word + wordlen;
70 while (len > 0 && isspace(*ptr)) {
71 ptr++;
72 len--;
73 }
74
75 val = parse_color(word, wordlen);
76 if (val >= -1) {
77 if (fg == -2) {
78 fg = val;
79 continue;
80 }
81 if (bg == -2) {
82 bg = val;
83 continue;
84 }
85 goto bad;
86 }
87 val = parse_attr(word, wordlen);
88 if (val < 0 || attr != -1)
89 goto bad;
90 attr = val;
91 }
92
93 if (attr >= 0 || fg >= 0 || bg >= 0) {
94 int sep = 0;
95
96 *dst++ = '\033';
97 *dst++ = '[';
98 if (attr >= 0) {
99 *dst++ = '0' + attr;
100 sep++;
101 }
102 if (fg >= 0) {
103 if (sep++)
104 *dst++ = ';';
105 if (fg < 8) {
106 *dst++ = '3';
107 *dst++ = '0' + fg;
108 } else {
109 dst += sprintf(dst, "38;5;%d", fg);
110 }
111 }
112 if (bg >= 0) {
113 if (sep++)
114 *dst++ = ';';
115 if (bg < 8) {
116 *dst++ = '4';
117 *dst++ = '0' + bg;
118 } else {
119 dst += sprintf(dst, "48;5;%d", bg);
120 }
121 }
122 *dst++ = 'm';
123 }
124 *dst = 0;
125 return;
126bad:
127 die("bad color value '%.*s' for variable '%s'", value_len, value, var);
128}
129
130int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty)
131{
132 if (value) {
133 if (!strcasecmp(value, "never"))
134 return 0;
135 if (!strcasecmp(value, "always"))
136 return 1;
137 if (!strcasecmp(value, "auto"))
138 goto auto_color;
139 }
140
141 /* Missing or explicit false to turn off colorization */
142 if (!perf_config_bool(var, value))
143 return 0;
144
145 /* any normal truth value defaults to 'auto' */
146 auto_color:
147 if (stdout_is_tty < 0)
148 stdout_is_tty = isatty(1);
149 if (stdout_is_tty || (pager_in_use() && pager_use_color)) {
150 char *term = getenv("TERM");
151 if (term && strcmp(term, "dumb"))
152 return 1;
153 }
154 return 0;
155}
156
157int perf_color_default_config(const char *var, const char *value, void *cb)
158{
159 if (!strcmp(var, "color.ui")) {
160 perf_use_color_default = perf_config_colorbool(var, value, -1);
161 return 0;
162 }
163
164 return perf_default_config(var, value, cb);
165}
166
167static int color_vfprintf(FILE *fp, const char *color, const char *fmt,
168 va_list args, const char *trail)
169{
170 int r = 0;
171
172 /*
173 * Auto-detect:
174 */
175 if (perf_use_color_default < 0) {
176 if (isatty(1) || pager_in_use())
177 perf_use_color_default = 1;
178 else
179 perf_use_color_default = 0;
180 }
181
182 if (perf_use_color_default && *color)
183 r += fprintf(fp, "%s", color);
184 r += vfprintf(fp, fmt, args);
185 if (perf_use_color_default && *color)
186 r += fprintf(fp, "%s", PERF_COLOR_RESET);
187 if (trail)
188 r += fprintf(fp, "%s", trail);
189 return r;
190}
191
192
193
194int color_fprintf(FILE *fp, const char *color, const char *fmt, ...)
195{
196 va_list args;
197 int r;
198
199 va_start(args, fmt);
200 r = color_vfprintf(fp, color, fmt, args, NULL);
201 va_end(args);
202 return r;
203}
204
205int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...)
206{
207 va_list args;
208 int r;
209 va_start(args, fmt);
210 r = color_vfprintf(fp, color, fmt, args, "\n");
211 va_end(args);
212 return r;
213}
214
215/*
216 * This function splits the buffer by newlines and colors the lines individually.
217 *
218 * Returns 0 on success.
219 */
220int color_fwrite_lines(FILE *fp, const char *color,
221 size_t count, const char *buf)
222{
223 if (!*color)
224 return fwrite(buf, count, 1, fp) != 1;
225 while (count) {
226 char *p = memchr(buf, '\n', count);
227 if (p != buf && (fputs(color, fp) < 0 ||
228 fwrite(buf, p ? p - buf : count, 1, fp) != 1 ||
229 fputs(PERF_COLOR_RESET, fp) < 0))
230 return -1;
231 if (!p)
232 return 0;
233 if (fputc('\n', fp) < 0)
234 return -1;
235 count -= p + 1 - buf;
236 buf = p + 1;
237 }
238 return 0;
239}
240
241
diff --git a/tools/perf/util/color.h b/tools/perf/util/color.h
new file mode 100644
index 000000000000..5abfd379582b
--- /dev/null
+++ b/tools/perf/util/color.h
@@ -0,0 +1,36 @@
1#ifndef COLOR_H
2#define COLOR_H
3
4/* "\033[1;38;5;2xx;48;5;2xxm\0" is 23 bytes */
5#define COLOR_MAXLEN 24
6
7#define PERF_COLOR_NORMAL ""
8#define PERF_COLOR_RESET "\033[m"
9#define PERF_COLOR_BOLD "\033[1m"
10#define PERF_COLOR_RED "\033[31m"
11#define PERF_COLOR_GREEN "\033[32m"
12#define PERF_COLOR_YELLOW "\033[33m"
13#define PERF_COLOR_BLUE "\033[34m"
14#define PERF_COLOR_MAGENTA "\033[35m"
15#define PERF_COLOR_CYAN "\033[36m"
16#define PERF_COLOR_BG_RED "\033[41m"
17
18/*
19 * This variable stores the value of color.ui
20 */
21extern int perf_use_color_default;
22
23
24/*
25 * Use this instead of perf_default_config if you need the value of color.ui.
26 */
27int perf_color_default_config(const char *var, const char *value, void *cb);
28
29int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty);
30void color_parse(const char *value, const char *var, char *dst);
31void color_parse_mem(const char *value, int len, const char *var, char *dst);
32int color_fprintf(FILE *fp, const char *color, const char *fmt, ...);
33int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...);
34int color_fwrite_lines(FILE *fp, const char *color, size_t count, const char *buf);
35
36#endif /* COLOR_H */
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
new file mode 100644
index 000000000000..3dd13faa6a27
--- /dev/null
+++ b/tools/perf/util/config.c
@@ -0,0 +1,873 @@
1/*
2 * GIT - The information manager from hell
3 *
4 * Copyright (C) Linus Torvalds, 2005
5 * Copyright (C) Johannes Schindelin, 2005
6 *
7 */
8#include "util.h"
9#include "cache.h"
10#include "exec_cmd.h"
11
12#define MAXNAME (256)
13
14static FILE *config_file;
15static const char *config_file_name;
16static int config_linenr;
17static int config_file_eof;
18
19const char *config_exclusive_filename = NULL;
20
21static int get_next_char(void)
22{
23 int c;
24 FILE *f;
25
26 c = '\n';
27 if ((f = config_file) != NULL) {
28 c = fgetc(f);
29 if (c == '\r') {
30 /* DOS like systems */
31 c = fgetc(f);
32 if (c != '\n') {
33 ungetc(c, f);
34 c = '\r';
35 }
36 }
37 if (c == '\n')
38 config_linenr++;
39 if (c == EOF) {
40 config_file_eof = 1;
41 c = '\n';
42 }
43 }
44 return c;
45}
46
47static char *parse_value(void)
48{
49 static char value[1024];
50 int quote = 0, comment = 0, len = 0, space = 0;
51
52 for (;;) {
53 int c = get_next_char();
54 if (len >= sizeof(value) - 1)
55 return NULL;
56 if (c == '\n') {
57 if (quote)
58 return NULL;
59 value[len] = 0;
60 return value;
61 }
62 if (comment)
63 continue;
64 if (isspace(c) && !quote) {
65 space = 1;
66 continue;
67 }
68 if (!quote) {
69 if (c == ';' || c == '#') {
70 comment = 1;
71 continue;
72 }
73 }
74 if (space) {
75 if (len)
76 value[len++] = ' ';
77 space = 0;
78 }
79 if (c == '\\') {
80 c = get_next_char();
81 switch (c) {
82 case '\n':
83 continue;
84 case 't':
85 c = '\t';
86 break;
87 case 'b':
88 c = '\b';
89 break;
90 case 'n':
91 c = '\n';
92 break;
93 /* Some characters escape as themselves */
94 case '\\': case '"':
95 break;
96 /* Reject unknown escape sequences */
97 default:
98 return NULL;
99 }
100 value[len++] = c;
101 continue;
102 }
103 if (c == '"') {
104 quote = 1-quote;
105 continue;
106 }
107 value[len++] = c;
108 }
109}
110
111static inline int iskeychar(int c)
112{
113 return isalnum(c) || c == '-';
114}
115
116static int get_value(config_fn_t fn, void *data, char *name, unsigned int len)
117{
118 int c;
119 char *value;
120
121 /* Get the full name */
122 for (;;) {
123 c = get_next_char();
124 if (config_file_eof)
125 break;
126 if (!iskeychar(c))
127 break;
128 name[len++] = tolower(c);
129 if (len >= MAXNAME)
130 return -1;
131 }
132 name[len] = 0;
133 while (c == ' ' || c == '\t')
134 c = get_next_char();
135
136 value = NULL;
137 if (c != '\n') {
138 if (c != '=')
139 return -1;
140 value = parse_value();
141 if (!value)
142 return -1;
143 }
144 return fn(name, value, data);
145}
146
147static int get_extended_base_var(char *name, int baselen, int c)
148{
149 do {
150 if (c == '\n')
151 return -1;
152 c = get_next_char();
153 } while (isspace(c));
154
155 /* We require the format to be '[base "extension"]' */
156 if (c != '"')
157 return -1;
158 name[baselen++] = '.';
159
160 for (;;) {
161 int c = get_next_char();
162 if (c == '\n')
163 return -1;
164 if (c == '"')
165 break;
166 if (c == '\\') {
167 c = get_next_char();
168 if (c == '\n')
169 return -1;
170 }
171 name[baselen++] = c;
172 if (baselen > MAXNAME / 2)
173 return -1;
174 }
175
176 /* Final ']' */
177 if (get_next_char() != ']')
178 return -1;
179 return baselen;
180}
181
182static int get_base_var(char *name)
183{
184 int baselen = 0;
185
186 for (;;) {
187 int c = get_next_char();
188 if (config_file_eof)
189 return -1;
190 if (c == ']')
191 return baselen;
192 if (isspace(c))
193 return get_extended_base_var(name, baselen, c);
194 if (!iskeychar(c) && c != '.')
195 return -1;
196 if (baselen > MAXNAME / 2)
197 return -1;
198 name[baselen++] = tolower(c);
199 }
200}
201
202static int perf_parse_file(config_fn_t fn, void *data)
203{
204 int comment = 0;
205 int baselen = 0;
206 static char var[MAXNAME];
207
208 /* U+FEFF Byte Order Mark in UTF8 */
209 static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf";
210 const unsigned char *bomptr = utf8_bom;
211
212 for (;;) {
213 int c = get_next_char();
214 if (bomptr && *bomptr) {
215 /* We are at the file beginning; skip UTF8-encoded BOM
216 * if present. Sane editors won't put this in on their
217 * own, but e.g. Windows Notepad will do it happily. */
218 if ((unsigned char) c == *bomptr) {
219 bomptr++;
220 continue;
221 } else {
222 /* Do not tolerate partial BOM. */
223 if (bomptr != utf8_bom)
224 break;
225 /* No BOM at file beginning. Cool. */
226 bomptr = NULL;
227 }
228 }
229 if (c == '\n') {
230 if (config_file_eof)
231 return 0;
232 comment = 0;
233 continue;
234 }
235 if (comment || isspace(c))
236 continue;
237 if (c == '#' || c == ';') {
238 comment = 1;
239 continue;
240 }
241 if (c == '[') {
242 baselen = get_base_var(var);
243 if (baselen <= 0)
244 break;
245 var[baselen++] = '.';
246 var[baselen] = 0;
247 continue;
248 }
249 if (!isalpha(c))
250 break;
251 var[baselen] = tolower(c);
252 if (get_value(fn, data, var, baselen+1) < 0)
253 break;
254 }
255 die("bad config file line %d in %s", config_linenr, config_file_name);
256}
257
258static int parse_unit_factor(const char *end, unsigned long *val)
259{
260 if (!*end)
261 return 1;
262 else if (!strcasecmp(end, "k")) {
263 *val *= 1024;
264 return 1;
265 }
266 else if (!strcasecmp(end, "m")) {
267 *val *= 1024 * 1024;
268 return 1;
269 }
270 else if (!strcasecmp(end, "g")) {
271 *val *= 1024 * 1024 * 1024;
272 return 1;
273 }
274 return 0;
275}
276
277static int perf_parse_long(const char *value, long *ret)
278{
279 if (value && *value) {
280 char *end;
281 long val = strtol(value, &end, 0);
282 unsigned long factor = 1;
283 if (!parse_unit_factor(end, &factor))
284 return 0;
285 *ret = val * factor;
286 return 1;
287 }
288 return 0;
289}
290
291int perf_parse_ulong(const char *value, unsigned long *ret)
292{
293 if (value && *value) {
294 char *end;
295 unsigned long val = strtoul(value, &end, 0);
296 if (!parse_unit_factor(end, &val))
297 return 0;
298 *ret = val;
299 return 1;
300 }
301 return 0;
302}
303
304static void die_bad_config(const char *name)
305{
306 if (config_file_name)
307 die("bad config value for '%s' in %s", name, config_file_name);
308 die("bad config value for '%s'", name);
309}
310
311int perf_config_int(const char *name, const char *value)
312{
313 long ret = 0;
314 if (!perf_parse_long(value, &ret))
315 die_bad_config(name);
316 return ret;
317}
318
319unsigned long perf_config_ulong(const char *name, const char *value)
320{
321 unsigned long ret;
322 if (!perf_parse_ulong(value, &ret))
323 die_bad_config(name);
324 return ret;
325}
326
327int perf_config_bool_or_int(const char *name, const char *value, int *is_bool)
328{
329 *is_bool = 1;
330 if (!value)
331 return 1;
332 if (!*value)
333 return 0;
334 if (!strcasecmp(value, "true") || !strcasecmp(value, "yes") || !strcasecmp(value, "on"))
335 return 1;
336 if (!strcasecmp(value, "false") || !strcasecmp(value, "no") || !strcasecmp(value, "off"))
337 return 0;
338 *is_bool = 0;
339 return perf_config_int(name, value);
340}
341
342int perf_config_bool(const char *name, const char *value)
343{
344 int discard;
345 return !!perf_config_bool_or_int(name, value, &discard);
346}
347
348int perf_config_string(const char **dest, const char *var, const char *value)
349{
350 if (!value)
351 return config_error_nonbool(var);
352 *dest = strdup(value);
353 return 0;
354}
355
356static int perf_default_core_config(const char *var, const char *value)
357{
358 /* Add other config variables here and to Documentation/config.txt. */
359 return 0;
360}
361
362int perf_default_config(const char *var, const char *value, void *dummy)
363{
364 if (!prefixcmp(var, "core."))
365 return perf_default_core_config(var, value);
366
367 /* Add other config variables here and to Documentation/config.txt. */
368 return 0;
369}
370
371int perf_config_from_file(config_fn_t fn, const char *filename, void *data)
372{
373 int ret;
374 FILE *f = fopen(filename, "r");
375
376 ret = -1;
377 if (f) {
378 config_file = f;
379 config_file_name = filename;
380 config_linenr = 1;
381 config_file_eof = 0;
382 ret = perf_parse_file(fn, data);
383 fclose(f);
384 config_file_name = NULL;
385 }
386 return ret;
387}
388
389const char *perf_etc_perfconfig(void)
390{
391 static const char *system_wide;
392 if (!system_wide)
393 system_wide = system_path(ETC_PERFCONFIG);
394 return system_wide;
395}
396
397static int perf_env_bool(const char *k, int def)
398{
399 const char *v = getenv(k);
400 return v ? perf_config_bool(k, v) : def;
401}
402
403int perf_config_system(void)
404{
405 return !perf_env_bool("PERF_CONFIG_NOSYSTEM", 0);
406}
407
408int perf_config_global(void)
409{
410 return !perf_env_bool("PERF_CONFIG_NOGLOBAL", 0);
411}
412
413int perf_config(config_fn_t fn, void *data)
414{
415 int ret = 0, found = 0;
416 char *repo_config = NULL;
417 const char *home = NULL;
418
419 /* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
420 if (config_exclusive_filename)
421 return perf_config_from_file(fn, config_exclusive_filename, data);
422 if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) {
423 ret += perf_config_from_file(fn, perf_etc_perfconfig(),
424 data);
425 found += 1;
426 }
427
428 home = getenv("HOME");
429 if (perf_config_global() && home) {
430 char *user_config = strdup(mkpath("%s/.perfconfig", home));
431 if (!access(user_config, R_OK)) {
432 ret += perf_config_from_file(fn, user_config, data);
433 found += 1;
434 }
435 free(user_config);
436 }
437
438 repo_config = perf_pathdup("config");
439 if (!access(repo_config, R_OK)) {
440 ret += perf_config_from_file(fn, repo_config, data);
441 found += 1;
442 }
443 free(repo_config);
444 if (found == 0)
445 return -1;
446 return ret;
447}
448
449/*
450 * Find all the stuff for perf_config_set() below.
451 */
452
453#define MAX_MATCHES 512
454
455static struct {
456 int baselen;
457 char* key;
458 int do_not_match;
459 regex_t* value_regex;
460 int multi_replace;
461 size_t offset[MAX_MATCHES];
462 enum { START, SECTION_SEEN, SECTION_END_SEEN, KEY_SEEN } state;
463 int seen;
464} store;
465
466static int matches(const char* key, const char* value)
467{
468 return !strcmp(key, store.key) &&
469 (store.value_regex == NULL ||
470 (store.do_not_match ^
471 !regexec(store.value_regex, value, 0, NULL, 0)));
472}
473
474static int store_aux(const char* key, const char* value, void *cb)
475{
476 const char *ep;
477 size_t section_len;
478
479 switch (store.state) {
480 case KEY_SEEN:
481 if (matches(key, value)) {
482 if (store.seen == 1 && store.multi_replace == 0) {
483 warning("%s has multiple values", key);
484 } else if (store.seen >= MAX_MATCHES) {
485 error("too many matches for %s", key);
486 return 1;
487 }
488
489 store.offset[store.seen] = ftell(config_file);
490 store.seen++;
491 }
492 break;
493 case SECTION_SEEN:
494 /*
495 * What we are looking for is in store.key (both
496 * section and var), and its section part is baselen
497 * long. We found key (again, both section and var).
498 * We would want to know if this key is in the same
499 * section as what we are looking for. We already
500 * know we are in the same section as what should
501 * hold store.key.
502 */
503 ep = strrchr(key, '.');
504 section_len = ep - key;
505
506 if ((section_len != store.baselen) ||
507 memcmp(key, store.key, section_len+1)) {
508 store.state = SECTION_END_SEEN;
509 break;
510 }
511
512 /*
513 * Do not increment matches: this is no match, but we
514 * just made sure we are in the desired section.
515 */
516 store.offset[store.seen] = ftell(config_file);
517 /* fallthru */
518 case SECTION_END_SEEN:
519 case START:
520 if (matches(key, value)) {
521 store.offset[store.seen] = ftell(config_file);
522 store.state = KEY_SEEN;
523 store.seen++;
524 } else {
525 if (strrchr(key, '.') - key == store.baselen &&
526 !strncmp(key, store.key, store.baselen)) {
527 store.state = SECTION_SEEN;
528 store.offset[store.seen] = ftell(config_file);
529 }
530 }
531 }
532 return 0;
533}
534
535static int store_write_section(int fd, const char* key)
536{
537 const char *dot;
538 int i, success;
539 struct strbuf sb = STRBUF_INIT;
540
541 dot = memchr(key, '.', store.baselen);
542 if (dot) {
543 strbuf_addf(&sb, "[%.*s \"", (int)(dot - key), key);
544 for (i = dot - key + 1; i < store.baselen; i++) {
545 if (key[i] == '"' || key[i] == '\\')
546 strbuf_addch(&sb, '\\');
547 strbuf_addch(&sb, key[i]);
548 }
549 strbuf_addstr(&sb, "\"]\n");
550 } else {
551 strbuf_addf(&sb, "[%.*s]\n", store.baselen, key);
552 }
553
554 success = write_in_full(fd, sb.buf, sb.len) == sb.len;
555 strbuf_release(&sb);
556
557 return success;
558}
559
560static int store_write_pair(int fd, const char* key, const char* value)
561{
562 int i, success;
563 int length = strlen(key + store.baselen + 1);
564 const char *quote = "";
565 struct strbuf sb = STRBUF_INIT;
566
567 /*
568 * Check to see if the value needs to be surrounded with a dq pair.
569 * Note that problematic characters are always backslash-quoted; this
570 * check is about not losing leading or trailing SP and strings that
571 * follow beginning-of-comment characters (i.e. ';' and '#') by the
572 * configuration parser.
573 */
574 if (value[0] == ' ')
575 quote = "\"";
576 for (i = 0; value[i]; i++)
577 if (value[i] == ';' || value[i] == '#')
578 quote = "\"";
579 if (i && value[i - 1] == ' ')
580 quote = "\"";
581
582 strbuf_addf(&sb, "\t%.*s = %s",
583 length, key + store.baselen + 1, quote);
584
585 for (i = 0; value[i]; i++)
586 switch (value[i]) {
587 case '\n':
588 strbuf_addstr(&sb, "\\n");
589 break;
590 case '\t':
591 strbuf_addstr(&sb, "\\t");
592 break;
593 case '"':
594 case '\\':
595 strbuf_addch(&sb, '\\');
596 default:
597 strbuf_addch(&sb, value[i]);
598 break;
599 }
600 strbuf_addf(&sb, "%s\n", quote);
601
602 success = write_in_full(fd, sb.buf, sb.len) == sb.len;
603 strbuf_release(&sb);
604
605 return success;
606}
607
608static ssize_t find_beginning_of_line(const char* contents, size_t size,
609 size_t offset_, int* found_bracket)
610{
611 size_t equal_offset = size, bracket_offset = size;
612 ssize_t offset;
613
614contline:
615 for (offset = offset_-2; offset > 0
616 && contents[offset] != '\n'; offset--)
617 switch (contents[offset]) {
618 case '=': equal_offset = offset; break;
619 case ']': bracket_offset = offset; break;
620 }
621 if (offset > 0 && contents[offset-1] == '\\') {
622 offset_ = offset;
623 goto contline;
624 }
625 if (bracket_offset < equal_offset) {
626 *found_bracket = 1;
627 offset = bracket_offset+1;
628 } else
629 offset++;
630
631 return offset;
632}
633
634int perf_config_set(const char* key, const char* value)
635{
636 return perf_config_set_multivar(key, value, NULL, 0);
637}
638
639/*
640 * If value==NULL, unset in (remove from) config,
641 * if value_regex!=NULL, disregard key/value pairs where value does not match.
642 * if multi_replace==0, nothing, or only one matching key/value is replaced,
643 * else all matching key/values (regardless how many) are removed,
644 * before the new pair is written.
645 *
646 * Returns 0 on success.
647 *
648 * This function does this:
649 *
650 * - it locks the config file by creating ".perf/config.lock"
651 *
652 * - it then parses the config using store_aux() as validator to find
653 * the position on the key/value pair to replace. If it is to be unset,
654 * it must be found exactly once.
655 *
656 * - the config file is mmap()ed and the part before the match (if any) is
657 * written to the lock file, then the changed part and the rest.
658 *
659 * - the config file is removed and the lock file rename()d to it.
660 *
661 */
662int perf_config_set_multivar(const char* key, const char* value,
663 const char* value_regex, int multi_replace)
664{
665 int i, dot;
666 int fd = -1, in_fd;
667 int ret = 0;
668 char* config_filename;
669 const char* last_dot = strrchr(key, '.');
670
671 if (config_exclusive_filename)
672 config_filename = strdup(config_exclusive_filename);
673 else
674 config_filename = perf_pathdup("config");
675
676 /*
677 * Since "key" actually contains the section name and the real
678 * key name separated by a dot, we have to know where the dot is.
679 */
680
681 if (last_dot == NULL) {
682 error("key does not contain a section: %s", key);
683 ret = 2;
684 goto out_free;
685 }
686 store.baselen = last_dot - key;
687
688 store.multi_replace = multi_replace;
689
690 /*
691 * Validate the key and while at it, lower case it for matching.
692 */
693 store.key = malloc(strlen(key) + 1);
694 dot = 0;
695 for (i = 0; key[i]; i++) {
696 unsigned char c = key[i];
697 if (c == '.')
698 dot = 1;
699 /* Leave the extended basename untouched.. */
700 if (!dot || i > store.baselen) {
701 if (!iskeychar(c) || (i == store.baselen+1 && !isalpha(c))) {
702 error("invalid key: %s", key);
703 free(store.key);
704 ret = 1;
705 goto out_free;
706 }
707 c = tolower(c);
708 } else if (c == '\n') {
709 error("invalid key (newline): %s", key);
710 free(store.key);
711 ret = 1;
712 goto out_free;
713 }
714 store.key[i] = c;
715 }
716 store.key[i] = 0;
717
718 /*
719 * If .perf/config does not exist yet, write a minimal version.
720 */
721 in_fd = open(config_filename, O_RDONLY);
722 if ( in_fd < 0 ) {
723 free(store.key);
724
725 if ( ENOENT != errno ) {
726 error("opening %s: %s", config_filename,
727 strerror(errno));
728 ret = 3; /* same as "invalid config file" */
729 goto out_free;
730 }
731 /* if nothing to unset, error out */
732 if (value == NULL) {
733 ret = 5;
734 goto out_free;
735 }
736
737 store.key = (char*)key;
738 if (!store_write_section(fd, key) ||
739 !store_write_pair(fd, key, value))
740 goto write_err_out;
741 } else {
742 struct stat st;
743 char* contents;
744 size_t contents_sz, copy_begin, copy_end;
745 int i, new_line = 0;
746
747 if (value_regex == NULL)
748 store.value_regex = NULL;
749 else {
750 if (value_regex[0] == '!') {
751 store.do_not_match = 1;
752 value_regex++;
753 } else
754 store.do_not_match = 0;
755
756 store.value_regex = (regex_t*)malloc(sizeof(regex_t));
757 if (regcomp(store.value_regex, value_regex,
758 REG_EXTENDED)) {
759 error("invalid pattern: %s", value_regex);
760 free(store.value_regex);
761 ret = 6;
762 goto out_free;
763 }
764 }
765
766 store.offset[0] = 0;
767 store.state = START;
768 store.seen = 0;
769
770 /*
771 * After this, store.offset will contain the *end* offset
772 * of the last match, or remain at 0 if no match was found.
773 * As a side effect, we make sure to transform only a valid
774 * existing config file.
775 */
776 if (perf_config_from_file(store_aux, config_filename, NULL)) {
777 error("invalid config file %s", config_filename);
778 free(store.key);
779 if (store.value_regex != NULL) {
780 regfree(store.value_regex);
781 free(store.value_regex);
782 }
783 ret = 3;
784 goto out_free;
785 }
786
787 free(store.key);
788 if (store.value_regex != NULL) {
789 regfree(store.value_regex);
790 free(store.value_regex);
791 }
792
793 /* if nothing to unset, or too many matches, error out */
794 if ((store.seen == 0 && value == NULL) ||
795 (store.seen > 1 && multi_replace == 0)) {
796 ret = 5;
797 goto out_free;
798 }
799
800 fstat(in_fd, &st);
801 contents_sz = xsize_t(st.st_size);
802 contents = mmap(NULL, contents_sz, PROT_READ,
803 MAP_PRIVATE, in_fd, 0);
804 close(in_fd);
805
806 if (store.seen == 0)
807 store.seen = 1;
808
809 for (i = 0, copy_begin = 0; i < store.seen; i++) {
810 if (store.offset[i] == 0) {
811 store.offset[i] = copy_end = contents_sz;
812 } else if (store.state != KEY_SEEN) {
813 copy_end = store.offset[i];
814 } else
815 copy_end = find_beginning_of_line(
816 contents, contents_sz,
817 store.offset[i]-2, &new_line);
818
819 if (copy_end > 0 && contents[copy_end-1] != '\n')
820 new_line = 1;
821
822 /* write the first part of the config */
823 if (copy_end > copy_begin) {
824 if (write_in_full(fd, contents + copy_begin,
825 copy_end - copy_begin) <
826 copy_end - copy_begin)
827 goto write_err_out;
828 if (new_line &&
829 write_in_full(fd, "\n", 1) != 1)
830 goto write_err_out;
831 }
832 copy_begin = store.offset[i];
833 }
834
835 /* write the pair (value == NULL means unset) */
836 if (value != NULL) {
837 if (store.state == START) {
838 if (!store_write_section(fd, key))
839 goto write_err_out;
840 }
841 if (!store_write_pair(fd, key, value))
842 goto write_err_out;
843 }
844
845 /* write the rest of the config */
846 if (copy_begin < contents_sz)
847 if (write_in_full(fd, contents + copy_begin,
848 contents_sz - copy_begin) <
849 contents_sz - copy_begin)
850 goto write_err_out;
851
852 munmap(contents, contents_sz);
853 }
854
855 ret = 0;
856
857out_free:
858 free(config_filename);
859 return ret;
860
861write_err_out:
862 goto out_free;
863
864}
865
866/*
867 * Call this to report error for your variable that should not
868 * get a boolean value (i.e. "[my] var" means "true").
869 */
870int config_error_nonbool(const char *var)
871{
872 return error("Missing value for '%s'", var);
873}
diff --git a/tools/perf/util/ctype.c b/tools/perf/util/ctype.c
new file mode 100644
index 000000000000..b90ec004f29c
--- /dev/null
+++ b/tools/perf/util/ctype.c
@@ -0,0 +1,26 @@
1/*
2 * Sane locale-independent, ASCII ctype.
3 *
4 * No surprises, and works with signed and unsigned chars.
5 */
6#include "cache.h"
7
8enum {
9 S = GIT_SPACE,
10 A = GIT_ALPHA,
11 D = GIT_DIGIT,
12 G = GIT_GLOB_SPECIAL, /* *, ?, [, \\ */
13 R = GIT_REGEX_SPECIAL, /* $, (, ), +, ., ^, {, | * */
14};
15
16unsigned char sane_ctype[256] = {
17 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0, /* 0.. 15 */
18 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16.. 31 */
19 S, 0, 0, 0, R, 0, 0, 0, R, R, G, R, 0, 0, R, 0, /* 32.. 47 */
20 D, D, D, D, D, D, D, D, D, D, 0, 0, 0, 0, 0, G, /* 48.. 63 */
21 0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, /* 64.. 79 */
22 A, A, A, A, A, A, A, A, A, A, A, G, G, 0, R, 0, /* 80.. 95 */
23 0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, /* 96..111 */
24 A, A, A, A, A, A, A, A, A, A, A, R, R, 0, 0, 0, /* 112..127 */
25 /* Nothing in the 128.. range */
26};
diff --git a/tools/perf/util/environment.c b/tools/perf/util/environment.c
new file mode 100644
index 000000000000..275b0ee345f5
--- /dev/null
+++ b/tools/perf/util/environment.c
@@ -0,0 +1,9 @@
1/*
2 * We put all the perf config variables in this same object
3 * file, so that programs can link against the config parser
4 * without having to link against all the rest of perf.
5 */
6#include "cache.h"
7
8const char *pager_program;
9int pager_use_color = 1;
diff --git a/tools/perf/util/exec_cmd.c b/tools/perf/util/exec_cmd.c
new file mode 100644
index 000000000000..d39292263153
--- /dev/null
+++ b/tools/perf/util/exec_cmd.c
@@ -0,0 +1,165 @@
1#include "cache.h"
2#include "exec_cmd.h"
3#include "quote.h"
4#define MAX_ARGS 32
5
6extern char **environ;
7static const char *argv_exec_path;
8static const char *argv0_path;
9
10const char *system_path(const char *path)
11{
12#ifdef RUNTIME_PREFIX
13 static const char *prefix;
14#else
15 static const char *prefix = PREFIX;
16#endif
17 struct strbuf d = STRBUF_INIT;
18
19 if (is_absolute_path(path))
20 return path;
21
22#ifdef RUNTIME_PREFIX
23 assert(argv0_path);
24 assert(is_absolute_path(argv0_path));
25
26 if (!prefix &&
27 !(prefix = strip_path_suffix(argv0_path, PERF_EXEC_PATH)) &&
28 !(prefix = strip_path_suffix(argv0_path, BINDIR)) &&
29 !(prefix = strip_path_suffix(argv0_path, "perf"))) {
30 prefix = PREFIX;
31 fprintf(stderr, "RUNTIME_PREFIX requested, "
32 "but prefix computation failed. "
33 "Using static fallback '%s'.\n", prefix);
34 }
35#endif
36
37 strbuf_addf(&d, "%s/%s", prefix, path);
38 path = strbuf_detach(&d, NULL);
39 return path;
40}
41
42const char *perf_extract_argv0_path(const char *argv0)
43{
44 const char *slash;
45
46 if (!argv0 || !*argv0)
47 return NULL;
48 slash = argv0 + strlen(argv0);
49
50 while (argv0 <= slash && !is_dir_sep(*slash))
51 slash--;
52
53 if (slash >= argv0) {
54 argv0_path = strndup(argv0, slash - argv0);
55 return slash + 1;
56 }
57
58 return argv0;
59}
60
61void perf_set_argv_exec_path(const char *exec_path)
62{
63 argv_exec_path = exec_path;
64 /*
65 * Propagate this setting to external programs.
66 */
67 setenv(EXEC_PATH_ENVIRONMENT, exec_path, 1);
68}
69
70
71/* Returns the highest-priority, location to look for perf programs. */
72const char *perf_exec_path(void)
73{
74 const char *env;
75
76 if (argv_exec_path)
77 return argv_exec_path;
78
79 env = getenv(EXEC_PATH_ENVIRONMENT);
80 if (env && *env) {
81 return env;
82 }
83
84 return system_path(PERF_EXEC_PATH);
85}
86
87static void add_path(struct strbuf *out, const char *path)
88{
89 if (path && *path) {
90 if (is_absolute_path(path))
91 strbuf_addstr(out, path);
92 else
93 strbuf_addstr(out, make_nonrelative_path(path));
94
95 strbuf_addch(out, PATH_SEP);
96 }
97}
98
99void setup_path(void)
100{
101 const char *old_path = getenv("PATH");
102 struct strbuf new_path = STRBUF_INIT;
103
104 add_path(&new_path, perf_exec_path());
105 add_path(&new_path, argv0_path);
106
107 if (old_path)
108 strbuf_addstr(&new_path, old_path);
109 else
110 strbuf_addstr(&new_path, "/usr/local/bin:/usr/bin:/bin");
111
112 setenv("PATH", new_path.buf, 1);
113
114 strbuf_release(&new_path);
115}
116
117const char **prepare_perf_cmd(const char **argv)
118{
119 int argc;
120 const char **nargv;
121
122 for (argc = 0; argv[argc]; argc++)
123 ; /* just counting */
124 nargv = malloc(sizeof(*nargv) * (argc + 2));
125
126 nargv[0] = "perf";
127 for (argc = 0; argv[argc]; argc++)
128 nargv[argc + 1] = argv[argc];
129 nargv[argc + 1] = NULL;
130 return nargv;
131}
132
133int execv_perf_cmd(const char **argv) {
134 const char **nargv = prepare_perf_cmd(argv);
135
136 /* execvp() can only ever return if it fails */
137 execvp("perf", (char **)nargv);
138
139 free(nargv);
140 return -1;
141}
142
143
144int execl_perf_cmd(const char *cmd,...)
145{
146 int argc;
147 const char *argv[MAX_ARGS + 1];
148 const char *arg;
149 va_list param;
150
151 va_start(param, cmd);
152 argv[0] = cmd;
153 argc = 1;
154 while (argc < MAX_ARGS) {
155 arg = argv[argc++] = va_arg(param, char *);
156 if (!arg)
157 break;
158 }
159 va_end(param);
160 if (MAX_ARGS <= argc)
161 return error("too many args to run %s", cmd);
162
163 argv[argc] = NULL;
164 return execv_perf_cmd(argv);
165}
diff --git a/tools/perf/util/exec_cmd.h b/tools/perf/util/exec_cmd.h
new file mode 100644
index 000000000000..effe25eb1545
--- /dev/null
+++ b/tools/perf/util/exec_cmd.h
@@ -0,0 +1,13 @@
1#ifndef PERF_EXEC_CMD_H
2#define PERF_EXEC_CMD_H
3
4extern void perf_set_argv_exec_path(const char *exec_path);
5extern const char *perf_extract_argv0_path(const char *path);
6extern const char *perf_exec_path(void);
7extern void setup_path(void);
8extern const char **prepare_perf_cmd(const char **argv);
9extern int execv_perf_cmd(const char **argv); /* NULL terminated */
10extern int execl_perf_cmd(const char *cmd, ...);
11extern const char *system_path(const char *path);
12
13#endif /* PERF_EXEC_CMD_H */
diff --git a/tools/perf/util/generate-cmdlist.sh b/tools/perf/util/generate-cmdlist.sh
new file mode 100755
index 000000000000..f06f6fd148f8
--- /dev/null
+++ b/tools/perf/util/generate-cmdlist.sh
@@ -0,0 +1,24 @@
1#!/bin/sh
2
3echo "/* Automatically generated by $0 */
4struct cmdname_help
5{
6 char name[16];
7 char help[80];
8};
9
10static struct cmdname_help common_cmds[] = {"
11
12sed -n -e 's/^perf-\([^ ]*\)[ ].* common.*/\1/p' command-list.txt |
13sort |
14while read cmd
15do
16 sed -n '
17 /^NAME/,/perf-'"$cmd"'/H
18 ${
19 x
20 s/.*perf-'"$cmd"' - \(.*\)/ {"'"$cmd"'", "\1"},/
21 p
22 }' "Documentation/perf-$cmd.txt"
23done
24echo "};"
diff --git a/tools/perf/util/help.c b/tools/perf/util/help.c
new file mode 100644
index 000000000000..6653f7dd1d78
--- /dev/null
+++ b/tools/perf/util/help.c
@@ -0,0 +1,367 @@
1#include "cache.h"
2#include "../builtin.h"
3#include "exec_cmd.h"
4#include "levenshtein.h"
5#include "help.h"
6
7/* most GUI terminals set COLUMNS (although some don't export it) */
8static int term_columns(void)
9{
10 char *col_string = getenv("COLUMNS");
11 int n_cols;
12
13 if (col_string && (n_cols = atoi(col_string)) > 0)
14 return n_cols;
15
16#ifdef TIOCGWINSZ
17 {
18 struct winsize ws;
19 if (!ioctl(1, TIOCGWINSZ, &ws)) {
20 if (ws.ws_col)
21 return ws.ws_col;
22 }
23 }
24#endif
25
26 return 80;
27}
28
29void add_cmdname(struct cmdnames *cmds, const char *name, int len)
30{
31 struct cmdname *ent = malloc(sizeof(*ent) + len + 1);
32
33 ent->len = len;
34 memcpy(ent->name, name, len);
35 ent->name[len] = 0;
36
37 ALLOC_GROW(cmds->names, cmds->cnt + 1, cmds->alloc);
38 cmds->names[cmds->cnt++] = ent;
39}
40
41static void clean_cmdnames(struct cmdnames *cmds)
42{
43 int i;
44 for (i = 0; i < cmds->cnt; ++i)
45 free(cmds->names[i]);
46 free(cmds->names);
47 cmds->cnt = 0;
48 cmds->alloc = 0;
49}
50
51static int cmdname_compare(const void *a_, const void *b_)
52{
53 struct cmdname *a = *(struct cmdname **)a_;
54 struct cmdname *b = *(struct cmdname **)b_;
55 return strcmp(a->name, b->name);
56}
57
58static void uniq(struct cmdnames *cmds)
59{
60 int i, j;
61
62 if (!cmds->cnt)
63 return;
64
65 for (i = j = 1; i < cmds->cnt; i++)
66 if (strcmp(cmds->names[i]->name, cmds->names[i-1]->name))
67 cmds->names[j++] = cmds->names[i];
68
69 cmds->cnt = j;
70}
71
72void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
73{
74 int ci, cj, ei;
75 int cmp;
76
77 ci = cj = ei = 0;
78 while (ci < cmds->cnt && ei < excludes->cnt) {
79 cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name);
80 if (cmp < 0)
81 cmds->names[cj++] = cmds->names[ci++];
82 else if (cmp == 0)
83 ci++, ei++;
84 else if (cmp > 0)
85 ei++;
86 }
87
88 while (ci < cmds->cnt)
89 cmds->names[cj++] = cmds->names[ci++];
90
91 cmds->cnt = cj;
92}
93
94static void pretty_print_string_list(struct cmdnames *cmds, int longest)
95{
96 int cols = 1, rows;
97 int space = longest + 1; /* min 1 SP between words */
98 int max_cols = term_columns() - 1; /* don't print *on* the edge */
99 int i, j;
100
101 if (space < max_cols)
102 cols = max_cols / space;
103 rows = (cmds->cnt + cols - 1) / cols;
104
105 for (i = 0; i < rows; i++) {
106 printf(" ");
107
108 for (j = 0; j < cols; j++) {
109 int n = j * rows + i;
110 int size = space;
111 if (n >= cmds->cnt)
112 break;
113 if (j == cols-1 || n + rows >= cmds->cnt)
114 size = 1;
115 printf("%-*s", size, cmds->names[n]->name);
116 }
117 putchar('\n');
118 }
119}
120
121static int is_executable(const char *name)
122{
123 struct stat st;
124
125 if (stat(name, &st) || /* stat, not lstat */
126 !S_ISREG(st.st_mode))
127 return 0;
128
129#ifdef __MINGW32__
130 /* cannot trust the executable bit, peek into the file instead */
131 char buf[3] = { 0 };
132 int n;
133 int fd = open(name, O_RDONLY);
134 st.st_mode &= ~S_IXUSR;
135 if (fd >= 0) {
136 n = read(fd, buf, 2);
137 if (n == 2)
138 /* DOS executables start with "MZ" */
139 if (!strcmp(buf, "#!") || !strcmp(buf, "MZ"))
140 st.st_mode |= S_IXUSR;
141 close(fd);
142 }
143#endif
144 return st.st_mode & S_IXUSR;
145}
146
147static void list_commands_in_dir(struct cmdnames *cmds,
148 const char *path,
149 const char *prefix)
150{
151 int prefix_len;
152 DIR *dir = opendir(path);
153 struct dirent *de;
154 struct strbuf buf = STRBUF_INIT;
155 int len;
156
157 if (!dir)
158 return;
159 if (!prefix)
160 prefix = "perf-";
161 prefix_len = strlen(prefix);
162
163 strbuf_addf(&buf, "%s/", path);
164 len = buf.len;
165
166 while ((de = readdir(dir)) != NULL) {
167 int entlen;
168
169 if (prefixcmp(de->d_name, prefix))
170 continue;
171
172 strbuf_setlen(&buf, len);
173 strbuf_addstr(&buf, de->d_name);
174 if (!is_executable(buf.buf))
175 continue;
176
177 entlen = strlen(de->d_name) - prefix_len;
178 if (has_extension(de->d_name, ".exe"))
179 entlen -= 4;
180
181 add_cmdname(cmds, de->d_name + prefix_len, entlen);
182 }
183 closedir(dir);
184 strbuf_release(&buf);
185}
186
187void load_command_list(const char *prefix,
188 struct cmdnames *main_cmds,
189 struct cmdnames *other_cmds)
190{
191 const char *env_path = getenv("PATH");
192 const char *exec_path = perf_exec_path();
193
194 if (exec_path) {
195 list_commands_in_dir(main_cmds, exec_path, prefix);
196 qsort(main_cmds->names, main_cmds->cnt,
197 sizeof(*main_cmds->names), cmdname_compare);
198 uniq(main_cmds);
199 }
200
201 if (env_path) {
202 char *paths, *path, *colon;
203 path = paths = strdup(env_path);
204 while (1) {
205 if ((colon = strchr(path, PATH_SEP)))
206 *colon = 0;
207 if (!exec_path || strcmp(path, exec_path))
208 list_commands_in_dir(other_cmds, path, prefix);
209
210 if (!colon)
211 break;
212 path = colon + 1;
213 }
214 free(paths);
215
216 qsort(other_cmds->names, other_cmds->cnt,
217 sizeof(*other_cmds->names), cmdname_compare);
218 uniq(other_cmds);
219 }
220 exclude_cmds(other_cmds, main_cmds);
221}
222
223void list_commands(const char *title, struct cmdnames *main_cmds,
224 struct cmdnames *other_cmds)
225{
226 int i, longest = 0;
227
228 for (i = 0; i < main_cmds->cnt; i++)
229 if (longest < main_cmds->names[i]->len)
230 longest = main_cmds->names[i]->len;
231 for (i = 0; i < other_cmds->cnt; i++)
232 if (longest < other_cmds->names[i]->len)
233 longest = other_cmds->names[i]->len;
234
235 if (main_cmds->cnt) {
236 const char *exec_path = perf_exec_path();
237 printf("available %s in '%s'\n", title, exec_path);
238 printf("----------------");
239 mput_char('-', strlen(title) + strlen(exec_path));
240 putchar('\n');
241 pretty_print_string_list(main_cmds, longest);
242 putchar('\n');
243 }
244
245 if (other_cmds->cnt) {
246 printf("%s available from elsewhere on your $PATH\n", title);
247 printf("---------------------------------------");
248 mput_char('-', strlen(title));
249 putchar('\n');
250 pretty_print_string_list(other_cmds, longest);
251 putchar('\n');
252 }
253}
254
255int is_in_cmdlist(struct cmdnames *c, const char *s)
256{
257 int i;
258 for (i = 0; i < c->cnt; i++)
259 if (!strcmp(s, c->names[i]->name))
260 return 1;
261 return 0;
262}
263
264static int autocorrect;
265static struct cmdnames aliases;
266
267static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
268{
269 if (!strcmp(var, "help.autocorrect"))
270 autocorrect = perf_config_int(var,value);
271 /* Also use aliases for command lookup */
272 if (!prefixcmp(var, "alias."))
273 add_cmdname(&aliases, var + 6, strlen(var + 6));
274
275 return perf_default_config(var, value, cb);
276}
277
278static int levenshtein_compare(const void *p1, const void *p2)
279{
280 const struct cmdname *const *c1 = p1, *const *c2 = p2;
281 const char *s1 = (*c1)->name, *s2 = (*c2)->name;
282 int l1 = (*c1)->len;
283 int l2 = (*c2)->len;
284 return l1 != l2 ? l1 - l2 : strcmp(s1, s2);
285}
286
287static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
288{
289 int i;
290 ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc);
291
292 for (i = 0; i < old->cnt; i++)
293 cmds->names[cmds->cnt++] = old->names[i];
294 free(old->names);
295 old->cnt = 0;
296 old->names = NULL;
297}
298
299const char *help_unknown_cmd(const char *cmd)
300{
301 int i, n = 0, best_similarity = 0;
302 struct cmdnames main_cmds, other_cmds;
303
304 memset(&main_cmds, 0, sizeof(main_cmds));
305 memset(&other_cmds, 0, sizeof(main_cmds));
306 memset(&aliases, 0, sizeof(aliases));
307
308 perf_config(perf_unknown_cmd_config, NULL);
309
310 load_command_list("perf-", &main_cmds, &other_cmds);
311
312 add_cmd_list(&main_cmds, &aliases);
313 add_cmd_list(&main_cmds, &other_cmds);
314 qsort(main_cmds.names, main_cmds.cnt,
315 sizeof(main_cmds.names), cmdname_compare);
316 uniq(&main_cmds);
317
318 if (main_cmds.cnt) {
319 /* This reuses cmdname->len for similarity index */
320 for (i = 0; i < main_cmds.cnt; ++i)
321 main_cmds.names[i]->len =
322 levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
323
324 qsort(main_cmds.names, main_cmds.cnt,
325 sizeof(*main_cmds.names), levenshtein_compare);
326
327 best_similarity = main_cmds.names[0]->len;
328 n = 1;
329 while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
330 ++n;
331 }
332
333 if (autocorrect && n == 1) {
334 const char *assumed = main_cmds.names[0]->name;
335
336 main_cmds.names[0] = NULL;
337 clean_cmdnames(&main_cmds);
338 fprintf(stderr, "WARNING: You called a Git program named '%s', "
339 "which does not exist.\n"
340 "Continuing under the assumption that you meant '%s'\n",
341 cmd, assumed);
342 if (autocorrect > 0) {
343 fprintf(stderr, "in %0.1f seconds automatically...\n",
344 (float)autocorrect/10.0);
345 poll(NULL, 0, autocorrect * 100);
346 }
347 return assumed;
348 }
349
350 fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd);
351
352 if (main_cmds.cnt && best_similarity < 6) {
353 fprintf(stderr, "\nDid you mean %s?\n",
354 n < 2 ? "this": "one of these");
355
356 for (i = 0; i < n; i++)
357 fprintf(stderr, "\t%s\n", main_cmds.names[i]->name);
358 }
359
360 exit(1);
361}
362
363int cmd_version(int argc, const char **argv, const char *prefix)
364{
365 printf("perf version %s\n", perf_version_string);
366 return 0;
367}
diff --git a/tools/perf/util/help.h b/tools/perf/util/help.h
new file mode 100644
index 000000000000..56bc15406ffc
--- /dev/null
+++ b/tools/perf/util/help.h
@@ -0,0 +1,29 @@
1#ifndef HELP_H
2#define HELP_H
3
4struct cmdnames {
5 int alloc;
6 int cnt;
7 struct cmdname {
8 size_t len; /* also used for similarity index in help.c */
9 char name[FLEX_ARRAY];
10 } **names;
11};
12
13static inline void mput_char(char c, unsigned int num)
14{
15 while(num--)
16 putchar(c);
17}
18
19void load_command_list(const char *prefix,
20 struct cmdnames *main_cmds,
21 struct cmdnames *other_cmds);
22void add_cmdname(struct cmdnames *cmds, const char *name, int len);
23/* Here we require that excludes is a sorted list. */
24void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes);
25int is_in_cmdlist(struct cmdnames *c, const char *s);
26void list_commands(const char *title, struct cmdnames *main_cmds,
27 struct cmdnames *other_cmds);
28
29#endif /* HELP_H */
diff --git a/tools/perf/util/levenshtein.c b/tools/perf/util/levenshtein.c
new file mode 100644
index 000000000000..e521d1516df6
--- /dev/null
+++ b/tools/perf/util/levenshtein.c
@@ -0,0 +1,84 @@
1#include "cache.h"
2#include "levenshtein.h"
3
4/*
5 * This function implements the Damerau-Levenshtein algorithm to
6 * calculate a distance between strings.
7 *
8 * Basically, it says how many letters need to be swapped, substituted,
9 * deleted from, or added to string1, at least, to get string2.
10 *
11 * The idea is to build a distance matrix for the substrings of both
12 * strings. To avoid a large space complexity, only the last three rows
13 * are kept in memory (if swaps had the same or higher cost as one deletion
14 * plus one insertion, only two rows would be needed).
15 *
16 * At any stage, "i + 1" denotes the length of the current substring of
17 * string1 that the distance is calculated for.
18 *
19 * row2 holds the current row, row1 the previous row (i.e. for the substring
20 * of string1 of length "i"), and row0 the row before that.
21 *
22 * In other words, at the start of the big loop, row2[j + 1] contains the
23 * Damerau-Levenshtein distance between the substring of string1 of length
24 * "i" and the substring of string2 of length "j + 1".
25 *
26 * All the big loop does is determine the partial minimum-cost paths.
27 *
28 * It does so by calculating the costs of the path ending in characters
29 * i (in string1) and j (in string2), respectively, given that the last
30 * operation is a substition, a swap, a deletion, or an insertion.
31 *
32 * This implementation allows the costs to be weighted:
33 *
34 * - w (as in "sWap")
35 * - s (as in "Substitution")
36 * - a (for insertion, AKA "Add")
37 * - d (as in "Deletion")
38 *
39 * Note that this algorithm calculates a distance _iff_ d == a.
40 */
41int levenshtein(const char *string1, const char *string2,
42 int w, int s, int a, int d)
43{
44 int len1 = strlen(string1), len2 = strlen(string2);
45 int *row0 = malloc(sizeof(int) * (len2 + 1));
46 int *row1 = malloc(sizeof(int) * (len2 + 1));
47 int *row2 = malloc(sizeof(int) * (len2 + 1));
48 int i, j;
49
50 for (j = 0; j <= len2; j++)
51 row1[j] = j * a;
52 for (i = 0; i < len1; i++) {
53 int *dummy;
54
55 row2[0] = (i + 1) * d;
56 for (j = 0; j < len2; j++) {
57 /* substitution */
58 row2[j + 1] = row1[j] + s * (string1[i] != string2[j]);
59 /* swap */
60 if (i > 0 && j > 0 && string1[i - 1] == string2[j] &&
61 string1[i] == string2[j - 1] &&
62 row2[j + 1] > row0[j - 1] + w)
63 row2[j + 1] = row0[j - 1] + w;
64 /* deletion */
65 if (row2[j + 1] > row1[j + 1] + d)
66 row2[j + 1] = row1[j + 1] + d;
67 /* insertion */
68 if (row2[j + 1] > row2[j] + a)
69 row2[j + 1] = row2[j] + a;
70 }
71
72 dummy = row0;
73 row0 = row1;
74 row1 = row2;
75 row2 = dummy;
76 }
77
78 i = row1[len2];
79 free(row0);
80 free(row1);
81 free(row2);
82
83 return i;
84}
diff --git a/tools/perf/util/levenshtein.h b/tools/perf/util/levenshtein.h
new file mode 100644
index 000000000000..0173abeef52c
--- /dev/null
+++ b/tools/perf/util/levenshtein.h
@@ -0,0 +1,8 @@
1#ifndef LEVENSHTEIN_H
2#define LEVENSHTEIN_H
3
4int levenshtein(const char *string1, const char *string2,
5 int swap_penalty, int substition_penalty,
6 int insertion_penalty, int deletion_penalty);
7
8#endif
diff --git a/tools/perf/util/list.h b/tools/perf/util/list.h
new file mode 100644
index 000000000000..e2548e8072cf
--- /dev/null
+++ b/tools/perf/util/list.h
@@ -0,0 +1,603 @@
1#ifndef _LINUX_LIST_H
2#define _LINUX_LIST_H
3/*
4 Copyright (C) Cast of dozens, comes from the Linux kernel
5
6 This program is free software; you can redistribute it and/or modify it
7 under the terms of version 2 of the GNU General Public License as
8 published by the Free Software Foundation.
9*/
10
11#include <stddef.h>
12
13/*
14 * These are non-NULL pointers that will result in page faults
15 * under normal circumstances, used to verify that nobody uses
16 * non-initialized list entries.
17 */
18#define LIST_POISON1 ((void *)0x00100100)
19#define LIST_POISON2 ((void *)0x00200200)
20
21/**
22 * container_of - cast a member of a structure out to the containing structure
23 * @ptr: the pointer to the member.
24 * @type: the type of the container struct this is embedded in.
25 * @member: the name of the member within the struct.
26 *
27 */
28#define container_of(ptr, type, member) ({ \
29 const typeof( ((type *)0)->member ) *__mptr = (ptr); \
30 (type *)( (char *)__mptr - offsetof(type,member) );})
31
32/*
33 * Simple doubly linked list implementation.
34 *
35 * Some of the internal functions ("__xxx") are useful when
36 * manipulating whole lists rather than single entries, as
37 * sometimes we already know the next/prev entries and we can
38 * generate better code by using them directly rather than
39 * using the generic single-entry routines.
40 */
41
42struct list_head {
43 struct list_head *next, *prev;
44};
45
46#define LIST_HEAD_INIT(name) { &(name), &(name) }
47
48#define LIST_HEAD(name) \
49 struct list_head name = LIST_HEAD_INIT(name)
50
51static inline void INIT_LIST_HEAD(struct list_head *list)
52{
53 list->next = list;
54 list->prev = list;
55}
56
57/*
58 * Insert a new entry between two known consecutive entries.
59 *
60 * This is only for internal list manipulation where we know
61 * the prev/next entries already!
62 */
63static inline void __list_add(struct list_head *new,
64 struct list_head *prev,
65 struct list_head *next)
66{
67 next->prev = new;
68 new->next = next;
69 new->prev = prev;
70 prev->next = new;
71}
72
73/**
74 * list_add - add a new entry
75 * @new: new entry to be added
76 * @head: list head to add it after
77 *
78 * Insert a new entry after the specified head.
79 * This is good for implementing stacks.
80 */
81static inline void list_add(struct list_head *new, struct list_head *head)
82{
83 __list_add(new, head, head->next);
84}
85
86/**
87 * list_add_tail - add a new entry
88 * @new: new entry to be added
89 * @head: list head to add it before
90 *
91 * Insert a new entry before the specified head.
92 * This is useful for implementing queues.
93 */
94static inline void list_add_tail(struct list_head *new, struct list_head *head)
95{
96 __list_add(new, head->prev, head);
97}
98
99/*
100 * Delete a list entry by making the prev/next entries
101 * point to each other.
102 *
103 * This is only for internal list manipulation where we know
104 * the prev/next entries already!
105 */
106static inline void __list_del(struct list_head * prev, struct list_head * next)
107{
108 next->prev = prev;
109 prev->next = next;
110}
111
112/**
113 * list_del - deletes entry from list.
114 * @entry: the element to delete from the list.
115 * Note: list_empty on entry does not return true after this, the entry is
116 * in an undefined state.
117 */
118static inline void list_del(struct list_head *entry)
119{
120 __list_del(entry->prev, entry->next);
121 entry->next = LIST_POISON1;
122 entry->prev = LIST_POISON2;
123}
124
125/**
126 * list_del_range - deletes range of entries from list.
127 * @beging: first element in the range to delete from the list.
128 * @beging: first element in the range to delete from the list.
129 * Note: list_empty on the range of entries does not return true after this,
130 * the entries is in an undefined state.
131 */
132static inline void list_del_range(struct list_head *begin,
133 struct list_head *end)
134{
135 begin->prev->next = end->next;
136 end->next->prev = begin->prev;
137}
138
139/**
140 * list_replace - replace old entry by new one
141 * @old : the element to be replaced
142 * @new : the new element to insert
143 * Note: if 'old' was empty, it will be overwritten.
144 */
145static inline void list_replace(struct list_head *old,
146 struct list_head *new)
147{
148 new->next = old->next;
149 new->next->prev = new;
150 new->prev = old->prev;
151 new->prev->next = new;
152}
153
154static inline void list_replace_init(struct list_head *old,
155 struct list_head *new)
156{
157 list_replace(old, new);
158 INIT_LIST_HEAD(old);
159}
160
161/**
162 * list_del_init - deletes entry from list and reinitialize it.
163 * @entry: the element to delete from the list.
164 */
165static inline void list_del_init(struct list_head *entry)
166{
167 __list_del(entry->prev, entry->next);
168 INIT_LIST_HEAD(entry);
169}
170
171/**
172 * list_move - delete from one list and add as another's head
173 * @list: the entry to move
174 * @head: the head that will precede our entry
175 */
176static inline void list_move(struct list_head *list, struct list_head *head)
177{
178 __list_del(list->prev, list->next);
179 list_add(list, head);
180}
181
182/**
183 * list_move_tail - delete from one list and add as another's tail
184 * @list: the entry to move
185 * @head: the head that will follow our entry
186 */
187static inline void list_move_tail(struct list_head *list,
188 struct list_head *head)
189{
190 __list_del(list->prev, list->next);
191 list_add_tail(list, head);
192}
193
194/**
195 * list_is_last - tests whether @list is the last entry in list @head
196 * @list: the entry to test
197 * @head: the head of the list
198 */
199static inline int list_is_last(const struct list_head *list,
200 const struct list_head *head)
201{
202 return list->next == head;
203}
204
205/**
206 * list_empty - tests whether a list is empty
207 * @head: the list to test.
208 */
209static inline int list_empty(const struct list_head *head)
210{
211 return head->next == head;
212}
213
214/**
215 * list_empty_careful - tests whether a list is empty and not being modified
216 * @head: the list to test
217 *
218 * Description:
219 * tests whether a list is empty _and_ checks that no other CPU might be
220 * in the process of modifying either member (next or prev)
221 *
222 * NOTE: using list_empty_careful() without synchronization
223 * can only be safe if the only activity that can happen
224 * to the list entry is list_del_init(). Eg. it cannot be used
225 * if another CPU could re-list_add() it.
226 */
227static inline int list_empty_careful(const struct list_head *head)
228{
229 struct list_head *next = head->next;
230 return (next == head) && (next == head->prev);
231}
232
233static inline void __list_splice(struct list_head *list,
234 struct list_head *head)
235{
236 struct list_head *first = list->next;
237 struct list_head *last = list->prev;
238 struct list_head *at = head->next;
239
240 first->prev = head;
241 head->next = first;
242
243 last->next = at;
244 at->prev = last;
245}
246
247/**
248 * list_splice - join two lists
249 * @list: the new list to add.
250 * @head: the place to add it in the first list.
251 */
252static inline void list_splice(struct list_head *list, struct list_head *head)
253{
254 if (!list_empty(list))
255 __list_splice(list, head);
256}
257
258/**
259 * list_splice_init - join two lists and reinitialise the emptied list.
260 * @list: the new list to add.
261 * @head: the place to add it in the first list.
262 *
263 * The list at @list is reinitialised
264 */
265static inline void list_splice_init(struct list_head *list,
266 struct list_head *head)
267{
268 if (!list_empty(list)) {
269 __list_splice(list, head);
270 INIT_LIST_HEAD(list);
271 }
272}
273
274/**
275 * list_entry - get the struct for this entry
276 * @ptr: the &struct list_head pointer.
277 * @type: the type of the struct this is embedded in.
278 * @member: the name of the list_struct within the struct.
279 */
280#define list_entry(ptr, type, member) \
281 container_of(ptr, type, member)
282
283/**
284 * list_first_entry - get the first element from a list
285 * @ptr: the list head to take the element from.
286 * @type: the type of the struct this is embedded in.
287 * @member: the name of the list_struct within the struct.
288 *
289 * Note, that list is expected to be not empty.
290 */
291#define list_first_entry(ptr, type, member) \
292 list_entry((ptr)->next, type, member)
293
294/**
295 * list_for_each - iterate over a list
296 * @pos: the &struct list_head to use as a loop cursor.
297 * @head: the head for your list.
298 */
299#define list_for_each(pos, head) \
300 for (pos = (head)->next; pos != (head); \
301 pos = pos->next)
302
303/**
304 * __list_for_each - iterate over a list
305 * @pos: the &struct list_head to use as a loop cursor.
306 * @head: the head for your list.
307 *
308 * This variant differs from list_for_each() in that it's the
309 * simplest possible list iteration code, no prefetching is done.
310 * Use this for code that knows the list to be very short (empty
311 * or 1 entry) most of the time.
312 */
313#define __list_for_each(pos, head) \
314 for (pos = (head)->next; pos != (head); pos = pos->next)
315
316/**
317 * list_for_each_prev - iterate over a list backwards
318 * @pos: the &struct list_head to use as a loop cursor.
319 * @head: the head for your list.
320 */
321#define list_for_each_prev(pos, head) \
322 for (pos = (head)->prev; pos != (head); \
323 pos = pos->prev)
324
325/**
326 * list_for_each_safe - iterate over a list safe against removal of list entry
327 * @pos: the &struct list_head to use as a loop cursor.
328 * @n: another &struct list_head to use as temporary storage
329 * @head: the head for your list.
330 */
331#define list_for_each_safe(pos, n, head) \
332 for (pos = (head)->next, n = pos->next; pos != (head); \
333 pos = n, n = pos->next)
334
335/**
336 * list_for_each_entry - iterate over list of given type
337 * @pos: the type * to use as a loop cursor.
338 * @head: the head for your list.
339 * @member: the name of the list_struct within the struct.
340 */
341#define list_for_each_entry(pos, head, member) \
342 for (pos = list_entry((head)->next, typeof(*pos), member); \
343 &pos->member != (head); \
344 pos = list_entry(pos->member.next, typeof(*pos), member))
345
346/**
347 * list_for_each_entry_reverse - iterate backwards over list of given type.
348 * @pos: the type * to use as a loop cursor.
349 * @head: the head for your list.
350 * @member: the name of the list_struct within the struct.
351 */
352#define list_for_each_entry_reverse(pos, head, member) \
353 for (pos = list_entry((head)->prev, typeof(*pos), member); \
354 &pos->member != (head); \
355 pos = list_entry(pos->member.prev, typeof(*pos), member))
356
357/**
358 * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue
359 * @pos: the type * to use as a start point
360 * @head: the head of the list
361 * @member: the name of the list_struct within the struct.
362 *
363 * Prepares a pos entry for use as a start point in list_for_each_entry_continue.
364 */
365#define list_prepare_entry(pos, head, member) \
366 ((pos) ? : list_entry(head, typeof(*pos), member))
367
368/**
369 * list_for_each_entry_continue - continue iteration over list of given type
370 * @pos: the type * to use as a loop cursor.
371 * @head: the head for your list.
372 * @member: the name of the list_struct within the struct.
373 *
374 * Continue to iterate over list of given type, continuing after
375 * the current position.
376 */
377#define list_for_each_entry_continue(pos, head, member) \
378 for (pos = list_entry(pos->member.next, typeof(*pos), member); \
379 &pos->member != (head); \
380 pos = list_entry(pos->member.next, typeof(*pos), member))
381
382/**
383 * list_for_each_entry_from - iterate over list of given type from the current point
384 * @pos: the type * to use as a loop cursor.
385 * @head: the head for your list.
386 * @member: the name of the list_struct within the struct.
387 *
388 * Iterate over list of given type, continuing from current position.
389 */
390#define list_for_each_entry_from(pos, head, member) \
391 for (; &pos->member != (head); \
392 pos = list_entry(pos->member.next, typeof(*pos), member))
393
394/**
395 * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
396 * @pos: the type * to use as a loop cursor.
397 * @n: another type * to use as temporary storage
398 * @head: the head for your list.
399 * @member: the name of the list_struct within the struct.
400 */
401#define list_for_each_entry_safe(pos, n, head, member) \
402 for (pos = list_entry((head)->next, typeof(*pos), member), \
403 n = list_entry(pos->member.next, typeof(*pos), member); \
404 &pos->member != (head); \
405 pos = n, n = list_entry(n->member.next, typeof(*n), member))
406
407/**
408 * list_for_each_entry_safe_continue
409 * @pos: the type * to use as a loop cursor.
410 * @n: another type * to use as temporary storage
411 * @head: the head for your list.
412 * @member: the name of the list_struct within the struct.
413 *
414 * Iterate over list of given type, continuing after current point,
415 * safe against removal of list entry.
416 */
417#define list_for_each_entry_safe_continue(pos, n, head, member) \
418 for (pos = list_entry(pos->member.next, typeof(*pos), member), \
419 n = list_entry(pos->member.next, typeof(*pos), member); \
420 &pos->member != (head); \
421 pos = n, n = list_entry(n->member.next, typeof(*n), member))
422
423/**
424 * list_for_each_entry_safe_from
425 * @pos: the type * to use as a loop cursor.
426 * @n: another type * to use as temporary storage
427 * @head: the head for your list.
428 * @member: the name of the list_struct within the struct.
429 *
430 * Iterate over list of given type from current point, safe against
431 * removal of list entry.
432 */
433#define list_for_each_entry_safe_from(pos, n, head, member) \
434 for (n = list_entry(pos->member.next, typeof(*pos), member); \
435 &pos->member != (head); \
436 pos = n, n = list_entry(n->member.next, typeof(*n), member))
437
438/**
439 * list_for_each_entry_safe_reverse
440 * @pos: the type * to use as a loop cursor.
441 * @n: another type * to use as temporary storage
442 * @head: the head for your list.
443 * @member: the name of the list_struct within the struct.
444 *
445 * Iterate backwards over list of given type, safe against removal
446 * of list entry.
447 */
448#define list_for_each_entry_safe_reverse(pos, n, head, member) \
449 for (pos = list_entry((head)->prev, typeof(*pos), member), \
450 n = list_entry(pos->member.prev, typeof(*pos), member); \
451 &pos->member != (head); \
452 pos = n, n = list_entry(n->member.prev, typeof(*n), member))
453
454/*
455 * Double linked lists with a single pointer list head.
456 * Mostly useful for hash tables where the two pointer list head is
457 * too wasteful.
458 * You lose the ability to access the tail in O(1).
459 */
460
461struct hlist_head {
462 struct hlist_node *first;
463};
464
465struct hlist_node {
466 struct hlist_node *next, **pprev;
467};
468
469#define HLIST_HEAD_INIT { .first = NULL }
470#define HLIST_HEAD(name) struct hlist_head name = { .first = NULL }
471#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
472static inline void INIT_HLIST_NODE(struct hlist_node *h)
473{
474 h->next = NULL;
475 h->pprev = NULL;
476}
477
478static inline int hlist_unhashed(const struct hlist_node *h)
479{
480 return !h->pprev;
481}
482
483static inline int hlist_empty(const struct hlist_head *h)
484{
485 return !h->first;
486}
487
488static inline void __hlist_del(struct hlist_node *n)
489{
490 struct hlist_node *next = n->next;
491 struct hlist_node **pprev = n->pprev;
492 *pprev = next;
493 if (next)
494 next->pprev = pprev;
495}
496
497static inline void hlist_del(struct hlist_node *n)
498{
499 __hlist_del(n);
500 n->next = LIST_POISON1;
501 n->pprev = LIST_POISON2;
502}
503
504static inline void hlist_del_init(struct hlist_node *n)
505{
506 if (!hlist_unhashed(n)) {
507 __hlist_del(n);
508 INIT_HLIST_NODE(n);
509 }
510}
511
512static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
513{
514 struct hlist_node *first = h->first;
515 n->next = first;
516 if (first)
517 first->pprev = &n->next;
518 h->first = n;
519 n->pprev = &h->first;
520}
521
522/* next must be != NULL */
523static inline void hlist_add_before(struct hlist_node *n,
524 struct hlist_node *next)
525{
526 n->pprev = next->pprev;
527 n->next = next;
528 next->pprev = &n->next;
529 *(n->pprev) = n;
530}
531
532static inline void hlist_add_after(struct hlist_node *n,
533 struct hlist_node *next)
534{
535 next->next = n->next;
536 n->next = next;
537 next->pprev = &n->next;
538
539 if(next->next)
540 next->next->pprev = &next->next;
541}
542
543#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
544
545#define hlist_for_each(pos, head) \
546 for (pos = (head)->first; pos; \
547 pos = pos->next)
548
549#define hlist_for_each_safe(pos, n, head) \
550 for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
551 pos = n)
552
553/**
554 * hlist_for_each_entry - iterate over list of given type
555 * @tpos: the type * to use as a loop cursor.
556 * @pos: the &struct hlist_node to use as a loop cursor.
557 * @head: the head for your list.
558 * @member: the name of the hlist_node within the struct.
559 */
560#define hlist_for_each_entry(tpos, pos, head, member) \
561 for (pos = (head)->first; \
562 pos && \
563 ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
564 pos = pos->next)
565
566/**
567 * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
568 * @tpos: the type * to use as a loop cursor.
569 * @pos: the &struct hlist_node to use as a loop cursor.
570 * @member: the name of the hlist_node within the struct.
571 */
572#define hlist_for_each_entry_continue(tpos, pos, member) \
573 for (pos = (pos)->next; \
574 pos && \
575 ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
576 pos = pos->next)
577
578/**
579 * hlist_for_each_entry_from - iterate over a hlist continuing from current point
580 * @tpos: the type * to use as a loop cursor.
581 * @pos: the &struct hlist_node to use as a loop cursor.
582 * @member: the name of the hlist_node within the struct.
583 */
584#define hlist_for_each_entry_from(tpos, pos, member) \
585 for (; pos && \
586 ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
587 pos = pos->next)
588
589/**
590 * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
591 * @tpos: the type * to use as a loop cursor.
592 * @pos: the &struct hlist_node to use as a loop cursor.
593 * @n: another &struct hlist_node to use as temporary storage
594 * @head: the head for your list.
595 * @member: the name of the hlist_node within the struct.
596 */
597#define hlist_for_each_entry_safe(tpos, pos, n, head, member) \
598 for (pos = (head)->first; \
599 pos && ({ n = pos->next; 1; }) && \
600 ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
601 pos = n)
602
603#endif
diff --git a/tools/perf/util/pager.c b/tools/perf/util/pager.c
new file mode 100644
index 000000000000..a28bccae5458
--- /dev/null
+++ b/tools/perf/util/pager.c
@@ -0,0 +1,99 @@
1#include "cache.h"
2#include "run-command.h"
3#include "sigchain.h"
4
5/*
6 * This is split up from the rest of git so that we can do
7 * something different on Windows.
8 */
9
10static int spawned_pager;
11
12#ifndef __MINGW32__
13static void pager_preexec(void)
14{
15 /*
16 * Work around bug in "less" by not starting it until we
17 * have real input
18 */
19 fd_set in;
20
21 FD_ZERO(&in);
22 FD_SET(0, &in);
23 select(1, &in, NULL, &in, NULL);
24
25 setenv("LESS", "FRSX", 0);
26}
27#endif
28
29static const char *pager_argv[] = { "sh", "-c", NULL, NULL };
30static struct child_process pager_process;
31
32static void wait_for_pager(void)
33{
34 fflush(stdout);
35 fflush(stderr);
36 /* signal EOF to pager */
37 close(1);
38 close(2);
39 finish_command(&pager_process);
40}
41
42static void wait_for_pager_signal(int signo)
43{
44 wait_for_pager();
45 sigchain_pop(signo);
46 raise(signo);
47}
48
49void setup_pager(void)
50{
51 const char *pager = getenv("PERF_PAGER");
52
53 if (!isatty(1))
54 return;
55 if (!pager) {
56 if (!pager_program)
57 perf_config(perf_default_config, NULL);
58 pager = pager_program;
59 }
60 if (!pager)
61 pager = getenv("PAGER");
62 if (!pager)
63 pager = "less";
64 else if (!*pager || !strcmp(pager, "cat"))
65 return;
66
67 spawned_pager = 1; /* means we are emitting to terminal */
68
69 /* spawn the pager */
70 pager_argv[2] = pager;
71 pager_process.argv = pager_argv;
72 pager_process.in = -1;
73#ifndef __MINGW32__
74 pager_process.preexec_cb = pager_preexec;
75#endif
76 if (start_command(&pager_process))
77 return;
78
79 /* original process continues, but writes to the pipe */
80 dup2(pager_process.in, 1);
81 if (isatty(2))
82 dup2(pager_process.in, 2);
83 close(pager_process.in);
84
85 /* this makes sure that the parent terminates after the pager */
86 sigchain_push_common(wait_for_pager_signal);
87 atexit(wait_for_pager);
88}
89
90int pager_in_use(void)
91{
92 const char *env;
93
94 if (spawned_pager)
95 return 1;
96
97 env = getenv("PERF_PAGER_IN_USE");
98 return env ? perf_config_bool("PERF_PAGER_IN_USE", env) : 0;
99}
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
new file mode 100644
index 000000000000..9d5f1ca50e6f
--- /dev/null
+++ b/tools/perf/util/parse-events.c
@@ -0,0 +1,316 @@
1
2#include "../perf.h"
3#include "util.h"
4#include "parse-options.h"
5#include "parse-events.h"
6#include "exec_cmd.h"
7#include "string.h"
8
9extern char *strcasestr(const char *haystack, const char *needle);
10
11int nr_counters;
12
13struct perf_counter_attr attrs[MAX_COUNTERS];
14
15struct event_symbol {
16 __u8 type;
17 __u64 config;
18 char *symbol;
19};
20
21#define C(x, y) .type = PERF_TYPE_##x, .config = PERF_COUNT_##y
22#define CR(x, y) .type = PERF_TYPE_##x, .config = y
23
24static struct event_symbol event_symbols[] = {
25 { C(HARDWARE, HW_CPU_CYCLES), "cpu-cycles", },
26 { C(HARDWARE, HW_CPU_CYCLES), "cycles", },
27 { C(HARDWARE, HW_INSTRUCTIONS), "instructions", },
28 { C(HARDWARE, HW_CACHE_REFERENCES), "cache-references", },
29 { C(HARDWARE, HW_CACHE_MISSES), "cache-misses", },
30 { C(HARDWARE, HW_BRANCH_INSTRUCTIONS),"branch-instructions", },
31 { C(HARDWARE, HW_BRANCH_INSTRUCTIONS),"branches", },
32 { C(HARDWARE, HW_BRANCH_MISSES), "branch-misses", },
33 { C(HARDWARE, HW_BUS_CYCLES), "bus-cycles", },
34
35 { C(SOFTWARE, SW_CPU_CLOCK), "cpu-clock", },
36 { C(SOFTWARE, SW_TASK_CLOCK), "task-clock", },
37 { C(SOFTWARE, SW_PAGE_FAULTS), "page-faults", },
38 { C(SOFTWARE, SW_PAGE_FAULTS), "faults", },
39 { C(SOFTWARE, SW_PAGE_FAULTS_MIN), "minor-faults", },
40 { C(SOFTWARE, SW_PAGE_FAULTS_MAJ), "major-faults", },
41 { C(SOFTWARE, SW_CONTEXT_SWITCHES), "context-switches", },
42 { C(SOFTWARE, SW_CONTEXT_SWITCHES), "cs", },
43 { C(SOFTWARE, SW_CPU_MIGRATIONS), "cpu-migrations", },
44 { C(SOFTWARE, SW_CPU_MIGRATIONS), "migrations", },
45};
46
47#define __PERF_COUNTER_FIELD(config, name) \
48 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
49
50#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
51#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
52#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
53#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
54
55static char *hw_event_names[] = {
56 "cycles",
57 "instructions",
58 "cache-references",
59 "cache-misses",
60 "branches",
61 "branch-misses",
62 "bus-cycles",
63};
64
65static char *sw_event_names[] = {
66 "cpu-clock-ticks",
67 "task-clock-ticks",
68 "page-faults",
69 "context-switches",
70 "CPU-migrations",
71 "minor-faults",
72 "major-faults",
73};
74
75#define MAX_ALIASES 8
76
77static char *hw_cache [][MAX_ALIASES] = {
78 { "L1-data" , "l1-d", "l1d", "l1" },
79 { "L1-instruction" , "l1-i", "l1i" },
80 { "L2" , "l2" },
81 { "Data-TLB" , "dtlb", "d-tlb" },
82 { "Instruction-TLB" , "itlb", "i-tlb" },
83 { "Branch" , "bpu" , "btb", "bpc" },
84};
85
86static char *hw_cache_op [][MAX_ALIASES] = {
87 { "Load" , "read" },
88 { "Store" , "write" },
89 { "Prefetch" , "speculative-read", "speculative-load" },
90};
91
92static char *hw_cache_result [][MAX_ALIASES] = {
93 { "Reference" , "ops", "access" },
94 { "Miss" },
95};
96
97char *event_name(int counter)
98{
99 __u64 config = attrs[counter].config;
100 int type = attrs[counter].type;
101 static char buf[32];
102
103 if (attrs[counter].type == PERF_TYPE_RAW) {
104 sprintf(buf, "raw 0x%llx", config);
105 return buf;
106 }
107
108 switch (type) {
109 case PERF_TYPE_HARDWARE:
110 if (config < PERF_COUNT_HW_MAX)
111 return hw_event_names[config];
112 return "unknown-hardware";
113
114 case PERF_TYPE_HW_CACHE: {
115 __u8 cache_type, cache_op, cache_result;
116 static char name[100];
117
118 cache_type = (config >> 0) & 0xff;
119 if (cache_type > PERF_COUNT_HW_CACHE_MAX)
120 return "unknown-ext-hardware-cache-type";
121
122 cache_op = (config >> 8) & 0xff;
123 if (cache_op > PERF_COUNT_HW_CACHE_OP_MAX)
124 return "unknown-ext-hardware-cache-op";
125
126 cache_result = (config >> 16) & 0xff;
127 if (cache_result > PERF_COUNT_HW_CACHE_RESULT_MAX)
128 return "unknown-ext-hardware-cache-result";
129
130 sprintf(name, "%s-Cache-%s-%ses",
131 hw_cache[cache_type][0],
132 hw_cache_op[cache_op][0],
133 hw_cache_result[cache_result][0]);
134
135 return name;
136 }
137
138 case PERF_TYPE_SOFTWARE:
139 if (config < PERF_COUNT_SW_MAX)
140 return sw_event_names[config];
141 return "unknown-software";
142
143 default:
144 break;
145 }
146
147 return "unknown";
148}
149
150static int parse_aliases(const char *str, char *names[][MAX_ALIASES], int size)
151{
152 int i, j;
153
154 for (i = 0; i < size; i++) {
155 for (j = 0; j < MAX_ALIASES; j++) {
156 if (!names[i][j])
157 break;
158 if (strcasestr(str, names[i][j]))
159 return i;
160 }
161 }
162
163 return -1;
164}
165
166static int parse_generic_hw_symbols(const char *str, struct perf_counter_attr *attr)
167{
168 int cache_type = -1, cache_op = 0, cache_result = 0;
169
170 cache_type = parse_aliases(str, hw_cache, PERF_COUNT_HW_CACHE_MAX);
171 /*
172 * No fallback - if we cannot get a clear cache type
173 * then bail out:
174 */
175 if (cache_type == -1)
176 return -EINVAL;
177
178 cache_op = parse_aliases(str, hw_cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
179 /*
180 * Fall back to reads:
181 */
182 if (cache_op == -1)
183 cache_op = PERF_COUNT_HW_CACHE_OP_READ;
184
185 cache_result = parse_aliases(str, hw_cache_result,
186 PERF_COUNT_HW_CACHE_RESULT_MAX);
187 /*
188 * Fall back to accesses:
189 */
190 if (cache_result == -1)
191 cache_result = PERF_COUNT_HW_CACHE_RESULT_ACCESS;
192
193 attr->config = cache_type | (cache_op << 8) | (cache_result << 16);
194 attr->type = PERF_TYPE_HW_CACHE;
195
196 return 0;
197}
198
199/*
200 * Each event can have multiple symbolic names.
201 * Symbolic names are (almost) exactly matched.
202 */
203static int parse_event_symbols(const char *str, struct perf_counter_attr *attr)
204{
205 __u64 config, id;
206 int type;
207 unsigned int i;
208 const char *sep, *pstr;
209
210 if (str[0] == 'r' && hex2u64(str + 1, &config) > 0) {
211 attr->type = PERF_TYPE_RAW;
212 attr->config = config;
213
214 return 0;
215 }
216
217 pstr = str;
218 sep = strchr(pstr, ':');
219 if (sep) {
220 type = atoi(pstr);
221 pstr = sep + 1;
222 id = atoi(pstr);
223 sep = strchr(pstr, ':');
224 if (sep) {
225 pstr = sep + 1;
226 if (strchr(pstr, 'k'))
227 attr->exclude_user = 1;
228 if (strchr(pstr, 'u'))
229 attr->exclude_kernel = 1;
230 }
231 attr->type = type;
232 attr->config = id;
233
234 return 0;
235 }
236
237 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
238 if (!strncmp(str, event_symbols[i].symbol,
239 strlen(event_symbols[i].symbol))) {
240
241 attr->type = event_symbols[i].type;
242 attr->config = event_symbols[i].config;
243
244 return 0;
245 }
246 }
247
248 return parse_generic_hw_symbols(str, attr);
249}
250
251int parse_events(const struct option *opt, const char *str, int unset)
252{
253 struct perf_counter_attr attr;
254 int ret;
255
256 memset(&attr, 0, sizeof(attr));
257again:
258 if (nr_counters == MAX_COUNTERS)
259 return -1;
260
261 ret = parse_event_symbols(str, &attr);
262 if (ret < 0)
263 return ret;
264
265 attrs[nr_counters] = attr;
266 nr_counters++;
267
268 str = strstr(str, ",");
269 if (str) {
270 str++;
271 goto again;
272 }
273
274 return 0;
275}
276
277static const char * const event_type_descriptors[] = {
278 "",
279 "Hardware event",
280 "Software event",
281 "Tracepoint event",
282 "Hardware cache event",
283};
284
285/*
286 * Print the help text for the event symbols:
287 */
288void print_events(void)
289{
290 struct event_symbol *syms = event_symbols;
291 unsigned int i, type, prev_type = -1;
292
293 fprintf(stderr, "\n");
294 fprintf(stderr, "List of pre-defined events (to be used in -e):\n");
295
296 for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) {
297 type = syms->type + 1;
298 if (type > ARRAY_SIZE(event_type_descriptors))
299 type = 0;
300
301 if (type != prev_type)
302 fprintf(stderr, "\n");
303
304 fprintf(stderr, " %-30s [%s]\n", syms->symbol,
305 event_type_descriptors[type]);
306
307 prev_type = type;
308 }
309
310 fprintf(stderr, "\n");
311 fprintf(stderr, " %-30s [raw hardware event descriptor]\n",
312 "rNNN");
313 fprintf(stderr, "\n");
314
315 exit(129);
316}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
new file mode 100644
index 000000000000..e3d552908e60
--- /dev/null
+++ b/tools/perf/util/parse-events.h
@@ -0,0 +1,17 @@
1
2/*
3 * Parse symbolic events/counts passed in as options:
4 */
5
6extern int nr_counters;
7
8extern struct perf_counter_attr attrs[MAX_COUNTERS];
9
10extern char *event_name(int ctr);
11
12extern int parse_events(const struct option *opt, const char *str, int unset);
13
14#define EVENTS_HELP_MAX (128*1024)
15
16extern void print_events(void);
17
diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
new file mode 100644
index 000000000000..b3affb1658d2
--- /dev/null
+++ b/tools/perf/util/parse-options.c
@@ -0,0 +1,508 @@
1#include "util.h"
2#include "parse-options.h"
3#include "cache.h"
4
5#define OPT_SHORT 1
6#define OPT_UNSET 2
7
8static int opterror(const struct option *opt, const char *reason, int flags)
9{
10 if (flags & OPT_SHORT)
11 return error("switch `%c' %s", opt->short_name, reason);
12 if (flags & OPT_UNSET)
13 return error("option `no-%s' %s", opt->long_name, reason);
14 return error("option `%s' %s", opt->long_name, reason);
15}
16
17static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
18 int flags, const char **arg)
19{
20 if (p->opt) {
21 *arg = p->opt;
22 p->opt = NULL;
23 } else if (p->argc == 1 && (opt->flags & PARSE_OPT_LASTARG_DEFAULT)) {
24 *arg = (const char *)opt->defval;
25 } else if (p->argc > 1) {
26 p->argc--;
27 *arg = *++p->argv;
28 } else
29 return opterror(opt, "requires a value", flags);
30 return 0;
31}
32
33static int get_value(struct parse_opt_ctx_t *p,
34 const struct option *opt, int flags)
35{
36 const char *s, *arg = NULL;
37 const int unset = flags & OPT_UNSET;
38
39 if (unset && p->opt)
40 return opterror(opt, "takes no value", flags);
41 if (unset && (opt->flags & PARSE_OPT_NONEG))
42 return opterror(opt, "isn't available", flags);
43
44 if (!(flags & OPT_SHORT) && p->opt) {
45 switch (opt->type) {
46 case OPTION_CALLBACK:
47 if (!(opt->flags & PARSE_OPT_NOARG))
48 break;
49 /* FALLTHROUGH */
50 case OPTION_BOOLEAN:
51 case OPTION_BIT:
52 case OPTION_SET_INT:
53 case OPTION_SET_PTR:
54 return opterror(opt, "takes no value", flags);
55 default:
56 break;
57 }
58 }
59
60 switch (opt->type) {
61 case OPTION_BIT:
62 if (unset)
63 *(int *)opt->value &= ~opt->defval;
64 else
65 *(int *)opt->value |= opt->defval;
66 return 0;
67
68 case OPTION_BOOLEAN:
69 *(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
70 return 0;
71
72 case OPTION_SET_INT:
73 *(int *)opt->value = unset ? 0 : opt->defval;
74 return 0;
75
76 case OPTION_SET_PTR:
77 *(void **)opt->value = unset ? NULL : (void *)opt->defval;
78 return 0;
79
80 case OPTION_STRING:
81 if (unset)
82 *(const char **)opt->value = NULL;
83 else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
84 *(const char **)opt->value = (const char *)opt->defval;
85 else
86 return get_arg(p, opt, flags, (const char **)opt->value);
87 return 0;
88
89 case OPTION_CALLBACK:
90 if (unset)
91 return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
92 if (opt->flags & PARSE_OPT_NOARG)
93 return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
94 if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
95 return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
96 if (get_arg(p, opt, flags, &arg))
97 return -1;
98 return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
99
100 case OPTION_INTEGER:
101 if (unset) {
102 *(int *)opt->value = 0;
103 return 0;
104 }
105 if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
106 *(int *)opt->value = opt->defval;
107 return 0;
108 }
109 if (get_arg(p, opt, flags, &arg))
110 return -1;
111 *(int *)opt->value = strtol(arg, (char **)&s, 10);
112 if (*s)
113 return opterror(opt, "expects a numerical value", flags);
114 return 0;
115
116 case OPTION_LONG:
117 if (unset) {
118 *(long *)opt->value = 0;
119 return 0;
120 }
121 if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
122 *(long *)opt->value = opt->defval;
123 return 0;
124 }
125 if (get_arg(p, opt, flags, &arg))
126 return -1;
127 *(long *)opt->value = strtol(arg, (char **)&s, 10);
128 if (*s)
129 return opterror(opt, "expects a numerical value", flags);
130 return 0;
131
132 default:
133 die("should not happen, someone must be hit on the forehead");
134 }
135}
136
137static int parse_short_opt(struct parse_opt_ctx_t *p, const struct option *options)
138{
139 for (; options->type != OPTION_END; options++) {
140 if (options->short_name == *p->opt) {
141 p->opt = p->opt[1] ? p->opt + 1 : NULL;
142 return get_value(p, options, OPT_SHORT);
143 }
144 }
145 return -2;
146}
147
148static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
149 const struct option *options)
150{
151 const char *arg_end = strchr(arg, '=');
152 const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
153 int abbrev_flags = 0, ambiguous_flags = 0;
154
155 if (!arg_end)
156 arg_end = arg + strlen(arg);
157
158 for (; options->type != OPTION_END; options++) {
159 const char *rest;
160 int flags = 0;
161
162 if (!options->long_name)
163 continue;
164
165 rest = skip_prefix(arg, options->long_name);
166 if (options->type == OPTION_ARGUMENT) {
167 if (!rest)
168 continue;
169 if (*rest == '=')
170 return opterror(options, "takes no value", flags);
171 if (*rest)
172 continue;
173 p->out[p->cpidx++] = arg - 2;
174 return 0;
175 }
176 if (!rest) {
177 /* abbreviated? */
178 if (!strncmp(options->long_name, arg, arg_end - arg)) {
179is_abbreviated:
180 if (abbrev_option) {
181 /*
182 * If this is abbreviated, it is
183 * ambiguous. So when there is no
184 * exact match later, we need to
185 * error out.
186 */
187 ambiguous_option = abbrev_option;
188 ambiguous_flags = abbrev_flags;
189 }
190 if (!(flags & OPT_UNSET) && *arg_end)
191 p->opt = arg_end + 1;
192 abbrev_option = options;
193 abbrev_flags = flags;
194 continue;
195 }
196 /* negated and abbreviated very much? */
197 if (!prefixcmp("no-", arg)) {
198 flags |= OPT_UNSET;
199 goto is_abbreviated;
200 }
201 /* negated? */
202 if (strncmp(arg, "no-", 3))
203 continue;
204 flags |= OPT_UNSET;
205 rest = skip_prefix(arg + 3, options->long_name);
206 /* abbreviated and negated? */
207 if (!rest && !prefixcmp(options->long_name, arg + 3))
208 goto is_abbreviated;
209 if (!rest)
210 continue;
211 }
212 if (*rest) {
213 if (*rest != '=')
214 continue;
215 p->opt = rest + 1;
216 }
217 return get_value(p, options, flags);
218 }
219
220 if (ambiguous_option)
221 return error("Ambiguous option: %s "
222 "(could be --%s%s or --%s%s)",
223 arg,
224 (ambiguous_flags & OPT_UNSET) ? "no-" : "",
225 ambiguous_option->long_name,
226 (abbrev_flags & OPT_UNSET) ? "no-" : "",
227 abbrev_option->long_name);
228 if (abbrev_option)
229 return get_value(p, abbrev_option, abbrev_flags);
230 return -2;
231}
232
233static void check_typos(const char *arg, const struct option *options)
234{
235 if (strlen(arg) < 3)
236 return;
237
238 if (!prefixcmp(arg, "no-")) {
239 error ("did you mean `--%s` (with two dashes ?)", arg);
240 exit(129);
241 }
242
243 for (; options->type != OPTION_END; options++) {
244 if (!options->long_name)
245 continue;
246 if (!prefixcmp(options->long_name, arg)) {
247 error ("did you mean `--%s` (with two dashes ?)", arg);
248 exit(129);
249 }
250 }
251}
252
253void parse_options_start(struct parse_opt_ctx_t *ctx,
254 int argc, const char **argv, int flags)
255{
256 memset(ctx, 0, sizeof(*ctx));
257 ctx->argc = argc - 1;
258 ctx->argv = argv + 1;
259 ctx->out = argv;
260 ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
261 ctx->flags = flags;
262 if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
263 (flags & PARSE_OPT_STOP_AT_NON_OPTION))
264 die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
265}
266
267static int usage_with_options_internal(const char * const *,
268 const struct option *, int);
269
270int parse_options_step(struct parse_opt_ctx_t *ctx,
271 const struct option *options,
272 const char * const usagestr[])
273{
274 int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
275
276 /* we must reset ->opt, unknown short option leave it dangling */
277 ctx->opt = NULL;
278
279 for (; ctx->argc; ctx->argc--, ctx->argv++) {
280 const char *arg = ctx->argv[0];
281
282 if (*arg != '-' || !arg[1]) {
283 if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
284 break;
285 ctx->out[ctx->cpidx++] = ctx->argv[0];
286 continue;
287 }
288
289 if (arg[1] != '-') {
290 ctx->opt = arg + 1;
291 if (internal_help && *ctx->opt == 'h')
292 return parse_options_usage(usagestr, options);
293 switch (parse_short_opt(ctx, options)) {
294 case -1:
295 return parse_options_usage(usagestr, options);
296 case -2:
297 goto unknown;
298 }
299 if (ctx->opt)
300 check_typos(arg + 1, options);
301 while (ctx->opt) {
302 if (internal_help && *ctx->opt == 'h')
303 return parse_options_usage(usagestr, options);
304 switch (parse_short_opt(ctx, options)) {
305 case -1:
306 return parse_options_usage(usagestr, options);
307 case -2:
308 /* fake a short option thing to hide the fact that we may have
309 * started to parse aggregated stuff
310 *
311 * This is leaky, too bad.
312 */
313 ctx->argv[0] = strdup(ctx->opt - 1);
314 *(char *)ctx->argv[0] = '-';
315 goto unknown;
316 }
317 }
318 continue;
319 }
320
321 if (!arg[2]) { /* "--" */
322 if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
323 ctx->argc--;
324 ctx->argv++;
325 }
326 break;
327 }
328
329 if (internal_help && !strcmp(arg + 2, "help-all"))
330 return usage_with_options_internal(usagestr, options, 1);
331 if (internal_help && !strcmp(arg + 2, "help"))
332 return parse_options_usage(usagestr, options);
333 switch (parse_long_opt(ctx, arg + 2, options)) {
334 case -1:
335 return parse_options_usage(usagestr, options);
336 case -2:
337 goto unknown;
338 }
339 continue;
340unknown:
341 if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
342 return PARSE_OPT_UNKNOWN;
343 ctx->out[ctx->cpidx++] = ctx->argv[0];
344 ctx->opt = NULL;
345 }
346 return PARSE_OPT_DONE;
347}
348
349int parse_options_end(struct parse_opt_ctx_t *ctx)
350{
351 memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
352 ctx->out[ctx->cpidx + ctx->argc] = NULL;
353 return ctx->cpidx + ctx->argc;
354}
355
356int parse_options(int argc, const char **argv, const struct option *options,
357 const char * const usagestr[], int flags)
358{
359 struct parse_opt_ctx_t ctx;
360
361 parse_options_start(&ctx, argc, argv, flags);
362 switch (parse_options_step(&ctx, options, usagestr)) {
363 case PARSE_OPT_HELP:
364 exit(129);
365 case PARSE_OPT_DONE:
366 break;
367 default: /* PARSE_OPT_UNKNOWN */
368 if (ctx.argv[0][1] == '-') {
369 error("unknown option `%s'", ctx.argv[0] + 2);
370 } else {
371 error("unknown switch `%c'", *ctx.opt);
372 }
373 usage_with_options(usagestr, options);
374 }
375
376 return parse_options_end(&ctx);
377}
378
379#define USAGE_OPTS_WIDTH 24
380#define USAGE_GAP 2
381
382int usage_with_options_internal(const char * const *usagestr,
383 const struct option *opts, int full)
384{
385 if (!usagestr)
386 return PARSE_OPT_HELP;
387
388 fprintf(stderr, "\n usage: %s\n", *usagestr++);
389 while (*usagestr && **usagestr)
390 fprintf(stderr, " or: %s\n", *usagestr++);
391 while (*usagestr) {
392 fprintf(stderr, "%s%s\n",
393 **usagestr ? " " : "",
394 *usagestr);
395 usagestr++;
396 }
397
398 if (opts->type != OPTION_GROUP)
399 fputc('\n', stderr);
400
401 for (; opts->type != OPTION_END; opts++) {
402 size_t pos;
403 int pad;
404
405 if (opts->type == OPTION_GROUP) {
406 fputc('\n', stderr);
407 if (*opts->help)
408 fprintf(stderr, "%s\n", opts->help);
409 continue;
410 }
411 if (!full && (opts->flags & PARSE_OPT_HIDDEN))
412 continue;
413
414 pos = fprintf(stderr, " ");
415 if (opts->short_name)
416 pos += fprintf(stderr, "-%c", opts->short_name);
417 if (opts->long_name && opts->short_name)
418 pos += fprintf(stderr, ", ");
419 if (opts->long_name)
420 pos += fprintf(stderr, "--%s", opts->long_name);
421
422 switch (opts->type) {
423 case OPTION_ARGUMENT:
424 break;
425 case OPTION_INTEGER:
426 if (opts->flags & PARSE_OPT_OPTARG)
427 if (opts->long_name)
428 pos += fprintf(stderr, "[=<n>]");
429 else
430 pos += fprintf(stderr, "[<n>]");
431 else
432 pos += fprintf(stderr, " <n>");
433 break;
434 case OPTION_CALLBACK:
435 if (opts->flags & PARSE_OPT_NOARG)
436 break;
437 /* FALLTHROUGH */
438 case OPTION_STRING:
439 if (opts->argh) {
440 if (opts->flags & PARSE_OPT_OPTARG)
441 if (opts->long_name)
442 pos += fprintf(stderr, "[=<%s>]", opts->argh);
443 else
444 pos += fprintf(stderr, "[<%s>]", opts->argh);
445 else
446 pos += fprintf(stderr, " <%s>", opts->argh);
447 } else {
448 if (opts->flags & PARSE_OPT_OPTARG)
449 if (opts->long_name)
450 pos += fprintf(stderr, "[=...]");
451 else
452 pos += fprintf(stderr, "[...]");
453 else
454 pos += fprintf(stderr, " ...");
455 }
456 break;
457 default: /* OPTION_{BIT,BOOLEAN,SET_INT,SET_PTR} */
458 break;
459 }
460
461 if (pos <= USAGE_OPTS_WIDTH)
462 pad = USAGE_OPTS_WIDTH - pos;
463 else {
464 fputc('\n', stderr);
465 pad = USAGE_OPTS_WIDTH;
466 }
467 fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
468 }
469 fputc('\n', stderr);
470
471 return PARSE_OPT_HELP;
472}
473
474void usage_with_options(const char * const *usagestr,
475 const struct option *opts)
476{
477 usage_with_options_internal(usagestr, opts, 0);
478 exit(129);
479}
480
481int parse_options_usage(const char * const *usagestr,
482 const struct option *opts)
483{
484 return usage_with_options_internal(usagestr, opts, 0);
485}
486
487
488int parse_opt_verbosity_cb(const struct option *opt, const char *arg,
489 int unset)
490{
491 int *target = opt->value;
492
493 if (unset)
494 /* --no-quiet, --no-verbose */
495 *target = 0;
496 else if (opt->short_name == 'v') {
497 if (*target >= 0)
498 (*target)++;
499 else
500 *target = 1;
501 } else {
502 if (*target <= 0)
503 (*target)--;
504 else
505 *target = -1;
506 }
507 return 0;
508}
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
new file mode 100644
index 000000000000..a1039a6ce0eb
--- /dev/null
+++ b/tools/perf/util/parse-options.h
@@ -0,0 +1,174 @@
1#ifndef PARSE_OPTIONS_H
2#define PARSE_OPTIONS_H
3
4enum parse_opt_type {
5 /* special types */
6 OPTION_END,
7 OPTION_ARGUMENT,
8 OPTION_GROUP,
9 /* options with no arguments */
10 OPTION_BIT,
11 OPTION_BOOLEAN, /* _INCR would have been a better name */
12 OPTION_SET_INT,
13 OPTION_SET_PTR,
14 /* options with arguments (usually) */
15 OPTION_STRING,
16 OPTION_INTEGER,
17 OPTION_LONG,
18 OPTION_CALLBACK,
19};
20
21enum parse_opt_flags {
22 PARSE_OPT_KEEP_DASHDASH = 1,
23 PARSE_OPT_STOP_AT_NON_OPTION = 2,
24 PARSE_OPT_KEEP_ARGV0 = 4,
25 PARSE_OPT_KEEP_UNKNOWN = 8,
26 PARSE_OPT_NO_INTERNAL_HELP = 16,
27};
28
29enum parse_opt_option_flags {
30 PARSE_OPT_OPTARG = 1,
31 PARSE_OPT_NOARG = 2,
32 PARSE_OPT_NONEG = 4,
33 PARSE_OPT_HIDDEN = 8,
34 PARSE_OPT_LASTARG_DEFAULT = 16,
35};
36
37struct option;
38typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
39
40/*
41 * `type`::
42 * holds the type of the option, you must have an OPTION_END last in your
43 * array.
44 *
45 * `short_name`::
46 * the character to use as a short option name, '\0' if none.
47 *
48 * `long_name`::
49 * the long option name, without the leading dashes, NULL if none.
50 *
51 * `value`::
52 * stores pointers to the values to be filled.
53 *
54 * `argh`::
55 * token to explain the kind of argument this option wants. Keep it
56 * homogenous across the repository.
57 *
58 * `help`::
59 * the short help associated to what the option does.
60 * Must never be NULL (except for OPTION_END).
61 * OPTION_GROUP uses this pointer to store the group header.
62 *
63 * `flags`::
64 * mask of parse_opt_option_flags.
65 * PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
66 * PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
67 * PARSE_OPT_NONEG: says that this option cannot be negated
68 * PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
69 * the long one.
70 *
71 * `callback`::
72 * pointer to the callback to use for OPTION_CALLBACK.
73 *
74 * `defval`::
75 * default value to fill (*->value) with for PARSE_OPT_OPTARG.
76 * OPTION_{BIT,SET_INT,SET_PTR} store the {mask,integer,pointer} to put in
77 * the value when met.
78 * CALLBACKS can use it like they want.
79 */
80struct option {
81 enum parse_opt_type type;
82 int short_name;
83 const char *long_name;
84 void *value;
85 const char *argh;
86 const char *help;
87
88 int flags;
89 parse_opt_cb *callback;
90 intptr_t defval;
91};
92
93#define OPT_END() { OPTION_END }
94#define OPT_ARGUMENT(l, h) { OPTION_ARGUMENT, 0, (l), NULL, NULL, (h) }
95#define OPT_GROUP(h) { OPTION_GROUP, 0, NULL, NULL, NULL, (h) }
96#define OPT_BIT(s, l, v, h, b) { OPTION_BIT, (s), (l), (v), NULL, (h), 0, NULL, (b) }
97#define OPT_BOOLEAN(s, l, v, h) { OPTION_BOOLEAN, (s), (l), (v), NULL, (h) }
98#define OPT_SET_INT(s, l, v, h, i) { OPTION_SET_INT, (s), (l), (v), NULL, (h), 0, NULL, (i) }
99#define OPT_SET_PTR(s, l, v, h, p) { OPTION_SET_PTR, (s), (l), (v), NULL, (h), 0, NULL, (p) }
100#define OPT_INTEGER(s, l, v, h) { OPTION_INTEGER, (s), (l), (v), NULL, (h) }
101#define OPT_LONG(s, l, v, h) { OPTION_LONG, (s), (l), (v), NULL, (h) }
102#define OPT_STRING(s, l, v, a, h) { OPTION_STRING, (s), (l), (v), (a), (h) }
103#define OPT_DATE(s, l, v, h) \
104 { OPTION_CALLBACK, (s), (l), (v), "time",(h), 0, \
105 parse_opt_approxidate_cb }
106#define OPT_CALLBACK(s, l, v, a, h, f) \
107 { OPTION_CALLBACK, (s), (l), (v), (a), (h), 0, (f) }
108
109/* parse_options() will filter out the processed options and leave the
110 * non-option argments in argv[].
111 * Returns the number of arguments left in argv[].
112 */
113extern int parse_options(int argc, const char **argv,
114 const struct option *options,
115 const char * const usagestr[], int flags);
116
117extern NORETURN void usage_with_options(const char * const *usagestr,
118 const struct option *options);
119
120/*----- incremantal advanced APIs -----*/
121
122enum {
123 PARSE_OPT_HELP = -1,
124 PARSE_OPT_DONE,
125 PARSE_OPT_UNKNOWN,
126};
127
128/*
129 * It's okay for the caller to consume argv/argc in the usual way.
130 * Other fields of that structure are private to parse-options and should not
131 * be modified in any way.
132 */
133struct parse_opt_ctx_t {
134 const char **argv;
135 const char **out;
136 int argc, cpidx;
137 const char *opt;
138 int flags;
139};
140
141extern int parse_options_usage(const char * const *usagestr,
142 const struct option *opts);
143
144extern void parse_options_start(struct parse_opt_ctx_t *ctx,
145 int argc, const char **argv, int flags);
146
147extern int parse_options_step(struct parse_opt_ctx_t *ctx,
148 const struct option *options,
149 const char * const usagestr[]);
150
151extern int parse_options_end(struct parse_opt_ctx_t *ctx);
152
153
154/*----- some often used options -----*/
155extern int parse_opt_abbrev_cb(const struct option *, const char *, int);
156extern int parse_opt_approxidate_cb(const struct option *, const char *, int);
157extern int parse_opt_verbosity_cb(const struct option *, const char *, int);
158
159#define OPT__VERBOSE(var) OPT_BOOLEAN('v', "verbose", (var), "be verbose")
160#define OPT__QUIET(var) OPT_BOOLEAN('q', "quiet", (var), "be quiet")
161#define OPT__VERBOSITY(var) \
162 { OPTION_CALLBACK, 'v', "verbose", (var), NULL, "be more verbose", \
163 PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }, \
164 { OPTION_CALLBACK, 'q', "quiet", (var), NULL, "be more quiet", \
165 PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }
166#define OPT__DRY_RUN(var) OPT_BOOLEAN('n', "dry-run", (var), "dry run")
167#define OPT__ABBREV(var) \
168 { OPTION_CALLBACK, 0, "abbrev", (var), "n", \
169 "use <n> digits to display SHA-1s", \
170 PARSE_OPT_OPTARG, &parse_opt_abbrev_cb, 0 }
171
172extern const char *parse_options_fix_filename(const char *prefix, const char *file);
173
174#endif
diff --git a/tools/perf/util/path.c b/tools/perf/util/path.c
new file mode 100644
index 000000000000..a501a40dd2cb
--- /dev/null
+++ b/tools/perf/util/path.c
@@ -0,0 +1,353 @@
1/*
2 * I'm tired of doing "vsnprintf()" etc just to open a
3 * file, so here's a "return static buffer with printf"
4 * interface for paths.
5 *
6 * It's obviously not thread-safe. Sue me. But it's quite
7 * useful for doing things like
8 *
9 * f = open(mkpath("%s/%s.perf", base, name), O_RDONLY);
10 *
11 * which is what it's designed for.
12 */
13#include "cache.h"
14
15static char bad_path[] = "/bad-path/";
16/*
17 * Two hacks:
18 */
19
20static char *get_perf_dir(void)
21{
22 return ".";
23}
24
25size_t strlcpy(char *dest, const char *src, size_t size)
26{
27 size_t ret = strlen(src);
28
29 if (size) {
30 size_t len = (ret >= size) ? size - 1 : ret;
31 memcpy(dest, src, len);
32 dest[len] = '\0';
33 }
34 return ret;
35}
36
37
38static char *get_pathname(void)
39{
40 static char pathname_array[4][PATH_MAX];
41 static int index;
42 return pathname_array[3 & ++index];
43}
44
45static char *cleanup_path(char *path)
46{
47 /* Clean it up */
48 if (!memcmp(path, "./", 2)) {
49 path += 2;
50 while (*path == '/')
51 path++;
52 }
53 return path;
54}
55
56char *mksnpath(char *buf, size_t n, const char *fmt, ...)
57{
58 va_list args;
59 unsigned len;
60
61 va_start(args, fmt);
62 len = vsnprintf(buf, n, fmt, args);
63 va_end(args);
64 if (len >= n) {
65 strlcpy(buf, bad_path, n);
66 return buf;
67 }
68 return cleanup_path(buf);
69}
70
71static char *perf_vsnpath(char *buf, size_t n, const char *fmt, va_list args)
72{
73 const char *perf_dir = get_perf_dir();
74 size_t len;
75
76 len = strlen(perf_dir);
77 if (n < len + 1)
78 goto bad;
79 memcpy(buf, perf_dir, len);
80 if (len && !is_dir_sep(perf_dir[len-1]))
81 buf[len++] = '/';
82 len += vsnprintf(buf + len, n - len, fmt, args);
83 if (len >= n)
84 goto bad;
85 return cleanup_path(buf);
86bad:
87 strlcpy(buf, bad_path, n);
88 return buf;
89}
90
91char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
92{
93 va_list args;
94 va_start(args, fmt);
95 (void)perf_vsnpath(buf, n, fmt, args);
96 va_end(args);
97 return buf;
98}
99
100char *perf_pathdup(const char *fmt, ...)
101{
102 char path[PATH_MAX];
103 va_list args;
104 va_start(args, fmt);
105 (void)perf_vsnpath(path, sizeof(path), fmt, args);
106 va_end(args);
107 return xstrdup(path);
108}
109
110char *mkpath(const char *fmt, ...)
111{
112 va_list args;
113 unsigned len;
114 char *pathname = get_pathname();
115
116 va_start(args, fmt);
117 len = vsnprintf(pathname, PATH_MAX, fmt, args);
118 va_end(args);
119 if (len >= PATH_MAX)
120 return bad_path;
121 return cleanup_path(pathname);
122}
123
124char *perf_path(const char *fmt, ...)
125{
126 const char *perf_dir = get_perf_dir();
127 char *pathname = get_pathname();
128 va_list args;
129 unsigned len;
130
131 len = strlen(perf_dir);
132 if (len > PATH_MAX-100)
133 return bad_path;
134 memcpy(pathname, perf_dir, len);
135 if (len && perf_dir[len-1] != '/')
136 pathname[len++] = '/';
137 va_start(args, fmt);
138 len += vsnprintf(pathname + len, PATH_MAX - len, fmt, args);
139 va_end(args);
140 if (len >= PATH_MAX)
141 return bad_path;
142 return cleanup_path(pathname);
143}
144
145
146/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
147int perf_mkstemp(char *path, size_t len, const char *template)
148{
149 const char *tmp;
150 size_t n;
151
152 tmp = getenv("TMPDIR");
153 if (!tmp)
154 tmp = "/tmp";
155 n = snprintf(path, len, "%s/%s", tmp, template);
156 if (len <= n) {
157 errno = ENAMETOOLONG;
158 return -1;
159 }
160 return mkstemp(path);
161}
162
163
164const char *make_relative_path(const char *abs, const char *base)
165{
166 static char buf[PATH_MAX + 1];
167 int baselen;
168 if (!base)
169 return abs;
170 baselen = strlen(base);
171 if (prefixcmp(abs, base))
172 return abs;
173 if (abs[baselen] == '/')
174 baselen++;
175 else if (base[baselen - 1] != '/')
176 return abs;
177 strcpy(buf, abs + baselen);
178 return buf;
179}
180
181/*
182 * It is okay if dst == src, but they should not overlap otherwise.
183 *
184 * Performs the following normalizations on src, storing the result in dst:
185 * - Ensures that components are separated by '/' (Windows only)
186 * - Squashes sequences of '/'.
187 * - Removes "." components.
188 * - Removes ".." components, and the components the precede them.
189 * Returns failure (non-zero) if a ".." component appears as first path
190 * component anytime during the normalization. Otherwise, returns success (0).
191 *
192 * Note that this function is purely textual. It does not follow symlinks,
193 * verify the existence of the path, or make any system calls.
194 */
195int normalize_path_copy(char *dst, const char *src)
196{
197 char *dst0;
198
199 if (has_dos_drive_prefix(src)) {
200 *dst++ = *src++;
201 *dst++ = *src++;
202 }
203 dst0 = dst;
204
205 if (is_dir_sep(*src)) {
206 *dst++ = '/';
207 while (is_dir_sep(*src))
208 src++;
209 }
210
211 for (;;) {
212 char c = *src;
213
214 /*
215 * A path component that begins with . could be
216 * special:
217 * (1) "." and ends -- ignore and terminate.
218 * (2) "./" -- ignore them, eat slash and continue.
219 * (3) ".." and ends -- strip one and terminate.
220 * (4) "../" -- strip one, eat slash and continue.
221 */
222 if (c == '.') {
223 if (!src[1]) {
224 /* (1) */
225 src++;
226 } else if (is_dir_sep(src[1])) {
227 /* (2) */
228 src += 2;
229 while (is_dir_sep(*src))
230 src++;
231 continue;
232 } else if (src[1] == '.') {
233 if (!src[2]) {
234 /* (3) */
235 src += 2;
236 goto up_one;
237 } else if (is_dir_sep(src[2])) {
238 /* (4) */
239 src += 3;
240 while (is_dir_sep(*src))
241 src++;
242 goto up_one;
243 }
244 }
245 }
246
247 /* copy up to the next '/', and eat all '/' */
248 while ((c = *src++) != '\0' && !is_dir_sep(c))
249 *dst++ = c;
250 if (is_dir_sep(c)) {
251 *dst++ = '/';
252 while (is_dir_sep(c))
253 c = *src++;
254 src--;
255 } else if (!c)
256 break;
257 continue;
258
259 up_one:
260 /*
261 * dst0..dst is prefix portion, and dst[-1] is '/';
262 * go up one level.
263 */
264 dst--; /* go to trailing '/' */
265 if (dst <= dst0)
266 return -1;
267 /* Windows: dst[-1] cannot be backslash anymore */
268 while (dst0 < dst && dst[-1] != '/')
269 dst--;
270 }
271 *dst = '\0';
272 return 0;
273}
274
275/*
276 * path = Canonical absolute path
277 * prefix_list = Colon-separated list of absolute paths
278 *
279 * Determines, for each path in prefix_list, whether the "prefix" really
280 * is an ancestor directory of path. Returns the length of the longest
281 * ancestor directory, excluding any trailing slashes, or -1 if no prefix
282 * is an ancestor. (Note that this means 0 is returned if prefix_list is
283 * "/".) "/foo" is not considered an ancestor of "/foobar". Directories
284 * are not considered to be their own ancestors. path must be in a
285 * canonical form: empty components, or "." or ".." components are not
286 * allowed. prefix_list may be null, which is like "".
287 */
288int longest_ancestor_length(const char *path, const char *prefix_list)
289{
290 char buf[PATH_MAX+1];
291 const char *ceil, *colon;
292 int len, max_len = -1;
293
294 if (prefix_list == NULL || !strcmp(path, "/"))
295 return -1;
296
297 for (colon = ceil = prefix_list; *colon; ceil = colon+1) {
298 for (colon = ceil; *colon && *colon != PATH_SEP; colon++);
299 len = colon - ceil;
300 if (len == 0 || len > PATH_MAX || !is_absolute_path(ceil))
301 continue;
302 strlcpy(buf, ceil, len+1);
303 if (normalize_path_copy(buf, buf) < 0)
304 continue;
305 len = strlen(buf);
306 if (len > 0 && buf[len-1] == '/')
307 buf[--len] = '\0';
308
309 if (!strncmp(path, buf, len) &&
310 path[len] == '/' &&
311 len > max_len) {
312 max_len = len;
313 }
314 }
315
316 return max_len;
317}
318
319/* strip arbitrary amount of directory separators at end of path */
320static inline int chomp_trailing_dir_sep(const char *path, int len)
321{
322 while (len && is_dir_sep(path[len - 1]))
323 len--;
324 return len;
325}
326
327/*
328 * If path ends with suffix (complete path components), returns the
329 * part before suffix (sans trailing directory separators).
330 * Otherwise returns NULL.
331 */
332char *strip_path_suffix(const char *path, const char *suffix)
333{
334 int path_len = strlen(path), suffix_len = strlen(suffix);
335
336 while (suffix_len) {
337 if (!path_len)
338 return NULL;
339
340 if (is_dir_sep(path[path_len - 1])) {
341 if (!is_dir_sep(suffix[suffix_len - 1]))
342 return NULL;
343 path_len = chomp_trailing_dir_sep(path, path_len);
344 suffix_len = chomp_trailing_dir_sep(suffix, suffix_len);
345 }
346 else if (path[--path_len] != suffix[--suffix_len])
347 return NULL;
348 }
349
350 if (path_len && !is_dir_sep(path[path_len - 1]))
351 return NULL;
352 return xstrndup(path, chomp_trailing_dir_sep(path, path_len));
353}
diff --git a/tools/perf/util/quote.c b/tools/perf/util/quote.c
new file mode 100644
index 000000000000..f18c5212bc92
--- /dev/null
+++ b/tools/perf/util/quote.c
@@ -0,0 +1,481 @@
1#include "cache.h"
2#include "quote.h"
3
4int quote_path_fully = 1;
5
6/* Help to copy the thing properly quoted for the shell safety.
7 * any single quote is replaced with '\'', any exclamation point
8 * is replaced with '\!', and the whole thing is enclosed in a
9 *
10 * E.g.
11 * original sq_quote result
12 * name ==> name ==> 'name'
13 * a b ==> a b ==> 'a b'
14 * a'b ==> a'\''b ==> 'a'\''b'
15 * a!b ==> a'\!'b ==> 'a'\!'b'
16 */
17static inline int need_bs_quote(char c)
18{
19 return (c == '\'' || c == '!');
20}
21
22void sq_quote_buf(struct strbuf *dst, const char *src)
23{
24 char *to_free = NULL;
25
26 if (dst->buf == src)
27 to_free = strbuf_detach(dst, NULL);
28
29 strbuf_addch(dst, '\'');
30 while (*src) {
31 size_t len = strcspn(src, "'!");
32 strbuf_add(dst, src, len);
33 src += len;
34 while (need_bs_quote(*src)) {
35 strbuf_addstr(dst, "'\\");
36 strbuf_addch(dst, *src++);
37 strbuf_addch(dst, '\'');
38 }
39 }
40 strbuf_addch(dst, '\'');
41 free(to_free);
42}
43
44void sq_quote_print(FILE *stream, const char *src)
45{
46 char c;
47
48 fputc('\'', stream);
49 while ((c = *src++)) {
50 if (need_bs_quote(c)) {
51 fputs("'\\", stream);
52 fputc(c, stream);
53 fputc('\'', stream);
54 } else {
55 fputc(c, stream);
56 }
57 }
58 fputc('\'', stream);
59}
60
61void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen)
62{
63 int i;
64
65 /* Copy into destination buffer. */
66 strbuf_grow(dst, 255);
67 for (i = 0; argv[i]; ++i) {
68 strbuf_addch(dst, ' ');
69 sq_quote_buf(dst, argv[i]);
70 if (maxlen && dst->len > maxlen)
71 die("Too many or long arguments");
72 }
73}
74
75char *sq_dequote_step(char *arg, char **next)
76{
77 char *dst = arg;
78 char *src = arg;
79 char c;
80
81 if (*src != '\'')
82 return NULL;
83 for (;;) {
84 c = *++src;
85 if (!c)
86 return NULL;
87 if (c != '\'') {
88 *dst++ = c;
89 continue;
90 }
91 /* We stepped out of sq */
92 switch (*++src) {
93 case '\0':
94 *dst = 0;
95 if (next)
96 *next = NULL;
97 return arg;
98 case '\\':
99 c = *++src;
100 if (need_bs_quote(c) && *++src == '\'') {
101 *dst++ = c;
102 continue;
103 }
104 /* Fallthrough */
105 default:
106 if (!next || !isspace(*src))
107 return NULL;
108 do {
109 c = *++src;
110 } while (isspace(c));
111 *dst = 0;
112 *next = src;
113 return arg;
114 }
115 }
116}
117
118char *sq_dequote(char *arg)
119{
120 return sq_dequote_step(arg, NULL);
121}
122
123int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc)
124{
125 char *next = arg;
126
127 if (!*arg)
128 return 0;
129 do {
130 char *dequoted = sq_dequote_step(next, &next);
131 if (!dequoted)
132 return -1;
133 ALLOC_GROW(*argv, *nr + 1, *alloc);
134 (*argv)[(*nr)++] = dequoted;
135 } while (next);
136
137 return 0;
138}
139
140/* 1 means: quote as octal
141 * 0 means: quote as octal if (quote_path_fully)
142 * -1 means: never quote
143 * c: quote as "\\c"
144 */
145#define X8(x) x, x, x, x, x, x, x, x
146#define X16(x) X8(x), X8(x)
147static signed char const sq_lookup[256] = {
148 /* 0 1 2 3 4 5 6 7 */
149 /* 0x00 */ 1, 1, 1, 1, 1, 1, 1, 'a',
150 /* 0x08 */ 'b', 't', 'n', 'v', 'f', 'r', 1, 1,
151 /* 0x10 */ X16(1),
152 /* 0x20 */ -1, -1, '"', -1, -1, -1, -1, -1,
153 /* 0x28 */ X16(-1), X16(-1), X16(-1),
154 /* 0x58 */ -1, -1, -1, -1,'\\', -1, -1, -1,
155 /* 0x60 */ X16(-1), X8(-1),
156 /* 0x78 */ -1, -1, -1, -1, -1, -1, -1, 1,
157 /* 0x80 */ /* set to 0 */
158};
159
160static inline int sq_must_quote(char c)
161{
162 return sq_lookup[(unsigned char)c] + quote_path_fully > 0;
163}
164
165/* returns the longest prefix not needing a quote up to maxlen if positive.
166 This stops at the first \0 because it's marked as a character needing an
167 escape */
168static size_t next_quote_pos(const char *s, ssize_t maxlen)
169{
170 size_t len;
171 if (maxlen < 0) {
172 for (len = 0; !sq_must_quote(s[len]); len++);
173 } else {
174 for (len = 0; len < maxlen && !sq_must_quote(s[len]); len++);
175 }
176 return len;
177}
178
179/*
180 * C-style name quoting.
181 *
182 * (1) if sb and fp are both NULL, inspect the input name and counts the
183 * number of bytes that are needed to hold c_style quoted version of name,
184 * counting the double quotes around it but not terminating NUL, and
185 * returns it.
186 * However, if name does not need c_style quoting, it returns 0.
187 *
188 * (2) if sb or fp are not NULL, it emits the c_style quoted version
189 * of name, enclosed with double quotes if asked and needed only.
190 * Return value is the same as in (1).
191 */
192static size_t quote_c_style_counted(const char *name, ssize_t maxlen,
193 struct strbuf *sb, FILE *fp, int no_dq)
194{
195#undef EMIT
196#define EMIT(c) \
197 do { \
198 if (sb) strbuf_addch(sb, (c)); \
199 if (fp) fputc((c), fp); \
200 count++; \
201 } while (0)
202#define EMITBUF(s, l) \
203 do { \
204 int __ret; \
205 if (sb) strbuf_add(sb, (s), (l)); \
206 if (fp) __ret = fwrite((s), (l), 1, fp); \
207 count += (l); \
208 } while (0)
209
210 size_t len, count = 0;
211 const char *p = name;
212
213 for (;;) {
214 int ch;
215
216 len = next_quote_pos(p, maxlen);
217 if (len == maxlen || !p[len])
218 break;
219
220 if (!no_dq && p == name)
221 EMIT('"');
222
223 EMITBUF(p, len);
224 EMIT('\\');
225 p += len;
226 ch = (unsigned char)*p++;
227 if (sq_lookup[ch] >= ' ') {
228 EMIT(sq_lookup[ch]);
229 } else {
230 EMIT(((ch >> 6) & 03) + '0');
231 EMIT(((ch >> 3) & 07) + '0');
232 EMIT(((ch >> 0) & 07) + '0');
233 }
234 }
235
236 EMITBUF(p, len);
237 if (p == name) /* no ending quote needed */
238 return 0;
239
240 if (!no_dq)
241 EMIT('"');
242 return count;
243}
244
245size_t quote_c_style(const char *name, struct strbuf *sb, FILE *fp, int nodq)
246{
247 return quote_c_style_counted(name, -1, sb, fp, nodq);
248}
249
250void quote_two_c_style(struct strbuf *sb, const char *prefix, const char *path, int nodq)
251{
252 if (quote_c_style(prefix, NULL, NULL, 0) ||
253 quote_c_style(path, NULL, NULL, 0)) {
254 if (!nodq)
255 strbuf_addch(sb, '"');
256 quote_c_style(prefix, sb, NULL, 1);
257 quote_c_style(path, sb, NULL, 1);
258 if (!nodq)
259 strbuf_addch(sb, '"');
260 } else {
261 strbuf_addstr(sb, prefix);
262 strbuf_addstr(sb, path);
263 }
264}
265
266void write_name_quoted(const char *name, FILE *fp, int terminator)
267{
268 if (terminator) {
269 quote_c_style(name, NULL, fp, 0);
270 } else {
271 fputs(name, fp);
272 }
273 fputc(terminator, fp);
274}
275
276extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
277 const char *name, FILE *fp, int terminator)
278{
279 int needquote = 0;
280
281 if (terminator) {
282 needquote = next_quote_pos(pfx, pfxlen) < pfxlen
283 || name[next_quote_pos(name, -1)];
284 }
285 if (needquote) {
286 fputc('"', fp);
287 quote_c_style_counted(pfx, pfxlen, NULL, fp, 1);
288 quote_c_style(name, NULL, fp, 1);
289 fputc('"', fp);
290 } else {
291 int ret;
292
293 ret = fwrite(pfx, pfxlen, 1, fp);
294 fputs(name, fp);
295 }
296 fputc(terminator, fp);
297}
298
299/* quote path as relative to the given prefix */
300char *quote_path_relative(const char *in, int len,
301 struct strbuf *out, const char *prefix)
302{
303 int needquote;
304
305 if (len < 0)
306 len = strlen(in);
307
308 /* "../" prefix itself does not need quoting, but "in" might. */
309 needquote = next_quote_pos(in, len) < len;
310 strbuf_setlen(out, 0);
311 strbuf_grow(out, len);
312
313 if (needquote)
314 strbuf_addch(out, '"');
315 if (prefix) {
316 int off = 0;
317 while (prefix[off] && off < len && prefix[off] == in[off])
318 if (prefix[off] == '/') {
319 prefix += off + 1;
320 in += off + 1;
321 len -= off + 1;
322 off = 0;
323 } else
324 off++;
325
326 for (; *prefix; prefix++)
327 if (*prefix == '/')
328 strbuf_addstr(out, "../");
329 }
330
331 quote_c_style_counted (in, len, out, NULL, 1);
332
333 if (needquote)
334 strbuf_addch(out, '"');
335 if (!out->len)
336 strbuf_addstr(out, "./");
337
338 return out->buf;
339}
340
341/*
342 * C-style name unquoting.
343 *
344 * Quoted should point at the opening double quote.
345 * + Returns 0 if it was able to unquote the string properly, and appends the
346 * result in the strbuf `sb'.
347 * + Returns -1 in case of error, and doesn't touch the strbuf. Though note
348 * that this function will allocate memory in the strbuf, so calling
349 * strbuf_release is mandatory whichever result unquote_c_style returns.
350 *
351 * Updates endp pointer to point at one past the ending double quote if given.
352 */
353int unquote_c_style(struct strbuf *sb, const char *quoted, const char **endp)
354{
355 size_t oldlen = sb->len, len;
356 int ch, ac;
357
358 if (*quoted++ != '"')
359 return -1;
360
361 for (;;) {
362 len = strcspn(quoted, "\"\\");
363 strbuf_add(sb, quoted, len);
364 quoted += len;
365
366 switch (*quoted++) {
367 case '"':
368 if (endp)
369 *endp = quoted;
370 return 0;
371 case '\\':
372 break;
373 default:
374 goto error;
375 }
376
377 switch ((ch = *quoted++)) {
378 case 'a': ch = '\a'; break;
379 case 'b': ch = '\b'; break;
380 case 'f': ch = '\f'; break;
381 case 'n': ch = '\n'; break;
382 case 'r': ch = '\r'; break;
383 case 't': ch = '\t'; break;
384 case 'v': ch = '\v'; break;
385
386 case '\\': case '"':
387 break; /* verbatim */
388
389 /* octal values with first digit over 4 overflow */
390 case '0': case '1': case '2': case '3':
391 ac = ((ch - '0') << 6);
392 if ((ch = *quoted++) < '0' || '7' < ch)
393 goto error;
394 ac |= ((ch - '0') << 3);
395 if ((ch = *quoted++) < '0' || '7' < ch)
396 goto error;
397 ac |= (ch - '0');
398 ch = ac;
399 break;
400 default:
401 goto error;
402 }
403 strbuf_addch(sb, ch);
404 }
405
406 error:
407 strbuf_setlen(sb, oldlen);
408 return -1;
409}
410
411/* quoting as a string literal for other languages */
412
413void perl_quote_print(FILE *stream, const char *src)
414{
415 const char sq = '\'';
416 const char bq = '\\';
417 char c;
418
419 fputc(sq, stream);
420 while ((c = *src++)) {
421 if (c == sq || c == bq)
422 fputc(bq, stream);
423 fputc(c, stream);
424 }
425 fputc(sq, stream);
426}
427
428void python_quote_print(FILE *stream, const char *src)
429{
430 const char sq = '\'';
431 const char bq = '\\';
432 const char nl = '\n';
433 char c;
434
435 fputc(sq, stream);
436 while ((c = *src++)) {
437 if (c == nl) {
438 fputc(bq, stream);
439 fputc('n', stream);
440 continue;
441 }
442 if (c == sq || c == bq)
443 fputc(bq, stream);
444 fputc(c, stream);
445 }
446 fputc(sq, stream);
447}
448
449void tcl_quote_print(FILE *stream, const char *src)
450{
451 char c;
452
453 fputc('"', stream);
454 while ((c = *src++)) {
455 switch (c) {
456 case '[': case ']':
457 case '{': case '}':
458 case '$': case '\\': case '"':
459 fputc('\\', stream);
460 default:
461 fputc(c, stream);
462 break;
463 case '\f':
464 fputs("\\f", stream);
465 break;
466 case '\r':
467 fputs("\\r", stream);
468 break;
469 case '\n':
470 fputs("\\n", stream);
471 break;
472 case '\t':
473 fputs("\\t", stream);
474 break;
475 case '\v':
476 fputs("\\v", stream);
477 break;
478 }
479 }
480 fputc('"', stream);
481}
diff --git a/tools/perf/util/quote.h b/tools/perf/util/quote.h
new file mode 100644
index 000000000000..5dfad89816db
--- /dev/null
+++ b/tools/perf/util/quote.h
@@ -0,0 +1,68 @@
1#ifndef QUOTE_H
2#define QUOTE_H
3
4#include <stddef.h>
5#include <stdio.h>
6
7/* Help to copy the thing properly quoted for the shell safety.
8 * any single quote is replaced with '\'', any exclamation point
9 * is replaced with '\!', and the whole thing is enclosed in a
10 * single quote pair.
11 *
12 * For example, if you are passing the result to system() as an
13 * argument:
14 *
15 * sprintf(cmd, "foobar %s %s", sq_quote(arg0), sq_quote(arg1))
16 *
17 * would be appropriate. If the system() is going to call ssh to
18 * run the command on the other side:
19 *
20 * sprintf(cmd, "git-diff-tree %s %s", sq_quote(arg0), sq_quote(arg1));
21 * sprintf(rcmd, "ssh %s %s", sq_util/quote.host), sq_quote(cmd));
22 *
23 * Note that the above examples leak memory! Remember to free result from
24 * sq_quote() in a real application.
25 *
26 * sq_quote_buf() writes to an existing buffer of specified size; it
27 * will return the number of characters that would have been written
28 * excluding the final null regardless of the buffer size.
29 */
30
31extern void sq_quote_print(FILE *stream, const char *src);
32
33extern void sq_quote_buf(struct strbuf *, const char *src);
34extern void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen);
35
36/* This unwraps what sq_quote() produces in place, but returns
37 * NULL if the input does not look like what sq_quote would have
38 * produced.
39 */
40extern char *sq_dequote(char *);
41
42/*
43 * Same as the above, but can be used to unwrap many arguments in the
44 * same string separated by space. "next" is changed to point to the
45 * next argument that should be passed as first parameter. When there
46 * is no more argument to be dequoted, "next" is updated to point to NULL.
47 */
48extern char *sq_dequote_step(char *arg, char **next);
49extern int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc);
50
51extern int unquote_c_style(struct strbuf *, const char *quoted, const char **endp);
52extern size_t quote_c_style(const char *name, struct strbuf *, FILE *, int no_dq);
53extern void quote_two_c_style(struct strbuf *, const char *, const char *, int);
54
55extern void write_name_quoted(const char *name, FILE *, int terminator);
56extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
57 const char *name, FILE *, int terminator);
58
59/* quote path as relative to the given prefix */
60char *quote_path_relative(const char *in, int len,
61 struct strbuf *out, const char *prefix);
62
63/* quoting as a string literal for other languages */
64extern void perl_quote_print(FILE *stream, const char *src);
65extern void python_quote_print(FILE *stream, const char *src);
66extern void tcl_quote_print(FILE *stream, const char *src);
67
68#endif
diff --git a/tools/perf/util/rbtree.c b/tools/perf/util/rbtree.c
new file mode 100644
index 000000000000..b15ba9c7cb3f
--- /dev/null
+++ b/tools/perf/util/rbtree.c
@@ -0,0 +1,383 @@
1/*
2 Red Black Trees
3 (C) 1999 Andrea Arcangeli <andrea@suse.de>
4 (C) 2002 David Woodhouse <dwmw2@infradead.org>
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
20 linux/lib/rbtree.c
21*/
22
23#include "rbtree.h"
24
25static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
26{
27 struct rb_node *right = node->rb_right;
28 struct rb_node *parent = rb_parent(node);
29
30 if ((node->rb_right = right->rb_left))
31 rb_set_parent(right->rb_left, node);
32 right->rb_left = node;
33
34 rb_set_parent(right, parent);
35
36 if (parent)
37 {
38 if (node == parent->rb_left)
39 parent->rb_left = right;
40 else
41 parent->rb_right = right;
42 }
43 else
44 root->rb_node = right;
45 rb_set_parent(node, right);
46}
47
48static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
49{
50 struct rb_node *left = node->rb_left;
51 struct rb_node *parent = rb_parent(node);
52
53 if ((node->rb_left = left->rb_right))
54 rb_set_parent(left->rb_right, node);
55 left->rb_right = node;
56
57 rb_set_parent(left, parent);
58
59 if (parent)
60 {
61 if (node == parent->rb_right)
62 parent->rb_right = left;
63 else
64 parent->rb_left = left;
65 }
66 else
67 root->rb_node = left;
68 rb_set_parent(node, left);
69}
70
71void rb_insert_color(struct rb_node *node, struct rb_root *root)
72{
73 struct rb_node *parent, *gparent;
74
75 while ((parent = rb_parent(node)) && rb_is_red(parent))
76 {
77 gparent = rb_parent(parent);
78
79 if (parent == gparent->rb_left)
80 {
81 {
82 register struct rb_node *uncle = gparent->rb_right;
83 if (uncle && rb_is_red(uncle))
84 {
85 rb_set_black(uncle);
86 rb_set_black(parent);
87 rb_set_red(gparent);
88 node = gparent;
89 continue;
90 }
91 }
92
93 if (parent->rb_right == node)
94 {
95 register struct rb_node *tmp;
96 __rb_rotate_left(parent, root);
97 tmp = parent;
98 parent = node;
99 node = tmp;
100 }
101
102 rb_set_black(parent);
103 rb_set_red(gparent);
104 __rb_rotate_right(gparent, root);
105 } else {
106 {
107 register struct rb_node *uncle = gparent->rb_left;
108 if (uncle && rb_is_red(uncle))
109 {
110 rb_set_black(uncle);
111 rb_set_black(parent);
112 rb_set_red(gparent);
113 node = gparent;
114 continue;
115 }
116 }
117
118 if (parent->rb_left == node)
119 {
120 register struct rb_node *tmp;
121 __rb_rotate_right(parent, root);
122 tmp = parent;
123 parent = node;
124 node = tmp;
125 }
126
127 rb_set_black(parent);
128 rb_set_red(gparent);
129 __rb_rotate_left(gparent, root);
130 }
131 }
132
133 rb_set_black(root->rb_node);
134}
135
136static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
137 struct rb_root *root)
138{
139 struct rb_node *other;
140
141 while ((!node || rb_is_black(node)) && node != root->rb_node)
142 {
143 if (parent->rb_left == node)
144 {
145 other = parent->rb_right;
146 if (rb_is_red(other))
147 {
148 rb_set_black(other);
149 rb_set_red(parent);
150 __rb_rotate_left(parent, root);
151 other = parent->rb_right;
152 }
153 if ((!other->rb_left || rb_is_black(other->rb_left)) &&
154 (!other->rb_right || rb_is_black(other->rb_right)))
155 {
156 rb_set_red(other);
157 node = parent;
158 parent = rb_parent(node);
159 }
160 else
161 {
162 if (!other->rb_right || rb_is_black(other->rb_right))
163 {
164 rb_set_black(other->rb_left);
165 rb_set_red(other);
166 __rb_rotate_right(other, root);
167 other = parent->rb_right;
168 }
169 rb_set_color(other, rb_color(parent));
170 rb_set_black(parent);
171 rb_set_black(other->rb_right);
172 __rb_rotate_left(parent, root);
173 node = root->rb_node;
174 break;
175 }
176 }
177 else
178 {
179 other = parent->rb_left;
180 if (rb_is_red(other))
181 {
182 rb_set_black(other);
183 rb_set_red(parent);
184 __rb_rotate_right(parent, root);
185 other = parent->rb_left;
186 }
187 if ((!other->rb_left || rb_is_black(other->rb_left)) &&
188 (!other->rb_right || rb_is_black(other->rb_right)))
189 {
190 rb_set_red(other);
191 node = parent;
192 parent = rb_parent(node);
193 }
194 else
195 {
196 if (!other->rb_left || rb_is_black(other->rb_left))
197 {
198 rb_set_black(other->rb_right);
199 rb_set_red(other);
200 __rb_rotate_left(other, root);
201 other = parent->rb_left;
202 }
203 rb_set_color(other, rb_color(parent));
204 rb_set_black(parent);
205 rb_set_black(other->rb_left);
206 __rb_rotate_right(parent, root);
207 node = root->rb_node;
208 break;
209 }
210 }
211 }
212 if (node)
213 rb_set_black(node);
214}
215
216void rb_erase(struct rb_node *node, struct rb_root *root)
217{
218 struct rb_node *child, *parent;
219 int color;
220
221 if (!node->rb_left)
222 child = node->rb_right;
223 else if (!node->rb_right)
224 child = node->rb_left;
225 else
226 {
227 struct rb_node *old = node, *left;
228
229 node = node->rb_right;
230 while ((left = node->rb_left) != NULL)
231 node = left;
232 child = node->rb_right;
233 parent = rb_parent(node);
234 color = rb_color(node);
235
236 if (child)
237 rb_set_parent(child, parent);
238 if (parent == old) {
239 parent->rb_right = child;
240 parent = node;
241 } else
242 parent->rb_left = child;
243
244 node->rb_parent_color = old->rb_parent_color;
245 node->rb_right = old->rb_right;
246 node->rb_left = old->rb_left;
247
248 if (rb_parent(old))
249 {
250 if (rb_parent(old)->rb_left == old)
251 rb_parent(old)->rb_left = node;
252 else
253 rb_parent(old)->rb_right = node;
254 } else
255 root->rb_node = node;
256
257 rb_set_parent(old->rb_left, node);
258 if (old->rb_right)
259 rb_set_parent(old->rb_right, node);
260 goto color;
261 }
262
263 parent = rb_parent(node);
264 color = rb_color(node);
265
266 if (child)
267 rb_set_parent(child, parent);
268 if (parent)
269 {
270 if (parent->rb_left == node)
271 parent->rb_left = child;
272 else
273 parent->rb_right = child;
274 }
275 else
276 root->rb_node = child;
277
278 color:
279 if (color == RB_BLACK)
280 __rb_erase_color(child, parent, root);
281}
282
283/*
284 * This function returns the first node (in sort order) of the tree.
285 */
286struct rb_node *rb_first(const struct rb_root *root)
287{
288 struct rb_node *n;
289
290 n = root->rb_node;
291 if (!n)
292 return NULL;
293 while (n->rb_left)
294 n = n->rb_left;
295 return n;
296}
297
298struct rb_node *rb_last(const struct rb_root *root)
299{
300 struct rb_node *n;
301
302 n = root->rb_node;
303 if (!n)
304 return NULL;
305 while (n->rb_right)
306 n = n->rb_right;
307 return n;
308}
309
310struct rb_node *rb_next(const struct rb_node *node)
311{
312 struct rb_node *parent;
313
314 if (rb_parent(node) == node)
315 return NULL;
316
317 /* If we have a right-hand child, go down and then left as far
318 as we can. */
319 if (node->rb_right) {
320 node = node->rb_right;
321 while (node->rb_left)
322 node=node->rb_left;
323 return (struct rb_node *)node;
324 }
325
326 /* No right-hand children. Everything down and left is
327 smaller than us, so any 'next' node must be in the general
328 direction of our parent. Go up the tree; any time the
329 ancestor is a right-hand child of its parent, keep going
330 up. First time it's a left-hand child of its parent, said
331 parent is our 'next' node. */
332 while ((parent = rb_parent(node)) && node == parent->rb_right)
333 node = parent;
334
335 return parent;
336}
337
338struct rb_node *rb_prev(const struct rb_node *node)
339{
340 struct rb_node *parent;
341
342 if (rb_parent(node) == node)
343 return NULL;
344
345 /* If we have a left-hand child, go down and then right as far
346 as we can. */
347 if (node->rb_left) {
348 node = node->rb_left;
349 while (node->rb_right)
350 node=node->rb_right;
351 return (struct rb_node *)node;
352 }
353
354 /* No left-hand children. Go up till we find an ancestor which
355 is a right-hand child of its parent */
356 while ((parent = rb_parent(node)) && node == parent->rb_left)
357 node = parent;
358
359 return parent;
360}
361
362void rb_replace_node(struct rb_node *victim, struct rb_node *new,
363 struct rb_root *root)
364{
365 struct rb_node *parent = rb_parent(victim);
366
367 /* Set the surrounding nodes to point to the replacement */
368 if (parent) {
369 if (victim == parent->rb_left)
370 parent->rb_left = new;
371 else
372 parent->rb_right = new;
373 } else {
374 root->rb_node = new;
375 }
376 if (victim->rb_left)
377 rb_set_parent(victim->rb_left, new);
378 if (victim->rb_right)
379 rb_set_parent(victim->rb_right, new);
380
381 /* Copy the pointers/colour from the victim to the replacement */
382 *new = *victim;
383}
diff --git a/tools/perf/util/rbtree.h b/tools/perf/util/rbtree.h
new file mode 100644
index 000000000000..6bdc488a47fb
--- /dev/null
+++ b/tools/perf/util/rbtree.h
@@ -0,0 +1,171 @@
1/*
2 Red Black Trees
3 (C) 1999 Andrea Arcangeli <andrea@suse.de>
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18
19 linux/include/linux/rbtree.h
20
21 To use rbtrees you'll have to implement your own insert and search cores.
22 This will avoid us to use callbacks and to drop drammatically performances.
23 I know it's not the cleaner way, but in C (not in C++) to get
24 performances and genericity...
25
26 Some example of insert and search follows here. The search is a plain
27 normal search over an ordered tree. The insert instead must be implemented
28 int two steps: as first thing the code must insert the element in
29 order as a red leaf in the tree, then the support library function
30 rb_insert_color() must be called. Such function will do the
31 not trivial work to rebalance the rbtree if necessary.
32
33-----------------------------------------------------------------------
34static inline struct page * rb_search_page_cache(struct inode * inode,
35 unsigned long offset)
36{
37 struct rb_node * n = inode->i_rb_page_cache.rb_node;
38 struct page * page;
39
40 while (n)
41 {
42 page = rb_entry(n, struct page, rb_page_cache);
43
44 if (offset < page->offset)
45 n = n->rb_left;
46 else if (offset > page->offset)
47 n = n->rb_right;
48 else
49 return page;
50 }
51 return NULL;
52}
53
54static inline struct page * __rb_insert_page_cache(struct inode * inode,
55 unsigned long offset,
56 struct rb_node * node)
57{
58 struct rb_node ** p = &inode->i_rb_page_cache.rb_node;
59 struct rb_node * parent = NULL;
60 struct page * page;
61
62 while (*p)
63 {
64 parent = *p;
65 page = rb_entry(parent, struct page, rb_page_cache);
66
67 if (offset < page->offset)
68 p = &(*p)->rb_left;
69 else if (offset > page->offset)
70 p = &(*p)->rb_right;
71 else
72 return page;
73 }
74
75 rb_link_node(node, parent, p);
76
77 return NULL;
78}
79
80static inline struct page * rb_insert_page_cache(struct inode * inode,
81 unsigned long offset,
82 struct rb_node * node)
83{
84 struct page * ret;
85 if ((ret = __rb_insert_page_cache(inode, offset, node)))
86 goto out;
87 rb_insert_color(node, &inode->i_rb_page_cache);
88 out:
89 return ret;
90}
91-----------------------------------------------------------------------
92*/
93
94#ifndef _LINUX_RBTREE_H
95#define _LINUX_RBTREE_H
96
97#include <stddef.h>
98
99/**
100 * container_of - cast a member of a structure out to the containing structure
101 * @ptr: the pointer to the member.
102 * @type: the type of the container struct this is embedded in.
103 * @member: the name of the member within the struct.
104 *
105 */
106#define container_of(ptr, type, member) ({ \
107 const typeof( ((type *)0)->member ) *__mptr = (ptr); \
108 (type *)( (char *)__mptr - offsetof(type,member) );})
109
110struct rb_node
111{
112 unsigned long rb_parent_color;
113#define RB_RED 0
114#define RB_BLACK 1
115 struct rb_node *rb_right;
116 struct rb_node *rb_left;
117} __attribute__((aligned(sizeof(long))));
118 /* The alignment might seem pointless, but allegedly CRIS needs it */
119
120struct rb_root
121{
122 struct rb_node *rb_node;
123};
124
125
126#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3))
127#define rb_color(r) ((r)->rb_parent_color & 1)
128#define rb_is_red(r) (!rb_color(r))
129#define rb_is_black(r) rb_color(r)
130#define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0)
131#define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0)
132
133static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
134{
135 rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p;
136}
137static inline void rb_set_color(struct rb_node *rb, int color)
138{
139 rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
140}
141
142#define RB_ROOT (struct rb_root) { NULL, }
143#define rb_entry(ptr, type, member) container_of(ptr, type, member)
144
145#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL)
146#define RB_EMPTY_NODE(node) (rb_parent(node) == node)
147#define RB_CLEAR_NODE(node) (rb_set_parent(node, node))
148
149extern void rb_insert_color(struct rb_node *, struct rb_root *);
150extern void rb_erase(struct rb_node *, struct rb_root *);
151
152/* Find logical next and previous nodes in a tree */
153extern struct rb_node *rb_next(const struct rb_node *);
154extern struct rb_node *rb_prev(const struct rb_node *);
155extern struct rb_node *rb_first(const struct rb_root *);
156extern struct rb_node *rb_last(const struct rb_root *);
157
158/* Fast replacement of a single node without remove/rebalance/add/rebalance */
159extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
160 struct rb_root *root);
161
162static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
163 struct rb_node ** rb_link)
164{
165 node->rb_parent_color = (unsigned long )parent;
166 node->rb_left = node->rb_right = NULL;
167
168 *rb_link = node;
169}
170
171#endif /* _LINUX_RBTREE_H */
diff --git a/tools/perf/util/run-command.c b/tools/perf/util/run-command.c
new file mode 100644
index 000000000000..b2f5e854f40a
--- /dev/null
+++ b/tools/perf/util/run-command.c
@@ -0,0 +1,395 @@
1#include "cache.h"
2#include "run-command.h"
3#include "exec_cmd.h"
4
5static inline void close_pair(int fd[2])
6{
7 close(fd[0]);
8 close(fd[1]);
9}
10
11static inline void dup_devnull(int to)
12{
13 int fd = open("/dev/null", O_RDWR);
14 dup2(fd, to);
15 close(fd);
16}
17
18int start_command(struct child_process *cmd)
19{
20 int need_in, need_out, need_err;
21 int fdin[2], fdout[2], fderr[2];
22
23 /*
24 * In case of errors we must keep the promise to close FDs
25 * that have been passed in via ->in and ->out.
26 */
27
28 need_in = !cmd->no_stdin && cmd->in < 0;
29 if (need_in) {
30 if (pipe(fdin) < 0) {
31 if (cmd->out > 0)
32 close(cmd->out);
33 return -ERR_RUN_COMMAND_PIPE;
34 }
35 cmd->in = fdin[1];
36 }
37
38 need_out = !cmd->no_stdout
39 && !cmd->stdout_to_stderr
40 && cmd->out < 0;
41 if (need_out) {
42 if (pipe(fdout) < 0) {
43 if (need_in)
44 close_pair(fdin);
45 else if (cmd->in)
46 close(cmd->in);
47 return -ERR_RUN_COMMAND_PIPE;
48 }
49 cmd->out = fdout[0];
50 }
51
52 need_err = !cmd->no_stderr && cmd->err < 0;
53 if (need_err) {
54 if (pipe(fderr) < 0) {
55 if (need_in)
56 close_pair(fdin);
57 else if (cmd->in)
58 close(cmd->in);
59 if (need_out)
60 close_pair(fdout);
61 else if (cmd->out)
62 close(cmd->out);
63 return -ERR_RUN_COMMAND_PIPE;
64 }
65 cmd->err = fderr[0];
66 }
67
68#ifndef __MINGW32__
69 fflush(NULL);
70 cmd->pid = fork();
71 if (!cmd->pid) {
72 if (cmd->no_stdin)
73 dup_devnull(0);
74 else if (need_in) {
75 dup2(fdin[0], 0);
76 close_pair(fdin);
77 } else if (cmd->in) {
78 dup2(cmd->in, 0);
79 close(cmd->in);
80 }
81
82 if (cmd->no_stderr)
83 dup_devnull(2);
84 else if (need_err) {
85 dup2(fderr[1], 2);
86 close_pair(fderr);
87 }
88
89 if (cmd->no_stdout)
90 dup_devnull(1);
91 else if (cmd->stdout_to_stderr)
92 dup2(2, 1);
93 else if (need_out) {
94 dup2(fdout[1], 1);
95 close_pair(fdout);
96 } else if (cmd->out > 1) {
97 dup2(cmd->out, 1);
98 close(cmd->out);
99 }
100
101 if (cmd->dir && chdir(cmd->dir))
102 die("exec %s: cd to %s failed (%s)", cmd->argv[0],
103 cmd->dir, strerror(errno));
104 if (cmd->env) {
105 for (; *cmd->env; cmd->env++) {
106 if (strchr(*cmd->env, '='))
107 putenv((char*)*cmd->env);
108 else
109 unsetenv(*cmd->env);
110 }
111 }
112 if (cmd->preexec_cb)
113 cmd->preexec_cb();
114 if (cmd->perf_cmd) {
115 execv_perf_cmd(cmd->argv);
116 } else {
117 execvp(cmd->argv[0], (char *const*) cmd->argv);
118 }
119 exit(127);
120 }
121#else
122 int s0 = -1, s1 = -1, s2 = -1; /* backups of stdin, stdout, stderr */
123 const char **sargv = cmd->argv;
124 char **env = environ;
125
126 if (cmd->no_stdin) {
127 s0 = dup(0);
128 dup_devnull(0);
129 } else if (need_in) {
130 s0 = dup(0);
131 dup2(fdin[0], 0);
132 } else if (cmd->in) {
133 s0 = dup(0);
134 dup2(cmd->in, 0);
135 }
136
137 if (cmd->no_stderr) {
138 s2 = dup(2);
139 dup_devnull(2);
140 } else if (need_err) {
141 s2 = dup(2);
142 dup2(fderr[1], 2);
143 }
144
145 if (cmd->no_stdout) {
146 s1 = dup(1);
147 dup_devnull(1);
148 } else if (cmd->stdout_to_stderr) {
149 s1 = dup(1);
150 dup2(2, 1);
151 } else if (need_out) {
152 s1 = dup(1);
153 dup2(fdout[1], 1);
154 } else if (cmd->out > 1) {
155 s1 = dup(1);
156 dup2(cmd->out, 1);
157 }
158
159 if (cmd->dir)
160 die("chdir in start_command() not implemented");
161 if (cmd->env) {
162 env = copy_environ();
163 for (; *cmd->env; cmd->env++)
164 env = env_setenv(env, *cmd->env);
165 }
166
167 if (cmd->perf_cmd) {
168 cmd->argv = prepare_perf_cmd(cmd->argv);
169 }
170
171 cmd->pid = mingw_spawnvpe(cmd->argv[0], cmd->argv, env);
172
173 if (cmd->env)
174 free_environ(env);
175 if (cmd->perf_cmd)
176 free(cmd->argv);
177
178 cmd->argv = sargv;
179 if (s0 >= 0)
180 dup2(s0, 0), close(s0);
181 if (s1 >= 0)
182 dup2(s1, 1), close(s1);
183 if (s2 >= 0)
184 dup2(s2, 2), close(s2);
185#endif
186
187 if (cmd->pid < 0) {
188 int err = errno;
189 if (need_in)
190 close_pair(fdin);
191 else if (cmd->in)
192 close(cmd->in);
193 if (need_out)
194 close_pair(fdout);
195 else if (cmd->out)
196 close(cmd->out);
197 if (need_err)
198 close_pair(fderr);
199 return err == ENOENT ?
200 -ERR_RUN_COMMAND_EXEC :
201 -ERR_RUN_COMMAND_FORK;
202 }
203
204 if (need_in)
205 close(fdin[0]);
206 else if (cmd->in)
207 close(cmd->in);
208
209 if (need_out)
210 close(fdout[1]);
211 else if (cmd->out)
212 close(cmd->out);
213
214 if (need_err)
215 close(fderr[1]);
216
217 return 0;
218}
219
220static int wait_or_whine(pid_t pid)
221{
222 for (;;) {
223 int status, code;
224 pid_t waiting = waitpid(pid, &status, 0);
225
226 if (waiting < 0) {
227 if (errno == EINTR)
228 continue;
229 error("waitpid failed (%s)", strerror(errno));
230 return -ERR_RUN_COMMAND_WAITPID;
231 }
232 if (waiting != pid)
233 return -ERR_RUN_COMMAND_WAITPID_WRONG_PID;
234 if (WIFSIGNALED(status))
235 return -ERR_RUN_COMMAND_WAITPID_SIGNAL;
236
237 if (!WIFEXITED(status))
238 return -ERR_RUN_COMMAND_WAITPID_NOEXIT;
239 code = WEXITSTATUS(status);
240 switch (code) {
241 case 127:
242 return -ERR_RUN_COMMAND_EXEC;
243 case 0:
244 return 0;
245 default:
246 return -code;
247 }
248 }
249}
250
251int finish_command(struct child_process *cmd)
252{
253 return wait_or_whine(cmd->pid);
254}
255
256int run_command(struct child_process *cmd)
257{
258 int code = start_command(cmd);
259 if (code)
260 return code;
261 return finish_command(cmd);
262}
263
264static void prepare_run_command_v_opt(struct child_process *cmd,
265 const char **argv,
266 int opt)
267{
268 memset(cmd, 0, sizeof(*cmd));
269 cmd->argv = argv;
270 cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0;
271 cmd->perf_cmd = opt & RUN_PERF_CMD ? 1 : 0;
272 cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0;
273}
274
275int run_command_v_opt(const char **argv, int opt)
276{
277 struct child_process cmd;
278 prepare_run_command_v_opt(&cmd, argv, opt);
279 return run_command(&cmd);
280}
281
282int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env)
283{
284 struct child_process cmd;
285 prepare_run_command_v_opt(&cmd, argv, opt);
286 cmd.dir = dir;
287 cmd.env = env;
288 return run_command(&cmd);
289}
290
291#ifdef __MINGW32__
292static __stdcall unsigned run_thread(void *data)
293{
294 struct async *async = data;
295 return async->proc(async->fd_for_proc, async->data);
296}
297#endif
298
299int start_async(struct async *async)
300{
301 int pipe_out[2];
302
303 if (pipe(pipe_out) < 0)
304 return error("cannot create pipe: %s", strerror(errno));
305 async->out = pipe_out[0];
306
307#ifndef __MINGW32__
308 /* Flush stdio before fork() to avoid cloning buffers */
309 fflush(NULL);
310
311 async->pid = fork();
312 if (async->pid < 0) {
313 error("fork (async) failed: %s", strerror(errno));
314 close_pair(pipe_out);
315 return -1;
316 }
317 if (!async->pid) {
318 close(pipe_out[0]);
319 exit(!!async->proc(pipe_out[1], async->data));
320 }
321 close(pipe_out[1]);
322#else
323 async->fd_for_proc = pipe_out[1];
324 async->tid = (HANDLE) _beginthreadex(NULL, 0, run_thread, async, 0, NULL);
325 if (!async->tid) {
326 error("cannot create thread: %s", strerror(errno));
327 close_pair(pipe_out);
328 return -1;
329 }
330#endif
331 return 0;
332}
333
334int finish_async(struct async *async)
335{
336#ifndef __MINGW32__
337 int ret = 0;
338
339 if (wait_or_whine(async->pid))
340 ret = error("waitpid (async) failed");
341#else
342 DWORD ret = 0;
343 if (WaitForSingleObject(async->tid, INFINITE) != WAIT_OBJECT_0)
344 ret = error("waiting for thread failed: %lu", GetLastError());
345 else if (!GetExitCodeThread(async->tid, &ret))
346 ret = error("cannot get thread exit code: %lu", GetLastError());
347 CloseHandle(async->tid);
348#endif
349 return ret;
350}
351
352int run_hook(const char *index_file, const char *name, ...)
353{
354 struct child_process hook;
355 const char **argv = NULL, *env[2];
356 char index[PATH_MAX];
357 va_list args;
358 int ret;
359 size_t i = 0, alloc = 0;
360
361 if (access(perf_path("hooks/%s", name), X_OK) < 0)
362 return 0;
363
364 va_start(args, name);
365 ALLOC_GROW(argv, i + 1, alloc);
366 argv[i++] = perf_path("hooks/%s", name);
367 while (argv[i-1]) {
368 ALLOC_GROW(argv, i + 1, alloc);
369 argv[i++] = va_arg(args, const char *);
370 }
371 va_end(args);
372
373 memset(&hook, 0, sizeof(hook));
374 hook.argv = argv;
375 hook.no_stdin = 1;
376 hook.stdout_to_stderr = 1;
377 if (index_file) {
378 snprintf(index, sizeof(index), "PERF_INDEX_FILE=%s", index_file);
379 env[0] = index;
380 env[1] = NULL;
381 hook.env = env;
382 }
383
384 ret = start_command(&hook);
385 free(argv);
386 if (ret) {
387 warning("Could not spawn %s", argv[0]);
388 return ret;
389 }
390 ret = finish_command(&hook);
391 if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
392 warning("%s exited due to uncaught signal", argv[0]);
393
394 return ret;
395}
diff --git a/tools/perf/util/run-command.h b/tools/perf/util/run-command.h
new file mode 100644
index 000000000000..328289f23669
--- /dev/null
+++ b/tools/perf/util/run-command.h
@@ -0,0 +1,93 @@
1#ifndef RUN_COMMAND_H
2#define RUN_COMMAND_H
3
4enum {
5 ERR_RUN_COMMAND_FORK = 10000,
6 ERR_RUN_COMMAND_EXEC,
7 ERR_RUN_COMMAND_PIPE,
8 ERR_RUN_COMMAND_WAITPID,
9 ERR_RUN_COMMAND_WAITPID_WRONG_PID,
10 ERR_RUN_COMMAND_WAITPID_SIGNAL,
11 ERR_RUN_COMMAND_WAITPID_NOEXIT,
12};
13#define IS_RUN_COMMAND_ERR(x) (-(x) >= ERR_RUN_COMMAND_FORK)
14
15struct child_process {
16 const char **argv;
17 pid_t pid;
18 /*
19 * Using .in, .out, .err:
20 * - Specify 0 for no redirections (child inherits stdin, stdout,
21 * stderr from parent).
22 * - Specify -1 to have a pipe allocated as follows:
23 * .in: returns the writable pipe end; parent writes to it,
24 * the readable pipe end becomes child's stdin
25 * .out, .err: returns the readable pipe end; parent reads from
26 * it, the writable pipe end becomes child's stdout/stderr
27 * The caller of start_command() must close the returned FDs
28 * after it has completed reading from/writing to it!
29 * - Specify > 0 to set a channel to a particular FD as follows:
30 * .in: a readable FD, becomes child's stdin
31 * .out: a writable FD, becomes child's stdout/stderr
32 * .err > 0 not supported
33 * The specified FD is closed by start_command(), even in case
34 * of errors!
35 */
36 int in;
37 int out;
38 int err;
39 const char *dir;
40 const char *const *env;
41 unsigned no_stdin:1;
42 unsigned no_stdout:1;
43 unsigned no_stderr:1;
44 unsigned perf_cmd:1; /* if this is to be perf sub-command */
45 unsigned stdout_to_stderr:1;
46 void (*preexec_cb)(void);
47};
48
49int start_command(struct child_process *);
50int finish_command(struct child_process *);
51int run_command(struct child_process *);
52
53extern int run_hook(const char *index_file, const char *name, ...);
54
55#define RUN_COMMAND_NO_STDIN 1
56#define RUN_PERF_CMD 2 /*If this is to be perf sub-command */
57#define RUN_COMMAND_STDOUT_TO_STDERR 4
58int run_command_v_opt(const char **argv, int opt);
59
60/*
61 * env (the environment) is to be formatted like environ: "VAR=VALUE".
62 * To unset an environment variable use just "VAR".
63 */
64int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env);
65
66/*
67 * The purpose of the following functions is to feed a pipe by running
68 * a function asynchronously and providing output that the caller reads.
69 *
70 * It is expected that no synchronization and mutual exclusion between
71 * the caller and the feed function is necessary so that the function
72 * can run in a thread without interfering with the caller.
73 */
74struct async {
75 /*
76 * proc writes to fd and closes it;
77 * returns 0 on success, non-zero on failure
78 */
79 int (*proc)(int fd, void *data);
80 void *data;
81 int out; /* caller reads from here and closes it */
82#ifndef __MINGW32__
83 pid_t pid;
84#else
85 HANDLE tid;
86 int fd_for_proc;
87#endif
88};
89
90int start_async(struct async *async);
91int finish_async(struct async *async);
92
93#endif
diff --git a/tools/perf/util/sigchain.c b/tools/perf/util/sigchain.c
new file mode 100644
index 000000000000..1118b99e57d3
--- /dev/null
+++ b/tools/perf/util/sigchain.c
@@ -0,0 +1,52 @@
1#include "sigchain.h"
2#include "cache.h"
3
4#define SIGCHAIN_MAX_SIGNALS 32
5
6struct sigchain_signal {
7 sigchain_fun *old;
8 int n;
9 int alloc;
10};
11static struct sigchain_signal signals[SIGCHAIN_MAX_SIGNALS];
12
13static void check_signum(int sig)
14{
15 if (sig < 1 || sig >= SIGCHAIN_MAX_SIGNALS)
16 die("BUG: signal out of range: %d", sig);
17}
18
19int sigchain_push(int sig, sigchain_fun f)
20{
21 struct sigchain_signal *s = signals + sig;
22 check_signum(sig);
23
24 ALLOC_GROW(s->old, s->n + 1, s->alloc);
25 s->old[s->n] = signal(sig, f);
26 if (s->old[s->n] == SIG_ERR)
27 return -1;
28 s->n++;
29 return 0;
30}
31
32int sigchain_pop(int sig)
33{
34 struct sigchain_signal *s = signals + sig;
35 check_signum(sig);
36 if (s->n < 1)
37 return 0;
38
39 if (signal(sig, s->old[s->n - 1]) == SIG_ERR)
40 return -1;
41 s->n--;
42 return 0;
43}
44
45void sigchain_push_common(sigchain_fun f)
46{
47 sigchain_push(SIGINT, f);
48 sigchain_push(SIGHUP, f);
49 sigchain_push(SIGTERM, f);
50 sigchain_push(SIGQUIT, f);
51 sigchain_push(SIGPIPE, f);
52}
diff --git a/tools/perf/util/sigchain.h b/tools/perf/util/sigchain.h
new file mode 100644
index 000000000000..618083bce0c6
--- /dev/null
+++ b/tools/perf/util/sigchain.h
@@ -0,0 +1,11 @@
1#ifndef SIGCHAIN_H
2#define SIGCHAIN_H
3
4typedef void (*sigchain_fun)(int);
5
6int sigchain_push(int sig, sigchain_fun f);
7int sigchain_pop(int sig);
8
9void sigchain_push_common(sigchain_fun f);
10
11#endif /* SIGCHAIN_H */
diff --git a/tools/perf/util/strbuf.c b/tools/perf/util/strbuf.c
new file mode 100644
index 000000000000..eaba09306802
--- /dev/null
+++ b/tools/perf/util/strbuf.c
@@ -0,0 +1,359 @@
1#include "cache.h"
2
3int prefixcmp(const char *str, const char *prefix)
4{
5 for (; ; str++, prefix++)
6 if (!*prefix)
7 return 0;
8 else if (*str != *prefix)
9 return (unsigned char)*prefix - (unsigned char)*str;
10}
11
12/*
13 * Used as the default ->buf value, so that people can always assume
14 * buf is non NULL and ->buf is NUL terminated even for a freshly
15 * initialized strbuf.
16 */
17char strbuf_slopbuf[1];
18
19void strbuf_init(struct strbuf *sb, size_t hint)
20{
21 sb->alloc = sb->len = 0;
22 sb->buf = strbuf_slopbuf;
23 if (hint)
24 strbuf_grow(sb, hint);
25}
26
27void strbuf_release(struct strbuf *sb)
28{
29 if (sb->alloc) {
30 free(sb->buf);
31 strbuf_init(sb, 0);
32 }
33}
34
35char *strbuf_detach(struct strbuf *sb, size_t *sz)
36{
37 char *res = sb->alloc ? sb->buf : NULL;
38 if (sz)
39 *sz = sb->len;
40 strbuf_init(sb, 0);
41 return res;
42}
43
44void strbuf_attach(struct strbuf *sb, void *buf, size_t len, size_t alloc)
45{
46 strbuf_release(sb);
47 sb->buf = buf;
48 sb->len = len;
49 sb->alloc = alloc;
50 strbuf_grow(sb, 0);
51 sb->buf[sb->len] = '\0';
52}
53
54void strbuf_grow(struct strbuf *sb, size_t extra)
55{
56 if (sb->len + extra + 1 <= sb->len)
57 die("you want to use way too much memory");
58 if (!sb->alloc)
59 sb->buf = NULL;
60 ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
61}
62
63void strbuf_trim(struct strbuf *sb)
64{
65 char *b = sb->buf;
66 while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
67 sb->len--;
68 while (sb->len > 0 && isspace(*b)) {
69 b++;
70 sb->len--;
71 }
72 memmove(sb->buf, b, sb->len);
73 sb->buf[sb->len] = '\0';
74}
75void strbuf_rtrim(struct strbuf *sb)
76{
77 while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
78 sb->len--;
79 sb->buf[sb->len] = '\0';
80}
81
82void strbuf_ltrim(struct strbuf *sb)
83{
84 char *b = sb->buf;
85 while (sb->len > 0 && isspace(*b)) {
86 b++;
87 sb->len--;
88 }
89 memmove(sb->buf, b, sb->len);
90 sb->buf[sb->len] = '\0';
91}
92
93void strbuf_tolower(struct strbuf *sb)
94{
95 int i;
96 for (i = 0; i < sb->len; i++)
97 sb->buf[i] = tolower(sb->buf[i]);
98}
99
100struct strbuf **strbuf_split(const struct strbuf *sb, int delim)
101{
102 int alloc = 2, pos = 0;
103 char *n, *p;
104 struct strbuf **ret;
105 struct strbuf *t;
106
107 ret = calloc(alloc, sizeof(struct strbuf *));
108 p = n = sb->buf;
109 while (n < sb->buf + sb->len) {
110 int len;
111 n = memchr(n, delim, sb->len - (n - sb->buf));
112 if (pos + 1 >= alloc) {
113 alloc = alloc * 2;
114 ret = realloc(ret, sizeof(struct strbuf *) * alloc);
115 }
116 if (!n)
117 n = sb->buf + sb->len - 1;
118 len = n - p + 1;
119 t = malloc(sizeof(struct strbuf));
120 strbuf_init(t, len);
121 strbuf_add(t, p, len);
122 ret[pos] = t;
123 ret[++pos] = NULL;
124 p = ++n;
125 }
126 return ret;
127}
128
129void strbuf_list_free(struct strbuf **sbs)
130{
131 struct strbuf **s = sbs;
132
133 while (*s) {
134 strbuf_release(*s);
135 free(*s++);
136 }
137 free(sbs);
138}
139
140int strbuf_cmp(const struct strbuf *a, const struct strbuf *b)
141{
142 int len = a->len < b->len ? a->len: b->len;
143 int cmp = memcmp(a->buf, b->buf, len);
144 if (cmp)
145 return cmp;
146 return a->len < b->len ? -1: a->len != b->len;
147}
148
149void strbuf_splice(struct strbuf *sb, size_t pos, size_t len,
150 const void *data, size_t dlen)
151{
152 if (pos + len < pos)
153 die("you want to use way too much memory");
154 if (pos > sb->len)
155 die("`pos' is too far after the end of the buffer");
156 if (pos + len > sb->len)
157 die("`pos + len' is too far after the end of the buffer");
158
159 if (dlen >= len)
160 strbuf_grow(sb, dlen - len);
161 memmove(sb->buf + pos + dlen,
162 sb->buf + pos + len,
163 sb->len - pos - len);
164 memcpy(sb->buf + pos, data, dlen);
165 strbuf_setlen(sb, sb->len + dlen - len);
166}
167
168void strbuf_insert(struct strbuf *sb, size_t pos, const void *data, size_t len)
169{
170 strbuf_splice(sb, pos, 0, data, len);
171}
172
173void strbuf_remove(struct strbuf *sb, size_t pos, size_t len)
174{
175 strbuf_splice(sb, pos, len, NULL, 0);
176}
177
178void strbuf_add(struct strbuf *sb, const void *data, size_t len)
179{
180 strbuf_grow(sb, len);
181 memcpy(sb->buf + sb->len, data, len);
182 strbuf_setlen(sb, sb->len + len);
183}
184
185void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len)
186{
187 strbuf_grow(sb, len);
188 memcpy(sb->buf + sb->len, sb->buf + pos, len);
189 strbuf_setlen(sb, sb->len + len);
190}
191
192void strbuf_addf(struct strbuf *sb, const char *fmt, ...)
193{
194 int len;
195 va_list ap;
196
197 if (!strbuf_avail(sb))
198 strbuf_grow(sb, 64);
199 va_start(ap, fmt);
200 len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
201 va_end(ap);
202 if (len < 0)
203 die("your vsnprintf is broken");
204 if (len > strbuf_avail(sb)) {
205 strbuf_grow(sb, len);
206 va_start(ap, fmt);
207 len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
208 va_end(ap);
209 if (len > strbuf_avail(sb)) {
210 die("this should not happen, your snprintf is broken");
211 }
212 }
213 strbuf_setlen(sb, sb->len + len);
214}
215
216void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn,
217 void *context)
218{
219 for (;;) {
220 const char *percent;
221 size_t consumed;
222
223 percent = strchrnul(format, '%');
224 strbuf_add(sb, format, percent - format);
225 if (!*percent)
226 break;
227 format = percent + 1;
228
229 consumed = fn(sb, format, context);
230 if (consumed)
231 format += consumed;
232 else
233 strbuf_addch(sb, '%');
234 }
235}
236
237size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder,
238 void *context)
239{
240 struct strbuf_expand_dict_entry *e = context;
241 size_t len;
242
243 for (; e->placeholder && (len = strlen(e->placeholder)); e++) {
244 if (!strncmp(placeholder, e->placeholder, len)) {
245 if (e->value)
246 strbuf_addstr(sb, e->value);
247 return len;
248 }
249 }
250 return 0;
251}
252
253size_t strbuf_fread(struct strbuf *sb, size_t size, FILE *f)
254{
255 size_t res;
256 size_t oldalloc = sb->alloc;
257
258 strbuf_grow(sb, size);
259 res = fread(sb->buf + sb->len, 1, size, f);
260 if (res > 0)
261 strbuf_setlen(sb, sb->len + res);
262 else if (res < 0 && oldalloc == 0)
263 strbuf_release(sb);
264 return res;
265}
266
267ssize_t strbuf_read(struct strbuf *sb, int fd, size_t hint)
268{
269 size_t oldlen = sb->len;
270 size_t oldalloc = sb->alloc;
271
272 strbuf_grow(sb, hint ? hint : 8192);
273 for (;;) {
274 ssize_t cnt;
275
276 cnt = read(fd, sb->buf + sb->len, sb->alloc - sb->len - 1);
277 if (cnt < 0) {
278 if (oldalloc == 0)
279 strbuf_release(sb);
280 else
281 strbuf_setlen(sb, oldlen);
282 return -1;
283 }
284 if (!cnt)
285 break;
286 sb->len += cnt;
287 strbuf_grow(sb, 8192);
288 }
289
290 sb->buf[sb->len] = '\0';
291 return sb->len - oldlen;
292}
293
294#define STRBUF_MAXLINK (2*PATH_MAX)
295
296int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint)
297{
298 size_t oldalloc = sb->alloc;
299
300 if (hint < 32)
301 hint = 32;
302
303 while (hint < STRBUF_MAXLINK) {
304 int len;
305
306 strbuf_grow(sb, hint);
307 len = readlink(path, sb->buf, hint);
308 if (len < 0) {
309 if (errno != ERANGE)
310 break;
311 } else if (len < hint) {
312 strbuf_setlen(sb, len);
313 return 0;
314 }
315
316 /* .. the buffer was too small - try again */
317 hint *= 2;
318 }
319 if (oldalloc == 0)
320 strbuf_release(sb);
321 return -1;
322}
323
324int strbuf_getline(struct strbuf *sb, FILE *fp, int term)
325{
326 int ch;
327
328 strbuf_grow(sb, 0);
329 if (feof(fp))
330 return EOF;
331
332 strbuf_reset(sb);
333 while ((ch = fgetc(fp)) != EOF) {
334 if (ch == term)
335 break;
336 strbuf_grow(sb, 1);
337 sb->buf[sb->len++] = ch;
338 }
339 if (ch == EOF && sb->len == 0)
340 return EOF;
341
342 sb->buf[sb->len] = '\0';
343 return 0;
344}
345
346int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint)
347{
348 int fd, len;
349
350 fd = open(path, O_RDONLY);
351 if (fd < 0)
352 return -1;
353 len = strbuf_read(sb, fd, hint);
354 close(fd);
355 if (len < 0)
356 return -1;
357
358 return len;
359}
diff --git a/tools/perf/util/strbuf.h b/tools/perf/util/strbuf.h
new file mode 100644
index 000000000000..9ee908a3ec5d
--- /dev/null
+++ b/tools/perf/util/strbuf.h
@@ -0,0 +1,137 @@
1#ifndef STRBUF_H
2#define STRBUF_H
3
4/*
5 * Strbuf's can be use in many ways: as a byte array, or to store arbitrary
6 * long, overflow safe strings.
7 *
8 * Strbufs has some invariants that are very important to keep in mind:
9 *
10 * 1. the ->buf member is always malloc-ed, hence strbuf's can be used to
11 * build complex strings/buffers whose final size isn't easily known.
12 *
13 * It is NOT legal to copy the ->buf pointer away.
14 * `strbuf_detach' is the operation that detachs a buffer from its shell
15 * while keeping the shell valid wrt its invariants.
16 *
17 * 2. the ->buf member is a byte array that has at least ->len + 1 bytes
18 * allocated. The extra byte is used to store a '\0', allowing the ->buf
19 * member to be a valid C-string. Every strbuf function ensure this
20 * invariant is preserved.
21 *
22 * Note that it is OK to "play" with the buffer directly if you work it
23 * that way:
24 *
25 * strbuf_grow(sb, SOME_SIZE);
26 * ... Here, the memory array starting at sb->buf, and of length
27 * ... strbuf_avail(sb) is all yours, and you are sure that
28 * ... strbuf_avail(sb) is at least SOME_SIZE.
29 * strbuf_setlen(sb, sb->len + SOME_OTHER_SIZE);
30 *
31 * Of course, SOME_OTHER_SIZE must be smaller or equal to strbuf_avail(sb).
32 *
33 * Doing so is safe, though if it has to be done in many places, adding the
34 * missing API to the strbuf module is the way to go.
35 *
36 * XXX: do _not_ assume that the area that is yours is of size ->alloc - 1
37 * even if it's true in the current implementation. Alloc is somehow a
38 * "private" member that should not be messed with.
39 */
40
41#include <assert.h>
42
43extern char strbuf_slopbuf[];
44struct strbuf {
45 size_t alloc;
46 size_t len;
47 char *buf;
48};
49
50#define STRBUF_INIT { 0, 0, strbuf_slopbuf }
51
52/*----- strbuf life cycle -----*/
53extern void strbuf_init(struct strbuf *, size_t);
54extern void strbuf_release(struct strbuf *);
55extern char *strbuf_detach(struct strbuf *, size_t *);
56extern void strbuf_attach(struct strbuf *, void *, size_t, size_t);
57static inline void strbuf_swap(struct strbuf *a, struct strbuf *b) {
58 struct strbuf tmp = *a;
59 *a = *b;
60 *b = tmp;
61}
62
63/*----- strbuf size related -----*/
64static inline size_t strbuf_avail(const struct strbuf *sb) {
65 return sb->alloc ? sb->alloc - sb->len - 1 : 0;
66}
67
68extern void strbuf_grow(struct strbuf *, size_t);
69
70static inline void strbuf_setlen(struct strbuf *sb, size_t len) {
71 if (!sb->alloc)
72 strbuf_grow(sb, 0);
73 assert(len < sb->alloc);
74 sb->len = len;
75 sb->buf[len] = '\0';
76}
77#define strbuf_reset(sb) strbuf_setlen(sb, 0)
78
79/*----- content related -----*/
80extern void strbuf_trim(struct strbuf *);
81extern void strbuf_rtrim(struct strbuf *);
82extern void strbuf_ltrim(struct strbuf *);
83extern int strbuf_cmp(const struct strbuf *, const struct strbuf *);
84extern void strbuf_tolower(struct strbuf *);
85
86extern struct strbuf **strbuf_split(const struct strbuf *, int delim);
87extern void strbuf_list_free(struct strbuf **);
88
89/*----- add data in your buffer -----*/
90static inline void strbuf_addch(struct strbuf *sb, int c) {
91 strbuf_grow(sb, 1);
92 sb->buf[sb->len++] = c;
93 sb->buf[sb->len] = '\0';
94}
95
96extern void strbuf_insert(struct strbuf *, size_t pos, const void *, size_t);
97extern void strbuf_remove(struct strbuf *, size_t pos, size_t len);
98
99/* splice pos..pos+len with given data */
100extern void strbuf_splice(struct strbuf *, size_t pos, size_t len,
101 const void *, size_t);
102
103extern void strbuf_add(struct strbuf *, const void *, size_t);
104static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
105 strbuf_add(sb, s, strlen(s));
106}
107static inline void strbuf_addbuf(struct strbuf *sb, const struct strbuf *sb2) {
108 strbuf_add(sb, sb2->buf, sb2->len);
109}
110extern void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len);
111
112typedef size_t (*expand_fn_t) (struct strbuf *sb, const char *placeholder, void *context);
113extern void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn, void *context);
114struct strbuf_expand_dict_entry {
115 const char *placeholder;
116 const char *value;
117};
118extern size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder, void *context);
119
120__attribute__((format(printf,2,3)))
121extern void strbuf_addf(struct strbuf *sb, const char *fmt, ...);
122
123extern size_t strbuf_fread(struct strbuf *, size_t, FILE *);
124/* XXX: if read fails, any partial read is undone */
125extern ssize_t strbuf_read(struct strbuf *, int fd, size_t hint);
126extern int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint);
127extern int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint);
128
129extern int strbuf_getline(struct strbuf *, FILE *, int);
130
131extern void stripspace(struct strbuf *buf, int skip_comments);
132extern int launch_editor(const char *path, struct strbuf *buffer, const char *const *env);
133
134extern int strbuf_branchname(struct strbuf *sb, const char *name);
135extern int strbuf_check_branch_ref(struct strbuf *sb, const char *name);
136
137#endif /* STRBUF_H */
diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c
new file mode 100644
index 000000000000..ec33c0c7f4e2
--- /dev/null
+++ b/tools/perf/util/string.c
@@ -0,0 +1,34 @@
1#include "string.h"
2
3static int hex(char ch)
4{
5 if ((ch >= '0') && (ch <= '9'))
6 return ch - '0';
7 if ((ch >= 'a') && (ch <= 'f'))
8 return ch - 'a' + 10;
9 if ((ch >= 'A') && (ch <= 'F'))
10 return ch - 'A' + 10;
11 return -1;
12}
13
14/*
15 * While we find nice hex chars, build a long_val.
16 * Return number of chars processed.
17 */
18int hex2u64(const char *ptr, __u64 *long_val)
19{
20 const char *p = ptr;
21 *long_val = 0;
22
23 while (*p) {
24 const int hex_val = hex(*p);
25
26 if (hex_val < 0)
27 break;
28
29 *long_val = (*long_val << 4) | hex_val;
30 p++;
31 }
32
33 return p - ptr;
34}
diff --git a/tools/perf/util/string.h b/tools/perf/util/string.h
new file mode 100644
index 000000000000..72812c1c9a7a
--- /dev/null
+++ b/tools/perf/util/string.h
@@ -0,0 +1,8 @@
1#ifndef _PERF_STRING_H_
2#define _PERF_STRING_H_
3
4#include <linux/types.h>
5
6int hex2u64(const char *ptr, __u64 *val);
7
8#endif
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
new file mode 100644
index 000000000000..49a55f813712
--- /dev/null
+++ b/tools/perf/util/symbol.c
@@ -0,0 +1,641 @@
1#include "util.h"
2#include "../perf.h"
3#include "string.h"
4#include "symbol.h"
5
6#include <libelf.h>
7#include <gelf.h>
8#include <elf.h>
9
10const char *sym_hist_filter;
11
12static struct symbol *symbol__new(__u64 start, __u64 len,
13 const char *name, unsigned int priv_size,
14 __u64 obj_start, int verbose)
15{
16 size_t namelen = strlen(name) + 1;
17 struct symbol *self = calloc(1, priv_size + sizeof(*self) + namelen);
18
19 if (!self)
20 return NULL;
21
22 if (verbose >= 2)
23 printf("new symbol: %016Lx [%08lx]: %s, hist: %p, obj_start: %p\n",
24 (__u64)start, (unsigned long)len, name, self->hist, (void *)(unsigned long)obj_start);
25
26 self->obj_start= obj_start;
27 self->hist = NULL;
28 self->hist_sum = 0;
29
30 if (sym_hist_filter && !strcmp(name, sym_hist_filter))
31 self->hist = calloc(sizeof(__u64), len);
32
33 if (priv_size) {
34 memset(self, 0, priv_size);
35 self = ((void *)self) + priv_size;
36 }
37 self->start = start;
38 self->end = start + len - 1;
39 memcpy(self->name, name, namelen);
40
41 return self;
42}
43
44static void symbol__delete(struct symbol *self, unsigned int priv_size)
45{
46 free(((void *)self) - priv_size);
47}
48
49static size_t symbol__fprintf(struct symbol *self, FILE *fp)
50{
51 return fprintf(fp, " %llx-%llx %s\n",
52 self->start, self->end, self->name);
53}
54
55struct dso *dso__new(const char *name, unsigned int sym_priv_size)
56{
57 struct dso *self = malloc(sizeof(*self) + strlen(name) + 1);
58
59 if (self != NULL) {
60 strcpy(self->name, name);
61 self->syms = RB_ROOT;
62 self->sym_priv_size = sym_priv_size;
63 self->find_symbol = dso__find_symbol;
64 }
65
66 return self;
67}
68
69static void dso__delete_symbols(struct dso *self)
70{
71 struct symbol *pos;
72 struct rb_node *next = rb_first(&self->syms);
73
74 while (next) {
75 pos = rb_entry(next, struct symbol, rb_node);
76 next = rb_next(&pos->rb_node);
77 rb_erase(&pos->rb_node, &self->syms);
78 symbol__delete(pos, self->sym_priv_size);
79 }
80}
81
82void dso__delete(struct dso *self)
83{
84 dso__delete_symbols(self);
85 free(self);
86}
87
88static void dso__insert_symbol(struct dso *self, struct symbol *sym)
89{
90 struct rb_node **p = &self->syms.rb_node;
91 struct rb_node *parent = NULL;
92 const __u64 ip = sym->start;
93 struct symbol *s;
94
95 while (*p != NULL) {
96 parent = *p;
97 s = rb_entry(parent, struct symbol, rb_node);
98 if (ip < s->start)
99 p = &(*p)->rb_left;
100 else
101 p = &(*p)->rb_right;
102 }
103 rb_link_node(&sym->rb_node, parent, p);
104 rb_insert_color(&sym->rb_node, &self->syms);
105}
106
107struct symbol *dso__find_symbol(struct dso *self, __u64 ip)
108{
109 struct rb_node *n;
110
111 if (self == NULL)
112 return NULL;
113
114 n = self->syms.rb_node;
115
116 while (n) {
117 struct symbol *s = rb_entry(n, struct symbol, rb_node);
118
119 if (ip < s->start)
120 n = n->rb_left;
121 else if (ip > s->end)
122 n = n->rb_right;
123 else
124 return s;
125 }
126
127 return NULL;
128}
129
130size_t dso__fprintf(struct dso *self, FILE *fp)
131{
132 size_t ret = fprintf(fp, "dso: %s\n", self->name);
133
134 struct rb_node *nd;
135 for (nd = rb_first(&self->syms); nd; nd = rb_next(nd)) {
136 struct symbol *pos = rb_entry(nd, struct symbol, rb_node);
137 ret += symbol__fprintf(pos, fp);
138 }
139
140 return ret;
141}
142
143static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int verbose)
144{
145 struct rb_node *nd, *prevnd;
146 char *line = NULL;
147 size_t n;
148 FILE *file = fopen("/proc/kallsyms", "r");
149
150 if (file == NULL)
151 goto out_failure;
152
153 while (!feof(file)) {
154 __u64 start;
155 struct symbol *sym;
156 int line_len, len;
157 char symbol_type;
158
159 line_len = getline(&line, &n, file);
160 if (line_len < 0)
161 break;
162
163 if (!line)
164 goto out_failure;
165
166 line[--line_len] = '\0'; /* \n */
167
168 len = hex2u64(line, &start);
169
170 len++;
171 if (len + 2 >= line_len)
172 continue;
173
174 symbol_type = toupper(line[len]);
175 /*
176 * We're interested only in code ('T'ext)
177 */
178 if (symbol_type != 'T' && symbol_type != 'W')
179 continue;
180 /*
181 * Well fix up the end later, when we have all sorted.
182 */
183 sym = symbol__new(start, 0xdead, line + len + 2,
184 self->sym_priv_size, 0, verbose);
185
186 if (sym == NULL)
187 goto out_delete_line;
188
189 if (filter && filter(self, sym))
190 symbol__delete(sym, self->sym_priv_size);
191 else
192 dso__insert_symbol(self, sym);
193 }
194
195 /*
196 * Now that we have all sorted out, just set the ->end of all
197 * symbols
198 */
199 prevnd = rb_first(&self->syms);
200
201 if (prevnd == NULL)
202 goto out_delete_line;
203
204 for (nd = rb_next(prevnd); nd; nd = rb_next(nd)) {
205 struct symbol *prev = rb_entry(prevnd, struct symbol, rb_node),
206 *curr = rb_entry(nd, struct symbol, rb_node);
207
208 prev->end = curr->start - 1;
209 prevnd = nd;
210 }
211
212 free(line);
213 fclose(file);
214
215 return 0;
216
217out_delete_line:
218 free(line);
219out_failure:
220 return -1;
221}
222
223static int dso__load_perf_map(struct dso *self, symbol_filter_t filter, int verbose)
224{
225 char *line = NULL;
226 size_t n;
227 FILE *file;
228 int nr_syms = 0;
229
230 file = fopen(self->name, "r");
231 if (file == NULL)
232 goto out_failure;
233
234 while (!feof(file)) {
235 __u64 start, size;
236 struct symbol *sym;
237 int line_len, len;
238
239 line_len = getline(&line, &n, file);
240 if (line_len < 0)
241 break;
242
243 if (!line)
244 goto out_failure;
245
246 line[--line_len] = '\0'; /* \n */
247
248 len = hex2u64(line, &start);
249
250 len++;
251 if (len + 2 >= line_len)
252 continue;
253
254 len += hex2u64(line + len, &size);
255
256 len++;
257 if (len + 2 >= line_len)
258 continue;
259
260 sym = symbol__new(start, size, line + len,
261 self->sym_priv_size, start, verbose);
262
263 if (sym == NULL)
264 goto out_delete_line;
265
266 if (filter && filter(self, sym))
267 symbol__delete(sym, self->sym_priv_size);
268 else {
269 dso__insert_symbol(self, sym);
270 nr_syms++;
271 }
272 }
273
274 free(line);
275 fclose(file);
276
277 return nr_syms;
278
279out_delete_line:
280 free(line);
281out_failure:
282 return -1;
283}
284
285/**
286 * elf_symtab__for_each_symbol - iterate thru all the symbols
287 *
288 * @self: struct elf_symtab instance to iterate
289 * @index: uint32_t index
290 * @sym: GElf_Sym iterator
291 */
292#define elf_symtab__for_each_symbol(syms, nr_syms, index, sym) \
293 for (index = 0, gelf_getsym(syms, index, &sym);\
294 index < nr_syms; \
295 index++, gelf_getsym(syms, index, &sym))
296
297static inline uint8_t elf_sym__type(const GElf_Sym *sym)
298{
299 return GELF_ST_TYPE(sym->st_info);
300}
301
302static inline int elf_sym__is_function(const GElf_Sym *sym)
303{
304 return elf_sym__type(sym) == STT_FUNC &&
305 sym->st_name != 0 &&
306 sym->st_shndx != SHN_UNDEF &&
307 sym->st_size != 0;
308}
309
310static inline const char *elf_sym__name(const GElf_Sym *sym,
311 const Elf_Data *symstrs)
312{
313 return symstrs->d_buf + sym->st_name;
314}
315
316static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
317 GElf_Shdr *shp, const char *name,
318 size_t *index)
319{
320 Elf_Scn *sec = NULL;
321 size_t cnt = 1;
322
323 while ((sec = elf_nextscn(elf, sec)) != NULL) {
324 char *str;
325
326 gelf_getshdr(sec, shp);
327 str = elf_strptr(elf, ep->e_shstrndx, shp->sh_name);
328 if (!strcmp(name, str)) {
329 if (index)
330 *index = cnt;
331 break;
332 }
333 ++cnt;
334 }
335
336 return sec;
337}
338
339#define elf_section__for_each_rel(reldata, pos, pos_mem, idx, nr_entries) \
340 for (idx = 0, pos = gelf_getrel(reldata, 0, &pos_mem); \
341 idx < nr_entries; \
342 ++idx, pos = gelf_getrel(reldata, idx, &pos_mem))
343
344#define elf_section__for_each_rela(reldata, pos, pos_mem, idx, nr_entries) \
345 for (idx = 0, pos = gelf_getrela(reldata, 0, &pos_mem); \
346 idx < nr_entries; \
347 ++idx, pos = gelf_getrela(reldata, idx, &pos_mem))
348
349static int dso__synthesize_plt_symbols(struct dso *self, Elf *elf,
350 GElf_Ehdr *ehdr, Elf_Scn *scn_dynsym,
351 GElf_Shdr *shdr_dynsym,
352 size_t dynsym_idx, int verbose)
353{
354 uint32_t nr_rel_entries, idx;
355 GElf_Sym sym;
356 __u64 plt_offset;
357 GElf_Shdr shdr_plt;
358 struct symbol *f;
359 GElf_Shdr shdr_rel_plt;
360 Elf_Data *reldata, *syms, *symstrs;
361 Elf_Scn *scn_plt_rel, *scn_symstrs;
362 char sympltname[1024];
363 int nr = 0, symidx;
364
365 scn_plt_rel = elf_section_by_name(elf, ehdr, &shdr_rel_plt,
366 ".rela.plt", NULL);
367 if (scn_plt_rel == NULL) {
368 scn_plt_rel = elf_section_by_name(elf, ehdr, &shdr_rel_plt,
369 ".rel.plt", NULL);
370 if (scn_plt_rel == NULL)
371 return 0;
372 }
373
374 if (shdr_rel_plt.sh_link != dynsym_idx)
375 return 0;
376
377 if (elf_section_by_name(elf, ehdr, &shdr_plt, ".plt", NULL) == NULL)
378 return 0;
379
380 /*
381 * Fetch the relocation section to find the indexes to the GOT
382 * and the symbols in the .dynsym they refer to.
383 */
384 reldata = elf_getdata(scn_plt_rel, NULL);
385 if (reldata == NULL)
386 return -1;
387
388 syms = elf_getdata(scn_dynsym, NULL);
389 if (syms == NULL)
390 return -1;
391
392 scn_symstrs = elf_getscn(elf, shdr_dynsym->sh_link);
393 if (scn_symstrs == NULL)
394 return -1;
395
396 symstrs = elf_getdata(scn_symstrs, NULL);
397 if (symstrs == NULL)
398 return -1;
399
400 nr_rel_entries = shdr_rel_plt.sh_size / shdr_rel_plt.sh_entsize;
401 plt_offset = shdr_plt.sh_offset;
402
403 if (shdr_rel_plt.sh_type == SHT_RELA) {
404 GElf_Rela pos_mem, *pos;
405
406 elf_section__for_each_rela(reldata, pos, pos_mem, idx,
407 nr_rel_entries) {
408 symidx = GELF_R_SYM(pos->r_info);
409 plt_offset += shdr_plt.sh_entsize;
410 gelf_getsym(syms, symidx, &sym);
411 snprintf(sympltname, sizeof(sympltname),
412 "%s@plt", elf_sym__name(&sym, symstrs));
413
414 f = symbol__new(plt_offset, shdr_plt.sh_entsize,
415 sympltname, self->sym_priv_size, 0, verbose);
416 if (!f)
417 return -1;
418
419 dso__insert_symbol(self, f);
420 ++nr;
421 }
422 } else if (shdr_rel_plt.sh_type == SHT_REL) {
423 GElf_Rel pos_mem, *pos;
424 elf_section__for_each_rel(reldata, pos, pos_mem, idx,
425 nr_rel_entries) {
426 symidx = GELF_R_SYM(pos->r_info);
427 plt_offset += shdr_plt.sh_entsize;
428 gelf_getsym(syms, symidx, &sym);
429 snprintf(sympltname, sizeof(sympltname),
430 "%s@plt", elf_sym__name(&sym, symstrs));
431
432 f = symbol__new(plt_offset, shdr_plt.sh_entsize,
433 sympltname, self->sym_priv_size, 0, verbose);
434 if (!f)
435 return -1;
436
437 dso__insert_symbol(self, f);
438 ++nr;
439 }
440 } else {
441 /*
442 * TODO: There are still one more shdr_rel_plt.sh_type
443 * I have to investigate, but probably should be ignored.
444 */
445 }
446
447 return nr;
448}
449
450static int dso__load_sym(struct dso *self, int fd, const char *name,
451 symbol_filter_t filter, int verbose)
452{
453 Elf_Data *symstrs;
454 uint32_t nr_syms;
455 int err = -1;
456 uint32_t index;
457 GElf_Ehdr ehdr;
458 GElf_Shdr shdr;
459 Elf_Data *syms;
460 GElf_Sym sym;
461 Elf_Scn *sec, *sec_dynsym;
462 Elf *elf;
463 size_t dynsym_idx;
464 int nr = 0;
465
466 elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
467 if (elf == NULL) {
468 if (verbose)
469 fprintf(stderr, "%s: cannot read %s ELF file.\n",
470 __func__, name);
471 goto out_close;
472 }
473
474 if (gelf_getehdr(elf, &ehdr) == NULL) {
475 if (verbose)
476 fprintf(stderr, "%s: cannot get elf header.\n", __func__);
477 goto out_elf_end;
478 }
479
480 /*
481 * We need to check if we have a .dynsym, so that we can handle the
482 * .plt, synthesizing its symbols, that aren't on the symtabs (be it
483 * .dynsym or .symtab)
484 */
485 sec_dynsym = elf_section_by_name(elf, &ehdr, &shdr,
486 ".dynsym", &dynsym_idx);
487 if (sec_dynsym != NULL) {
488 nr = dso__synthesize_plt_symbols(self, elf, &ehdr,
489 sec_dynsym, &shdr,
490 dynsym_idx, verbose);
491 if (nr < 0)
492 goto out_elf_end;
493 }
494
495 /*
496 * But if we have a full .symtab (that is a superset of .dynsym) we
497 * should add the symbols not in the .dynsyn
498 */
499 sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
500 if (sec == NULL) {
501 if (sec_dynsym == NULL)
502 goto out_elf_end;
503
504 sec = sec_dynsym;
505 gelf_getshdr(sec, &shdr);
506 }
507
508 syms = elf_getdata(sec, NULL);
509 if (syms == NULL)
510 goto out_elf_end;
511
512 sec = elf_getscn(elf, shdr.sh_link);
513 if (sec == NULL)
514 goto out_elf_end;
515
516 symstrs = elf_getdata(sec, NULL);
517 if (symstrs == NULL)
518 goto out_elf_end;
519
520 nr_syms = shdr.sh_size / shdr.sh_entsize;
521
522 memset(&sym, 0, sizeof(sym));
523
524 elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
525 struct symbol *f;
526 __u64 obj_start;
527
528 if (!elf_sym__is_function(&sym))
529 continue;
530
531 sec = elf_getscn(elf, sym.st_shndx);
532 if (!sec)
533 goto out_elf_end;
534
535 gelf_getshdr(sec, &shdr);
536 obj_start = sym.st_value;
537
538 sym.st_value -= shdr.sh_addr - shdr.sh_offset;
539
540 f = symbol__new(sym.st_value, sym.st_size,
541 elf_sym__name(&sym, symstrs),
542 self->sym_priv_size, obj_start, verbose);
543 if (!f)
544 goto out_elf_end;
545
546 if (filter && filter(self, f))
547 symbol__delete(f, self->sym_priv_size);
548 else {
549 dso__insert_symbol(self, f);
550 nr++;
551 }
552 }
553
554 err = nr;
555out_elf_end:
556 elf_end(elf);
557out_close:
558 return err;
559}
560
561int dso__load(struct dso *self, symbol_filter_t filter, int verbose)
562{
563 int size = strlen(self->name) + sizeof("/usr/lib/debug%s.debug");
564 char *name = malloc(size);
565 int variant = 0;
566 int ret = -1;
567 int fd;
568
569 if (!name)
570 return -1;
571
572 if (strncmp(self->name, "/tmp/perf-", 10) == 0)
573 return dso__load_perf_map(self, filter, verbose);
574
575more:
576 do {
577 switch (variant) {
578 case 0: /* Fedora */
579 snprintf(name, size, "/usr/lib/debug%s.debug", self->name);
580 break;
581 case 1: /* Ubuntu */
582 snprintf(name, size, "/usr/lib/debug%s", self->name);
583 break;
584 case 2: /* Sane people */
585 snprintf(name, size, "%s", self->name);
586 break;
587
588 default:
589 goto out;
590 }
591 variant++;
592
593 fd = open(name, O_RDONLY);
594 } while (fd < 0);
595
596 ret = dso__load_sym(self, fd, name, filter, verbose);
597 close(fd);
598
599 /*
600 * Some people seem to have debuginfo files _WITHOUT_ debug info!?!?
601 */
602 if (!ret)
603 goto more;
604
605out:
606 free(name);
607 return ret;
608}
609
610static int dso__load_vmlinux(struct dso *self, const char *vmlinux,
611 symbol_filter_t filter, int verbose)
612{
613 int err, fd = open(vmlinux, O_RDONLY);
614
615 if (fd < 0)
616 return -1;
617
618 err = dso__load_sym(self, fd, vmlinux, filter, verbose);
619 close(fd);
620
621 return err;
622}
623
624int dso__load_kernel(struct dso *self, const char *vmlinux,
625 symbol_filter_t filter, int verbose)
626{
627 int err = -1;
628
629 if (vmlinux)
630 err = dso__load_vmlinux(self, vmlinux, filter, verbose);
631
632 if (err)
633 err = dso__load_kallsyms(self, filter, verbose);
634
635 return err;
636}
637
638void symbol__init(void)
639{
640 elf_version(EV_CURRENT);
641}
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
new file mode 100644
index 000000000000..0d1292bd8270
--- /dev/null
+++ b/tools/perf/util/symbol.h
@@ -0,0 +1,47 @@
1#ifndef _PERF_SYMBOL_
2#define _PERF_SYMBOL_ 1
3
4#include <linux/types.h>
5#include "list.h"
6#include "rbtree.h"
7
8struct symbol {
9 struct rb_node rb_node;
10 __u64 start;
11 __u64 end;
12 __u64 obj_start;
13 __u64 hist_sum;
14 __u64 *hist;
15 char name[0];
16};
17
18struct dso {
19 struct list_head node;
20 struct rb_root syms;
21 unsigned int sym_priv_size;
22 struct symbol *(*find_symbol)(struct dso *, __u64 ip);
23 char name[0];
24};
25
26const char *sym_hist_filter;
27
28typedef int (*symbol_filter_t)(struct dso *self, struct symbol *sym);
29
30struct dso *dso__new(const char *name, unsigned int sym_priv_size);
31void dso__delete(struct dso *self);
32
33static inline void *dso__sym_priv(struct dso *self, struct symbol *sym)
34{
35 return ((void *)sym) - self->sym_priv_size;
36}
37
38struct symbol *dso__find_symbol(struct dso *self, __u64 ip);
39
40int dso__load_kernel(struct dso *self, const char *vmlinux,
41 symbol_filter_t filter, int verbose);
42int dso__load(struct dso *self, symbol_filter_t filter, int verbose);
43
44size_t dso__fprintf(struct dso *self, FILE *fp);
45
46void symbol__init(void);
47#endif /* _PERF_SYMBOL_ */
diff --git a/tools/perf/util/usage.c b/tools/perf/util/usage.c
new file mode 100644
index 000000000000..e16bf9a707e8
--- /dev/null
+++ b/tools/perf/util/usage.c
@@ -0,0 +1,80 @@
1/*
2 * GIT - The information manager from hell
3 *
4 * Copyright (C) Linus Torvalds, 2005
5 */
6#include "util.h"
7
8static void report(const char *prefix, const char *err, va_list params)
9{
10 char msg[1024];
11 vsnprintf(msg, sizeof(msg), err, params);
12 fprintf(stderr, " %s%s\n", prefix, msg);
13}
14
15static NORETURN void usage_builtin(const char *err)
16{
17 fprintf(stderr, "\n Usage: %s\n", err);
18 exit(129);
19}
20
21static NORETURN void die_builtin(const char *err, va_list params)
22{
23 report(" Fatal: ", err, params);
24 exit(128);
25}
26
27static void error_builtin(const char *err, va_list params)
28{
29 report(" Error: ", err, params);
30}
31
32static void warn_builtin(const char *warn, va_list params)
33{
34 report(" Warning: ", warn, params);
35}
36
37/* If we are in a dlopen()ed .so write to a global variable would segfault
38 * (ugh), so keep things static. */
39static void (*usage_routine)(const char *err) NORETURN = usage_builtin;
40static void (*die_routine)(const char *err, va_list params) NORETURN = die_builtin;
41static void (*error_routine)(const char *err, va_list params) = error_builtin;
42static void (*warn_routine)(const char *err, va_list params) = warn_builtin;
43
44void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN)
45{
46 die_routine = routine;
47}
48
49void usage(const char *err)
50{
51 usage_routine(err);
52}
53
54void die(const char *err, ...)
55{
56 va_list params;
57
58 va_start(params, err);
59 die_routine(err, params);
60 va_end(params);
61}
62
63int error(const char *err, ...)
64{
65 va_list params;
66
67 va_start(params, err);
68 error_routine(err, params);
69 va_end(params);
70 return -1;
71}
72
73void warning(const char *warn, ...)
74{
75 va_list params;
76
77 va_start(params, warn);
78 warn_routine(warn, params);
79 va_end(params);
80}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
new file mode 100644
index 000000000000..76590a16c271
--- /dev/null
+++ b/tools/perf/util/util.h
@@ -0,0 +1,410 @@
1#ifndef GIT_COMPAT_UTIL_H
2#define GIT_COMPAT_UTIL_H
3
4#define _FILE_OFFSET_BITS 64
5
6#ifndef FLEX_ARRAY
7/*
8 * See if our compiler is known to support flexible array members.
9 */
10#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
11# define FLEX_ARRAY /* empty */
12#elif defined(__GNUC__)
13# if (__GNUC__ >= 3)
14# define FLEX_ARRAY /* empty */
15# else
16# define FLEX_ARRAY 0 /* older GNU extension */
17# endif
18#endif
19
20/*
21 * Otherwise, default to safer but a bit wasteful traditional style
22 */
23#ifndef FLEX_ARRAY
24# define FLEX_ARRAY 1
25#endif
26#endif
27
28#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
29
30#ifdef __GNUC__
31#define TYPEOF(x) (__typeof__(x))
32#else
33#define TYPEOF(x)
34#endif
35
36#define MSB(x, bits) ((x) & TYPEOF(x)(~0ULL << (sizeof(x) * 8 - (bits))))
37#define HAS_MULTI_BITS(i) ((i) & ((i) - 1)) /* checks if an integer has more than 1 bit set */
38
39/* Approximation of the length of the decimal representation of this type. */
40#define decimal_length(x) ((int)(sizeof(x) * 2.56 + 0.5) + 1)
41
42#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__USLC__) && !defined(_M_UNIX)
43#define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */
44#define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */
45#endif
46#define _ALL_SOURCE 1
47#define _GNU_SOURCE 1
48#define _BSD_SOURCE 1
49
50#include <unistd.h>
51#include <stdio.h>
52#include <sys/stat.h>
53#include <fcntl.h>
54#include <stddef.h>
55#include <stdlib.h>
56#include <stdarg.h>
57#include <string.h>
58#include <errno.h>
59#include <limits.h>
60#include <sys/param.h>
61#include <sys/types.h>
62#include <dirent.h>
63#include <sys/time.h>
64#include <time.h>
65#include <signal.h>
66#include <fnmatch.h>
67#include <assert.h>
68#include <regex.h>
69#include <utime.h>
70#ifndef __MINGW32__
71#include <sys/wait.h>
72#include <sys/poll.h>
73#include <sys/socket.h>
74#include <sys/ioctl.h>
75#ifndef NO_SYS_SELECT_H
76#include <sys/select.h>
77#endif
78#include <netinet/in.h>
79#include <netinet/tcp.h>
80#include <arpa/inet.h>
81#include <netdb.h>
82#include <pwd.h>
83#include <inttypes.h>
84#if defined(__CYGWIN__)
85#undef _XOPEN_SOURCE
86#include <grp.h>
87#define _XOPEN_SOURCE 600
88#include "compat/cygwin.h"
89#else
90#undef _ALL_SOURCE /* AIX 5.3L defines a struct list with _ALL_SOURCE. */
91#include <grp.h>
92#define _ALL_SOURCE 1
93#endif
94#else /* __MINGW32__ */
95/* pull in Windows compatibility stuff */
96#include "compat/mingw.h"
97#endif /* __MINGW32__ */
98
99#ifndef NO_ICONV
100#include <iconv.h>
101#endif
102
103#ifndef NO_OPENSSL
104#include <openssl/ssl.h>
105#include <openssl/err.h>
106#endif
107
108/* On most systems <limits.h> would have given us this, but
109 * not on some systems (e.g. GNU/Hurd).
110 */
111#ifndef PATH_MAX
112#define PATH_MAX 4096
113#endif
114
115#ifndef PRIuMAX
116#define PRIuMAX "llu"
117#endif
118
119#ifndef PRIu32
120#define PRIu32 "u"
121#endif
122
123#ifndef PRIx32
124#define PRIx32 "x"
125#endif
126
127#ifndef PATH_SEP
128#define PATH_SEP ':'
129#endif
130
131#ifndef STRIP_EXTENSION
132#define STRIP_EXTENSION ""
133#endif
134
135#ifndef has_dos_drive_prefix
136#define has_dos_drive_prefix(path) 0
137#endif
138
139#ifndef is_dir_sep
140#define is_dir_sep(c) ((c) == '/')
141#endif
142
143#ifdef __GNUC__
144#define NORETURN __attribute__((__noreturn__))
145#else
146#define NORETURN
147#ifndef __attribute__
148#define __attribute__(x)
149#endif
150#endif
151
152/* General helper functions */
153extern void usage(const char *err) NORETURN;
154extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
155extern int error(const char *err, ...) __attribute__((format (printf, 1, 2)));
156extern void warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
157
158extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
159
160extern int prefixcmp(const char *str, const char *prefix);
161extern time_t tm_to_time_t(const struct tm *tm);
162
163static inline const char *skip_prefix(const char *str, const char *prefix)
164{
165 size_t len = strlen(prefix);
166 return strncmp(str, prefix, len) ? NULL : str + len;
167}
168
169#if defined(NO_MMAP) || defined(USE_WIN32_MMAP)
170
171#ifndef PROT_READ
172#define PROT_READ 1
173#define PROT_WRITE 2
174#define MAP_PRIVATE 1
175#define MAP_FAILED ((void*)-1)
176#endif
177
178#define mmap git_mmap
179#define munmap git_munmap
180extern void *git_mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
181extern int git_munmap(void *start, size_t length);
182
183#else /* NO_MMAP || USE_WIN32_MMAP */
184
185#include <sys/mman.h>
186
187#endif /* NO_MMAP || USE_WIN32_MMAP */
188
189#ifdef NO_MMAP
190
191/* This value must be multiple of (pagesize * 2) */
192#define DEFAULT_PACKED_GIT_WINDOW_SIZE (1 * 1024 * 1024)
193
194#else /* NO_MMAP */
195
196/* This value must be multiple of (pagesize * 2) */
197#define DEFAULT_PACKED_GIT_WINDOW_SIZE \
198 (sizeof(void*) >= 8 \
199 ? 1 * 1024 * 1024 * 1024 \
200 : 32 * 1024 * 1024)
201
202#endif /* NO_MMAP */
203
204#ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
205#define on_disk_bytes(st) ((st).st_size)
206#else
207#define on_disk_bytes(st) ((st).st_blocks * 512)
208#endif
209
210#define DEFAULT_PACKED_GIT_LIMIT \
211 ((1024L * 1024L) * (sizeof(void*) >= 8 ? 8192 : 256))
212
213#ifdef NO_PREAD
214#define pread git_pread
215extern ssize_t git_pread(int fd, void *buf, size_t count, off_t offset);
216#endif
217/*
218 * Forward decl that will remind us if its twin in cache.h changes.
219 * This function is used in compat/pread.c. But we can't include
220 * cache.h there.
221 */
222extern ssize_t read_in_full(int fd, void *buf, size_t count);
223
224#ifdef NO_SETENV
225#define setenv gitsetenv
226extern int gitsetenv(const char *, const char *, int);
227#endif
228
229#ifdef NO_MKDTEMP
230#define mkdtemp gitmkdtemp
231extern char *gitmkdtemp(char *);
232#endif
233
234#ifdef NO_UNSETENV
235#define unsetenv gitunsetenv
236extern void gitunsetenv(const char *);
237#endif
238
239#ifdef NO_STRCASESTR
240#define strcasestr gitstrcasestr
241extern char *gitstrcasestr(const char *haystack, const char *needle);
242#endif
243
244#ifdef NO_STRLCPY
245#define strlcpy gitstrlcpy
246extern size_t gitstrlcpy(char *, const char *, size_t);
247#endif
248
249#ifdef NO_STRTOUMAX
250#define strtoumax gitstrtoumax
251extern uintmax_t gitstrtoumax(const char *, char **, int);
252#endif
253
254#ifdef NO_HSTRERROR
255#define hstrerror githstrerror
256extern const char *githstrerror(int herror);
257#endif
258
259#ifdef NO_MEMMEM
260#define memmem gitmemmem
261void *gitmemmem(const void *haystack, size_t haystacklen,
262 const void *needle, size_t needlelen);
263#endif
264
265#ifdef FREAD_READS_DIRECTORIES
266#ifdef fopen
267#undef fopen
268#endif
269#define fopen(a,b) git_fopen(a,b)
270extern FILE *git_fopen(const char*, const char*);
271#endif
272
273#ifdef SNPRINTF_RETURNS_BOGUS
274#define snprintf git_snprintf
275extern int git_snprintf(char *str, size_t maxsize,
276 const char *format, ...);
277#define vsnprintf git_vsnprintf
278extern int git_vsnprintf(char *str, size_t maxsize,
279 const char *format, va_list ap);
280#endif
281
282#ifdef __GLIBC_PREREQ
283#if __GLIBC_PREREQ(2, 1)
284#define HAVE_STRCHRNUL
285#endif
286#endif
287
288#ifndef HAVE_STRCHRNUL
289#define strchrnul gitstrchrnul
290static inline char *gitstrchrnul(const char *s, int c)
291{
292 while (*s && *s != c)
293 s++;
294 return (char *)s;
295}
296#endif
297
298/*
299 * Wrappers:
300 */
301extern char *xstrdup(const char *str);
302extern void *xmalloc(size_t size);
303extern void *xmemdupz(const void *data, size_t len);
304extern char *xstrndup(const char *str, size_t len);
305extern void *xrealloc(void *ptr, size_t size);
306extern void *xcalloc(size_t nmemb, size_t size);
307extern void *xmmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
308extern ssize_t xread(int fd, void *buf, size_t len);
309extern ssize_t xwrite(int fd, const void *buf, size_t len);
310extern int xdup(int fd);
311extern FILE *xfdopen(int fd, const char *mode);
312extern int xmkstemp(char *template);
313
314static inline size_t xsize_t(off_t len)
315{
316 return (size_t)len;
317}
318
319static inline int has_extension(const char *filename, const char *ext)
320{
321 size_t len = strlen(filename);
322 size_t extlen = strlen(ext);
323 return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
324}
325
326/* Sane ctype - no locale, and works with signed chars */
327#undef isascii
328#undef isspace
329#undef isdigit
330#undef isalpha
331#undef isalnum
332#undef tolower
333#undef toupper
334extern unsigned char sane_ctype[256];
335#define GIT_SPACE 0x01
336#define GIT_DIGIT 0x02
337#define GIT_ALPHA 0x04
338#define GIT_GLOB_SPECIAL 0x08
339#define GIT_REGEX_SPECIAL 0x10
340#define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
341#define isascii(x) (((x) & ~0x7f) == 0)
342#define isspace(x) sane_istest(x,GIT_SPACE)
343#define isdigit(x) sane_istest(x,GIT_DIGIT)
344#define isalpha(x) sane_istest(x,GIT_ALPHA)
345#define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
346#define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
347#define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
348#define tolower(x) sane_case((unsigned char)(x), 0x20)
349#define toupper(x) sane_case((unsigned char)(x), 0)
350
351static inline int sane_case(int x, int high)
352{
353 if (sane_istest(x, GIT_ALPHA))
354 x = (x & ~0x20) | high;
355 return x;
356}
357
358static inline int strtoul_ui(char const *s, int base, unsigned int *result)
359{
360 unsigned long ul;
361 char *p;
362
363 errno = 0;
364 ul = strtoul(s, &p, base);
365 if (errno || *p || p == s || (unsigned int) ul != ul)
366 return -1;
367 *result = ul;
368 return 0;
369}
370
371static inline int strtol_i(char const *s, int base, int *result)
372{
373 long ul;
374 char *p;
375
376 errno = 0;
377 ul = strtol(s, &p, base);
378 if (errno || *p || p == s || (int) ul != ul)
379 return -1;
380 *result = ul;
381 return 0;
382}
383
384#ifdef INTERNAL_QSORT
385void git_qsort(void *base, size_t nmemb, size_t size,
386 int(*compar)(const void *, const void *));
387#define qsort git_qsort
388#endif
389
390#ifndef DIR_HAS_BSD_GROUP_SEMANTICS
391# define FORCE_DIR_SET_GID S_ISGID
392#else
393# define FORCE_DIR_SET_GID 0
394#endif
395
396#ifdef NO_NSEC
397#undef USE_NSEC
398#define ST_CTIME_NSEC(st) 0
399#define ST_MTIME_NSEC(st) 0
400#else
401#ifdef USE_ST_TIMESPEC
402#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctimespec.tv_nsec))
403#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtimespec.tv_nsec))
404#else
405#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctim.tv_nsec))
406#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtim.tv_nsec))
407#endif
408#endif
409
410#endif
diff --git a/tools/perf/util/wrapper.c b/tools/perf/util/wrapper.c
new file mode 100644
index 000000000000..6350d65f6d9e
--- /dev/null
+++ b/tools/perf/util/wrapper.c
@@ -0,0 +1,206 @@
1/*
2 * Various trivial helper wrappers around standard functions
3 */
4#include "cache.h"
5
6/*
7 * There's no pack memory to release - but stay close to the Git
8 * version so wrap this away:
9 */
10static inline void release_pack_memory(size_t size, int flag)
11{
12}
13
14char *xstrdup(const char *str)
15{
16 char *ret = strdup(str);
17 if (!ret) {
18 release_pack_memory(strlen(str) + 1, -1);
19 ret = strdup(str);
20 if (!ret)
21 die("Out of memory, strdup failed");
22 }
23 return ret;
24}
25
26void *xmalloc(size_t size)
27{
28 void *ret = malloc(size);
29 if (!ret && !size)
30 ret = malloc(1);
31 if (!ret) {
32 release_pack_memory(size, -1);
33 ret = malloc(size);
34 if (!ret && !size)
35 ret = malloc(1);
36 if (!ret)
37 die("Out of memory, malloc failed");
38 }
39#ifdef XMALLOC_POISON
40 memset(ret, 0xA5, size);
41#endif
42 return ret;
43}
44
45/*
46 * xmemdupz() allocates (len + 1) bytes of memory, duplicates "len" bytes of
47 * "data" to the allocated memory, zero terminates the allocated memory,
48 * and returns a pointer to the allocated memory. If the allocation fails,
49 * the program dies.
50 */
51void *xmemdupz(const void *data, size_t len)
52{
53 char *p = xmalloc(len + 1);
54 memcpy(p, data, len);
55 p[len] = '\0';
56 return p;
57}
58
59char *xstrndup(const char *str, size_t len)
60{
61 char *p = memchr(str, '\0', len);
62 return xmemdupz(str, p ? p - str : len);
63}
64
65void *xrealloc(void *ptr, size_t size)
66{
67 void *ret = realloc(ptr, size);
68 if (!ret && !size)
69 ret = realloc(ptr, 1);
70 if (!ret) {
71 release_pack_memory(size, -1);
72 ret = realloc(ptr, size);
73 if (!ret && !size)
74 ret = realloc(ptr, 1);
75 if (!ret)
76 die("Out of memory, realloc failed");
77 }
78 return ret;
79}
80
81void *xcalloc(size_t nmemb, size_t size)
82{
83 void *ret = calloc(nmemb, size);
84 if (!ret && (!nmemb || !size))
85 ret = calloc(1, 1);
86 if (!ret) {
87 release_pack_memory(nmemb * size, -1);
88 ret = calloc(nmemb, size);
89 if (!ret && (!nmemb || !size))
90 ret = calloc(1, 1);
91 if (!ret)
92 die("Out of memory, calloc failed");
93 }
94 return ret;
95}
96
97void *xmmap(void *start, size_t length,
98 int prot, int flags, int fd, off_t offset)
99{
100 void *ret = mmap(start, length, prot, flags, fd, offset);
101 if (ret == MAP_FAILED) {
102 if (!length)
103 return NULL;
104 release_pack_memory(length, fd);
105 ret = mmap(start, length, prot, flags, fd, offset);
106 if (ret == MAP_FAILED)
107 die("Out of memory? mmap failed: %s", strerror(errno));
108 }
109 return ret;
110}
111
112/*
113 * xread() is the same a read(), but it automatically restarts read()
114 * operations with a recoverable error (EAGAIN and EINTR). xread()
115 * DOES NOT GUARANTEE that "len" bytes is read even if the data is available.
116 */
117ssize_t xread(int fd, void *buf, size_t len)
118{
119 ssize_t nr;
120 while (1) {
121 nr = read(fd, buf, len);
122 if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
123 continue;
124 return nr;
125 }
126}
127
128/*
129 * xwrite() is the same a write(), but it automatically restarts write()
130 * operations with a recoverable error (EAGAIN and EINTR). xwrite() DOES NOT
131 * GUARANTEE that "len" bytes is written even if the operation is successful.
132 */
133ssize_t xwrite(int fd, const void *buf, size_t len)
134{
135 ssize_t nr;
136 while (1) {
137 nr = write(fd, buf, len);
138 if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
139 continue;
140 return nr;
141 }
142}
143
144ssize_t read_in_full(int fd, void *buf, size_t count)
145{
146 char *p = buf;
147 ssize_t total = 0;
148
149 while (count > 0) {
150 ssize_t loaded = xread(fd, p, count);
151 if (loaded <= 0)
152 return total ? total : loaded;
153 count -= loaded;
154 p += loaded;
155 total += loaded;
156 }
157
158 return total;
159}
160
161ssize_t write_in_full(int fd, const void *buf, size_t count)
162{
163 const char *p = buf;
164 ssize_t total = 0;
165
166 while (count > 0) {
167 ssize_t written = xwrite(fd, p, count);
168 if (written < 0)
169 return -1;
170 if (!written) {
171 errno = ENOSPC;
172 return -1;
173 }
174 count -= written;
175 p += written;
176 total += written;
177 }
178
179 return total;
180}
181
182int xdup(int fd)
183{
184 int ret = dup(fd);
185 if (ret < 0)
186 die("dup failed: %s", strerror(errno));
187 return ret;
188}
189
190FILE *xfdopen(int fd, const char *mode)
191{
192 FILE *stream = fdopen(fd, mode);
193 if (stream == NULL)
194 die("Out of memory? fdopen failed: %s", strerror(errno));
195 return stream;
196}
197
198int xmkstemp(char *template)
199{
200 int fd;
201
202 fd = mkstemp(template);
203 if (fd < 0)
204 die("Unable to create temporary file: %s", strerror(errno));
205 return fd;
206}