aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/Kconfig7
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/a.out-core.h10
-rw-r--r--arch/x86/include/asm/debugreg.h33
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h73
-rw-r--r--arch/x86/include/asm/processor.h14
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/hw_breakpoint.c549
-rw-r--r--arch/x86/kernel/kgdb.c6
-rw-r--r--arch/x86/kernel/kprobes.c9
-rw-r--r--arch/x86/kernel/machine_kexec_32.c2
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/process.c21
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c7
-rw-r--r--arch/x86/kernel/ptrace.c293
-rw-r--r--arch/x86/kernel/signal.c9
-rw-r--r--arch/x86/kernel/traps.c73
-rw-r--r--arch/x86/kvm/x86.c18
-rw-r--r--arch/x86/mm/kmmio.c8
-rw-r--r--arch/x86/power/cpu.c26
-rw-r--r--arch/x86/tools/test_get_len.c17
-rw-r--r--include/linux/ftrace_event.h9
-rw-r--r--include/linux/hw_breakpoint.h140
-rw-r--r--include/linux/perf_event.h48
-rw-r--r--include/trace/ftrace.h22
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/hw_breakpoint.c501
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/perf_event.c517
-rw-r--r--kernel/trace/Kconfig21
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/trace.h7
-rw-r--r--kernel/trace/trace_entries.h16
-rw-r--r--kernel/trace/trace_event_profile.c14
-rw-r--r--kernel/trace/trace_kprobe.c50
-rw-r--r--kernel/trace/trace_ksym.c554
-rw-r--r--kernel/trace/trace_selftest.c55
-rw-r--r--kernel/trace/trace_syscalls.c59
-rw-r--r--samples/Kconfig6
-rw-r--r--samples/Makefile3
-rw-r--r--samples/hw_breakpoint/Makefile1
-rw-r--r--samples/hw_breakpoint/data_breakpoint.c90
-rw-r--r--tools/perf/Documentation/perf-kmem.txt44
-rw-r--r--tools/perf/Documentation/perf-record.txt16
-rw-r--r--tools/perf/Makefile49
-rw-r--r--tools/perf/builtin-annotate.c12
-rw-r--r--tools/perf/builtin-kmem.c833
-rw-r--r--tools/perf/builtin-record.c29
-rw-r--r--tools/perf/builtin-report.c21
-rw-r--r--tools/perf/builtin-sched.c3
-rw-r--r--tools/perf/builtin-timechart.c17
-rw-r--r--tools/perf/builtin-top.c30
-rw-r--r--tools/perf/builtin-trace.c3
-rw-r--r--tools/perf/builtin.h1
-rw-r--r--tools/perf/command-list.txt1
-rw-r--r--tools/perf/perf.c27
-rw-r--r--tools/perf/util/ctype.c8
-rw-r--r--tools/perf/util/data_map.c86
-rw-r--r--tools/perf/util/data_map.h2
-rw-r--r--tools/perf/util/event.h10
-rw-r--r--tools/perf/util/header.c163
-rw-r--r--tools/perf/util/header.h9
-rw-r--r--tools/perf/util/include/linux/bitops.h2
-rw-r--r--tools/perf/util/map.c23
-rw-r--r--tools/perf/util/parse-events.c84
-rw-r--r--tools/perf/util/symbol.c505
-rw-r--r--tools/perf/util/symbol.h11
-rw-r--r--tools/perf/util/thread.h3
-rw-r--r--tools/perf/util/trace-event-info.c22
-rw-r--r--tools/perf/util/trace-event-read.c4
-rw-r--r--tools/perf/util/trace-event.h2
-rw-r--r--tools/perf/util/util.h3
76 files changed, 4431 insertions, 902 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index 7f418bbc261a..eef3bbb97075 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -126,4 +126,11 @@ config HAVE_DMA_API_DEBUG
126config HAVE_DEFAULT_NO_SPIN_MUTEXES 126config HAVE_DEFAULT_NO_SPIN_MUTEXES
127 bool 127 bool
128 128
129config HAVE_HW_BREAKPOINT
130 bool
131 depends on HAVE_PERF_EVENTS
132 select ANON_INODES
133 select PERF_EVENTS
134
135
129source "kernel/gcov/Kconfig" 136source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 72ace9515a07..178084b4377c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -49,6 +49,7 @@ config X86
49 select HAVE_KERNEL_GZIP 49 select HAVE_KERNEL_GZIP
50 select HAVE_KERNEL_BZIP2 50 select HAVE_KERNEL_BZIP2
51 select HAVE_KERNEL_LZMA 51 select HAVE_KERNEL_LZMA
52 select HAVE_HW_BREAKPOINT
52 select HAVE_ARCH_KMEMCHECK 53 select HAVE_ARCH_KMEMCHECK
53 54
54config OUTPUT_FORMAT 55config OUTPUT_FORMAT
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4a8e80cdcfa5..9f828f87ca35 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -10,6 +10,7 @@ header-y += ptrace-abi.h
10header-y += sigcontext32.h 10header-y += sigcontext32.h
11header-y += ucontext.h 11header-y += ucontext.h
12header-y += processor-flags.h 12header-y += processor-flags.h
13header-y += hw_breakpoint.h
13 14
14unifdef-y += e820.h 15unifdef-y += e820.h
15unifdef-y += ist.h 16unifdef-y += ist.h
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
index bb70e397aa84..7a15588e45d4 100644
--- a/arch/x86/include/asm/a.out-core.h
+++ b/arch/x86/include/asm/a.out-core.h
@@ -17,6 +17,7 @@
17 17
18#include <linux/user.h> 18#include <linux/user.h>
19#include <linux/elfcore.h> 19#include <linux/elfcore.h>
20#include <asm/debugreg.h>
20 21
21/* 22/*
22 * fill in the user structure for an a.out core dump 23 * fill in the user structure for an a.out core dump
@@ -32,14 +33,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
32 >> PAGE_SHIFT; 33 >> PAGE_SHIFT;
33 dump->u_dsize -= dump->u_tsize; 34 dump->u_dsize -= dump->u_tsize;
34 dump->u_ssize = 0; 35 dump->u_ssize = 0;
35 dump->u_debugreg[0] = current->thread.debugreg0; 36 aout_dump_debugregs(dump);
36 dump->u_debugreg[1] = current->thread.debugreg1;
37 dump->u_debugreg[2] = current->thread.debugreg2;
38 dump->u_debugreg[3] = current->thread.debugreg3;
39 dump->u_debugreg[4] = 0;
40 dump->u_debugreg[5] = 0;
41 dump->u_debugreg[6] = current->thread.debugreg6;
42 dump->u_debugreg[7] = current->thread.debugreg7;
43 37
44 if (dump->start_stack < TASK_SIZE) 38 if (dump->start_stack < TASK_SIZE)
45 dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack)) 39 dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 3ea6f37be9e2..fdabd8435765 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -18,6 +18,7 @@
18#define DR_TRAP1 (0x2) /* db1 */ 18#define DR_TRAP1 (0x2) /* db1 */
19#define DR_TRAP2 (0x4) /* db2 */ 19#define DR_TRAP2 (0x4) /* db2 */
20#define DR_TRAP3 (0x8) /* db3 */ 20#define DR_TRAP3 (0x8) /* db3 */
21#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
21 22
22#define DR_STEP (0x4000) /* single-step */ 23#define DR_STEP (0x4000) /* single-step */
23#define DR_SWITCH (0x8000) /* task switch */ 24#define DR_SWITCH (0x8000) /* task switch */
@@ -49,6 +50,8 @@
49 50
50#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */ 51#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */
51#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */ 52#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */
53#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */
54#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */
52#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */ 55#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */
53 56
54#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */ 57#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */
@@ -67,4 +70,34 @@
67#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */ 70#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */
68#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */ 71#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */
69 72
73/*
74 * HW breakpoint additions
75 */
76#ifdef __KERNEL__
77
78DECLARE_PER_CPU(unsigned long, dr7);
79
80static inline void hw_breakpoint_disable(void)
81{
82 /* Zero the control register for HW Breakpoint */
83 set_debugreg(0UL, 7);
84
85 /* Zero-out the individual HW breakpoint address registers */
86 set_debugreg(0UL, 0);
87 set_debugreg(0UL, 1);
88 set_debugreg(0UL, 2);
89 set_debugreg(0UL, 3);
90}
91
92static inline int hw_breakpoint_active(void)
93{
94 return __get_cpu_var(dr7) & DR_GLOBAL_ENABLE_MASK;
95}
96
97extern void aout_dump_debugregs(struct user *dump);
98
99extern void hw_breakpoint_restore(void);
100
101#endif /* __KERNEL__ */
102
70#endif /* _ASM_X86_DEBUGREG_H */ 103#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
new file mode 100644
index 000000000000..0675a7c4c20e
--- /dev/null
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,73 @@
1#ifndef _I386_HW_BREAKPOINT_H
2#define _I386_HW_BREAKPOINT_H
3
4#ifdef __KERNEL__
5#define __ARCH_HW_BREAKPOINT_H
6
7/*
8 * The name should probably be something dealt in
9 * a higher level. While dealing with the user
10 * (display/resolving)
11 */
12struct arch_hw_breakpoint {
13 char *name; /* Contains name of the symbol to set bkpt */
14 unsigned long address;
15 u8 len;
16 u8 type;
17};
18
19#include <linux/kdebug.h>
20#include <linux/percpu.h>
21#include <linux/list.h>
22
23/* Available HW breakpoint length encodings */
24#define X86_BREAKPOINT_LEN_1 0x40
25#define X86_BREAKPOINT_LEN_2 0x44
26#define X86_BREAKPOINT_LEN_4 0x4c
27#define X86_BREAKPOINT_LEN_EXECUTE 0x40
28
29#ifdef CONFIG_X86_64
30#define X86_BREAKPOINT_LEN_8 0x48
31#endif
32
33/* Available HW breakpoint type encodings */
34
35/* trigger on instruction execute */
36#define X86_BREAKPOINT_EXECUTE 0x80
37/* trigger on memory write */
38#define X86_BREAKPOINT_WRITE 0x81
39/* trigger on memory read or write */
40#define X86_BREAKPOINT_RW 0x83
41
42/* Total number of available HW breakpoint registers */
43#define HBP_NUM 4
44
45struct perf_event;
46struct pmu;
47
48extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
49extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
50 struct task_struct *tsk);
51extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
52 unsigned long val, void *data);
53
54
55int arch_install_hw_breakpoint(struct perf_event *bp);
56void arch_uninstall_hw_breakpoint(struct perf_event *bp);
57void hw_breakpoint_pmu_read(struct perf_event *bp);
58void hw_breakpoint_pmu_unthrottle(struct perf_event *bp);
59
60extern void
61arch_fill_perf_breakpoint(struct perf_event *bp);
62
63unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type);
64int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type);
65
66extern int arch_bp_generic_fields(int x86_len, int x86_type,
67 int *gen_len, int *gen_type);
68
69extern struct pmu perf_ops_bp;
70
71#endif /* __KERNEL__ */
72#endif /* _I386_HW_BREAKPOINT_H */
73
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c9786480f0fe..6f8ec1c37e0a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -30,6 +30,7 @@ struct mm_struct;
30#include <linux/math64.h> 30#include <linux/math64.h>
31#include <linux/init.h> 31#include <linux/init.h>
32 32
33#define HBP_NUM 4
33/* 34/*
34 * Default implementation of macro that returns current 35 * Default implementation of macro that returns current
35 * instruction pointer ("program counter"). 36 * instruction pointer ("program counter").
@@ -422,6 +423,8 @@ extern unsigned int xstate_size;
422extern void free_thread_xstate(struct task_struct *); 423extern void free_thread_xstate(struct task_struct *);
423extern struct kmem_cache *task_xstate_cachep; 424extern struct kmem_cache *task_xstate_cachep;
424 425
426struct perf_event;
427
425struct thread_struct { 428struct thread_struct {
426 /* Cached TLS descriptors: */ 429 /* Cached TLS descriptors: */
427 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; 430 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
@@ -443,13 +446,10 @@ struct thread_struct {
443 unsigned long fs; 446 unsigned long fs;
444#endif 447#endif
445 unsigned long gs; 448 unsigned long gs;
446 /* Hardware debugging registers: */ 449 /* Save middle states of ptrace breakpoints */
447 unsigned long debugreg0; 450 struct perf_event *ptrace_bps[HBP_NUM];
448 unsigned long debugreg1; 451 /* Debug status used for traps, single steps, etc... */
449 unsigned long debugreg2; 452 unsigned long debugreg6;
450 unsigned long debugreg3;
451 unsigned long debugreg6;
452 unsigned long debugreg7;
453 /* Fault info: */ 453 /* Fault info: */
454 unsigned long cr2; 454 unsigned long cr2;
455 unsigned long trap_no; 455 unsigned long trap_no;
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d8e5d0cdd678..4f2e66e29ecc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
41obj-y += bootflag.o e820.o 41obj-y += bootflag.o e820.o
42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
43obj-y += alternative.o i8253.o pci-nommu.o 43obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
44obj-y += tsc.o io_delay.o rtc.o 44obj-y += tsc.o io_delay.o rtc.o
45 45
46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 68537e957a9b..1d2cb383410e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -5,6 +5,7 @@
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
6ifdef CONFIG_FUNCTION_TRACER 6ifdef CONFIG_FUNCTION_TRACER
7CFLAGS_REMOVE_common.o = -pg 7CFLAGS_REMOVE_common.o = -pg
8CFLAGS_REMOVE_perf_event.o = -pg
8endif 9endif
9 10
10# Make sure load_percpu_segment has no stackprotector 11# Make sure load_percpu_segment has no stackprotector
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..4d267fb77828
--- /dev/null
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,549 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) 2009 IBM Corporation
18 * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Authors: Alan Stern <stern@rowland.harvard.edu>
21 * K.Prasad <prasad@linux.vnet.ibm.com>
22 * Frederic Weisbecker <fweisbec@gmail.com>
23 */
24
25/*
26 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
27 * using the CPU's debug registers.
28 */
29
30#include <linux/perf_event.h>
31#include <linux/hw_breakpoint.h>
32#include <linux/irqflags.h>
33#include <linux/notifier.h>
34#include <linux/kallsyms.h>
35#include <linux/kprobes.h>
36#include <linux/percpu.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/sched.h>
41#include <linux/init.h>
42#include <linux/smp.h>
43
44#include <asm/hw_breakpoint.h>
45#include <asm/processor.h>
46#include <asm/debugreg.h>
47
48/* Per cpu debug control register value */
49DEFINE_PER_CPU(unsigned long, dr7);
50EXPORT_PER_CPU_SYMBOL(dr7);
51
52/* Per cpu debug address registers values */
53static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
54
55/*
56 * Stores the breakpoints currently in use on each breakpoint address
57 * register for each cpus
58 */
59static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
60
61
62/*
63 * Encode the length, type, Exact, and Enable bits for a particular breakpoint
64 * as stored in debug register 7.
65 */
66unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
67{
68 unsigned long bp_info;
69
70 bp_info = (len | type) & 0xf;
71 bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
72 bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) |
73 DR_GLOBAL_SLOWDOWN;
74 return bp_info;
75}
76
77/*
78 * Decode the length and type bits for a particular breakpoint as
79 * stored in debug register 7. Return the "enabled" status.
80 */
81int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
82{
83 int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
84
85 *len = (bp_info & 0xc) | 0x40;
86 *type = (bp_info & 0x3) | 0x80;
87
88 return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
89}
90
91/*
92 * Install a perf counter breakpoint.
93 *
94 * We seek a free debug address register and use it for this
95 * breakpoint. Eventually we enable it in the debug control register.
96 *
97 * Atomic: we hold the counter->ctx->lock and we only handle variables
98 * and registers local to this cpu.
99 */
100int arch_install_hw_breakpoint(struct perf_event *bp)
101{
102 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
103 unsigned long *dr7;
104 int i;
105
106 for (i = 0; i < HBP_NUM; i++) {
107 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
108
109 if (!*slot) {
110 *slot = bp;
111 break;
112 }
113 }
114
115 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
116 return -EBUSY;
117
118 set_debugreg(info->address, i);
119 __get_cpu_var(cpu_debugreg[i]) = info->address;
120
121 dr7 = &__get_cpu_var(dr7);
122 *dr7 |= encode_dr7(i, info->len, info->type);
123
124 set_debugreg(*dr7, 7);
125
126 return 0;
127}
128
129/*
130 * Uninstall the breakpoint contained in the given counter.
131 *
132 * First we search the debug address register it uses and then we disable
133 * it.
134 *
135 * Atomic: we hold the counter->ctx->lock and we only handle variables
136 * and registers local to this cpu.
137 */
138void arch_uninstall_hw_breakpoint(struct perf_event *bp)
139{
140 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
141 unsigned long *dr7;
142 int i;
143
144 for (i = 0; i < HBP_NUM; i++) {
145 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
146
147 if (*slot == bp) {
148 *slot = NULL;
149 break;
150 }
151 }
152
153 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
154 return;
155
156 dr7 = &__get_cpu_var(dr7);
157 *dr7 &= ~encode_dr7(i, info->len, info->type);
158
159 set_debugreg(*dr7, 7);
160}
161
162static int get_hbp_len(u8 hbp_len)
163{
164 unsigned int len_in_bytes = 0;
165
166 switch (hbp_len) {
167 case X86_BREAKPOINT_LEN_1:
168 len_in_bytes = 1;
169 break;
170 case X86_BREAKPOINT_LEN_2:
171 len_in_bytes = 2;
172 break;
173 case X86_BREAKPOINT_LEN_4:
174 len_in_bytes = 4;
175 break;
176#ifdef CONFIG_X86_64
177 case X86_BREAKPOINT_LEN_8:
178 len_in_bytes = 8;
179 break;
180#endif
181 }
182 return len_in_bytes;
183}
184
185/*
186 * Check for virtual address in user space.
187 */
188int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
189{
190 unsigned int len;
191
192 len = get_hbp_len(hbp_len);
193
194 return (va <= TASK_SIZE - len);
195}
196
197/*
198 * Check for virtual address in kernel space.
199 */
200static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
201{
202 unsigned int len;
203
204 len = get_hbp_len(hbp_len);
205
206 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
207}
208
209/*
210 * Store a breakpoint's encoded address, length, and type.
211 */
212static int arch_store_info(struct perf_event *bp)
213{
214 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
215 /*
216 * For kernel-addresses, either the address or symbol name can be
217 * specified.
218 */
219 if (info->name)
220 info->address = (unsigned long)
221 kallsyms_lookup_name(info->name);
222 if (info->address)
223 return 0;
224
225 return -EINVAL;
226}
227
228int arch_bp_generic_fields(int x86_len, int x86_type,
229 int *gen_len, int *gen_type)
230{
231 /* Len */
232 switch (x86_len) {
233 case X86_BREAKPOINT_LEN_1:
234 *gen_len = HW_BREAKPOINT_LEN_1;
235 break;
236 case X86_BREAKPOINT_LEN_2:
237 *gen_len = HW_BREAKPOINT_LEN_2;
238 break;
239 case X86_BREAKPOINT_LEN_4:
240 *gen_len = HW_BREAKPOINT_LEN_4;
241 break;
242#ifdef CONFIG_X86_64
243 case X86_BREAKPOINT_LEN_8:
244 *gen_len = HW_BREAKPOINT_LEN_8;
245 break;
246#endif
247 default:
248 return -EINVAL;
249 }
250
251 /* Type */
252 switch (x86_type) {
253 case X86_BREAKPOINT_EXECUTE:
254 *gen_type = HW_BREAKPOINT_X;
255 break;
256 case X86_BREAKPOINT_WRITE:
257 *gen_type = HW_BREAKPOINT_W;
258 break;
259 case X86_BREAKPOINT_RW:
260 *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
261 break;
262 default:
263 return -EINVAL;
264 }
265
266 return 0;
267}
268
269
270static int arch_build_bp_info(struct perf_event *bp)
271{
272 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
273
274 info->address = bp->attr.bp_addr;
275
276 /* Len */
277 switch (bp->attr.bp_len) {
278 case HW_BREAKPOINT_LEN_1:
279 info->len = X86_BREAKPOINT_LEN_1;
280 break;
281 case HW_BREAKPOINT_LEN_2:
282 info->len = X86_BREAKPOINT_LEN_2;
283 break;
284 case HW_BREAKPOINT_LEN_4:
285 info->len = X86_BREAKPOINT_LEN_4;
286 break;
287#ifdef CONFIG_X86_64
288 case HW_BREAKPOINT_LEN_8:
289 info->len = X86_BREAKPOINT_LEN_8;
290 break;
291#endif
292 default:
293 return -EINVAL;
294 }
295
296 /* Type */
297 switch (bp->attr.bp_type) {
298 case HW_BREAKPOINT_W:
299 info->type = X86_BREAKPOINT_WRITE;
300 break;
301 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
302 info->type = X86_BREAKPOINT_RW;
303 break;
304 case HW_BREAKPOINT_X:
305 info->type = X86_BREAKPOINT_EXECUTE;
306 break;
307 default:
308 return -EINVAL;
309 }
310
311 return 0;
312}
313/*
314 * Validate the arch-specific HW Breakpoint register settings
315 */
316int arch_validate_hwbkpt_settings(struct perf_event *bp,
317 struct task_struct *tsk)
318{
319 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
320 unsigned int align;
321 int ret;
322
323
324 ret = arch_build_bp_info(bp);
325 if (ret)
326 return ret;
327
328 ret = -EINVAL;
329
330 if (info->type == X86_BREAKPOINT_EXECUTE)
331 /*
332 * Ptrace-refactoring code
333 * For now, we'll allow instruction breakpoint only for user-space
334 * addresses
335 */
336 if ((!arch_check_va_in_userspace(info->address, info->len)) &&
337 info->len != X86_BREAKPOINT_EXECUTE)
338 return ret;
339
340 switch (info->len) {
341 case X86_BREAKPOINT_LEN_1:
342 align = 0;
343 break;
344 case X86_BREAKPOINT_LEN_2:
345 align = 1;
346 break;
347 case X86_BREAKPOINT_LEN_4:
348 align = 3;
349 break;
350#ifdef CONFIG_X86_64
351 case X86_BREAKPOINT_LEN_8:
352 align = 7;
353 break;
354#endif
355 default:
356 return ret;
357 }
358
359 if (bp->callback)
360 ret = arch_store_info(bp);
361
362 if (ret < 0)
363 return ret;
364 /*
365 * Check that the low-order bits of the address are appropriate
366 * for the alignment implied by len.
367 */
368 if (info->address & align)
369 return -EINVAL;
370
371 /* Check that the virtual address is in the proper range */
372 if (tsk) {
373 if (!arch_check_va_in_userspace(info->address, info->len))
374 return -EFAULT;
375 } else {
376 if (!arch_check_va_in_kernelspace(info->address, info->len))
377 return -EFAULT;
378 }
379
380 return 0;
381}
382
383/*
384 * Dump the debug register contents to the user.
385 * We can't dump our per cpu values because it
386 * may contain cpu wide breakpoint, something that
387 * doesn't belong to the current task.
388 *
389 * TODO: include non-ptrace user breakpoints (perf)
390 */
391void aout_dump_debugregs(struct user *dump)
392{
393 int i;
394 int dr7 = 0;
395 struct perf_event *bp;
396 struct arch_hw_breakpoint *info;
397 struct thread_struct *thread = &current->thread;
398
399 for (i = 0; i < HBP_NUM; i++) {
400 bp = thread->ptrace_bps[i];
401
402 if (bp && !bp->attr.disabled) {
403 dump->u_debugreg[i] = bp->attr.bp_addr;
404 info = counter_arch_bp(bp);
405 dr7 |= encode_dr7(i, info->len, info->type);
406 } else {
407 dump->u_debugreg[i] = 0;
408 }
409 }
410
411 dump->u_debugreg[4] = 0;
412 dump->u_debugreg[5] = 0;
413 dump->u_debugreg[6] = current->thread.debugreg6;
414
415 dump->u_debugreg[7] = dr7;
416}
417EXPORT_SYMBOL_GPL(aout_dump_debugregs);
418
419/*
420 * Release the user breakpoints used by ptrace
421 */
422void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
423{
424 int i;
425 struct thread_struct *t = &tsk->thread;
426
427 for (i = 0; i < HBP_NUM; i++) {
428 unregister_hw_breakpoint(t->ptrace_bps[i]);
429 t->ptrace_bps[i] = NULL;
430 }
431}
432
433void hw_breakpoint_restore(void)
434{
435 set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
436 set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
437 set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
438 set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
439 set_debugreg(current->thread.debugreg6, 6);
440 set_debugreg(__get_cpu_var(dr7), 7);
441}
442EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
443
444/*
445 * Handle debug exception notifications.
446 *
447 * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
448 *
449 * NOTIFY_DONE returned if one of the following conditions is true.
450 * i) When the causative address is from user-space and the exception
451 * is a valid one, i.e. not triggered as a result of lazy debug register
452 * switching
453 * ii) When there are more bits than trap<n> set in DR6 register (such
454 * as BD, BS or BT) indicating that more than one debug condition is
455 * met and requires some more action in do_debug().
456 *
457 * NOTIFY_STOP returned for all other cases
458 *
459 */
460static int __kprobes hw_breakpoint_handler(struct die_args *args)
461{
462 int i, cpu, rc = NOTIFY_STOP;
463 struct perf_event *bp;
464 unsigned long dr7, dr6;
465 unsigned long *dr6_p;
466
467 /* The DR6 value is pointed by args->err */
468 dr6_p = (unsigned long *)ERR_PTR(args->err);
469 dr6 = *dr6_p;
470
471 /* Do an early return if no trap bits are set in DR6 */
472 if ((dr6 & DR_TRAP_BITS) == 0)
473 return NOTIFY_DONE;
474
475 get_debugreg(dr7, 7);
476 /* Disable breakpoints during exception handling */
477 set_debugreg(0UL, 7);
478 /*
479 * Assert that local interrupts are disabled
480 * Reset the DRn bits in the virtualized register value.
481 * The ptrace trigger routine will add in whatever is needed.
482 */
483 current->thread.debugreg6 &= ~DR_TRAP_BITS;
484 cpu = get_cpu();
485
486 /* Handle all the breakpoints that were triggered */
487 for (i = 0; i < HBP_NUM; ++i) {
488 if (likely(!(dr6 & (DR_TRAP0 << i))))
489 continue;
490
491 /*
492 * The counter may be concurrently released but that can only
493 * occur from a call_rcu() path. We can then safely fetch
494 * the breakpoint, use its callback, touch its counter
495 * while we are in an rcu_read_lock() path.
496 */
497 rcu_read_lock();
498
499 bp = per_cpu(bp_per_reg[i], cpu);
500 if (bp)
501 rc = NOTIFY_DONE;
502 /*
503 * Reset the 'i'th TRAP bit in dr6 to denote completion of
504 * exception handling
505 */
506 (*dr6_p) &= ~(DR_TRAP0 << i);
507 /*
508 * bp can be NULL due to lazy debug register switching
509 * or due to concurrent perf counter removing.
510 */
511 if (!bp) {
512 rcu_read_unlock();
513 break;
514 }
515
516 (bp->callback)(bp, args->regs);
517
518 rcu_read_unlock();
519 }
520 if (dr6 & (~DR_TRAP_BITS))
521 rc = NOTIFY_DONE;
522
523 set_debugreg(dr7, 7);
524 put_cpu();
525
526 return rc;
527}
528
529/*
530 * Handle debug exception notifications.
531 */
532int __kprobes hw_breakpoint_exceptions_notify(
533 struct notifier_block *unused, unsigned long val, void *data)
534{
535 if (val != DIE_DEBUG)
536 return NOTIFY_DONE;
537
538 return hw_breakpoint_handler(data);
539}
540
541void hw_breakpoint_pmu_read(struct perf_event *bp)
542{
543 /* TODO */
544}
545
546void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
547{
548 /* TODO */
549}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8d82a77a3f3b..34e86b67550c 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -43,6 +43,7 @@
43#include <linux/smp.h> 43#include <linux/smp.h>
44#include <linux/nmi.h> 44#include <linux/nmi.h>
45 45
46#include <asm/debugreg.h>
46#include <asm/apicdef.h> 47#include <asm/apicdef.h>
47#include <asm/system.h> 48#include <asm/system.h>
48 49
@@ -434,6 +435,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
434 "resuming...\n"); 435 "resuming...\n");
435 kgdb_arch_handle_exception(args->trapnr, args->signr, 436 kgdb_arch_handle_exception(args->trapnr, args->signr,
436 args->err, "c", "", regs); 437 args->err, "c", "", regs);
438 /*
439 * Reset the BS bit in dr6 (pointed by args->err) to
440 * denote completion of processing
441 */
442 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
437 443
438 return NOTIFY_STOP; 444 return NOTIFY_STOP;
439} 445}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index c5f1f117e0c0..3fe86d706a14 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -56,6 +56,7 @@
56#include <asm/uaccess.h> 56#include <asm/uaccess.h>
57#include <asm/alternative.h> 57#include <asm/alternative.h>
58#include <asm/insn.h> 58#include <asm/insn.h>
59#include <asm/debugreg.h>
59 60
60void jprobe_return_end(void); 61void jprobe_return_end(void);
61 62
@@ -945,8 +946,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
945 ret = NOTIFY_STOP; 946 ret = NOTIFY_STOP;
946 break; 947 break;
947 case DIE_DEBUG: 948 case DIE_DEBUG:
948 if (post_kprobe_handler(args->regs)) 949 if (post_kprobe_handler(args->regs)) {
950 /*
951 * Reset the BS bit in dr6 (pointed by args->err) to
952 * denote completion of processing
953 */
954 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
949 ret = NOTIFY_STOP; 955 ret = NOTIFY_STOP;
956 }
950 break; 957 break;
951 case DIE_GPF: 958 case DIE_GPF:
952 /* 959 /*
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index c1c429d00130..c843f8406da2 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
25#include <asm/desc.h> 25#include <asm/desc.h>
26#include <asm/system.h> 26#include <asm/system.h>
27#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
28#include <asm/debugreg.h>
28 29
29static void set_idt(void *newidt, __u16 limit) 30static void set_idt(void *newidt, __u16 limit)
30{ 31{
@@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image)
202 203
203 /* Interrupts aren't acceptable while we reboot */ 204 /* Interrupts aren't acceptable while we reboot */
204 local_irq_disable(); 205 local_irq_disable();
206 hw_breakpoint_disable();
205 207
206 if (image->preserve_context) { 208 if (image->preserve_context) {
207#ifdef CONFIG_X86_IO_APIC 209#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 84c3bf209e98..4a8bb82248ae 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,7 @@
18#include <asm/pgtable.h> 18#include <asm/pgtable.h>
19#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
20#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
21#include <asm/debugreg.h>
21 22
22static int init_one_level2_page(struct kimage *image, pgd_t *pgd, 23static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
23 unsigned long addr) 24 unsigned long addr)
@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image)
282 283
283 /* Interrupts aren't acceptable while we reboot */ 284 /* Interrupts aren't acceptable while we reboot */
284 local_irq_disable(); 285 local_irq_disable();
286 hw_breakpoint_disable();
285 287
286 if (image->preserve_context) { 288 if (image->preserve_context) {
287#ifdef CONFIG_X86_IO_APIC 289#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2b5776..744508e7cfdd 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -10,6 +10,7 @@
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <trace/events/power.h> 12#include <trace/events/power.h>
13#include <linux/hw_breakpoint.h>
13#include <asm/system.h> 14#include <asm/system.h>
14#include <asm/apic.h> 15#include <asm/apic.h>
15#include <asm/syscalls.h> 16#include <asm/syscalls.h>
@@ -17,6 +18,7 @@
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
18#include <asm/i387.h> 19#include <asm/i387.h>
19#include <asm/ds.h> 20#include <asm/ds.h>
21#include <asm/debugreg.h>
20 22
21unsigned long idle_halt; 23unsigned long idle_halt;
22EXPORT_SYMBOL(idle_halt); 24EXPORT_SYMBOL(idle_halt);
@@ -103,14 +105,7 @@ void flush_thread(void)
103 } 105 }
104#endif 106#endif
105 107
106 clear_tsk_thread_flag(tsk, TIF_DEBUG); 108 flush_ptrace_hw_breakpoint(tsk);
107
108 tsk->thread.debugreg0 = 0;
109 tsk->thread.debugreg1 = 0;
110 tsk->thread.debugreg2 = 0;
111 tsk->thread.debugreg3 = 0;
112 tsk->thread.debugreg6 = 0;
113 tsk->thread.debugreg7 = 0;
114 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 109 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
115 /* 110 /*
116 * Forget coprocessor state.. 111 * Forget coprocessor state..
@@ -192,16 +187,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
192 else if (next->debugctlmsr != prev->debugctlmsr) 187 else if (next->debugctlmsr != prev->debugctlmsr)
193 update_debugctlmsr(next->debugctlmsr); 188 update_debugctlmsr(next->debugctlmsr);
194 189
195 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
196 set_debugreg(next->debugreg0, 0);
197 set_debugreg(next->debugreg1, 1);
198 set_debugreg(next->debugreg2, 2);
199 set_debugreg(next->debugreg3, 3);
200 /* no 4 and 5 */
201 set_debugreg(next->debugreg6, 6);
202 set_debugreg(next->debugreg7, 7);
203 }
204
205 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 190 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
206 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 191 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
207 /* prev and next are different */ 192 /* prev and next are different */
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4cf79567cdab..d5bd3132ee70 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -58,6 +58,7 @@
58#include <asm/idle.h> 58#include <asm/idle.h>
59#include <asm/syscalls.h> 59#include <asm/syscalls.h>
60#include <asm/ds.h> 60#include <asm/ds.h>
61#include <asm/debugreg.h>
61 62
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 63asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 64
@@ -259,7 +260,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
259 260
260 task_user_gs(p) = get_user_gs(regs); 261 task_user_gs(p) = get_user_gs(regs);
261 262
263 p->thread.io_bitmap_ptr = NULL;
262 tsk = current; 264 tsk = current;
265 err = -ENOMEM;
266
267 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
268
263 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 269 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
264 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, 270 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
265 IO_BITMAP_BYTES, GFP_KERNEL); 271 IO_BITMAP_BYTES, GFP_KERNEL);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index eb62cbcaa490..70cf15873f3d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,6 +52,7 @@
52#include <asm/idle.h> 52#include <asm/idle.h>
53#include <asm/syscalls.h> 53#include <asm/syscalls.h>
54#include <asm/ds.h> 54#include <asm/ds.h>
55#include <asm/debugreg.h>
55 56
56asmlinkage extern void ret_from_fork(void); 57asmlinkage extern void ret_from_fork(void);
57 58
@@ -297,12 +298,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
297 298
298 p->thread.fs = me->thread.fs; 299 p->thread.fs = me->thread.fs;
299 p->thread.gs = me->thread.gs; 300 p->thread.gs = me->thread.gs;
301 p->thread.io_bitmap_ptr = NULL;
300 302
301 savesegment(gs, p->thread.gsindex); 303 savesegment(gs, p->thread.gsindex);
302 savesegment(fs, p->thread.fsindex); 304 savesegment(fs, p->thread.fsindex);
303 savesegment(es, p->thread.es); 305 savesegment(es, p->thread.es);
304 savesegment(ds, p->thread.ds); 306 savesegment(ds, p->thread.ds);
305 307
308 err = -ENOMEM;
309 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
310
306 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 311 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
307 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 312 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
308 if (!p->thread.io_bitmap_ptr) { 313 if (!p->thread.io_bitmap_ptr) {
@@ -341,6 +346,7 @@ out:
341 kfree(p->thread.io_bitmap_ptr); 346 kfree(p->thread.io_bitmap_ptr);
342 p->thread.io_bitmap_max = 0; 347 p->thread.io_bitmap_max = 0;
343 } 348 }
349
344 return err; 350 return err;
345} 351}
346 352
@@ -495,6 +501,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
495 */ 501 */
496 if (preload_fpu) 502 if (preload_fpu)
497 __math_state_restore(); 503 __math_state_restore();
504
498 return prev_p; 505 return prev_p;
499} 506}
500 507
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index c4f76d275ee4..b25f8947ed7a 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,8 @@
22#include <linux/seccomp.h> 22#include <linux/seccomp.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/workqueue.h> 24#include <linux/workqueue.h>
25#include <linux/perf_event.h>
26#include <linux/hw_breakpoint.h>
25 27
26#include <asm/uaccess.h> 28#include <asm/uaccess.h>
27#include <asm/pgtable.h> 29#include <asm/pgtable.h>
@@ -34,6 +36,7 @@
34#include <asm/prctl.h> 36#include <asm/prctl.h>
35#include <asm/proto.h> 37#include <asm/proto.h>
36#include <asm/ds.h> 38#include <asm/ds.h>
39#include <asm/hw_breakpoint.h>
37 40
38#include "tls.h" 41#include "tls.h"
39 42
@@ -249,11 +252,6 @@ static int set_segment_reg(struct task_struct *task,
249 return 0; 252 return 0;
250} 253}
251 254
252static unsigned long debugreg_addr_limit(struct task_struct *task)
253{
254 return TASK_SIZE - 3;
255}
256
257#else /* CONFIG_X86_64 */ 255#else /* CONFIG_X86_64 */
258 256
259#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) 257#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -378,15 +376,6 @@ static int set_segment_reg(struct task_struct *task,
378 return 0; 376 return 0;
379} 377}
380 378
381static unsigned long debugreg_addr_limit(struct task_struct *task)
382{
383#ifdef CONFIG_IA32_EMULATION
384 if (test_tsk_thread_flag(task, TIF_IA32))
385 return IA32_PAGE_OFFSET - 3;
386#endif
387 return TASK_SIZE_MAX - 7;
388}
389
390#endif /* CONFIG_X86_32 */ 379#endif /* CONFIG_X86_32 */
391 380
392static unsigned long get_flags(struct task_struct *task) 381static unsigned long get_flags(struct task_struct *task)
@@ -566,99 +555,229 @@ static int genregs_set(struct task_struct *target,
566 return ret; 555 return ret;
567} 556}
568 557
558static void ptrace_triggered(struct perf_event *bp, void *data)
559{
560 int i;
561 struct thread_struct *thread = &(current->thread);
562
563 /*
564 * Store in the virtual DR6 register the fact that the breakpoint
565 * was hit so the thread's debugger will see it.
566 */
567 for (i = 0; i < HBP_NUM; i++) {
568 if (thread->ptrace_bps[i] == bp)
569 break;
570 }
571
572 thread->debugreg6 |= (DR_TRAP0 << i);
573}
574
569/* 575/*
570 * This function is trivial and will be inlined by the compiler. 576 * Walk through every ptrace breakpoints for this thread and
571 * Having it separates the implementation details of debug 577 * build the dr7 value on top of their attributes.
572 * registers from the interface details of ptrace. 578 *
573 */ 579 */
574static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) 580static unsigned long ptrace_get_dr7(struct perf_event *bp[])
575{ 581{
576 switch (n) { 582 int i;
577 case 0: return child->thread.debugreg0; 583 int dr7 = 0;
578 case 1: return child->thread.debugreg1; 584 struct arch_hw_breakpoint *info;
579 case 2: return child->thread.debugreg2; 585
580 case 3: return child->thread.debugreg3; 586 for (i = 0; i < HBP_NUM; i++) {
581 case 6: return child->thread.debugreg6; 587 if (bp[i] && !bp[i]->attr.disabled) {
582 case 7: return child->thread.debugreg7; 588 info = counter_arch_bp(bp[i]);
589 dr7 |= encode_dr7(i, info->len, info->type);
590 }
583 } 591 }
584 return 0; 592
593 return dr7;
585} 594}
586 595
587static int ptrace_set_debugreg(struct task_struct *child, 596/*
588 int n, unsigned long data) 597 * Handle ptrace writes to debug register 7.
598 */
599static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
589{ 600{
590 int i; 601 struct thread_struct *thread = &(tsk->thread);
602 unsigned long old_dr7;
603 int i, orig_ret = 0, rc = 0;
604 int enabled, second_pass = 0;
605 unsigned len, type;
606 int gen_len, gen_type;
607 struct perf_event *bp;
608
609 data &= ~DR_CONTROL_RESERVED;
610 old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
611restore:
612 /*
613 * Loop through all the hardware breakpoints, making the
614 * appropriate changes to each.
615 */
616 for (i = 0; i < HBP_NUM; i++) {
617 enabled = decode_dr7(data, i, &len, &type);
618 bp = thread->ptrace_bps[i];
619
620 if (!enabled) {
621 if (bp) {
622 /*
623 * Don't unregister the breakpoints right-away,
624 * unless all register_user_hw_breakpoint()
625 * requests have succeeded. This prevents
626 * any window of opportunity for debug
627 * register grabbing by other users.
628 */
629 if (!second_pass)
630 continue;
631 thread->ptrace_bps[i] = NULL;
632 unregister_hw_breakpoint(bp);
633 }
634 continue;
635 }
591 636
592 if (unlikely(n == 4 || n == 5)) 637 /*
593 return -EIO; 638 * We shoud have at least an inactive breakpoint at this
639 * slot. It means the user is writing dr7 without having
640 * written the address register first
641 */
642 if (!bp) {
643 rc = -EINVAL;
644 break;
645 }
594 646
595 if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) 647 rc = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
596 return -EIO; 648 if (rc)
649 break;
597 650
598 switch (n) { 651 /*
599 case 0: child->thread.debugreg0 = data; break; 652 * This is a temporary thing as bp is unregistered/registered
600 case 1: child->thread.debugreg1 = data; break; 653 * to simulate modification
601 case 2: child->thread.debugreg2 = data; break; 654 */
602 case 3: child->thread.debugreg3 = data; break; 655 bp = modify_user_hw_breakpoint(bp, bp->attr.bp_addr, gen_len,
656 gen_type, bp->callback,
657 tsk, true);
658 thread->ptrace_bps[i] = NULL;
603 659
604 case 6: 660 if (!bp) { /* incorrect bp, or we have a bug in bp API */
605 if ((data & ~0xffffffffUL) != 0) 661 rc = -EINVAL;
606 return -EIO; 662 break;
607 child->thread.debugreg6 = data; 663 }
608 break; 664 if (IS_ERR(bp)) {
665 rc = PTR_ERR(bp);
666 bp = NULL;
667 break;
668 }
669 thread->ptrace_bps[i] = bp;
670 }
671 /*
672 * Make a second pass to free the remaining unused breakpoints
673 * or to restore the original breakpoints if an error occurred.
674 */
675 if (!second_pass) {
676 second_pass = 1;
677 if (rc < 0) {
678 orig_ret = rc;
679 data = old_dr7;
680 }
681 goto restore;
682 }
683 return ((orig_ret < 0) ? orig_ret : rc);
684}
609 685
610 case 7: 686/*
687 * Handle PTRACE_PEEKUSR calls for the debug register area.
688 */
689static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
690{
691 struct thread_struct *thread = &(tsk->thread);
692 unsigned long val = 0;
693
694 if (n < HBP_NUM) {
695 struct perf_event *bp;
696 bp = thread->ptrace_bps[n];
697 if (!bp)
698 return 0;
699 val = bp->hw.info.address;
700 } else if (n == 6) {
701 val = thread->debugreg6;
702 } else if (n == 7) {
703 val = ptrace_get_dr7(thread->ptrace_bps);
704 }
705 return val;
706}
707
708static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
709 unsigned long addr)
710{
711 struct perf_event *bp;
712 struct thread_struct *t = &tsk->thread;
713
714 if (!t->ptrace_bps[nr]) {
611 /* 715 /*
612 * Sanity-check data. Take one half-byte at once with 716 * Put stub len and type to register (reserve) an inactive but
613 * check = (val >> (16 + 4*i)) & 0xf. It contains the 717 * correct bp
614 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
615 * 2 and 3 are LENi. Given a list of invalid values,
616 * we do mask |= 1 << invalid_value, so that
617 * (mask >> check) & 1 is a correct test for invalid
618 * values.
619 *
620 * R/Wi contains the type of the breakpoint /
621 * watchpoint, LENi contains the length of the watched
622 * data in the watchpoint case.
623 *
624 * The invalid values are:
625 * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
626 * - R/Wi == 0x10 (break on I/O reads or writes), so
627 * mask |= 0x4444.
628 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
629 * 0x1110.
630 *
631 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
632 *
633 * See the Intel Manual "System Programming Guide",
634 * 15.2.4
635 *
636 * Note that LENi == 0x10 is defined on x86_64 in long
637 * mode (i.e. even for 32-bit userspace software, but
638 * 64-bit kernel), so the x86_64 mask value is 0x5454.
639 * See the AMD manual no. 24593 (AMD64 System Programming)
640 */ 718 */
641#ifdef CONFIG_X86_32 719 bp = register_user_hw_breakpoint(addr, HW_BREAKPOINT_LEN_1,
642#define DR7_MASK 0x5f54 720 HW_BREAKPOINT_W,
643#else 721 ptrace_triggered, tsk,
644#define DR7_MASK 0x5554 722 false);
645#endif 723 } else {
646 data &= ~DR_CONTROL_RESERVED; 724 bp = t->ptrace_bps[nr];
647 for (i = 0; i < 4; i++) 725 t->ptrace_bps[nr] = NULL;
648 if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) 726 bp = modify_user_hw_breakpoint(bp, addr, bp->attr.bp_len,
649 return -EIO; 727 bp->attr.bp_type,
650 child->thread.debugreg7 = data; 728 bp->callback,
651 if (data) 729 tsk,
652 set_tsk_thread_flag(child, TIF_DEBUG); 730 bp->attr.disabled);
653 else
654 clear_tsk_thread_flag(child, TIF_DEBUG);
655 break;
656 } 731 }
657 732
733 if (!bp)
734 return -EIO;
735 /*
736 * CHECKME: the previous code returned -EIO if the addr wasn't a
737 * valid task virtual addr. The new one will return -EINVAL in this
738 * case.
739 * -EINVAL may be what we want for in-kernel breakpoints users, but
740 * -EIO looks better for ptrace, since we refuse a register writing
741 * for the user. And anyway this is the previous behaviour.
742 */
743 if (IS_ERR(bp))
744 return PTR_ERR(bp);
745
746 t->ptrace_bps[nr] = bp;
747
658 return 0; 748 return 0;
659} 749}
660 750
661/* 751/*
752 * Handle PTRACE_POKEUSR calls for the debug register area.
753 */
754int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
755{
756 struct thread_struct *thread = &(tsk->thread);
757 int rc = 0;
758
759 /* There are no DR4 or DR5 registers */
760 if (n == 4 || n == 5)
761 return -EIO;
762
763 if (n == 6) {
764 thread->debugreg6 = val;
765 goto ret_path;
766 }
767 if (n < HBP_NUM) {
768 rc = ptrace_set_breakpoint_addr(tsk, n, val);
769 if (rc)
770 return rc;
771 }
772 /* All that's left is DR7 */
773 if (n == 7)
774 rc = ptrace_write_dr7(tsk, val);
775
776ret_path:
777 return rc;
778}
779
780/*
662 * These access the current or another (stopped) task's io permission 781 * These access the current or another (stopped) task's io permission
663 * bitmap for debugging or core dump. 782 * bitmap for debugging or core dump.
664 */ 783 */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76055ad..fbf3b07c8567 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -799,15 +799,6 @@ static void do_signal(struct pt_regs *regs)
799 799
800 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 800 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
801 if (signr > 0) { 801 if (signr > 0) {
802 /*
803 * Re-enable any watchpoints before delivering the
804 * signal to user space. The processor register will
805 * have been cleared if the watchpoint triggered
806 * inside the kernel.
807 */
808 if (current->thread.debugreg7)
809 set_debugreg(current->thread.debugreg7, 7);
810
811 /* Whee! Actually deliver the signal. */ 802 /* Whee! Actually deliver the signal. */
812 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 803 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
813 /* 804 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7e37dcee0cc3..33399176512a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -529,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) 529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
530{ 530{
531 struct task_struct *tsk = current; 531 struct task_struct *tsk = current;
532 unsigned long condition; 532 unsigned long dr6;
533 int si_code; 533 int si_code;
534 534
535 get_debugreg(condition, 6); 535 get_debugreg(dr6, 6);
536 536
537 /* Catch kmemcheck conditions first of all! */ 537 /* Catch kmemcheck conditions first of all! */
538 if (condition & DR_STEP && kmemcheck_trap(regs)) 538 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
539 return; 539 return;
540 540
541 /* DR6 may or may not be cleared by the CPU */
542 set_debugreg(0, 6);
541 /* 543 /*
542 * The processor cleared BTF, so don't mark that we need it set. 544 * The processor cleared BTF, so don't mark that we need it set.
543 */ 545 */
544 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); 546 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
545 tsk->thread.debugctlmsr = 0; 547 tsk->thread.debugctlmsr = 0;
546 548
547 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 549 /* Store the virtualized DR6 value */
548 SIGTRAP) == NOTIFY_STOP) 550 tsk->thread.debugreg6 = dr6;
551
552 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
553 SIGTRAP) == NOTIFY_STOP)
549 return; 554 return;
550 555
551 /* It's safe to allow irq's after DR6 has been saved */ 556 /* It's safe to allow irq's after DR6 has been saved */
552 preempt_conditional_sti(regs); 557 preempt_conditional_sti(regs);
553 558
554 /* Mask out spurious debug traps due to lazy DR7 setting */ 559 if (regs->flags & X86_VM_MASK) {
555 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 560 handle_vm86_trap((struct kernel_vm86_regs *) regs,
556 if (!tsk->thread.debugreg7) 561 error_code, 1);
557 goto clear_dr7; 562 return;
558 } 563 }
559 564
560#ifdef CONFIG_X86_32
561 if (regs->flags & X86_VM_MASK)
562 goto debug_vm86;
563#endif
564
565 /* Save debug status register where ptrace can see it */
566 tsk->thread.debugreg6 = condition;
567
568 /* 565 /*
569 * Single-stepping through TF: make sure we ignore any events in 566 * Single-stepping through system calls: ignore any exceptions in
570 * kernel space (but re-enable TF when returning to user mode). 567 * kernel space, but re-enable TF when returning to user mode.
568 *
569 * We already checked v86 mode above, so we can check for kernel mode
570 * by just checking the CPL of CS.
571 */ 571 */
572 if (condition & DR_STEP) { 572 if ((dr6 & DR_STEP) && !user_mode(regs)) {
573 if (!user_mode(regs)) 573 tsk->thread.debugreg6 &= ~DR_STEP;
574 goto clear_TF_reenable; 574 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
575 regs->flags &= ~X86_EFLAGS_TF;
575 } 576 }
576 577 si_code = get_si_code(tsk->thread.debugreg6);
577 si_code = get_si_code(condition); 578 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
578 /* Ok, finally something we can handle */ 579 send_sigtrap(tsk, regs, error_code, si_code);
579 send_sigtrap(tsk, regs, error_code, si_code);
580
581 /*
582 * Disable additional traps. They'll be re-enabled when
583 * the signal is delivered.
584 */
585clear_dr7:
586 set_debugreg(0, 7);
587 preempt_conditional_cli(regs); 580 preempt_conditional_cli(regs);
588 return;
589 581
590#ifdef CONFIG_X86_32
591debug_vm86:
592 /* reenable preemption: handle_vm86_trap() might sleep */
593 dec_preempt_count();
594 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
595 conditional_cli(regs);
596 return;
597#endif
598
599clear_TF_reenable:
600 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
601 regs->flags &= ~X86_EFLAGS_TF;
602 preempt_conditional_cli(regs);
603 return; 582 return;
604} 583}
605 584
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ae07d261527c..4fc80174191c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,6 +42,7 @@
42#define CREATE_TRACE_POINTS 42#define CREATE_TRACE_POINTS
43#include "trace.h" 43#include "trace.h"
44 44
45#include <asm/debugreg.h>
45#include <asm/uaccess.h> 46#include <asm/uaccess.h>
46#include <asm/msr.h> 47#include <asm/msr.h>
47#include <asm/desc.h> 48#include <asm/desc.h>
@@ -3643,14 +3644,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3643 trace_kvm_entry(vcpu->vcpu_id); 3644 trace_kvm_entry(vcpu->vcpu_id);
3644 kvm_x86_ops->run(vcpu, kvm_run); 3645 kvm_x86_ops->run(vcpu, kvm_run);
3645 3646
3646 if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { 3647 /*
3647 set_debugreg(current->thread.debugreg0, 0); 3648 * If the guest has used debug registers, at least dr7
3648 set_debugreg(current->thread.debugreg1, 1); 3649 * will be disabled while returning to the host.
3649 set_debugreg(current->thread.debugreg2, 2); 3650 * If we don't have active breakpoints in the host, we don't
3650 set_debugreg(current->thread.debugreg3, 3); 3651 * care about the messed up debug address registers. But if
3651 set_debugreg(current->thread.debugreg6, 6); 3652 * we have some of them active, restore the old state.
3652 set_debugreg(current->thread.debugreg7, 7); 3653 */
3653 } 3654 if (hw_breakpoint_active())
3655 hw_breakpoint_restore();
3654 3656
3655 set_bit(KVM_REQ_KICK, &vcpu->requests); 3657 set_bit(KVM_REQ_KICK, &vcpu->requests);
3656 local_irq_enable(); 3658 local_irq_enable();
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 16ccbd77917f..11a4ad4d6253 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -540,8 +540,14 @@ kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
540 struct die_args *arg = args; 540 struct die_args *arg = args;
541 541
542 if (val == DIE_DEBUG && (arg->err & DR_STEP)) 542 if (val == DIE_DEBUG && (arg->err & DR_STEP))
543 if (post_kmmio_handler(arg->err, arg->regs) == 1) 543 if (post_kmmio_handler(arg->err, arg->regs) == 1) {
544 /*
545 * Reset the BS bit in dr6 (pointed by args->err) to
546 * denote completion of processing
547 */
548 (*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP;
544 return NOTIFY_STOP; 549 return NOTIFY_STOP;
550 }
545 551
546 return NOTIFY_DONE; 552 return NOTIFY_DONE;
547} 553}
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 8aa85f17667e..0a979f3e5b8a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -18,6 +18,7 @@
18#include <asm/mce.h> 18#include <asm/mce.h>
19#include <asm/xcr.h> 19#include <asm/xcr.h>
20#include <asm/suspend.h> 20#include <asm/suspend.h>
21#include <asm/debugreg.h>
21 22
22#ifdef CONFIG_X86_32 23#ifdef CONFIG_X86_32
23static struct saved_context saved_context; 24static struct saved_context saved_context;
@@ -142,31 +143,6 @@ static void fix_processor_context(void)
142#endif 143#endif
143 load_TR_desc(); /* This does ltr */ 144 load_TR_desc(); /* This does ltr */
144 load_LDT(&current->active_mm->context); /* This does lldt */ 145 load_LDT(&current->active_mm->context); /* This does lldt */
145
146 /*
147 * Now maybe reload the debug registers
148 */
149 if (current->thread.debugreg7) {
150#ifdef CONFIG_X86_32
151 set_debugreg(current->thread.debugreg0, 0);
152 set_debugreg(current->thread.debugreg1, 1);
153 set_debugreg(current->thread.debugreg2, 2);
154 set_debugreg(current->thread.debugreg3, 3);
155 /* no 4 and 5 */
156 set_debugreg(current->thread.debugreg6, 6);
157 set_debugreg(current->thread.debugreg7, 7);
158#else
159 /* CONFIG_X86_64 */
160 loaddebug(&current->thread, 0);
161 loaddebug(&current->thread, 1);
162 loaddebug(&current->thread, 2);
163 loaddebug(&current->thread, 3);
164 /* no 4 and 5 */
165 loaddebug(&current->thread, 6);
166 loaddebug(&current->thread, 7);
167#endif
168 }
169
170} 146}
171 147
172/** 148/**
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c
index af75e07217ba..d8214dc03fa7 100644
--- a/arch/x86/tools/test_get_len.c
+++ b/arch/x86/tools/test_get_len.c
@@ -114,6 +114,7 @@ int main(int argc, char **argv)
114 unsigned char insn_buf[16]; 114 unsigned char insn_buf[16];
115 struct insn insn; 115 struct insn insn;
116 int insns = 0, c; 116 int insns = 0, c;
117 int warnings = 0;
117 118
118 parse_args(argc, argv); 119 parse_args(argc, argv);
119 120
@@ -151,18 +152,22 @@ int main(int argc, char **argv)
151 insn_init(&insn, insn_buf, x86_64); 152 insn_init(&insn, insn_buf, x86_64);
152 insn_get_length(&insn); 153 insn_get_length(&insn);
153 if (insn.length != nb) { 154 if (insn.length != nb) {
154 fprintf(stderr, "Error: %s found a difference at %s\n", 155 warnings++;
156 fprintf(stderr, "Warning: %s found difference at %s\n",
155 prog, sym); 157 prog, sym);
156 fprintf(stderr, "Error: %s", line); 158 fprintf(stderr, "Warning: %s", line);
157 fprintf(stderr, "Error: objdump says %d bytes, but " 159 fprintf(stderr, "Warning: objdump says %d bytes, but "
158 "insn_get_length() says %d\n", nb, 160 "insn_get_length() says %d\n", nb,
159 insn.length); 161 insn.length);
160 if (verbose) 162 if (verbose)
161 dump_insn(stderr, &insn); 163 dump_insn(stderr, &insn);
162 exit(2);
163 } 164 }
164 } 165 }
165 fprintf(stderr, "Succeed: decoded and checked %d instructions\n", 166 if (warnings)
166 insns); 167 fprintf(stderr, "Warning: decoded and checked %d"
168 " instructions with %d warnings\n", insns, warnings);
169 else
170 fprintf(stderr, "Succeed: decoded and checked %d"
171 " instructions\n", insns);
167 return 0; 172 return 0;
168} 173}
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 43360c1d8f70..47bbdf9c38d0 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -137,13 +137,8 @@ struct ftrace_event_call {
137 137
138#define FTRACE_MAX_PROFILE_SIZE 2048 138#define FTRACE_MAX_PROFILE_SIZE 2048
139 139
140struct perf_trace_buf { 140extern char *perf_trace_buf;
141 char buf[FTRACE_MAX_PROFILE_SIZE]; 141extern char *perf_trace_buf_nmi;
142 int recursion;
143};
144
145extern struct perf_trace_buf *perf_trace_buf;
146extern struct perf_trace_buf *perf_trace_buf_nmi;
147 142
148#define MAX_FILTER_PRED 32 143#define MAX_FILTER_PRED 32
149#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ 144#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
new file mode 100644
index 000000000000..c9f7f7c7b0e0
--- /dev/null
+++ b/include/linux/hw_breakpoint.h
@@ -0,0 +1,140 @@
1#ifndef _LINUX_HW_BREAKPOINT_H
2#define _LINUX_HW_BREAKPOINT_H
3
4enum {
5 HW_BREAKPOINT_LEN_1 = 1,
6 HW_BREAKPOINT_LEN_2 = 2,
7 HW_BREAKPOINT_LEN_4 = 4,
8 HW_BREAKPOINT_LEN_8 = 8,
9};
10
11enum {
12 HW_BREAKPOINT_R = 1,
13 HW_BREAKPOINT_W = 2,
14 HW_BREAKPOINT_X = 4,
15};
16
17#ifdef __KERNEL__
18
19#include <linux/perf_event.h>
20
21#ifdef CONFIG_HAVE_HW_BREAKPOINT
22
23static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
24{
25 return bp->attr.bp_addr;
26}
27
28static inline int hw_breakpoint_type(struct perf_event *bp)
29{
30 return bp->attr.bp_type;
31}
32
33static inline int hw_breakpoint_len(struct perf_event *bp)
34{
35 return bp->attr.bp_len;
36}
37
38extern struct perf_event *
39register_user_hw_breakpoint(unsigned long addr,
40 int len,
41 int type,
42 perf_callback_t triggered,
43 struct task_struct *tsk,
44 bool active);
45
46/* FIXME: only change from the attr, and don't unregister */
47extern struct perf_event *
48modify_user_hw_breakpoint(struct perf_event *bp,
49 unsigned long addr,
50 int len,
51 int type,
52 perf_callback_t triggered,
53 struct task_struct *tsk,
54 bool active);
55
56/*
57 * Kernel breakpoints are not associated with any particular thread.
58 */
59extern struct perf_event *
60register_wide_hw_breakpoint_cpu(unsigned long addr,
61 int len,
62 int type,
63 perf_callback_t triggered,
64 int cpu,
65 bool active);
66
67extern struct perf_event **
68register_wide_hw_breakpoint(unsigned long addr,
69 int len,
70 int type,
71 perf_callback_t triggered,
72 bool active);
73
74extern int register_perf_hw_breakpoint(struct perf_event *bp);
75extern int __register_perf_hw_breakpoint(struct perf_event *bp);
76extern void unregister_hw_breakpoint(struct perf_event *bp);
77extern void unregister_wide_hw_breakpoint(struct perf_event **cpu_events);
78
79extern int reserve_bp_slot(struct perf_event *bp);
80extern void release_bp_slot(struct perf_event *bp);
81
82extern void flush_ptrace_hw_breakpoint(struct task_struct *tsk);
83
84static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
85{
86 return &bp->hw.info;
87}
88
89#else /* !CONFIG_HAVE_HW_BREAKPOINT */
90
91static inline struct perf_event *
92register_user_hw_breakpoint(unsigned long addr,
93 int len,
94 int type,
95 perf_callback_t triggered,
96 struct task_struct *tsk,
97 bool active) { return NULL; }
98static inline struct perf_event *
99modify_user_hw_breakpoint(struct perf_event *bp,
100 unsigned long addr,
101 int len,
102 int type,
103 perf_callback_t triggered,
104 struct task_struct *tsk,
105 bool active) { return NULL; }
106static inline struct perf_event *
107register_wide_hw_breakpoint_cpu(unsigned long addr,
108 int len,
109 int type,
110 perf_callback_t triggered,
111 int cpu,
112 bool active) { return NULL; }
113static inline struct perf_event **
114register_wide_hw_breakpoint(unsigned long addr,
115 int len,
116 int type,
117 perf_callback_t triggered,
118 bool active) { return NULL; }
119static inline int
120register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; }
121static inline int
122__register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; }
123static inline void unregister_hw_breakpoint(struct perf_event *bp) { }
124static inline void
125unregister_wide_hw_breakpoint(struct perf_event **cpu_events) { }
126static inline int
127reserve_bp_slot(struct perf_event *bp) {return -ENOSYS; }
128static inline void release_bp_slot(struct perf_event *bp) { }
129
130static inline void flush_ptrace_hw_breakpoint(struct task_struct *tsk) { }
131
132static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
133{
134 return NULL;
135}
136
137#endif /* CONFIG_HAVE_HW_BREAKPOINT */
138#endif /* __KERNEL__ */
139
140#endif /* _LINUX_HW_BREAKPOINT_H */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7f87563c8485..43adbd7f0010 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -18,6 +18,10 @@
18#include <linux/ioctl.h> 18#include <linux/ioctl.h>
19#include <asm/byteorder.h> 19#include <asm/byteorder.h>
20 20
21#ifdef CONFIG_HAVE_HW_BREAKPOINT
22#include <asm/hw_breakpoint.h>
23#endif
24
21/* 25/*
22 * User-space ABI bits: 26 * User-space ABI bits:
23 */ 27 */
@@ -31,6 +35,7 @@ enum perf_type_id {
31 PERF_TYPE_TRACEPOINT = 2, 35 PERF_TYPE_TRACEPOINT = 2,
32 PERF_TYPE_HW_CACHE = 3, 36 PERF_TYPE_HW_CACHE = 3,
33 PERF_TYPE_RAW = 4, 37 PERF_TYPE_RAW = 4,
38 PERF_TYPE_BREAKPOINT = 5,
34 39
35 PERF_TYPE_MAX, /* non-ABI */ 40 PERF_TYPE_MAX, /* non-ABI */
36}; 41};
@@ -209,6 +214,15 @@ struct perf_event_attr {
209 __u32 wakeup_events; /* wakeup every n events */ 214 __u32 wakeup_events; /* wakeup every n events */
210 __u32 wakeup_watermark; /* bytes before wakeup */ 215 __u32 wakeup_watermark; /* bytes before wakeup */
211 }; 216 };
217
218 union {
219 struct { /* Hardware breakpoint info */
220 __u64 bp_addr;
221 __u32 bp_type;
222 __u32 bp_len;
223 };
224 };
225
212 __u32 __reserved_2; 226 __u32 __reserved_2;
213 227
214 __u64 __reserved_3; 228 __u64 __reserved_3;
@@ -478,6 +492,11 @@ struct hw_perf_event {
478 s64 remaining; 492 s64 remaining;
479 struct hrtimer hrtimer; 493 struct hrtimer hrtimer;
480 }; 494 };
495#ifdef CONFIG_HAVE_HW_BREAKPOINT
496 union { /* breakpoint */
497 struct arch_hw_breakpoint info;
498 };
499#endif
481 }; 500 };
482 atomic64_t prev_count; 501 atomic64_t prev_count;
483 u64 sample_period; 502 u64 sample_period;
@@ -546,6 +565,10 @@ struct perf_pending_entry {
546 void (*func)(struct perf_pending_entry *); 565 void (*func)(struct perf_pending_entry *);
547}; 566};
548 567
568typedef void (*perf_callback_t)(struct perf_event *, void *);
569
570struct perf_sample_data;
571
549/** 572/**
550 * struct perf_event - performance event kernel representation: 573 * struct perf_event - performance event kernel representation:
551 */ 574 */
@@ -588,7 +611,7 @@ struct perf_event {
588 u64 tstamp_running; 611 u64 tstamp_running;
589 u64 tstamp_stopped; 612 u64 tstamp_stopped;
590 613
591 struct perf_event_attr attr; 614 struct perf_event_attr attr;
592 struct hw_perf_event hw; 615 struct hw_perf_event hw;
593 616
594 struct perf_event_context *ctx; 617 struct perf_event_context *ctx;
@@ -637,10 +660,18 @@ struct perf_event {
637 struct pid_namespace *ns; 660 struct pid_namespace *ns;
638 u64 id; 661 u64 id;
639 662
663 void (*overflow_handler)(struct perf_event *event,
664 int nmi, struct perf_sample_data *data,
665 struct pt_regs *regs);
666
640#ifdef CONFIG_EVENT_PROFILE 667#ifdef CONFIG_EVENT_PROFILE
641 struct event_filter *filter; 668 struct event_filter *filter;
642#endif 669#endif
643 670
671 perf_callback_t callback;
672
673 perf_callback_t event_callback;
674
644#endif /* CONFIG_PERF_EVENTS */ 675#endif /* CONFIG_PERF_EVENTS */
645}; 676};
646 677
@@ -745,6 +776,14 @@ extern int hw_perf_group_sched_in(struct perf_event *group_leader,
745 struct perf_cpu_context *cpuctx, 776 struct perf_cpu_context *cpuctx,
746 struct perf_event_context *ctx, int cpu); 777 struct perf_event_context *ctx, int cpu);
747extern void perf_event_update_userpage(struct perf_event *event); 778extern void perf_event_update_userpage(struct perf_event *event);
779extern int perf_event_release_kernel(struct perf_event *event);
780extern struct perf_event *
781perf_event_create_kernel_counter(struct perf_event_attr *attr,
782 int cpu,
783 pid_t pid,
784 perf_callback_t callback);
785extern u64 perf_event_read_value(struct perf_event *event,
786 u64 *enabled, u64 *running);
748 787
749struct perf_sample_data { 788struct perf_sample_data {
750 u64 type; 789 u64 type;
@@ -821,6 +860,7 @@ extern int sysctl_perf_event_sample_rate;
821extern void perf_event_init(void); 860extern void perf_event_init(void);
822extern void perf_tp_event(int event_id, u64 addr, u64 count, 861extern void perf_tp_event(int event_id, u64 addr, u64 count,
823 void *record, int entry_size); 862 void *record, int entry_size);
863extern void perf_bp_event(struct perf_event *event, void *data);
824 864
825#ifndef perf_misc_flags 865#ifndef perf_misc_flags
826#define perf_misc_flags(regs) (user_mode(regs) ? PERF_RECORD_MISC_USER : \ 866#define perf_misc_flags(regs) (user_mode(regs) ? PERF_RECORD_MISC_USER : \
@@ -834,6 +874,8 @@ extern int perf_output_begin(struct perf_output_handle *handle,
834extern void perf_output_end(struct perf_output_handle *handle); 874extern void perf_output_end(struct perf_output_handle *handle);
835extern void perf_output_copy(struct perf_output_handle *handle, 875extern void perf_output_copy(struct perf_output_handle *handle,
836 const void *buf, unsigned int len); 876 const void *buf, unsigned int len);
877extern int perf_swevent_get_recursion_context(void);
878extern void perf_swevent_put_recursion_context(int rctx);
837#else 879#else
838static inline void 880static inline void
839perf_event_task_sched_in(struct task_struct *task, int cpu) { } 881perf_event_task_sched_in(struct task_struct *task, int cpu) { }
@@ -855,11 +897,15 @@ static inline int perf_event_task_enable(void) { return -EINVAL; }
855static inline void 897static inline void
856perf_sw_event(u32 event_id, u64 nr, int nmi, 898perf_sw_event(u32 event_id, u64 nr, int nmi,
857 struct pt_regs *regs, u64 addr) { } 899 struct pt_regs *regs, u64 addr) { }
900static inline void
901perf_bp_event(struct perf_event *event, void *data) { }
858 902
859static inline void perf_event_mmap(struct vm_area_struct *vma) { } 903static inline void perf_event_mmap(struct vm_area_struct *vma) { }
860static inline void perf_event_comm(struct task_struct *tsk) { } 904static inline void perf_event_comm(struct task_struct *tsk) { }
861static inline void perf_event_fork(struct task_struct *tsk) { } 905static inline void perf_event_fork(struct task_struct *tsk) { }
862static inline void perf_event_init(void) { } 906static inline void perf_event_init(void) { }
907static inline int perf_swevent_get_recursion_context(void) { return -1; }
908static inline void perf_swevent_put_recursion_context(int rctx) { }
863 909
864#endif 910#endif
865 911
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 4945d1c99864..c3417c13e3ed 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -724,17 +724,20 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
724static void ftrace_profile_##call(proto) \ 724static void ftrace_profile_##call(proto) \
725{ \ 725{ \
726 struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ 726 struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
727 extern int perf_swevent_get_recursion_context(void); \
728 extern void perf_swevent_put_recursion_context(int rctx); \
727 struct ftrace_event_call *event_call = &event_##call; \ 729 struct ftrace_event_call *event_call = &event_##call; \
728 extern void perf_tp_event(int, u64, u64, void *, int); \ 730 extern void perf_tp_event(int, u64, u64, void *, int); \
729 struct ftrace_raw_##call *entry; \ 731 struct ftrace_raw_##call *entry; \
730 struct perf_trace_buf *trace_buf; \
731 u64 __addr = 0, __count = 1; \ 732 u64 __addr = 0, __count = 1; \
732 unsigned long irq_flags; \ 733 unsigned long irq_flags; \
733 struct trace_entry *ent; \ 734 struct trace_entry *ent; \
734 int __entry_size; \ 735 int __entry_size; \
735 int __data_size; \ 736 int __data_size; \
737 char *trace_buf; \
736 char *raw_data; \ 738 char *raw_data; \
737 int __cpu; \ 739 int __cpu; \
740 int rctx; \
738 int pc; \ 741 int pc; \
739 \ 742 \
740 pc = preempt_count(); \ 743 pc = preempt_count(); \
@@ -749,6 +752,11 @@ static void ftrace_profile_##call(proto) \
749 return; \ 752 return; \
750 \ 753 \
751 local_irq_save(irq_flags); \ 754 local_irq_save(irq_flags); \
755 \
756 rctx = perf_swevent_get_recursion_context(); \
757 if (rctx < 0) \
758 goto end_recursion; \
759 \
752 __cpu = smp_processor_id(); \ 760 __cpu = smp_processor_id(); \
753 \ 761 \
754 if (in_nmi()) \ 762 if (in_nmi()) \
@@ -759,13 +767,7 @@ static void ftrace_profile_##call(proto) \
759 if (!trace_buf) \ 767 if (!trace_buf) \
760 goto end; \ 768 goto end; \
761 \ 769 \
762 trace_buf = per_cpu_ptr(trace_buf, __cpu); \ 770 raw_data = per_cpu_ptr(trace_buf, __cpu); \
763 if (trace_buf->recursion++) \
764 goto end_recursion; \
765 \
766 barrier(); \
767 \
768 raw_data = trace_buf->buf; \
769 \ 771 \
770 *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \ 772 *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \
771 entry = (struct ftrace_raw_##call *)raw_data; \ 773 entry = (struct ftrace_raw_##call *)raw_data; \
@@ -780,9 +782,9 @@ static void ftrace_profile_##call(proto) \
780 perf_tp_event(event_call->id, __addr, __count, entry, \ 782 perf_tp_event(event_call->id, __addr, __count, entry, \
781 __entry_size); \ 783 __entry_size); \
782 \ 784 \
783end_recursion: \
784 trace_buf->recursion--; \
785end: \ 785end: \
786 perf_swevent_put_recursion_context(rctx); \
787end_recursion: \
786 local_irq_restore(irq_flags); \ 788 local_irq_restore(irq_flags); \
787 \ 789 \
788} 790}
diff --git a/kernel/Makefile b/kernel/Makefile
index b8d4cd8ac0b9..6b7ce8173dfd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 21CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_perf_event.o = -pg
24endif 25endif
25 26
26obj-$(CONFIG_FREEZER) += freezer.o 27obj-$(CONFIG_FREEZER) += freezer.o
@@ -95,6 +96,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
95obj-$(CONFIG_SMP) += sched_cpupri.o 96obj-$(CONFIG_SMP) += sched_cpupri.o
96obj-$(CONFIG_SLOW_WORK) += slow-work.o 97obj-$(CONFIG_SLOW_WORK) += slow-work.o
97obj-$(CONFIG_PERF_EVENTS) += perf_event.o 98obj-$(CONFIG_PERF_EVENTS) += perf_event.o
99obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
98 100
99ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 101ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
100# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 102# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index f7864ac2ecc1..3f45e3cf931d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,6 +49,7 @@
49#include <linux/init_task.h> 49#include <linux/init_task.h>
50#include <linux/perf_event.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/unistd.h> 55#include <asm/unistd.h>
@@ -978,6 +979,10 @@ NORET_TYPE void do_exit(long code)
978 proc_exit_connector(tsk); 979 proc_exit_connector(tsk);
979 980
980 /* 981 /*
982 * FIXME: do that only when needed, using sched_exit tracepoint
983 */
984 flush_ptrace_hw_breakpoint(tsk);
985 /*
981 * Flush inherited counters to the parent - before the parent 986 * Flush inherited counters to the parent - before the parent
982 * gets woken up by child-exit notifications. 987 * gets woken up by child-exit notifications.
983 */ 988 */
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..06d372fc026d
--- /dev/null
+++ b/kernel/hw_breakpoint.c
@@ -0,0 +1,501 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) IBM Corporation, 2009
18 * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Thanks to Ingo Molnar for his many suggestions.
21 *
22 * Authors: Alan Stern <stern@rowland.harvard.edu>
23 * K.Prasad <prasad@linux.vnet.ibm.com>
24 * Frederic Weisbecker <fweisbec@gmail.com>
25 */
26
27/*
28 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
29 * using the CPU's debug registers.
30 * This file contains the arch-independent routines.
31 */
32
33#include <linux/irqflags.h>
34#include <linux/kallsyms.h>
35#include <linux/notifier.h>
36#include <linux/kprobes.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/percpu.h>
41#include <linux/sched.h>
42#include <linux/init.h>
43#include <linux/smp.h>
44
45#include <linux/hw_breakpoint.h>
46
47/*
48 * Constraints data
49 */
50
51/* Number of pinned cpu breakpoints in a cpu */
52static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
53
54/* Number of pinned task breakpoints in a cpu */
55static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]);
56
57/* Number of non-pinned cpu/task breakpoints in a cpu */
58static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
59
60/* Gather the number of total pinned and un-pinned bp in a cpuset */
61struct bp_busy_slots {
62 unsigned int pinned;
63 unsigned int flexible;
64};
65
66/* Serialize accesses to the above constraints */
67static DEFINE_MUTEX(nr_bp_mutex);
68
69/*
70 * Report the maximum number of pinned breakpoints a task
71 * have in this cpu
72 */
73static unsigned int max_task_bp_pinned(int cpu)
74{
75 int i;
76 unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu);
77
78 for (i = HBP_NUM -1; i >= 0; i--) {
79 if (tsk_pinned[i] > 0)
80 return i + 1;
81 }
82
83 return 0;
84}
85
86/*
87 * Report the number of pinned/un-pinned breakpoints we have in
88 * a given cpu (cpu > -1) or in all of them (cpu = -1).
89 */
90static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
91{
92 if (cpu >= 0) {
93 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
94 slots->pinned += max_task_bp_pinned(cpu);
95 slots->flexible = per_cpu(nr_bp_flexible, cpu);
96
97 return;
98 }
99
100 for_each_online_cpu(cpu) {
101 unsigned int nr;
102
103 nr = per_cpu(nr_cpu_bp_pinned, cpu);
104 nr += max_task_bp_pinned(cpu);
105
106 if (nr > slots->pinned)
107 slots->pinned = nr;
108
109 nr = per_cpu(nr_bp_flexible, cpu);
110
111 if (nr > slots->flexible)
112 slots->flexible = nr;
113 }
114}
115
116/*
117 * Add a pinned breakpoint for the given task in our constraint table
118 */
119static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
120{
121 int count = 0;
122 struct perf_event *bp;
123 struct perf_event_context *ctx = tsk->perf_event_ctxp;
124 unsigned int *task_bp_pinned;
125 struct list_head *list;
126 unsigned long flags;
127
128 if (WARN_ONCE(!ctx, "No perf context for this task"))
129 return;
130
131 list = &ctx->event_list;
132
133 spin_lock_irqsave(&ctx->lock, flags);
134
135 /*
136 * The current breakpoint counter is not included in the list
137 * at the open() callback time
138 */
139 list_for_each_entry(bp, list, event_entry) {
140 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
141 count++;
142 }
143
144 spin_unlock_irqrestore(&ctx->lock, flags);
145
146 if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list"))
147 return;
148
149 task_bp_pinned = per_cpu(task_bp_pinned, cpu);
150 if (enable) {
151 task_bp_pinned[count]++;
152 if (count > 0)
153 task_bp_pinned[count-1]--;
154 } else {
155 task_bp_pinned[count]--;
156 if (count > 0)
157 task_bp_pinned[count-1]++;
158 }
159}
160
161/*
162 * Add/remove the given breakpoint in our constraint table
163 */
164static void toggle_bp_slot(struct perf_event *bp, bool enable)
165{
166 int cpu = bp->cpu;
167 struct task_struct *tsk = bp->ctx->task;
168
169 /* Pinned counter task profiling */
170 if (tsk) {
171 if (cpu >= 0) {
172 toggle_bp_task_slot(tsk, cpu, enable);
173 return;
174 }
175
176 for_each_online_cpu(cpu)
177 toggle_bp_task_slot(tsk, cpu, enable);
178 return;
179 }
180
181 /* Pinned counter cpu profiling */
182 if (enable)
183 per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
184 else
185 per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
186}
187
188/*
189 * Contraints to check before allowing this new breakpoint counter:
190 *
191 * == Non-pinned counter == (Considered as pinned for now)
192 *
193 * - If attached to a single cpu, check:
194 *
195 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
196 * + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM
197 *
198 * -> If there are already non-pinned counters in this cpu, it means
199 * there is already a free slot for them.
200 * Otherwise, we check that the maximum number of per task
201 * breakpoints (for this cpu) plus the number of per cpu breakpoint
202 * (for this cpu) doesn't cover every registers.
203 *
204 * - If attached to every cpus, check:
205 *
206 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
207 * + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM
208 *
209 * -> This is roughly the same, except we check the number of per cpu
210 * bp for every cpu and we keep the max one. Same for the per tasks
211 * breakpoints.
212 *
213 *
214 * == Pinned counter ==
215 *
216 * - If attached to a single cpu, check:
217 *
218 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
219 * + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM
220 *
221 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep
222 * one register at least (or they will never be fed).
223 *
224 * - If attached to every cpus, check:
225 *
226 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
227 * + max(per_cpu(task_bp_pinned, *))) < HBP_NUM
228 */
229int reserve_bp_slot(struct perf_event *bp)
230{
231 struct bp_busy_slots slots = {0};
232 int ret = 0;
233
234 mutex_lock(&nr_bp_mutex);
235
236 fetch_bp_busy_slots(&slots, bp->cpu);
237
238 /* Flexible counters need to keep at least one slot */
239 if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
240 ret = -ENOSPC;
241 goto end;
242 }
243
244 toggle_bp_slot(bp, true);
245
246end:
247 mutex_unlock(&nr_bp_mutex);
248
249 return ret;
250}
251
252void release_bp_slot(struct perf_event *bp)
253{
254 mutex_lock(&nr_bp_mutex);
255
256 toggle_bp_slot(bp, false);
257
258 mutex_unlock(&nr_bp_mutex);
259}
260
261
262int __register_perf_hw_breakpoint(struct perf_event *bp)
263{
264 int ret;
265
266 ret = reserve_bp_slot(bp);
267 if (ret)
268 return ret;
269
270 /*
271 * Ptrace breakpoints can be temporary perf events only
272 * meant to reserve a slot. In this case, it is created disabled and
273 * we don't want to check the params right now (as we put a null addr)
274 * But perf tools create events as disabled and we want to check
275 * the params for them.
276 * This is a quick hack that will be removed soon, once we remove
277 * the tmp breakpoints from ptrace
278 */
279 if (!bp->attr.disabled || bp->callback == perf_bp_event)
280 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
281
282 return ret;
283}
284
285int register_perf_hw_breakpoint(struct perf_event *bp)
286{
287 bp->callback = perf_bp_event;
288
289 return __register_perf_hw_breakpoint(bp);
290}
291
292/*
293 * Register a breakpoint bound to a task and a given cpu.
294 * If cpu is -1, the breakpoint is active for the task in every cpu
295 * If the task is -1, the breakpoint is active for every tasks in the given
296 * cpu.
297 */
298static struct perf_event *
299register_user_hw_breakpoint_cpu(unsigned long addr,
300 int len,
301 int type,
302 perf_callback_t triggered,
303 pid_t pid,
304 int cpu,
305 bool active)
306{
307 struct perf_event_attr *attr;
308 struct perf_event *bp;
309
310 attr = kzalloc(sizeof(*attr), GFP_KERNEL);
311 if (!attr)
312 return ERR_PTR(-ENOMEM);
313
314 attr->type = PERF_TYPE_BREAKPOINT;
315 attr->size = sizeof(*attr);
316 attr->bp_addr = addr;
317 attr->bp_len = len;
318 attr->bp_type = type;
319 /*
320 * Such breakpoints are used by debuggers to trigger signals when
321 * we hit the excepted memory op. We can't miss such events, they
322 * must be pinned.
323 */
324 attr->pinned = 1;
325
326 if (!active)
327 attr->disabled = 1;
328
329 bp = perf_event_create_kernel_counter(attr, cpu, pid, triggered);
330 kfree(attr);
331
332 return bp;
333}
334
335/**
336 * register_user_hw_breakpoint - register a hardware breakpoint for user space
337 * @addr: is the memory address that triggers the breakpoint
338 * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
339 * @type: the type of the access to the memory (read/write/exec)
340 * @triggered: callback to trigger when we hit the breakpoint
341 * @tsk: pointer to 'task_struct' of the process to which the address belongs
342 * @active: should we activate it while registering it
343 *
344 */
345struct perf_event *
346register_user_hw_breakpoint(unsigned long addr,
347 int len,
348 int type,
349 perf_callback_t triggered,
350 struct task_struct *tsk,
351 bool active)
352{
353 return register_user_hw_breakpoint_cpu(addr, len, type, triggered,
354 tsk->pid, -1, active);
355}
356EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
357
358/**
359 * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
360 * @bp: the breakpoint structure to modify
361 * @addr: is the memory address that triggers the breakpoint
362 * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
363 * @type: the type of the access to the memory (read/write/exec)
364 * @triggered: callback to trigger when we hit the breakpoint
365 * @tsk: pointer to 'task_struct' of the process to which the address belongs
366 * @active: should we activate it while registering it
367 */
368struct perf_event *
369modify_user_hw_breakpoint(struct perf_event *bp,
370 unsigned long addr,
371 int len,
372 int type,
373 perf_callback_t triggered,
374 struct task_struct *tsk,
375 bool active)
376{
377 /*
378 * FIXME: do it without unregistering
379 * - We don't want to lose our slot
380 * - If the new bp is incorrect, don't lose the older one
381 */
382 unregister_hw_breakpoint(bp);
383
384 return register_user_hw_breakpoint(addr, len, type, triggered,
385 tsk, active);
386}
387EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
388
389/**
390 * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
391 * @bp: the breakpoint structure to unregister
392 */
393void unregister_hw_breakpoint(struct perf_event *bp)
394{
395 if (!bp)
396 return;
397 perf_event_release_kernel(bp);
398}
399EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
400
401static struct perf_event *
402register_kernel_hw_breakpoint_cpu(unsigned long addr,
403 int len,
404 int type,
405 perf_callback_t triggered,
406 int cpu,
407 bool active)
408{
409 return register_user_hw_breakpoint_cpu(addr, len, type, triggered,
410 -1, cpu, active);
411}
412
413/**
414 * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
415 * @addr: is the memory address that triggers the breakpoint
416 * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
417 * @type: the type of the access to the memory (read/write/exec)
418 * @triggered: callback to trigger when we hit the breakpoint
419 * @active: should we activate it while registering it
420 *
421 * @return a set of per_cpu pointers to perf events
422 */
423struct perf_event **
424register_wide_hw_breakpoint(unsigned long addr,
425 int len,
426 int type,
427 perf_callback_t triggered,
428 bool active)
429{
430 struct perf_event **cpu_events, **pevent, *bp;
431 long err;
432 int cpu;
433
434 cpu_events = alloc_percpu(typeof(*cpu_events));
435 if (!cpu_events)
436 return ERR_PTR(-ENOMEM);
437
438 for_each_possible_cpu(cpu) {
439 pevent = per_cpu_ptr(cpu_events, cpu);
440 bp = register_kernel_hw_breakpoint_cpu(addr, len, type,
441 triggered, cpu, active);
442
443 *pevent = bp;
444
445 if (IS_ERR(bp) || !bp) {
446 err = PTR_ERR(bp);
447 goto fail;
448 }
449 }
450
451 return cpu_events;
452
453fail:
454 for_each_possible_cpu(cpu) {
455 pevent = per_cpu_ptr(cpu_events, cpu);
456 if (IS_ERR(*pevent) || !*pevent)
457 break;
458 unregister_hw_breakpoint(*pevent);
459 }
460 free_percpu(cpu_events);
461 /* return the error if any */
462 return ERR_PTR(err);
463}
464EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
465
466/**
467 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
468 * @cpu_events: the per cpu set of events to unregister
469 */
470void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
471{
472 int cpu;
473 struct perf_event **pevent;
474
475 for_each_possible_cpu(cpu) {
476 pevent = per_cpu_ptr(cpu_events, cpu);
477 unregister_hw_breakpoint(*pevent);
478 }
479 free_percpu(cpu_events);
480}
481EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
482
483static struct notifier_block hw_breakpoint_exceptions_nb = {
484 .notifier_call = hw_breakpoint_exceptions_notify,
485 /* we need to be notified first */
486 .priority = 0x7fffffff
487};
488
489static int __init init_hw_breakpoint(void)
490{
491 return register_die_notifier(&hw_breakpoint_exceptions_nb);
492}
493core_initcall(init_hw_breakpoint);
494
495
496struct pmu perf_ops_bp = {
497 .enable = arch_install_hw_breakpoint,
498 .disable = arch_uninstall_hw_breakpoint,
499 .read = hw_breakpoint_pmu_read,
500 .unthrottle = hw_breakpoint_pmu_unthrottle
501};
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8b6b8b697c68..8e5288a8a355 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name)
181 } 181 }
182 return module_kallsyms_lookup_name(name); 182 return module_kallsyms_lookup_name(name);
183} 183}
184EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
184 185
185int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, 186int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
186 unsigned long), 187 unsigned long),
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3256e36ad251..accfd7bfe387 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -29,6 +29,7 @@
29#include <linux/kernel_stat.h> 29#include <linux/kernel_stat.h>
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/ftrace_event.h> 31#include <linux/ftrace_event.h>
32#include <linux/hw_breakpoint.h>
32 33
33#include <asm/irq_regs.h> 34#include <asm/irq_regs.h>
34 35
@@ -245,6 +246,49 @@ static void perf_unpin_context(struct perf_event_context *ctx)
245 put_ctx(ctx); 246 put_ctx(ctx);
246} 247}
247 248
249static inline u64 perf_clock(void)
250{
251 return cpu_clock(smp_processor_id());
252}
253
254/*
255 * Update the record of the current time in a context.
256 */
257static void update_context_time(struct perf_event_context *ctx)
258{
259 u64 now = perf_clock();
260
261 ctx->time += now - ctx->timestamp;
262 ctx->timestamp = now;
263}
264
265/*
266 * Update the total_time_enabled and total_time_running fields for a event.
267 */
268static void update_event_times(struct perf_event *event)
269{
270 struct perf_event_context *ctx = event->ctx;
271 u64 run_end;
272
273 if (event->state < PERF_EVENT_STATE_INACTIVE ||
274 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
275 return;
276
277 if (ctx->is_active)
278 run_end = ctx->time;
279 else
280 run_end = event->tstamp_stopped;
281
282 event->total_time_enabled = run_end - event->tstamp_enabled;
283
284 if (event->state == PERF_EVENT_STATE_INACTIVE)
285 run_end = event->tstamp_stopped;
286 else
287 run_end = ctx->time;
288
289 event->total_time_running = run_end - event->tstamp_running;
290}
291
248/* 292/*
249 * Add a event from the lists for its context. 293 * Add a event from the lists for its context.
250 * Must be called with ctx->mutex and ctx->lock held. 294 * Must be called with ctx->mutex and ctx->lock held.
@@ -293,6 +337,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
293 if (event->group_leader != event) 337 if (event->group_leader != event)
294 event->group_leader->nr_siblings--; 338 event->group_leader->nr_siblings--;
295 339
340 update_event_times(event);
341 event->state = PERF_EVENT_STATE_OFF;
342
296 /* 343 /*
297 * If this was a group event with sibling events then 344 * If this was a group event with sibling events then
298 * upgrade the siblings to singleton events by adding them 345 * upgrade the siblings to singleton events by adding them
@@ -446,50 +493,11 @@ retry:
446 * can remove the event safely, if the call above did not 493 * can remove the event safely, if the call above did not
447 * succeed. 494 * succeed.
448 */ 495 */
449 if (!list_empty(&event->group_entry)) { 496 if (!list_empty(&event->group_entry))
450 list_del_event(event, ctx); 497 list_del_event(event, ctx);
451 }
452 spin_unlock_irq(&ctx->lock); 498 spin_unlock_irq(&ctx->lock);
453} 499}
454 500
455static inline u64 perf_clock(void)
456{
457 return cpu_clock(smp_processor_id());
458}
459
460/*
461 * Update the record of the current time in a context.
462 */
463static void update_context_time(struct perf_event_context *ctx)
464{
465 u64 now = perf_clock();
466
467 ctx->time += now - ctx->timestamp;
468 ctx->timestamp = now;
469}
470
471/*
472 * Update the total_time_enabled and total_time_running fields for a event.
473 */
474static void update_event_times(struct perf_event *event)
475{
476 struct perf_event_context *ctx = event->ctx;
477 u64 run_end;
478
479 if (event->state < PERF_EVENT_STATE_INACTIVE ||
480 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
481 return;
482
483 event->total_time_enabled = ctx->time - event->tstamp_enabled;
484
485 if (event->state == PERF_EVENT_STATE_INACTIVE)
486 run_end = event->tstamp_stopped;
487 else
488 run_end = ctx->time;
489
490 event->total_time_running = run_end - event->tstamp_running;
491}
492
493/* 501/*
494 * Update total_time_enabled and total_time_running for all events in a group. 502 * Update total_time_enabled and total_time_running for all events in a group.
495 */ 503 */
@@ -1032,10 +1040,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1032 update_context_time(ctx); 1040 update_context_time(ctx);
1033 1041
1034 perf_disable(); 1042 perf_disable();
1035 if (ctx->nr_active) 1043 if (ctx->nr_active) {
1036 list_for_each_entry(event, &ctx->group_list, group_entry) 1044 list_for_each_entry(event, &ctx->group_list, group_entry)
1037 group_sched_out(event, cpuctx, ctx); 1045 group_sched_out(event, cpuctx, ctx);
1038 1046 }
1039 perf_enable(); 1047 perf_enable();
1040 out: 1048 out:
1041 spin_unlock(&ctx->lock); 1049 spin_unlock(&ctx->lock);
@@ -1060,8 +1068,6 @@ static int context_equiv(struct perf_event_context *ctx1,
1060 && !ctx1->pin_count && !ctx2->pin_count; 1068 && !ctx1->pin_count && !ctx2->pin_count;
1061} 1069}
1062 1070
1063static void __perf_event_read(void *event);
1064
1065static void __perf_event_sync_stat(struct perf_event *event, 1071static void __perf_event_sync_stat(struct perf_event *event,
1066 struct perf_event *next_event) 1072 struct perf_event *next_event)
1067{ 1073{
@@ -1079,8 +1085,8 @@ static void __perf_event_sync_stat(struct perf_event *event,
1079 */ 1085 */
1080 switch (event->state) { 1086 switch (event->state) {
1081 case PERF_EVENT_STATE_ACTIVE: 1087 case PERF_EVENT_STATE_ACTIVE:
1082 __perf_event_read(event); 1088 event->pmu->read(event);
1083 break; 1089 /* fall-through */
1084 1090
1085 case PERF_EVENT_STATE_INACTIVE: 1091 case PERF_EVENT_STATE_INACTIVE:
1086 update_event_times(event); 1092 update_event_times(event);
@@ -1119,6 +1125,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1119 if (!ctx->nr_stat) 1125 if (!ctx->nr_stat)
1120 return; 1126 return;
1121 1127
1128 update_context_time(ctx);
1129
1122 event = list_first_entry(&ctx->event_list, 1130 event = list_first_entry(&ctx->event_list,
1123 struct perf_event, event_entry); 1131 struct perf_event, event_entry);
1124 1132
@@ -1162,8 +1170,6 @@ void perf_event_task_sched_out(struct task_struct *task,
1162 if (likely(!ctx || !cpuctx->task_ctx)) 1170 if (likely(!ctx || !cpuctx->task_ctx))
1163 return; 1171 return;
1164 1172
1165 update_context_time(ctx);
1166
1167 rcu_read_lock(); 1173 rcu_read_lock();
1168 parent = rcu_dereference(ctx->parent_ctx); 1174 parent = rcu_dereference(ctx->parent_ctx);
1169 next_ctx = next->perf_event_ctxp; 1175 next_ctx = next->perf_event_ctxp;
@@ -1516,7 +1522,6 @@ static void __perf_event_read(void *info)
1516 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1522 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1517 struct perf_event *event = info; 1523 struct perf_event *event = info;
1518 struct perf_event_context *ctx = event->ctx; 1524 struct perf_event_context *ctx = event->ctx;
1519 unsigned long flags;
1520 1525
1521 /* 1526 /*
1522 * If this is a task context, we need to check whether it is 1527 * If this is a task context, we need to check whether it is
@@ -1528,12 +1533,12 @@ static void __perf_event_read(void *info)
1528 if (ctx->task && cpuctx->task_ctx != ctx) 1533 if (ctx->task && cpuctx->task_ctx != ctx)
1529 return; 1534 return;
1530 1535
1531 local_irq_save(flags); 1536 spin_lock(&ctx->lock);
1532 if (ctx->is_active) 1537 update_context_time(ctx);
1533 update_context_time(ctx);
1534 event->pmu->read(event);
1535 update_event_times(event); 1538 update_event_times(event);
1536 local_irq_restore(flags); 1539 spin_unlock(&ctx->lock);
1540
1541 event->pmu->read(event);
1537} 1542}
1538 1543
1539static u64 perf_event_read(struct perf_event *event) 1544static u64 perf_event_read(struct perf_event *event)
@@ -1546,7 +1551,13 @@ static u64 perf_event_read(struct perf_event *event)
1546 smp_call_function_single(event->oncpu, 1551 smp_call_function_single(event->oncpu,
1547 __perf_event_read, event, 1); 1552 __perf_event_read, event, 1);
1548 } else if (event->state == PERF_EVENT_STATE_INACTIVE) { 1553 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1554 struct perf_event_context *ctx = event->ctx;
1555 unsigned long flags;
1556
1557 spin_lock_irqsave(&ctx->lock, flags);
1558 update_context_time(ctx);
1549 update_event_times(event); 1559 update_event_times(event);
1560 spin_unlock_irqrestore(&ctx->lock, flags);
1550 } 1561 }
1551 1562
1552 return atomic64_read(&event->count); 1563 return atomic64_read(&event->count);
@@ -1700,16 +1711,10 @@ static void free_event(struct perf_event *event)
1700 call_rcu(&event->rcu_head, free_event_rcu); 1711 call_rcu(&event->rcu_head, free_event_rcu);
1701} 1712}
1702 1713
1703/* 1714int perf_event_release_kernel(struct perf_event *event)
1704 * Called when the last reference to the file is gone.
1705 */
1706static int perf_release(struct inode *inode, struct file *file)
1707{ 1715{
1708 struct perf_event *event = file->private_data;
1709 struct perf_event_context *ctx = event->ctx; 1716 struct perf_event_context *ctx = event->ctx;
1710 1717
1711 file->private_data = NULL;
1712
1713 WARN_ON_ONCE(ctx->parent_ctx); 1718 WARN_ON_ONCE(ctx->parent_ctx);
1714 mutex_lock(&ctx->mutex); 1719 mutex_lock(&ctx->mutex);
1715 perf_event_remove_from_context(event); 1720 perf_event_remove_from_context(event);
@@ -1724,6 +1729,19 @@ static int perf_release(struct inode *inode, struct file *file)
1724 1729
1725 return 0; 1730 return 0;
1726} 1731}
1732EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1733
1734/*
1735 * Called when the last reference to the file is gone.
1736 */
1737static int perf_release(struct inode *inode, struct file *file)
1738{
1739 struct perf_event *event = file->private_data;
1740
1741 file->private_data = NULL;
1742
1743 return perf_event_release_kernel(event);
1744}
1727 1745
1728static int perf_event_read_size(struct perf_event *event) 1746static int perf_event_read_size(struct perf_event *event)
1729{ 1747{
@@ -1750,91 +1768,94 @@ static int perf_event_read_size(struct perf_event *event)
1750 return size; 1768 return size;
1751} 1769}
1752 1770
1753static u64 perf_event_read_value(struct perf_event *event) 1771u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
1754{ 1772{
1755 struct perf_event *child; 1773 struct perf_event *child;
1756 u64 total = 0; 1774 u64 total = 0;
1757 1775
1776 *enabled = 0;
1777 *running = 0;
1778
1779 mutex_lock(&event->child_mutex);
1758 total += perf_event_read(event); 1780 total += perf_event_read(event);
1759 list_for_each_entry(child, &event->child_list, child_list) 1781 *enabled += event->total_time_enabled +
1782 atomic64_read(&event->child_total_time_enabled);
1783 *running += event->total_time_running +
1784 atomic64_read(&event->child_total_time_running);
1785
1786 list_for_each_entry(child, &event->child_list, child_list) {
1760 total += perf_event_read(child); 1787 total += perf_event_read(child);
1788 *enabled += child->total_time_enabled;
1789 *running += child->total_time_running;
1790 }
1791 mutex_unlock(&event->child_mutex);
1761 1792
1762 return total; 1793 return total;
1763} 1794}
1764 1795EXPORT_SYMBOL_GPL(perf_event_read_value);
1765static int perf_event_read_entry(struct perf_event *event,
1766 u64 read_format, char __user *buf)
1767{
1768 int n = 0, count = 0;
1769 u64 values[2];
1770
1771 values[n++] = perf_event_read_value(event);
1772 if (read_format & PERF_FORMAT_ID)
1773 values[n++] = primary_event_id(event);
1774
1775 count = n * sizeof(u64);
1776
1777 if (copy_to_user(buf, values, count))
1778 return -EFAULT;
1779
1780 return count;
1781}
1782 1796
1783static int perf_event_read_group(struct perf_event *event, 1797static int perf_event_read_group(struct perf_event *event,
1784 u64 read_format, char __user *buf) 1798 u64 read_format, char __user *buf)
1785{ 1799{
1786 struct perf_event *leader = event->group_leader, *sub; 1800 struct perf_event *leader = event->group_leader, *sub;
1787 int n = 0, size = 0, err = -EFAULT; 1801 int n = 0, size = 0, ret = -EFAULT;
1788 u64 values[3]; 1802 struct perf_event_context *ctx = leader->ctx;
1803 u64 values[5];
1804 u64 count, enabled, running;
1805
1806 mutex_lock(&ctx->mutex);
1807 count = perf_event_read_value(leader, &enabled, &running);
1789 1808
1790 values[n++] = 1 + leader->nr_siblings; 1809 values[n++] = 1 + leader->nr_siblings;
1791 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1810 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1792 values[n++] = leader->total_time_enabled + 1811 values[n++] = enabled;
1793 atomic64_read(&leader->child_total_time_enabled); 1812 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1794 } 1813 values[n++] = running;
1795 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 1814 values[n++] = count;
1796 values[n++] = leader->total_time_running + 1815 if (read_format & PERF_FORMAT_ID)
1797 atomic64_read(&leader->child_total_time_running); 1816 values[n++] = primary_event_id(leader);
1798 }
1799 1817
1800 size = n * sizeof(u64); 1818 size = n * sizeof(u64);
1801 1819
1802 if (copy_to_user(buf, values, size)) 1820 if (copy_to_user(buf, values, size))
1803 return -EFAULT; 1821 goto unlock;
1804
1805 err = perf_event_read_entry(leader, read_format, buf + size);
1806 if (err < 0)
1807 return err;
1808 1822
1809 size += err; 1823 ret = size;
1810 1824
1811 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 1825 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1812 err = perf_event_read_entry(sub, read_format, 1826 n = 0;
1813 buf + size);
1814 if (err < 0)
1815 return err;
1816 1827
1817 size += err; 1828 values[n++] = perf_event_read_value(sub, &enabled, &running);
1829 if (read_format & PERF_FORMAT_ID)
1830 values[n++] = primary_event_id(sub);
1831
1832 size = n * sizeof(u64);
1833
1834 if (copy_to_user(buf + ret, values, size)) {
1835 ret = -EFAULT;
1836 goto unlock;
1837 }
1838
1839 ret += size;
1818 } 1840 }
1841unlock:
1842 mutex_unlock(&ctx->mutex);
1819 1843
1820 return size; 1844 return ret;
1821} 1845}
1822 1846
1823static int perf_event_read_one(struct perf_event *event, 1847static int perf_event_read_one(struct perf_event *event,
1824 u64 read_format, char __user *buf) 1848 u64 read_format, char __user *buf)
1825{ 1849{
1850 u64 enabled, running;
1826 u64 values[4]; 1851 u64 values[4];
1827 int n = 0; 1852 int n = 0;
1828 1853
1829 values[n++] = perf_event_read_value(event); 1854 values[n++] = perf_event_read_value(event, &enabled, &running);
1830 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1855 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1831 values[n++] = event->total_time_enabled + 1856 values[n++] = enabled;
1832 atomic64_read(&event->child_total_time_enabled); 1857 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1833 } 1858 values[n++] = running;
1834 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1835 values[n++] = event->total_time_running +
1836 atomic64_read(&event->child_total_time_running);
1837 }
1838 if (read_format & PERF_FORMAT_ID) 1859 if (read_format & PERF_FORMAT_ID)
1839 values[n++] = primary_event_id(event); 1860 values[n++] = primary_event_id(event);
1840 1861
@@ -1865,12 +1886,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1865 return -ENOSPC; 1886 return -ENOSPC;
1866 1887
1867 WARN_ON_ONCE(event->ctx->parent_ctx); 1888 WARN_ON_ONCE(event->ctx->parent_ctx);
1868 mutex_lock(&event->child_mutex);
1869 if (read_format & PERF_FORMAT_GROUP) 1889 if (read_format & PERF_FORMAT_GROUP)
1870 ret = perf_event_read_group(event, read_format, buf); 1890 ret = perf_event_read_group(event, read_format, buf);
1871 else 1891 else
1872 ret = perf_event_read_one(event, read_format, buf); 1892 ret = perf_event_read_one(event, read_format, buf);
1873 mutex_unlock(&event->child_mutex);
1874 1893
1875 return ret; 1894 return ret;
1876} 1895}
@@ -2315,7 +2334,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2315 } 2334 }
2316 2335
2317 if (!data->watermark) 2336 if (!data->watermark)
2318 data->watermark = max_t(long, PAGE_SIZE, max_size / 2); 2337 data->watermark = max_size / 2;
2319 2338
2320 2339
2321 rcu_assign_pointer(event->data, data); 2340 rcu_assign_pointer(event->data, data);
@@ -3245,15 +3264,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3245{ 3264{
3246 struct perf_event *event; 3265 struct perf_event *event;
3247 3266
3248 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3249 return;
3250
3251 rcu_read_lock();
3252 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3267 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3253 if (perf_event_task_match(event)) 3268 if (perf_event_task_match(event))
3254 perf_event_task_output(event, task_event); 3269 perf_event_task_output(event, task_event);
3255 } 3270 }
3256 rcu_read_unlock();
3257} 3271}
3258 3272
3259static void perf_event_task_event(struct perf_task_event *task_event) 3273static void perf_event_task_event(struct perf_task_event *task_event)
@@ -3261,11 +3275,11 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3261 struct perf_cpu_context *cpuctx; 3275 struct perf_cpu_context *cpuctx;
3262 struct perf_event_context *ctx = task_event->task_ctx; 3276 struct perf_event_context *ctx = task_event->task_ctx;
3263 3277
3278 rcu_read_lock();
3264 cpuctx = &get_cpu_var(perf_cpu_context); 3279 cpuctx = &get_cpu_var(perf_cpu_context);
3265 perf_event_task_ctx(&cpuctx->ctx, task_event); 3280 perf_event_task_ctx(&cpuctx->ctx, task_event);
3266 put_cpu_var(perf_cpu_context); 3281 put_cpu_var(perf_cpu_context);
3267 3282
3268 rcu_read_lock();
3269 if (!ctx) 3283 if (!ctx)
3270 ctx = rcu_dereference(task_event->task->perf_event_ctxp); 3284 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3271 if (ctx) 3285 if (ctx)
@@ -3357,15 +3371,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx,
3357{ 3371{
3358 struct perf_event *event; 3372 struct perf_event *event;
3359 3373
3360 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3361 return;
3362
3363 rcu_read_lock();
3364 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3374 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3365 if (perf_event_comm_match(event)) 3375 if (perf_event_comm_match(event))
3366 perf_event_comm_output(event, comm_event); 3376 perf_event_comm_output(event, comm_event);
3367 } 3377 }
3368 rcu_read_unlock();
3369} 3378}
3370 3379
3371static void perf_event_comm_event(struct perf_comm_event *comm_event) 3380static void perf_event_comm_event(struct perf_comm_event *comm_event)
@@ -3376,7 +3385,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3376 char comm[TASK_COMM_LEN]; 3385 char comm[TASK_COMM_LEN];
3377 3386
3378 memset(comm, 0, sizeof(comm)); 3387 memset(comm, 0, sizeof(comm));
3379 strncpy(comm, comm_event->task->comm, sizeof(comm)); 3388 strlcpy(comm, comm_event->task->comm, sizeof(comm));
3380 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3389 size = ALIGN(strlen(comm)+1, sizeof(u64));
3381 3390
3382 comm_event->comm = comm; 3391 comm_event->comm = comm;
@@ -3384,11 +3393,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3384 3393
3385 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3394 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3386 3395
3396 rcu_read_lock();
3387 cpuctx = &get_cpu_var(perf_cpu_context); 3397 cpuctx = &get_cpu_var(perf_cpu_context);
3388 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3398 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3389 put_cpu_var(perf_cpu_context); 3399 put_cpu_var(perf_cpu_context);
3390 3400
3391 rcu_read_lock();
3392 /* 3401 /*
3393 * doesn't really matter which of the child contexts the 3402 * doesn't really matter which of the child contexts the
3394 * events ends up in. 3403 * events ends up in.
@@ -3481,15 +3490,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3481{ 3490{
3482 struct perf_event *event; 3491 struct perf_event *event;
3483 3492
3484 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3485 return;
3486
3487 rcu_read_lock();
3488 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3493 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3489 if (perf_event_mmap_match(event, mmap_event)) 3494 if (perf_event_mmap_match(event, mmap_event))
3490 perf_event_mmap_output(event, mmap_event); 3495 perf_event_mmap_output(event, mmap_event);
3491 } 3496 }
3492 rcu_read_unlock();
3493} 3497}
3494 3498
3495static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 3499static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -3545,11 +3549,11 @@ got_name:
3545 3549
3546 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 3550 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3547 3551
3552 rcu_read_lock();
3548 cpuctx = &get_cpu_var(perf_cpu_context); 3553 cpuctx = &get_cpu_var(perf_cpu_context);
3549 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3554 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3550 put_cpu_var(perf_cpu_context); 3555 put_cpu_var(perf_cpu_context);
3551 3556
3552 rcu_read_lock();
3553 /* 3557 /*
3554 * doesn't really matter which of the child contexts the 3558 * doesn't really matter which of the child contexts the
3555 * events ends up in. 3559 * events ends up in.
@@ -3688,7 +3692,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3688 perf_event_disable(event); 3692 perf_event_disable(event);
3689 } 3693 }
3690 3694
3691 perf_event_output(event, nmi, data, regs); 3695 if (event->overflow_handler)
3696 event->overflow_handler(event, nmi, data, regs);
3697 else
3698 perf_event_output(event, nmi, data, regs);
3699
3692 return ret; 3700 return ret;
3693} 3701}
3694 3702
@@ -3733,16 +3741,16 @@ again:
3733 return nr; 3741 return nr;
3734} 3742}
3735 3743
3736static void perf_swevent_overflow(struct perf_event *event, 3744static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3737 int nmi, struct perf_sample_data *data, 3745 int nmi, struct perf_sample_data *data,
3738 struct pt_regs *regs) 3746 struct pt_regs *regs)
3739{ 3747{
3740 struct hw_perf_event *hwc = &event->hw; 3748 struct hw_perf_event *hwc = &event->hw;
3741 int throttle = 0; 3749 int throttle = 0;
3742 u64 overflow;
3743 3750
3744 data->period = event->hw.last_period; 3751 data->period = event->hw.last_period;
3745 overflow = perf_swevent_set_period(event); 3752 if (!overflow)
3753 overflow = perf_swevent_set_period(event);
3746 3754
3747 if (hwc->interrupts == MAX_INTERRUPTS) 3755 if (hwc->interrupts == MAX_INTERRUPTS)
3748 return; 3756 return;
@@ -3775,14 +3783,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3775 3783
3776 atomic64_add(nr, &event->count); 3784 atomic64_add(nr, &event->count);
3777 3785
3786 if (!regs)
3787 return;
3788
3778 if (!hwc->sample_period) 3789 if (!hwc->sample_period)
3779 return; 3790 return;
3780 3791
3781 if (!regs) 3792 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3793 return perf_swevent_overflow(event, 1, nmi, data, regs);
3794
3795 if (atomic64_add_negative(nr, &hwc->period_left))
3782 return; 3796 return;
3783 3797
3784 if (!atomic64_add_negative(nr, &hwc->period_left)) 3798 perf_swevent_overflow(event, 0, nmi, data, regs);
3785 perf_swevent_overflow(event, nmi, data, regs);
3786} 3799}
3787 3800
3788static int perf_swevent_is_counting(struct perf_event *event) 3801static int perf_swevent_is_counting(struct perf_event *event)
@@ -3818,6 +3831,20 @@ static int perf_swevent_is_counting(struct perf_event *event)
3818static int perf_tp_event_match(struct perf_event *event, 3831static int perf_tp_event_match(struct perf_event *event,
3819 struct perf_sample_data *data); 3832 struct perf_sample_data *data);
3820 3833
3834static int perf_exclude_event(struct perf_event *event,
3835 struct pt_regs *regs)
3836{
3837 if (regs) {
3838 if (event->attr.exclude_user && user_mode(regs))
3839 return 1;
3840
3841 if (event->attr.exclude_kernel && !user_mode(regs))
3842 return 1;
3843 }
3844
3845 return 0;
3846}
3847
3821static int perf_swevent_match(struct perf_event *event, 3848static int perf_swevent_match(struct perf_event *event,
3822 enum perf_type_id type, 3849 enum perf_type_id type,
3823 u32 event_id, 3850 u32 event_id,
@@ -3829,16 +3856,12 @@ static int perf_swevent_match(struct perf_event *event,
3829 3856
3830 if (event->attr.type != type) 3857 if (event->attr.type != type)
3831 return 0; 3858 return 0;
3859
3832 if (event->attr.config != event_id) 3860 if (event->attr.config != event_id)
3833 return 0; 3861 return 0;
3834 3862
3835 if (regs) { 3863 if (perf_exclude_event(event, regs))
3836 if (event->attr.exclude_user && user_mode(regs)) 3864 return 0;
3837 return 0;
3838
3839 if (event->attr.exclude_kernel && !user_mode(regs))
3840 return 0;
3841 }
3842 3865
3843 if (event->attr.type == PERF_TYPE_TRACEPOINT && 3866 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3844 !perf_tp_event_match(event, data)) 3867 !perf_tp_event_match(event, data))
@@ -3855,49 +3878,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3855{ 3878{
3856 struct perf_event *event; 3879 struct perf_event *event;
3857 3880
3858 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3859 return;
3860
3861 rcu_read_lock();
3862 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3881 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3863 if (perf_swevent_match(event, type, event_id, data, regs)) 3882 if (perf_swevent_match(event, type, event_id, data, regs))
3864 perf_swevent_add(event, nr, nmi, data, regs); 3883 perf_swevent_add(event, nr, nmi, data, regs);
3865 } 3884 }
3866 rcu_read_unlock();
3867} 3885}
3868 3886
3869static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) 3887int perf_swevent_get_recursion_context(void)
3870{ 3888{
3889 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3890 int rctx;
3891
3871 if (in_nmi()) 3892 if (in_nmi())
3872 return &cpuctx->recursion[3]; 3893 rctx = 3;
3894 else if (in_irq())
3895 rctx = 2;
3896 else if (in_softirq())
3897 rctx = 1;
3898 else
3899 rctx = 0;
3900
3901 if (cpuctx->recursion[rctx]) {
3902 put_cpu_var(perf_cpu_context);
3903 return -1;
3904 }
3873 3905
3874 if (in_irq()) 3906 cpuctx->recursion[rctx]++;
3875 return &cpuctx->recursion[2]; 3907 barrier();
3876 3908
3877 if (in_softirq()) 3909 return rctx;
3878 return &cpuctx->recursion[1]; 3910}
3911EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
3879 3912
3880 return &cpuctx->recursion[0]; 3913void perf_swevent_put_recursion_context(int rctx)
3914{
3915 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3916 barrier();
3917 cpuctx->recursion[rctx]++;
3918 put_cpu_var(perf_cpu_context);
3881} 3919}
3920EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
3882 3921
3883static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 3922static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3884 u64 nr, int nmi, 3923 u64 nr, int nmi,
3885 struct perf_sample_data *data, 3924 struct perf_sample_data *data,
3886 struct pt_regs *regs) 3925 struct pt_regs *regs)
3887{ 3926{
3888 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 3927 struct perf_cpu_context *cpuctx;
3889 int *recursion = perf_swevent_recursion_context(cpuctx);
3890 struct perf_event_context *ctx; 3928 struct perf_event_context *ctx;
3891 3929
3892 if (*recursion) 3930 cpuctx = &__get_cpu_var(perf_cpu_context);
3893 goto out; 3931 rcu_read_lock();
3894
3895 (*recursion)++;
3896 barrier();
3897
3898 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, 3932 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3899 nr, nmi, data, regs); 3933 nr, nmi, data, regs);
3900 rcu_read_lock();
3901 /* 3934 /*
3902 * doesn't really matter which of the child contexts the 3935 * doesn't really matter which of the child contexts the
3903 * events ends up in. 3936 * events ends up in.
@@ -3906,23 +3939,24 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3906 if (ctx) 3939 if (ctx)
3907 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); 3940 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3908 rcu_read_unlock(); 3941 rcu_read_unlock();
3909
3910 barrier();
3911 (*recursion)--;
3912
3913out:
3914 put_cpu_var(perf_cpu_context);
3915} 3942}
3916 3943
3917void __perf_sw_event(u32 event_id, u64 nr, int nmi, 3944void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3918 struct pt_regs *regs, u64 addr) 3945 struct pt_regs *regs, u64 addr)
3919{ 3946{
3920 struct perf_sample_data data = { 3947 struct perf_sample_data data;
3921 .addr = addr, 3948 int rctx;
3922 };
3923 3949
3924 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, 3950 rctx = perf_swevent_get_recursion_context();
3925 &data, regs); 3951 if (rctx < 0)
3952 return;
3953
3954 data.addr = addr;
3955 data.raw = NULL;
3956
3957 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
3958
3959 perf_swevent_put_recursion_context(rctx);
3926} 3960}
3927 3961
3928static void perf_swevent_read(struct perf_event *event) 3962static void perf_swevent_read(struct perf_event *event)
@@ -4145,6 +4179,7 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4145 if (!regs) 4179 if (!regs)
4146 regs = task_pt_regs(current); 4180 regs = task_pt_regs(current);
4147 4181
4182 /* Trace events already protected against recursion */
4148 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4183 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4149 &data, regs); 4184 &data, regs);
4150} 4185}
@@ -4231,6 +4266,57 @@ static void perf_event_free_filter(struct perf_event *event)
4231 4266
4232#endif /* CONFIG_EVENT_PROFILE */ 4267#endif /* CONFIG_EVENT_PROFILE */
4233 4268
4269#ifdef CONFIG_HAVE_HW_BREAKPOINT
4270static void bp_perf_event_destroy(struct perf_event *event)
4271{
4272 release_bp_slot(event);
4273}
4274
4275static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4276{
4277 int err;
4278 /*
4279 * The breakpoint is already filled if we haven't created the counter
4280 * through perf syscall
4281 * FIXME: manage to get trigerred to NULL if it comes from syscalls
4282 */
4283 if (!bp->callback)
4284 err = register_perf_hw_breakpoint(bp);
4285 else
4286 err = __register_perf_hw_breakpoint(bp);
4287 if (err)
4288 return ERR_PTR(err);
4289
4290 bp->destroy = bp_perf_event_destroy;
4291
4292 return &perf_ops_bp;
4293}
4294
4295void perf_bp_event(struct perf_event *bp, void *data)
4296{
4297 struct perf_sample_data sample;
4298 struct pt_regs *regs = data;
4299
4300 sample.addr = bp->attr.bp_addr;
4301
4302 if (!perf_exclude_event(bp, regs))
4303 perf_swevent_add(bp, 1, 1, &sample, regs);
4304}
4305#else
4306static void bp_perf_event_destroy(struct perf_event *event)
4307{
4308}
4309
4310static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4311{
4312 return NULL;
4313}
4314
4315void perf_bp_event(struct perf_event *bp, void *regs)
4316{
4317}
4318#endif
4319
4234atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4320atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4235 4321
4236static void sw_perf_event_destroy(struct perf_event *event) 4322static void sw_perf_event_destroy(struct perf_event *event)
@@ -4297,6 +4383,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4297 struct perf_event_context *ctx, 4383 struct perf_event_context *ctx,
4298 struct perf_event *group_leader, 4384 struct perf_event *group_leader,
4299 struct perf_event *parent_event, 4385 struct perf_event *parent_event,
4386 perf_callback_t callback,
4300 gfp_t gfpflags) 4387 gfp_t gfpflags)
4301{ 4388{
4302 const struct pmu *pmu; 4389 const struct pmu *pmu;
@@ -4339,6 +4426,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4339 4426
4340 event->state = PERF_EVENT_STATE_INACTIVE; 4427 event->state = PERF_EVENT_STATE_INACTIVE;
4341 4428
4429 if (!callback && parent_event)
4430 callback = parent_event->callback;
4431
4432 event->callback = callback;
4433
4342 if (attr->disabled) 4434 if (attr->disabled)
4343 event->state = PERF_EVENT_STATE_OFF; 4435 event->state = PERF_EVENT_STATE_OFF;
4344 4436
@@ -4373,6 +4465,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4373 pmu = tp_perf_event_init(event); 4465 pmu = tp_perf_event_init(event);
4374 break; 4466 break;
4375 4467
4468 case PERF_TYPE_BREAKPOINT:
4469 pmu = bp_perf_event_init(event);
4470 break;
4471
4472
4376 default: 4473 default:
4377 break; 4474 break;
4378 } 4475 }
@@ -4615,7 +4712,7 @@ SYSCALL_DEFINE5(perf_event_open,
4615 } 4712 }
4616 4713
4617 event = perf_event_alloc(&attr, cpu, ctx, group_leader, 4714 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4618 NULL, GFP_KERNEL); 4715 NULL, NULL, GFP_KERNEL);
4619 err = PTR_ERR(event); 4716 err = PTR_ERR(event);
4620 if (IS_ERR(event)) 4717 if (IS_ERR(event))
4621 goto err_put_context; 4718 goto err_put_context;
@@ -4663,6 +4760,58 @@ err_put_context:
4663 return err; 4760 return err;
4664} 4761}
4665 4762
4763/**
4764 * perf_event_create_kernel_counter
4765 *
4766 * @attr: attributes of the counter to create
4767 * @cpu: cpu in which the counter is bound
4768 * @pid: task to profile
4769 */
4770struct perf_event *
4771perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4772 pid_t pid, perf_callback_t callback)
4773{
4774 struct perf_event *event;
4775 struct perf_event_context *ctx;
4776 int err;
4777
4778 /*
4779 * Get the target context (task or percpu):
4780 */
4781
4782 ctx = find_get_context(pid, cpu);
4783 if (IS_ERR(ctx))
4784 return NULL;
4785
4786 event = perf_event_alloc(attr, cpu, ctx, NULL,
4787 NULL, callback, GFP_KERNEL);
4788 err = PTR_ERR(event);
4789 if (IS_ERR(event))
4790 goto err_put_context;
4791
4792 event->filp = NULL;
4793 WARN_ON_ONCE(ctx->parent_ctx);
4794 mutex_lock(&ctx->mutex);
4795 perf_install_in_context(ctx, event, cpu);
4796 ++ctx->generation;
4797 mutex_unlock(&ctx->mutex);
4798
4799 event->owner = current;
4800 get_task_struct(current);
4801 mutex_lock(&current->perf_event_mutex);
4802 list_add_tail(&event->owner_entry, &current->perf_event_list);
4803 mutex_unlock(&current->perf_event_mutex);
4804
4805 return event;
4806
4807err_put_context:
4808 if (err < 0)
4809 put_ctx(ctx);
4810
4811 return NULL;
4812}
4813EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4814
4666/* 4815/*
4667 * inherit a event from parent task to child task: 4816 * inherit a event from parent task to child task:
4668 */ 4817 */
@@ -4688,7 +4837,7 @@ inherit_event(struct perf_event *parent_event,
4688 child_event = perf_event_alloc(&parent_event->attr, 4837 child_event = perf_event_alloc(&parent_event->attr,
4689 parent_event->cpu, child_ctx, 4838 parent_event->cpu, child_ctx,
4690 group_leader, parent_event, 4839 group_leader, parent_event,
4691 GFP_KERNEL); 4840 NULL, GFP_KERNEL);
4692 if (IS_ERR(child_event)) 4841 if (IS_ERR(child_event))
4693 return child_event; 4842 return child_event;
4694 get_ctx(child_ctx); 4843 get_ctx(child_ctx);
@@ -4706,6 +4855,8 @@ inherit_event(struct perf_event *parent_event,
4706 if (parent_event->attr.freq) 4855 if (parent_event->attr.freq)
4707 child_event->hw.sample_period = parent_event->hw.sample_period; 4856 child_event->hw.sample_period = parent_event->hw.sample_period;
4708 4857
4858 child_event->overflow_handler = parent_event->overflow_handler;
4859
4709 /* 4860 /*
4710 * Link it up in the child's context: 4861 * Link it up in the child's context:
4711 */ 4862 */
@@ -4795,7 +4946,6 @@ __perf_event_exit_task(struct perf_event *child_event,
4795{ 4946{
4796 struct perf_event *parent_event; 4947 struct perf_event *parent_event;
4797 4948
4798 update_event_times(child_event);
4799 perf_event_remove_from_context(child_event); 4949 perf_event_remove_from_context(child_event);
4800 4950
4801 parent_event = child_event->parent; 4951 parent_event = child_event->parent;
@@ -4847,6 +4997,7 @@ void perf_event_exit_task(struct task_struct *child)
4847 * the events from it. 4997 * the events from it.
4848 */ 4998 */
4849 unclone_ctx(child_ctx); 4999 unclone_ctx(child_ctx);
5000 update_context_time(child_ctx);
4850 spin_unlock_irqrestore(&child_ctx->lock, flags); 5001 spin_unlock_irqrestore(&child_ctx->lock, flags);
4851 5002
4852 /* 5003 /*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f05671609a89..d006554888dc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -339,6 +339,27 @@ config POWER_TRACER
339 power management decisions, specifically the C-state and P-state 339 power management decisions, specifically the C-state and P-state
340 behavior. 340 behavior.
341 341
342config KSYM_TRACER
343 bool "Trace read and write access on kernel memory locations"
344 depends on HAVE_HW_BREAKPOINT
345 select TRACING
346 help
347 This tracer helps find read and write operations on any given kernel
348 symbol i.e. /proc/kallsyms.
349
350config PROFILE_KSYM_TRACER
351 bool "Profile all kernel memory accesses on 'watched' variables"
352 depends on KSYM_TRACER
353 help
354 This tracer profiles kernel accesses on variables watched through the
355 ksym tracer ftrace plugin. Depending upon the hardware, all read
356 and write operations on kernel variables can be monitored for
357 accesses.
358
359 The results will be displayed in:
360 /debugfs/tracing/profile_ksym
361
362 Say N if unsure.
342 363
343config STACK_TRACER 364config STACK_TRACER
344 bool "Trace max stack" 365 bool "Trace max stack"
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index edc3a3cca1a1..cd9ecd89ec77 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
57obj-$(CONFIG_EVENT_TRACING) += power-traces.o 58obj-$(CONFIG_EVENT_TRACING) += power-traces.o
58 59
59libftrace-y := ftrace.o 60libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b4e4212e66d7..4da6ede74401 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,6 +11,7 @@
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <trace/boot.h> 12#include <trace/boot.h>
13#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
14#include <linux/hw_breakpoint.h>
14 15
15#include <linux/trace_seq.h> 16#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h> 17#include <linux/ftrace_event.h>
@@ -37,6 +38,7 @@ enum trace_type {
37 TRACE_KMEM_ALLOC, 38 TRACE_KMEM_ALLOC,
38 TRACE_KMEM_FREE, 39 TRACE_KMEM_FREE,
39 TRACE_BLK, 40 TRACE_BLK,
41 TRACE_KSYM,
40 42
41 __TRACE_LAST_TYPE, 43 __TRACE_LAST_TYPE,
42}; 44};
@@ -232,6 +234,7 @@ extern void __ftrace_bad_type(void);
232 TRACE_KMEM_ALLOC); \ 234 TRACE_KMEM_ALLOC); \
233 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 235 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
234 TRACE_KMEM_FREE); \ 236 TRACE_KMEM_FREE); \
237 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
235 __ftrace_bad_type(); \ 238 __ftrace_bad_type(); \
236 } while (0) 239 } while (0)
237 240
@@ -387,6 +390,8 @@ int register_tracer(struct tracer *type);
387void unregister_tracer(struct tracer *type); 390void unregister_tracer(struct tracer *type);
388int is_tracing_stopped(void); 391int is_tracing_stopped(void);
389 392
393extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
394
390extern unsigned long nsecs_to_usecs(unsigned long nsecs); 395extern unsigned long nsecs_to_usecs(unsigned long nsecs);
391 396
392#ifdef CONFIG_TRACER_MAX_TRACE 397#ifdef CONFIG_TRACER_MAX_TRACE
@@ -461,6 +466,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
461 struct trace_array *tr); 466 struct trace_array *tr);
462extern int trace_selftest_startup_hw_branches(struct tracer *trace, 467extern int trace_selftest_startup_hw_branches(struct tracer *trace,
463 struct trace_array *tr); 468 struct trace_array *tr);
469extern int trace_selftest_startup_ksym(struct tracer *trace,
470 struct trace_array *tr);
464#endif /* CONFIG_FTRACE_STARTUP_TEST */ 471#endif /* CONFIG_FTRACE_STARTUP_TEST */
465 472
466extern void *head_page(struct trace_array_cpu *data); 473extern void *head_page(struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ead3d724599d..c16a08f399df 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
364 F_printk("type:%u call_site:%lx ptr:%p", 364 F_printk("type:%u call_site:%lx ptr:%p",
365 __entry->type_id, __entry->call_site, __entry->ptr) 365 __entry->type_id, __entry->call_site, __entry->ptr)
366); 366);
367
368FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
369
370 TRACE_KSYM,
371
372 F_STRUCT(
373 __field( unsigned long, ip )
374 __field( unsigned char, type )
375 __array( char , cmd, TASK_COMM_LEN )
376 __field( unsigned long, addr )
377 ),
378
379 F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
380 (void *)__entry->ip, (unsigned int)__entry->type,
381 (void *)__entry->addr, __entry->cmd)
382);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index e0d351b01f5a..d9c60f80aa0d 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -9,31 +9,33 @@
9#include "trace.h" 9#include "trace.h"
10 10
11 11
12struct perf_trace_buf *perf_trace_buf; 12char *perf_trace_buf;
13EXPORT_SYMBOL_GPL(perf_trace_buf); 13EXPORT_SYMBOL_GPL(perf_trace_buf);
14 14
15struct perf_trace_buf *perf_trace_buf_nmi; 15char *perf_trace_buf_nmi;
16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi); 16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
17 17
18typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
19
18/* Count the events in use (per event id, not per instance) */ 20/* Count the events in use (per event id, not per instance) */
19static int total_profile_count; 21static int total_profile_count;
20 22
21static int ftrace_profile_enable_event(struct ftrace_event_call *event) 23static int ftrace_profile_enable_event(struct ftrace_event_call *event)
22{ 24{
23 struct perf_trace_buf *buf; 25 char *buf;
24 int ret = -ENOMEM; 26 int ret = -ENOMEM;
25 27
26 if (atomic_inc_return(&event->profile_count)) 28 if (atomic_inc_return(&event->profile_count))
27 return 0; 29 return 0;
28 30
29 if (!total_profile_count) { 31 if (!total_profile_count) {
30 buf = alloc_percpu(struct perf_trace_buf); 32 buf = (char *)alloc_percpu(perf_trace_t);
31 if (!buf) 33 if (!buf)
32 goto fail_buf; 34 goto fail_buf;
33 35
34 rcu_assign_pointer(perf_trace_buf, buf); 36 rcu_assign_pointer(perf_trace_buf, buf);
35 37
36 buf = alloc_percpu(struct perf_trace_buf); 38 buf = (char *)alloc_percpu(perf_trace_t);
37 if (!buf) 39 if (!buf)
38 goto fail_buf_nmi; 40 goto fail_buf_nmi;
39 41
@@ -79,7 +81,7 @@ int ftrace_profile_enable(int event_id)
79 81
80static void ftrace_profile_disable_event(struct ftrace_event_call *event) 82static void ftrace_profile_disable_event(struct ftrace_event_call *event)
81{ 83{
82 struct perf_trace_buf *buf, *nmi_buf; 84 char *buf, *nmi_buf;
83 85
84 if (!atomic_add_negative(-1, &event->profile_count)) 86 if (!atomic_add_negative(-1, &event->profile_count))
85 return; 87 return;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 3696476f307d..79ce6a2bd74f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1208,11 +1208,12 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
1208 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1208 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1209 struct ftrace_event_call *call = &tp->call; 1209 struct ftrace_event_call *call = &tp->call;
1210 struct kprobe_trace_entry *entry; 1210 struct kprobe_trace_entry *entry;
1211 struct perf_trace_buf *trace_buf;
1212 struct trace_entry *ent; 1211 struct trace_entry *ent;
1213 int size, __size, i, pc, __cpu; 1212 int size, __size, i, pc, __cpu;
1214 unsigned long irq_flags; 1213 unsigned long irq_flags;
1214 char *trace_buf;
1215 char *raw_data; 1215 char *raw_data;
1216 int rctx;
1216 1217
1217 pc = preempt_count(); 1218 pc = preempt_count();
1218 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1219 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
@@ -1227,6 +1228,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
1227 * This also protects the rcu read side 1228 * This also protects the rcu read side
1228 */ 1229 */
1229 local_irq_save(irq_flags); 1230 local_irq_save(irq_flags);
1231
1232 rctx = perf_swevent_get_recursion_context();
1233 if (rctx < 0)
1234 goto end_recursion;
1235
1230 __cpu = smp_processor_id(); 1236 __cpu = smp_processor_id();
1231 1237
1232 if (in_nmi()) 1238 if (in_nmi())
@@ -1237,18 +1243,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
1237 if (!trace_buf) 1243 if (!trace_buf)
1238 goto end; 1244 goto end;
1239 1245
1240 trace_buf = per_cpu_ptr(trace_buf, __cpu); 1246 raw_data = per_cpu_ptr(trace_buf, __cpu);
1241
1242 if (trace_buf->recursion++)
1243 goto end_recursion;
1244
1245 /*
1246 * Make recursion update visible before entering perf_tp_event
1247 * so that we protect from perf recursions.
1248 */
1249 barrier();
1250
1251 raw_data = trace_buf->buf;
1252 1247
1253 /* Zero dead bytes from alignment to avoid buffer leak to userspace */ 1248 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1254 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 1249 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -1263,9 +1258,9 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
1263 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1258 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1264 perf_tp_event(call->id, entry->ip, 1, entry, size); 1259 perf_tp_event(call->id, entry->ip, 1, entry, size);
1265 1260
1266end_recursion:
1267 trace_buf->recursion--;
1268end: 1261end:
1262 perf_swevent_put_recursion_context(rctx);
1263end_recursion:
1269 local_irq_restore(irq_flags); 1264 local_irq_restore(irq_flags);
1270 1265
1271 return 0; 1266 return 0;
@@ -1278,11 +1273,12 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
1278 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1273 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1279 struct ftrace_event_call *call = &tp->call; 1274 struct ftrace_event_call *call = &tp->call;
1280 struct kretprobe_trace_entry *entry; 1275 struct kretprobe_trace_entry *entry;
1281 struct perf_trace_buf *trace_buf;
1282 struct trace_entry *ent; 1276 struct trace_entry *ent;
1283 int size, __size, i, pc, __cpu; 1277 int size, __size, i, pc, __cpu;
1284 unsigned long irq_flags; 1278 unsigned long irq_flags;
1279 char *trace_buf;
1285 char *raw_data; 1280 char *raw_data;
1281 int rctx;
1286 1282
1287 pc = preempt_count(); 1283 pc = preempt_count();
1288 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1284 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
@@ -1297,6 +1293,11 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
1297 * This also protects the rcu read side 1293 * This also protects the rcu read side
1298 */ 1294 */
1299 local_irq_save(irq_flags); 1295 local_irq_save(irq_flags);
1296
1297 rctx = perf_swevent_get_recursion_context();
1298 if (rctx < 0)
1299 goto end_recursion;
1300
1300 __cpu = smp_processor_id(); 1301 __cpu = smp_processor_id();
1301 1302
1302 if (in_nmi()) 1303 if (in_nmi())
@@ -1307,18 +1308,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
1307 if (!trace_buf) 1308 if (!trace_buf)
1308 goto end; 1309 goto end;
1309 1310
1310 trace_buf = per_cpu_ptr(trace_buf, __cpu); 1311 raw_data = per_cpu_ptr(trace_buf, __cpu);
1311
1312 if (trace_buf->recursion++)
1313 goto end_recursion;
1314
1315 /*
1316 * Make recursion update visible before entering perf_tp_event
1317 * so that we protect from perf recursions.
1318 */
1319 barrier();
1320
1321 raw_data = trace_buf->buf;
1322 1312
1323 /* Zero dead bytes from alignment to avoid buffer leak to userspace */ 1313 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1324 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 1314 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -1334,9 +1324,9 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
1334 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1324 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1335 perf_tp_event(call->id, entry->ret_ip, 1, entry, size); 1325 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1336 1326
1337end_recursion:
1338 trace_buf->recursion--;
1339end: 1327end:
1328 perf_swevent_put_recursion_context(rctx);
1329end_recursion:
1340 local_irq_restore(irq_flags); 1330 local_irq_restore(irq_flags);
1341 1331
1342 return 0; 1332 return 0;
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
new file mode 100644
index 000000000000..11935b53a6cb
--- /dev/null
+++ b/kernel/trace/trace_ksym.c
@@ -0,0 +1,554 @@
1/*
2 * trace_ksym.c - Kernel Symbol Tracer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 */
20
21#include <linux/kallsyms.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/ftrace.h>
25#include <linux/module.h>
26#include <linux/fs.h>
27
28#include "trace_output.h"
29#include "trace_stat.h"
30#include "trace.h"
31
32#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h>
34
35/*
36 * For now, let us restrict the no. of symbols traced simultaneously to number
37 * of available hardware breakpoint registers.
38 */
39#define KSYM_TRACER_MAX HBP_NUM
40
41#define KSYM_TRACER_OP_LEN 3 /* rw- */
42
43struct trace_ksym {
44 struct perf_event **ksym_hbp;
45 unsigned long ksym_addr;
46 int type;
47 int len;
48#ifdef CONFIG_PROFILE_KSYM_TRACER
49 unsigned long counter;
50#endif
51 struct hlist_node ksym_hlist;
52};
53
54static struct trace_array *ksym_trace_array;
55
56static unsigned int ksym_filter_entry_count;
57static unsigned int ksym_tracing_enabled;
58
59static HLIST_HEAD(ksym_filter_head);
60
61static DEFINE_MUTEX(ksym_tracer_mutex);
62
63#ifdef CONFIG_PROFILE_KSYM_TRACER
64
65#define MAX_UL_INT 0xffffffff
66
67void ksym_collect_stats(unsigned long hbp_hit_addr)
68{
69 struct hlist_node *node;
70 struct trace_ksym *entry;
71
72 rcu_read_lock();
73 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
74 if ((entry->ksym_addr == hbp_hit_addr) &&
75 (entry->counter <= MAX_UL_INT)) {
76 entry->counter++;
77 break;
78 }
79 }
80 rcu_read_unlock();
81}
82#endif /* CONFIG_PROFILE_KSYM_TRACER */
83
84void ksym_hbp_handler(struct perf_event *hbp, void *data)
85{
86 struct ring_buffer_event *event;
87 struct ksym_trace_entry *entry;
88 struct pt_regs *regs = data;
89 struct ring_buffer *buffer;
90 int pc;
91
92 if (!ksym_tracing_enabled)
93 return;
94
95 buffer = ksym_trace_array->buffer;
96
97 pc = preempt_count();
98
99 event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
100 sizeof(*entry), 0, pc);
101 if (!event)
102 return;
103
104 entry = ring_buffer_event_data(event);
105 entry->ip = instruction_pointer(regs);
106 entry->type = hw_breakpoint_type(hbp);
107 entry->addr = hw_breakpoint_addr(hbp);
108 strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
109
110#ifdef CONFIG_PROFILE_KSYM_TRACER
111 ksym_collect_stats(hw_breakpoint_addr(hbp));
112#endif /* CONFIG_PROFILE_KSYM_TRACER */
113
114 trace_buffer_unlock_commit(buffer, event, 0, pc);
115}
116
117/* Valid access types are represented as
118 *
119 * rw- : Set Read/Write Access Breakpoint
120 * -w- : Set Write Access Breakpoint
121 * --- : Clear Breakpoints
122 * --x : Set Execution Break points (Not available yet)
123 *
124 */
125static int ksym_trace_get_access_type(char *str)
126{
127 int access = 0;
128
129 if (str[0] == 'r')
130 access |= HW_BREAKPOINT_R;
131
132 if (str[1] == 'w')
133 access |= HW_BREAKPOINT_W;
134
135 if (str[2] == 'x')
136 access |= HW_BREAKPOINT_X;
137
138 switch (access) {
139 case HW_BREAKPOINT_R:
140 case HW_BREAKPOINT_W:
141 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
142 return access;
143 default:
144 return -EINVAL;
145 }
146}
147
148/*
149 * There can be several possible malformed requests and we attempt to capture
150 * all of them. We enumerate some of the rules
151 * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
152 * i.e. multiple ':' symbols disallowed. Possible uses are of the form
153 * <module>:<ksym_name>:<op>.
154 * 2. No delimiter symbol ':' in the input string
155 * 3. Spurious operator symbols or symbols not in their respective positions
156 * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
157 * 5. Kernel symbol not a part of /proc/kallsyms
158 * 6. Duplicate requests
159 */
160static int parse_ksym_trace_str(char *input_string, char **ksymname,
161 unsigned long *addr)
162{
163 int ret;
164
165 *ksymname = strsep(&input_string, ":");
166 *addr = kallsyms_lookup_name(*ksymname);
167
168 /* Check for malformed request: (2), (1) and (5) */
169 if ((!input_string) ||
170 (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
171 (*addr == 0))
172 return -EINVAL;;
173
174 ret = ksym_trace_get_access_type(input_string);
175
176 return ret;
177}
178
179int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
180{
181 struct trace_ksym *entry;
182 int ret = -ENOMEM;
183
184 if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
185 printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
186 " new requests for tracing can be accepted now.\n",
187 KSYM_TRACER_MAX);
188 return -ENOSPC;
189 }
190
191 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
192 if (!entry)
193 return -ENOMEM;
194
195 entry->type = op;
196 entry->ksym_addr = addr;
197 entry->len = HW_BREAKPOINT_LEN_4;
198
199 ret = -EAGAIN;
200 entry->ksym_hbp = register_wide_hw_breakpoint(entry->ksym_addr,
201 entry->len, entry->type,
202 ksym_hbp_handler, true);
203 if (IS_ERR(entry->ksym_hbp)) {
204 entry->ksym_hbp = NULL;
205 ret = PTR_ERR(entry->ksym_hbp);
206 }
207
208 if (!entry->ksym_hbp) {
209 printk(KERN_INFO "ksym_tracer request failed. Try again"
210 " later!!\n");
211 goto err;
212 }
213
214 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
215 ksym_filter_entry_count++;
216
217 return 0;
218
219err:
220 kfree(entry);
221
222 return ret;
223}
224
225static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
226 size_t count, loff_t *ppos)
227{
228 struct trace_ksym *entry;
229 struct hlist_node *node;
230 struct trace_seq *s;
231 ssize_t cnt = 0;
232 int ret;
233
234 s = kmalloc(sizeof(*s), GFP_KERNEL);
235 if (!s)
236 return -ENOMEM;
237 trace_seq_init(s);
238
239 mutex_lock(&ksym_tracer_mutex);
240
241 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
242 ret = trace_seq_printf(s, "%pS:", (void *)entry->ksym_addr);
243 if (entry->type == HW_BREAKPOINT_R)
244 ret = trace_seq_puts(s, "r--\n");
245 else if (entry->type == HW_BREAKPOINT_W)
246 ret = trace_seq_puts(s, "-w-\n");
247 else if (entry->type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
248 ret = trace_seq_puts(s, "rw-\n");
249 WARN_ON_ONCE(!ret);
250 }
251
252 cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
253
254 mutex_unlock(&ksym_tracer_mutex);
255
256 kfree(s);
257
258 return cnt;
259}
260
261static void __ksym_trace_reset(void)
262{
263 struct trace_ksym *entry;
264 struct hlist_node *node, *node1;
265
266 mutex_lock(&ksym_tracer_mutex);
267 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
268 ksym_hlist) {
269 unregister_wide_hw_breakpoint(entry->ksym_hbp);
270 ksym_filter_entry_count--;
271 hlist_del_rcu(&(entry->ksym_hlist));
272 synchronize_rcu();
273 kfree(entry);
274 }
275 mutex_unlock(&ksym_tracer_mutex);
276}
277
278static ssize_t ksym_trace_filter_write(struct file *file,
279 const char __user *buffer,
280 size_t count, loff_t *ppos)
281{
282 struct trace_ksym *entry;
283 struct hlist_node *node;
284 char *input_string, *ksymname = NULL;
285 unsigned long ksym_addr = 0;
286 int ret, op, changed = 0;
287
288 input_string = kzalloc(count + 1, GFP_KERNEL);
289 if (!input_string)
290 return -ENOMEM;
291
292 if (copy_from_user(input_string, buffer, count)) {
293 kfree(input_string);
294 return -EFAULT;
295 }
296 input_string[count] = '\0';
297
298 strstrip(input_string);
299
300 /*
301 * Clear all breakpoints if:
302 * 1: echo > ksym_trace_filter
303 * 2: echo 0 > ksym_trace_filter
304 * 3: echo "*:---" > ksym_trace_filter
305 */
306 if (!input_string[0] || !strcmp(input_string, "0") ||
307 !strcmp(input_string, "*:---")) {
308 __ksym_trace_reset();
309 kfree(input_string);
310 return count;
311 }
312
313 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
314 if (ret < 0) {
315 kfree(input_string);
316 return ret;
317 }
318
319 mutex_lock(&ksym_tracer_mutex);
320
321 ret = -EINVAL;
322 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
323 if (entry->ksym_addr == ksym_addr) {
324 /* Check for malformed request: (6) */
325 if (entry->type != op)
326 changed = 1;
327 else
328 goto out;
329 break;
330 }
331 }
332 if (changed) {
333 unregister_wide_hw_breakpoint(entry->ksym_hbp);
334 entry->type = op;
335 if (op > 0) {
336 entry->ksym_hbp =
337 register_wide_hw_breakpoint(entry->ksym_addr,
338 entry->len, entry->type,
339 ksym_hbp_handler, true);
340 if (IS_ERR(entry->ksym_hbp))
341 entry->ksym_hbp = NULL;
342 if (!entry->ksym_hbp)
343 goto out;
344 }
345 ksym_filter_entry_count--;
346 hlist_del_rcu(&(entry->ksym_hlist));
347 synchronize_rcu();
348 kfree(entry);
349 ret = 0;
350 goto out;
351 } else {
352 /* Check for malformed request: (4) */
353 if (op == 0)
354 goto out;
355 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
356 }
357out:
358 mutex_unlock(&ksym_tracer_mutex);
359
360 kfree(input_string);
361
362 if (!ret)
363 ret = count;
364 return ret;
365}
366
367static const struct file_operations ksym_tracing_fops = {
368 .open = tracing_open_generic,
369 .read = ksym_trace_filter_read,
370 .write = ksym_trace_filter_write,
371};
372
373static void ksym_trace_reset(struct trace_array *tr)
374{
375 ksym_tracing_enabled = 0;
376 __ksym_trace_reset();
377}
378
379static int ksym_trace_init(struct trace_array *tr)
380{
381 int cpu, ret = 0;
382
383 for_each_online_cpu(cpu)
384 tracing_reset(tr, cpu);
385 ksym_tracing_enabled = 1;
386 ksym_trace_array = tr;
387
388 return ret;
389}
390
391static void ksym_trace_print_header(struct seq_file *m)
392{
393 seq_puts(m,
394 "# TASK-PID CPU# Symbol "
395 "Type Function\n");
396 seq_puts(m,
397 "# | | | "
398 " | |\n");
399}
400
401static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
402{
403 struct trace_entry *entry = iter->ent;
404 struct trace_seq *s = &iter->seq;
405 struct ksym_trace_entry *field;
406 char str[KSYM_SYMBOL_LEN];
407 int ret;
408
409 if (entry->type != TRACE_KSYM)
410 return TRACE_TYPE_UNHANDLED;
411
412 trace_assign_type(field, entry);
413
414 ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
415 entry->pid, iter->cpu, (char *)field->addr);
416 if (!ret)
417 return TRACE_TYPE_PARTIAL_LINE;
418
419 switch (field->type) {
420 case HW_BREAKPOINT_R:
421 ret = trace_seq_printf(s, " R ");
422 break;
423 case HW_BREAKPOINT_W:
424 ret = trace_seq_printf(s, " W ");
425 break;
426 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
427 ret = trace_seq_printf(s, " RW ");
428 break;
429 default:
430 return TRACE_TYPE_PARTIAL_LINE;
431 }
432
433 if (!ret)
434 return TRACE_TYPE_PARTIAL_LINE;
435
436 sprint_symbol(str, field->ip);
437 ret = trace_seq_printf(s, "%s\n", str);
438 if (!ret)
439 return TRACE_TYPE_PARTIAL_LINE;
440
441 return TRACE_TYPE_HANDLED;
442}
443
444struct tracer ksym_tracer __read_mostly =
445{
446 .name = "ksym_tracer",
447 .init = ksym_trace_init,
448 .reset = ksym_trace_reset,
449#ifdef CONFIG_FTRACE_SELFTEST
450 .selftest = trace_selftest_startup_ksym,
451#endif
452 .print_header = ksym_trace_print_header,
453 .print_line = ksym_trace_output
454};
455
456__init static int init_ksym_trace(void)
457{
458 struct dentry *d_tracer;
459 struct dentry *entry;
460
461 d_tracer = tracing_init_dentry();
462 ksym_filter_entry_count = 0;
463
464 entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
465 NULL, &ksym_tracing_fops);
466 if (!entry)
467 pr_warning("Could not create debugfs "
468 "'ksym_trace_filter' file\n");
469
470 return register_tracer(&ksym_tracer);
471}
472device_initcall(init_ksym_trace);
473
474
475#ifdef CONFIG_PROFILE_KSYM_TRACER
476static int ksym_tracer_stat_headers(struct seq_file *m)
477{
478 seq_puts(m, " Access Type ");
479 seq_puts(m, " Symbol Counter\n");
480 seq_puts(m, " ----------- ");
481 seq_puts(m, " ------ -------\n");
482 return 0;
483}
484
485static int ksym_tracer_stat_show(struct seq_file *m, void *v)
486{
487 struct hlist_node *stat = v;
488 struct trace_ksym *entry;
489 int access_type = 0;
490 char fn_name[KSYM_NAME_LEN];
491
492 entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
493
494 access_type = entry->type;
495
496 switch (access_type) {
497 case HW_BREAKPOINT_R:
498 seq_puts(m, " R ");
499 break;
500 case HW_BREAKPOINT_W:
501 seq_puts(m, " W ");
502 break;
503 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
504 seq_puts(m, " RW ");
505 break;
506 default:
507 seq_puts(m, " NA ");
508 }
509
510 if (lookup_symbol_name(entry->ksym_addr, fn_name) >= 0)
511 seq_printf(m, " %-36s", fn_name);
512 else
513 seq_printf(m, " %-36s", "<NA>");
514 seq_printf(m, " %15lu\n", entry->counter);
515
516 return 0;
517}
518
519static void *ksym_tracer_stat_start(struct tracer_stat *trace)
520{
521 return ksym_filter_head.first;
522}
523
524static void *
525ksym_tracer_stat_next(void *v, int idx)
526{
527 struct hlist_node *stat = v;
528
529 return stat->next;
530}
531
532static struct tracer_stat ksym_tracer_stats = {
533 .name = "ksym_tracer",
534 .stat_start = ksym_tracer_stat_start,
535 .stat_next = ksym_tracer_stat_next,
536 .stat_headers = ksym_tracer_stat_headers,
537 .stat_show = ksym_tracer_stat_show
538};
539
540__init static int ksym_tracer_stat_init(void)
541{
542 int ret;
543
544 ret = register_stat_tracer(&ksym_tracer_stats);
545 if (ret) {
546 printk(KERN_WARNING "Warning: could not register "
547 "ksym tracer stats\n");
548 return 1;
549 }
550
551 return 0;
552}
553fs_initcall(ksym_tracer_stat_init);
554#endif /* CONFIG_PROFILE_KSYM_TRACER */
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index d2cdbabb4ead..dc98309e839a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
17 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
19 case TRACE_HW_BRANCHES: 19 case TRACE_HW_BRANCHES:
20 case TRACE_KSYM:
20 return 1; 21 return 1;
21 } 22 }
22 return 0; 23 return 0;
@@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
808 return ret; 809 return ret;
809} 810}
810#endif /* CONFIG_HW_BRANCH_TRACER */ 811#endif /* CONFIG_HW_BRANCH_TRACER */
812
813#ifdef CONFIG_KSYM_TRACER
814static int ksym_selftest_dummy;
815
816int
817trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
818{
819 unsigned long count;
820 int ret;
821
822 /* start the tracing */
823 ret = tracer_init(trace, tr);
824 if (ret) {
825 warn_failed_init_tracer(trace, ret);
826 return ret;
827 }
828
829 ksym_selftest_dummy = 0;
830 /* Register the read-write tracing request */
831
832 ret = process_new_ksym_entry("ksym_selftest_dummy",
833 HW_BREAKPOINT_R | HW_BREAKPOINT_W,
834 (unsigned long)(&ksym_selftest_dummy));
835
836 if (ret < 0) {
837 printk(KERN_CONT "ksym_trace read-write startup test failed\n");
838 goto ret_path;
839 }
840 /* Perform a read and a write operation over the dummy variable to
841 * trigger the tracer
842 */
843 if (ksym_selftest_dummy == 0)
844 ksym_selftest_dummy++;
845
846 /* stop the tracing. */
847 tracing_stop();
848 /* check the trace buffer */
849 ret = trace_test_buffer(tr, &count);
850 trace->reset(tr);
851 tracing_start();
852
853 /* read & write operations - one each is performed on the dummy variable
854 * triggering two entries in the trace buffer
855 */
856 if (!ret && count != 2) {
857 printk(KERN_CONT "Ksym tracer startup test failed");
858 ret = -1;
859 }
860
861ret_path:
862 return ret;
863}
864#endif /* CONFIG_KSYM_TRACER */
865
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 51213b0aa81b..9189cbe86079 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -477,11 +477,12 @@ static int sys_prof_refcount_exit;
477static void prof_syscall_enter(struct pt_regs *regs, long id) 477static void prof_syscall_enter(struct pt_regs *regs, long id)
478{ 478{
479 struct syscall_metadata *sys_data; 479 struct syscall_metadata *sys_data;
480 struct perf_trace_buf *trace_buf;
481 struct syscall_trace_enter *rec; 480 struct syscall_trace_enter *rec;
482 unsigned long flags; 481 unsigned long flags;
482 char *trace_buf;
483 char *raw_data; 483 char *raw_data;
484 int syscall_nr; 484 int syscall_nr;
485 int rctx;
485 int size; 486 int size;
486 int cpu; 487 int cpu;
487 488
@@ -505,28 +506,18 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
505 /* Protect the per cpu buffer, begin the rcu read side */ 506 /* Protect the per cpu buffer, begin the rcu read side */
506 local_irq_save(flags); 507 local_irq_save(flags);
507 508
509 rctx = perf_swevent_get_recursion_context();
510 if (rctx < 0)
511 goto end_recursion;
512
508 cpu = smp_processor_id(); 513 cpu = smp_processor_id();
509 514
510 if (in_nmi()) 515 trace_buf = rcu_dereference(perf_trace_buf);
511 trace_buf = rcu_dereference(perf_trace_buf_nmi);
512 else
513 trace_buf = rcu_dereference(perf_trace_buf);
514 516
515 if (!trace_buf) 517 if (!trace_buf)
516 goto end; 518 goto end;
517 519
518 trace_buf = per_cpu_ptr(trace_buf, cpu); 520 raw_data = per_cpu_ptr(trace_buf, cpu);
519
520 if (trace_buf->recursion++)
521 goto end_recursion;
522
523 /*
524 * Make recursion update visible before entering perf_tp_event
525 * so that we protect from perf recursions.
526 */
527 barrier();
528
529 raw_data = trace_buf->buf;
530 521
531 /* zero the dead bytes from align to not leak stack to user */ 522 /* zero the dead bytes from align to not leak stack to user */
532 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 523 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -539,9 +530,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
539 (unsigned long *)&rec->args); 530 (unsigned long *)&rec->args);
540 perf_tp_event(sys_data->enter_id, 0, 1, rec, size); 531 perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
541 532
542end_recursion:
543 trace_buf->recursion--;
544end: 533end:
534 perf_swevent_put_recursion_context(rctx);
535end_recursion:
545 local_irq_restore(flags); 536 local_irq_restore(flags);
546} 537}
547 538
@@ -588,10 +579,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
588{ 579{
589 struct syscall_metadata *sys_data; 580 struct syscall_metadata *sys_data;
590 struct syscall_trace_exit *rec; 581 struct syscall_trace_exit *rec;
591 struct perf_trace_buf *trace_buf;
592 unsigned long flags; 582 unsigned long flags;
593 int syscall_nr; 583 int syscall_nr;
584 char *trace_buf;
594 char *raw_data; 585 char *raw_data;
586 int rctx;
595 int size; 587 int size;
596 int cpu; 588 int cpu;
597 589
@@ -617,28 +609,19 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
617 609
618 /* Protect the per cpu buffer, begin the rcu read side */ 610 /* Protect the per cpu buffer, begin the rcu read side */
619 local_irq_save(flags); 611 local_irq_save(flags);
612
613 rctx = perf_swevent_get_recursion_context();
614 if (rctx < 0)
615 goto end_recursion;
616
620 cpu = smp_processor_id(); 617 cpu = smp_processor_id();
621 618
622 if (in_nmi()) 619 trace_buf = rcu_dereference(perf_trace_buf);
623 trace_buf = rcu_dereference(perf_trace_buf_nmi);
624 else
625 trace_buf = rcu_dereference(perf_trace_buf);
626 620
627 if (!trace_buf) 621 if (!trace_buf)
628 goto end; 622 goto end;
629 623
630 trace_buf = per_cpu_ptr(trace_buf, cpu); 624 raw_data = per_cpu_ptr(trace_buf, cpu);
631
632 if (trace_buf->recursion++)
633 goto end_recursion;
634
635 /*
636 * Make recursion update visible before entering perf_tp_event
637 * so that we protect from perf recursions.
638 */
639 barrier();
640
641 raw_data = trace_buf->buf;
642 625
643 /* zero the dead bytes from align to not leak stack to user */ 626 /* zero the dead bytes from align to not leak stack to user */
644 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 627 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -652,9 +635,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
652 635
653 perf_tp_event(sys_data->exit_id, 0, 1, rec, size); 636 perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
654 637
655end_recursion:
656 trace_buf->recursion--;
657end: 638end:
639 perf_swevent_put_recursion_context(rctx);
640end_recursion:
658 local_irq_restore(flags); 641 local_irq_restore(flags);
659} 642}
660 643
diff --git a/samples/Kconfig b/samples/Kconfig
index b92bde3c6a89..e4be84ac3d38 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -40,5 +40,11 @@ config SAMPLE_KRETPROBES
40 default m 40 default m
41 depends on SAMPLE_KPROBES && KRETPROBES 41 depends on SAMPLE_KPROBES && KRETPROBES
42 42
43config SAMPLE_HW_BREAKPOINT
44 tristate "Build kernel hardware breakpoint examples -- loadable module only"
45 depends on HAVE_HW_BREAKPOINT && m
46 help
47 This builds kernel hardware breakpoint example modules.
48
43endif # SAMPLES 49endif # SAMPLES
44 50
diff --git a/samples/Makefile b/samples/Makefile
index 43343a03b1f4..0f15e6d77fd6 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -1,3 +1,4 @@
1# Makefile for Linux samples code 1# Makefile for Linux samples code
2 2
3obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ tracepoints/ trace_events/ 3obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ tracepoints/ trace_events/ \
4 hw_breakpoint/
diff --git a/samples/hw_breakpoint/Makefile b/samples/hw_breakpoint/Makefile
new file mode 100644
index 000000000000..0f5c31c2fc47
--- /dev/null
+++ b/samples/hw_breakpoint/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o
diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c
new file mode 100644
index 000000000000..95063818bcf4
--- /dev/null
+++ b/samples/hw_breakpoint/data_breakpoint.c
@@ -0,0 +1,90 @@
1/*
2 * data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * usage: insmod data_breakpoint.ko ksym=<ksym_name>
19 *
20 * This file is a kernel module that places a breakpoint over ksym_name kernel
21 * variable using Hardware Breakpoint register. The corresponding handler which
22 * prints a backtrace is invoked everytime a write operation is performed on
23 * that variable.
24 *
25 * Copyright (C) IBM Corporation, 2009
26 *
27 * Author: K.Prasad <prasad@linux.vnet.ibm.com>
28 */
29#include <linux/module.h> /* Needed by all modules */
30#include <linux/kernel.h> /* Needed for KERN_INFO */
31#include <linux/init.h> /* Needed for the macros */
32#include <linux/kallsyms.h>
33
34#include <linux/perf_event.h>
35#include <linux/hw_breakpoint.h>
36
37struct perf_event **sample_hbp;
38
39static char ksym_name[KSYM_NAME_LEN] = "pid_max";
40module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO);
41MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any"
42 " write operations on the kernel symbol");
43
44static void sample_hbp_handler(struct perf_event *temp, void *data)
45{
46 printk(KERN_INFO "%s value is changed\n", ksym_name);
47 dump_stack();
48 printk(KERN_INFO "Dump stack from sample_hbp_handler\n");
49}
50
51static int __init hw_break_module_init(void)
52{
53 int ret;
54 unsigned long addr;
55
56 addr = kallsyms_lookup_name(ksym_name);
57
58 sample_hbp = register_wide_hw_breakpoint(addr, HW_BREAKPOINT_LEN_4,
59 HW_BREAKPOINT_W | HW_BREAKPOINT_R,
60 sample_hbp_handler, true);
61 if (IS_ERR(sample_hbp)) {
62 ret = PTR_ERR(sample_hbp);
63 goto fail;
64 } else if (!sample_hbp) {
65 ret = -EINVAL;
66 goto fail;
67 }
68
69 printk(KERN_INFO "HW Breakpoint for %s write installed\n", ksym_name);
70
71 return 0;
72
73fail:
74 printk(KERN_INFO "Breakpoint registration failed\n");
75
76 return ret;
77}
78
79static void __exit hw_break_module_exit(void)
80{
81 unregister_wide_hw_breakpoint(sample_hbp);
82 printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name);
83}
84
85module_init(hw_break_module_init);
86module_exit(hw_break_module_exit);
87
88MODULE_LICENSE("GPL");
89MODULE_AUTHOR("K.Prasad");
90MODULE_DESCRIPTION("ksym breakpoint");
diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt
new file mode 100644
index 000000000000..44b0ce35c28a
--- /dev/null
+++ b/tools/perf/Documentation/perf-kmem.txt
@@ -0,0 +1,44 @@
1perf-kmem(1)
2==============
3
4NAME
5----
6perf-kmem - Tool to trace/measure kernel memory(slab) properties
7
8SYNOPSIS
9--------
10[verse]
11'perf kmem' {record} [<options>]
12
13DESCRIPTION
14-----------
15There's two variants of perf kmem:
16
17 'perf kmem record <command>' to record the kmem events
18 of an arbitrary workload.
19
20 'perf kmem' to report kernel memory statistics.
21
22OPTIONS
23-------
24-i <file>::
25--input=<file>::
26 Select the input file (default: perf.data)
27
28--stat=<caller|alloc>::
29 Select per callsite or per allocation statistics
30
31-s <key[,key2...]>::
32--sort=<key[,key2...]>::
33 Sort the output (default: frag,hit,bytes)
34
35-l <num>::
36--line=<num>::
37 Print n lines only
38
39--raw-ip::
40 Print raw ip instead of symbol
41
42SEE ALSO
43--------
44linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 0ff23de9e453..fc46c0b40f6e 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -26,11 +26,19 @@ OPTIONS
26 26
27-e:: 27-e::
28--event=:: 28--event=::
29 Select the PMU event. Selection can be a symbolic event name 29 Select the PMU event. Selection can be:
30 (use 'perf list' to list all events) or a raw PMU
31 event (eventsel+umask) in the form of rNNN where NNN is a
32 hexadecimal event descriptor.
33 30
31 - a symbolic event name (use 'perf list' to list all events)
32
33 - a raw PMU event (eventsel+umask) in the form of rNNN where NNN is a
34 hexadecimal event descriptor.
35
36 - a hardware breakpoint event in the form of '\mem:addr[:access]'
37 where addr is the address in memory you want to break in.
38 Access is the memory access type (read, write, execute) it can
39 be passed as follows: '\mem:addr[:[r][w][x]]'.
40 If you want to profile read-write accesses in 0x1000, just set
41 'mem:0x1000:rw'.
34-a:: 42-a::
35 System-wide collection. 43 System-wide collection.
36 44
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 53e663a5fa2f..f1537a94a05f 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -2,6 +2,7 @@
2all:: 2all::
3 3
4# Define V=1 to have a more verbose compile. 4# Define V=1 to have a more verbose compile.
5# Define V=2 to have an even more verbose compile.
5# 6#
6# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf() 7# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf()
7# or vsnprintf() return -1 instead of number of characters which would 8# or vsnprintf() return -1 instead of number of characters which would
@@ -147,6 +148,8 @@ all::
147# broken, or spawning external process is slower than built-in grep perf has). 148# broken, or spawning external process is slower than built-in grep perf has).
148# 149#
149# Define LDFLAGS=-static to build a static binary. 150# Define LDFLAGS=-static to build a static binary.
151#
152# Define EXTRA_CFLAGS=-m64 or EXTRA_CFLAGS=-m32 as appropriate for cross-builds.
150 153
151PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE 154PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
152 @$(SHELL_PATH) util/PERF-VERSION-GEN 155 @$(SHELL_PATH) util/PERF-VERSION-GEN
@@ -159,22 +162,6 @@ uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
159uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not') 162uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
160uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not') 163uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
161 164
162#
163# Add -m32 for cross-builds:
164#
165ifdef NO_64BIT
166 MBITS := -m32
167else
168 #
169 # If we're on a 64-bit kernel (except ia64), use -m64:
170 #
171 ifneq ($(uname_M),ia64)
172 ifneq ($(patsubst %64,%,$(uname_M)),$(uname_M))
173 MBITS := -m64
174 endif
175 endif
176endif
177
178# CFLAGS and LDFLAGS are for the users to override from the command line. 165# CFLAGS and LDFLAGS are for the users to override from the command line.
179 166
180# 167#
@@ -211,7 +198,7 @@ ifndef PERF_DEBUG
211 CFLAGS_OPTIMIZE = -O6 198 CFLAGS_OPTIMIZE = -O6
212endif 199endif
213 200
214CFLAGS = $(MBITS) -ggdb3 -Wall -Wextra -std=gnu99 -Werror $(CFLAGS_OPTIMIZE) -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS) 201CFLAGS = -ggdb3 -Wall -Wextra -std=gnu99 -Werror $(CFLAGS_OPTIMIZE) -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
215EXTLIBS = -lpthread -lrt -lelf -lm 202EXTLIBS = -lpthread -lrt -lelf -lm
216ALL_CFLAGS = $(CFLAGS) 203ALL_CFLAGS = $(CFLAGS)
217ALL_LDFLAGS = $(LDFLAGS) 204ALL_LDFLAGS = $(LDFLAGS)
@@ -263,7 +250,7 @@ PTHREAD_LIBS = -lpthread
263# explicitly what architecture to check for. Fix this up for yours.. 250# explicitly what architecture to check for. Fix this up for yours..
264SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__ 251SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
265 252
266ifeq ($(shell sh -c "echo 'int foo(void) {char X[2]; return 3;}' | $(CC) -x c -c -Werror -fstack-protector-all - -o /dev/null >/dev/null 2>&1 && echo y"), y) 253ifeq ($(shell sh -c "echo 'int foo(void) {char X[2]; return 3;}' | $(CC) -x c -c -Werror -fstack-protector-all - -o /dev/null "$(QUIET_STDERR)" && echo y"), y)
267 CFLAGS := $(CFLAGS) -fstack-protector-all 254 CFLAGS := $(CFLAGS) -fstack-protector-all
268endif 255endif
269 256
@@ -445,9 +432,15 @@ BUILTIN_OBJS += builtin-timechart.o
445BUILTIN_OBJS += builtin-top.o 432BUILTIN_OBJS += builtin-top.o
446BUILTIN_OBJS += builtin-trace.o 433BUILTIN_OBJS += builtin-trace.o
447BUILTIN_OBJS += builtin-probe.o 434BUILTIN_OBJS += builtin-probe.o
435BUILTIN_OBJS += builtin-kmem.o
448 436
449PERFLIBS = $(LIB_FILE) 437PERFLIBS = $(LIB_FILE)
450 438
439ifeq ($(V), 2)
440 QUIET_STDERR = ">/dev/null"
441else
442 QUIET_STDERR = ">/dev/null 2>&1"
443endif
451# 444#
452# Platform specific tweaks 445# Platform specific tweaks
453# 446#
@@ -475,19 +468,19 @@ ifeq ($(uname_S),Darwin)
475 PTHREAD_LIBS = 468 PTHREAD_LIBS =
476endif 469endif
477 470
478ifeq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) > /dev/null 2>&1 && echo y"), y) 471ifeq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
479ifneq ($(shell sh -c "(echo '\#include <gnu/libc-version.h>'; echo 'int main(void) { const char * version = gnu_get_libc_version(); return (long)version; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) > /dev/null 2>&1 && echo y"), y) 472ifneq ($(shell sh -c "(echo '\#include <gnu/libc-version.h>'; echo 'int main(void) { const char * version = gnu_get_libc_version(); return (long)version; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
480 msg := $(error No gnu/libc-version.h found, please install glibc-dev[el]); 473 msg := $(error No gnu/libc-version.h found, please install glibc-dev[el]/glibc-static);
481endif 474endif
482 475
483 ifneq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ_MMAP, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) > /dev/null 2>&1 && echo y"), y) 476 ifneq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ_MMAP, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
484 BASIC_CFLAGS += -DLIBELF_NO_MMAP 477 BASIC_CFLAGS += -DLIBELF_NO_MMAP
485 endif 478 endif
486else 479else
487 msg := $(error No libelf.h/libelf found, please install libelf-dev/elfutils-libelf-devel and glibc-dev[el]); 480 msg := $(error No libelf.h/libelf found, please install libelf-dev/elfutils-libelf-devel and glibc-dev[el]);
488endif 481endif
489 482
490ifneq ($(shell sh -c "(echo '\#include <libdwarf/dwarf.h>'; echo '\#include <libdwarf/libdwarf.h>'; echo 'int main(void) { Dwarf_Debug dbg; Dwarf_Error err; Dwarf_Ranges *rng; dwarf_init(0, DW_DLC_READ, 0, 0, &dbg, &err); dwarf_get_ranges(dbg, 0, &rng, 0, 0, &err); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -ldwarf -lelf -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) > /dev/null 2>&1 && echo y"), y) 483ifneq ($(shell sh -c "(echo '\#include <libdwarf/dwarf.h>'; echo '\#include <libdwarf/libdwarf.h>'; echo 'int main(void) { Dwarf_Debug dbg; Dwarf_Error err; Dwarf_Ranges *rng; dwarf_init(0, DW_DLC_READ, 0, 0, &dbg, &err); dwarf_get_ranges(dbg, 0, &rng, 0, 0, &err); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -ldwarf -lelf -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
491 msg := $(warning No libdwarf.h found or old libdwarf.h found, disables dwarf support. Please install libdwarf-dev/libdwarf-devel >= 20081231); 484 msg := $(warning No libdwarf.h found or old libdwarf.h found, disables dwarf support. Please install libdwarf-dev/libdwarf-devel >= 20081231);
492 BASIC_CFLAGS += -DNO_LIBDWARF 485 BASIC_CFLAGS += -DNO_LIBDWARF
493else 486else
@@ -499,25 +492,25 @@ endif
499ifdef NO_DEMANGLE 492ifdef NO_DEMANGLE
500 BASIC_CFLAGS += -DNO_DEMANGLE 493 BASIC_CFLAGS += -DNO_DEMANGLE
501else 494else
502 has_bfd := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd > /dev/null 2>&1 && echo y") 495 has_bfd := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd "$(QUIET_STDERR)" && echo y")
503 496
504 ifeq ($(has_bfd),y) 497 ifeq ($(has_bfd),y)
505 EXTLIBS += -lbfd 498 EXTLIBS += -lbfd
506 else 499 else
507 has_bfd_iberty := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd -liberty > /dev/null 2>&1 && echo y") 500 has_bfd_iberty := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd -liberty "$(QUIET_STDERR)" && echo y")
508 ifeq ($(has_bfd_iberty),y) 501 ifeq ($(has_bfd_iberty),y)
509 EXTLIBS += -lbfd -liberty 502 EXTLIBS += -lbfd -liberty
510 else 503 else
511 has_bfd_iberty_z := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd -liberty -lz > /dev/null 2>&1 && echo y") 504 has_bfd_iberty_z := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd -liberty -lz "$(QUIET_STDERR)" && echo y")
512 ifeq ($(has_bfd_iberty_z),y) 505 ifeq ($(has_bfd_iberty_z),y)
513 EXTLIBS += -lbfd -liberty -lz 506 EXTLIBS += -lbfd -liberty -lz
514 else 507 else
515 has_cplus_demangle := $(shell sh -c "(echo 'extern char *cplus_demangle(const char *, int);'; echo 'int main(void) { cplus_demangle(0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -liberty > /dev/null 2>&1 && echo y") 508 has_cplus_demangle := $(shell sh -c "(echo 'extern char *cplus_demangle(const char *, int);'; echo 'int main(void) { cplus_demangle(0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -liberty "$(QUIET_STDERR)" && echo y")
516 ifeq ($(has_cplus_demangle),y) 509 ifeq ($(has_cplus_demangle),y)
517 EXTLIBS += -liberty 510 EXTLIBS += -liberty
518 BASIC_CFLAGS += -DHAVE_CPLUS_DEMANGLE 511 BASIC_CFLAGS += -DHAVE_CPLUS_DEMANGLE
519 else 512 else
520 msg := $(warning No bfd.h/libbfd found, install binutils-dev[el] to gain symbol demangling) 513 msg := $(warning No bfd.h/libbfd found, install binutils-dev[el]/zlib-static to gain symbol demangling)
521 BASIC_CFLAGS += -DNO_DEMANGLE 514 BASIC_CFLAGS += -DNO_DEMANGLE
522 endif 515 endif
523 endif 516 endif
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 77d50a6d6802..6b13a1ecf1e7 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -33,9 +33,11 @@ static int input;
33static int full_paths; 33static int full_paths;
34 34
35static int print_line; 35static int print_line;
36static bool use_modules;
36 37
37static unsigned long page_size; 38static unsigned long page_size;
38static unsigned long mmap_window = 32; 39static unsigned long mmap_window = 32;
40const char *vmlinux_name;
39 41
40struct sym_hist { 42struct sym_hist {
41 u64 sum; 43 u64 sum;
@@ -156,7 +158,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
156 158
157 if (event->header.misc & PERF_RECORD_MISC_KERNEL) { 159 if (event->header.misc & PERF_RECORD_MISC_KERNEL) {
158 level = 'k'; 160 level = 'k';
159 sym = kernel_maps__find_symbol(ip, &map); 161 sym = kernel_maps__find_symbol(ip, &map, symbol_filter);
160 dump_printf(" ...... dso: %s\n", 162 dump_printf(" ...... dso: %s\n",
161 map ? map->dso->long_name : "<not found>"); 163 map ? map->dso->long_name : "<not found>");
162 } else if (event->header.misc & PERF_RECORD_MISC_USER) { 164 } else if (event->header.misc & PERF_RECORD_MISC_USER) {
@@ -636,9 +638,9 @@ static int __cmd_annotate(void)
636 exit(0); 638 exit(0);
637 } 639 }
638 640
639 if (load_kernel(symbol_filter) < 0) { 641 if (kernel_maps__init(vmlinux_name, true, use_modules) < 0) {
640 perror("failed to load kernel symbols"); 642 pr_err("failed to create kernel maps for symbol resolution\b");
641 return EXIT_FAILURE; 643 return -1;
642 } 644 }
643 645
644remap: 646remap:
@@ -742,7 +744,7 @@ static const struct option options[] = {
742 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, 744 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
743 "dump raw trace in ASCII"), 745 "dump raw trace in ASCII"),
744 OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"), 746 OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"),
745 OPT_BOOLEAN('m', "modules", &modules, 747 OPT_BOOLEAN('m', "modules", &use_modules,
746 "load module symbols - WARNING: use only with -k and LIVE kernel"), 748 "load module symbols - WARNING: use only with -k and LIVE kernel"),
747 OPT_BOOLEAN('l', "print-line", &print_line, 749 OPT_BOOLEAN('l', "print-line", &print_line,
748 "print matching source lines (may be slow)"), 750 "print matching source lines (may be slow)"),
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
new file mode 100644
index 000000000000..173d6db42ecb
--- /dev/null
+++ b/tools/perf/builtin-kmem.c
@@ -0,0 +1,833 @@
1#include "builtin.h"
2#include "perf.h"
3
4#include "util/util.h"
5#include "util/cache.h"
6#include "util/symbol.h"
7#include "util/thread.h"
8#include "util/header.h"
9
10#include "util/parse-options.h"
11#include "util/trace-event.h"
12
13#include "util/debug.h"
14#include "util/data_map.h"
15
16#include <linux/rbtree.h>
17
18struct alloc_stat;
19typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
20
21static char const *input_name = "perf.data";
22
23static struct perf_header *header;
24static u64 sample_type;
25
26static int alloc_flag;
27static int caller_flag;
28
29static int alloc_lines = -1;
30static int caller_lines = -1;
31
32static bool raw_ip;
33
34static char default_sort_order[] = "frag,hit,bytes";
35
36static char *cwd;
37static int cwdlen;
38
39static int *cpunode_map;
40static int max_cpu_num;
41
42struct alloc_stat {
43 u64 call_site;
44 u64 ptr;
45 u64 bytes_req;
46 u64 bytes_alloc;
47 u32 hit;
48 u32 pingpong;
49
50 short alloc_cpu;
51
52 struct rb_node node;
53};
54
55static struct rb_root root_alloc_stat;
56static struct rb_root root_alloc_sorted;
57static struct rb_root root_caller_stat;
58static struct rb_root root_caller_sorted;
59
60static unsigned long total_requested, total_allocated;
61static unsigned long nr_allocs, nr_cross_allocs;
62
63struct raw_event_sample {
64 u32 size;
65 char data[0];
66};
67
68#define PATH_SYS_NODE "/sys/devices/system/node"
69
70static void init_cpunode_map(void)
71{
72 FILE *fp;
73 int i;
74
75 fp = fopen("/sys/devices/system/cpu/kernel_max", "r");
76 if (!fp) {
77 max_cpu_num = 4096;
78 return;
79 }
80
81 if (fscanf(fp, "%d", &max_cpu_num) < 1)
82 die("Failed to read 'kernel_max' from sysfs");
83 max_cpu_num++;
84
85 cpunode_map = calloc(max_cpu_num, sizeof(int));
86 if (!cpunode_map)
87 die("calloc");
88 for (i = 0; i < max_cpu_num; i++)
89 cpunode_map[i] = -1;
90 fclose(fp);
91}
92
93static void setup_cpunode_map(void)
94{
95 struct dirent *dent1, *dent2;
96 DIR *dir1, *dir2;
97 unsigned int cpu, mem;
98 char buf[PATH_MAX];
99
100 init_cpunode_map();
101
102 dir1 = opendir(PATH_SYS_NODE);
103 if (!dir1)
104 return;
105
106 while (true) {
107 dent1 = readdir(dir1);
108 if (!dent1)
109 break;
110
111 if (sscanf(dent1->d_name, "node%u", &mem) < 1)
112 continue;
113
114 snprintf(buf, PATH_MAX, "%s/%s", PATH_SYS_NODE, dent1->d_name);
115 dir2 = opendir(buf);
116 if (!dir2)
117 continue;
118 while (true) {
119 dent2 = readdir(dir2);
120 if (!dent2)
121 break;
122 if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1)
123 continue;
124 cpunode_map[cpu] = mem;
125 }
126 }
127}
128
129static int
130process_comm_event(event_t *event, unsigned long offset, unsigned long head)
131{
132 struct thread *thread = threads__findnew(event->comm.pid);
133
134 dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
135 (void *)(offset + head),
136 (void *)(long)(event->header.size),
137 event->comm.comm, event->comm.pid);
138
139 if (thread == NULL ||
140 thread__set_comm(thread, event->comm.comm)) {
141 dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
142 return -1;
143 }
144
145 return 0;
146}
147
148static void insert_alloc_stat(unsigned long call_site, unsigned long ptr,
149 int bytes_req, int bytes_alloc, int cpu)
150{
151 struct rb_node **node = &root_alloc_stat.rb_node;
152 struct rb_node *parent = NULL;
153 struct alloc_stat *data = NULL;
154
155 while (*node) {
156 parent = *node;
157 data = rb_entry(*node, struct alloc_stat, node);
158
159 if (ptr > data->ptr)
160 node = &(*node)->rb_right;
161 else if (ptr < data->ptr)
162 node = &(*node)->rb_left;
163 else
164 break;
165 }
166
167 if (data && data->ptr == ptr) {
168 data->hit++;
169 data->bytes_req += bytes_req;
170 data->bytes_alloc += bytes_req;
171 } else {
172 data = malloc(sizeof(*data));
173 if (!data)
174 die("malloc");
175 data->ptr = ptr;
176 data->pingpong = 0;
177 data->hit = 1;
178 data->bytes_req = bytes_req;
179 data->bytes_alloc = bytes_alloc;
180
181 rb_link_node(&data->node, parent, node);
182 rb_insert_color(&data->node, &root_alloc_stat);
183 }
184 data->call_site = call_site;
185 data->alloc_cpu = cpu;
186}
187
188static void insert_caller_stat(unsigned long call_site,
189 int bytes_req, int bytes_alloc)
190{
191 struct rb_node **node = &root_caller_stat.rb_node;
192 struct rb_node *parent = NULL;
193 struct alloc_stat *data = NULL;
194
195 while (*node) {
196 parent = *node;
197 data = rb_entry(*node, struct alloc_stat, node);
198
199 if (call_site > data->call_site)
200 node = &(*node)->rb_right;
201 else if (call_site < data->call_site)
202 node = &(*node)->rb_left;
203 else
204 break;
205 }
206
207 if (data && data->call_site == call_site) {
208 data->hit++;
209 data->bytes_req += bytes_req;
210 data->bytes_alloc += bytes_req;
211 } else {
212 data = malloc(sizeof(*data));
213 if (!data)
214 die("malloc");
215 data->call_site = call_site;
216 data->pingpong = 0;
217 data->hit = 1;
218 data->bytes_req = bytes_req;
219 data->bytes_alloc = bytes_alloc;
220
221 rb_link_node(&data->node, parent, node);
222 rb_insert_color(&data->node, &root_caller_stat);
223 }
224}
225
226static void process_alloc_event(struct raw_event_sample *raw,
227 struct event *event,
228 int cpu,
229 u64 timestamp __used,
230 struct thread *thread __used,
231 int node)
232{
233 unsigned long call_site;
234 unsigned long ptr;
235 int bytes_req;
236 int bytes_alloc;
237 int node1, node2;
238
239 ptr = raw_field_value(event, "ptr", raw->data);
240 call_site = raw_field_value(event, "call_site", raw->data);
241 bytes_req = raw_field_value(event, "bytes_req", raw->data);
242 bytes_alloc = raw_field_value(event, "bytes_alloc", raw->data);
243
244 insert_alloc_stat(call_site, ptr, bytes_req, bytes_alloc, cpu);
245 insert_caller_stat(call_site, bytes_req, bytes_alloc);
246
247 total_requested += bytes_req;
248 total_allocated += bytes_alloc;
249
250 if (node) {
251 node1 = cpunode_map[cpu];
252 node2 = raw_field_value(event, "node", raw->data);
253 if (node1 != node2)
254 nr_cross_allocs++;
255 }
256 nr_allocs++;
257}
258
259static int ptr_cmp(struct alloc_stat *, struct alloc_stat *);
260static int callsite_cmp(struct alloc_stat *, struct alloc_stat *);
261
262static struct alloc_stat *search_alloc_stat(unsigned long ptr,
263 unsigned long call_site,
264 struct rb_root *root,
265 sort_fn_t sort_fn)
266{
267 struct rb_node *node = root->rb_node;
268 struct alloc_stat key = { .ptr = ptr, .call_site = call_site };
269
270 while (node) {
271 struct alloc_stat *data;
272 int cmp;
273
274 data = rb_entry(node, struct alloc_stat, node);
275
276 cmp = sort_fn(&key, data);
277 if (cmp < 0)
278 node = node->rb_left;
279 else if (cmp > 0)
280 node = node->rb_right;
281 else
282 return data;
283 }
284 return NULL;
285}
286
287static void process_free_event(struct raw_event_sample *raw,
288 struct event *event,
289 int cpu,
290 u64 timestamp __used,
291 struct thread *thread __used)
292{
293 unsigned long ptr;
294 struct alloc_stat *s_alloc, *s_caller;
295
296 ptr = raw_field_value(event, "ptr", raw->data);
297
298 s_alloc = search_alloc_stat(ptr, 0, &root_alloc_stat, ptr_cmp);
299 if (!s_alloc)
300 return;
301
302 if (cpu != s_alloc->alloc_cpu) {
303 s_alloc->pingpong++;
304
305 s_caller = search_alloc_stat(0, s_alloc->call_site,
306 &root_caller_stat, callsite_cmp);
307 assert(s_caller);
308 s_caller->pingpong++;
309 }
310 s_alloc->alloc_cpu = -1;
311}
312
313static void
314process_raw_event(event_t *raw_event __used, void *more_data,
315 int cpu, u64 timestamp, struct thread *thread)
316{
317 struct raw_event_sample *raw = more_data;
318 struct event *event;
319 int type;
320
321 type = trace_parse_common_type(raw->data);
322 event = trace_find_event(type);
323
324 if (!strcmp(event->name, "kmalloc") ||
325 !strcmp(event->name, "kmem_cache_alloc")) {
326 process_alloc_event(raw, event, cpu, timestamp, thread, 0);
327 return;
328 }
329
330 if (!strcmp(event->name, "kmalloc_node") ||
331 !strcmp(event->name, "kmem_cache_alloc_node")) {
332 process_alloc_event(raw, event, cpu, timestamp, thread, 1);
333 return;
334 }
335
336 if (!strcmp(event->name, "kfree") ||
337 !strcmp(event->name, "kmem_cache_free")) {
338 process_free_event(raw, event, cpu, timestamp, thread);
339 return;
340 }
341}
342
343static int
344process_sample_event(event_t *event, unsigned long offset, unsigned long head)
345{
346 u64 ip = event->ip.ip;
347 u64 timestamp = -1;
348 u32 cpu = -1;
349 u64 period = 1;
350 void *more_data = event->ip.__more_data;
351 struct thread *thread = threads__findnew(event->ip.pid);
352
353 if (sample_type & PERF_SAMPLE_TIME) {
354 timestamp = *(u64 *)more_data;
355 more_data += sizeof(u64);
356 }
357
358 if (sample_type & PERF_SAMPLE_CPU) {
359 cpu = *(u32 *)more_data;
360 more_data += sizeof(u32);
361 more_data += sizeof(u32); /* reserved */
362 }
363
364 if (sample_type & PERF_SAMPLE_PERIOD) {
365 period = *(u64 *)more_data;
366 more_data += sizeof(u64);
367 }
368
369 dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
370 (void *)(offset + head),
371 (void *)(long)(event->header.size),
372 event->header.misc,
373 event->ip.pid, event->ip.tid,
374 (void *)(long)ip,
375 (long long)period);
376
377 if (thread == NULL) {
378 pr_debug("problem processing %d event, skipping it.\n",
379 event->header.type);
380 return -1;
381 }
382
383 dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
384
385 process_raw_event(event, more_data, cpu, timestamp, thread);
386
387 return 0;
388}
389
390static int sample_type_check(u64 type)
391{
392 sample_type = type;
393
394 if (!(sample_type & PERF_SAMPLE_RAW)) {
395 fprintf(stderr,
396 "No trace sample to read. Did you call perf record "
397 "without -R?");
398 return -1;
399 }
400
401 return 0;
402}
403
404static struct perf_file_handler file_handler = {
405 .process_sample_event = process_sample_event,
406 .process_comm_event = process_comm_event,
407 .sample_type_check = sample_type_check,
408};
409
410static int read_events(void)
411{
412 register_idle_thread();
413 register_perf_file_handler(&file_handler);
414
415 return mmap_dispatch_perf_file(&header, input_name, NULL, false, 0, 0,
416 &cwdlen, &cwd);
417}
418
419static double fragmentation(unsigned long n_req, unsigned long n_alloc)
420{
421 if (n_alloc == 0)
422 return 0.0;
423 else
424 return 100.0 - (100.0 * n_req / n_alloc);
425}
426
427static void __print_result(struct rb_root *root, int n_lines, int is_caller)
428{
429 struct rb_node *next;
430
431 printf("%.102s\n", graph_dotted_line);
432 printf(" %-34s |", is_caller ? "Callsite": "Alloc Ptr");
433 printf(" Total_alloc/Per | Total_req/Per | Hit | Ping-pong | Frag\n");
434 printf("%.102s\n", graph_dotted_line);
435
436 next = rb_first(root);
437
438 while (next && n_lines--) {
439 struct alloc_stat *data = rb_entry(next, struct alloc_stat,
440 node);
441 struct symbol *sym = NULL;
442 char buf[BUFSIZ];
443 u64 addr;
444
445 if (is_caller) {
446 addr = data->call_site;
447 if (!raw_ip)
448 sym = kernel_maps__find_symbol(addr,
449 NULL, NULL);
450 } else
451 addr = data->ptr;
452
453 if (sym != NULL)
454 snprintf(buf, sizeof(buf), "%s+%Lx", sym->name,
455 addr - sym->start);
456 else
457 snprintf(buf, sizeof(buf), "%#Lx", addr);
458 printf(" %-34s |", buf);
459
460 printf(" %9llu/%-5lu | %9llu/%-5lu | %6lu | %8lu | %6.3f%%\n",
461 (unsigned long long)data->bytes_alloc,
462 (unsigned long)data->bytes_alloc / data->hit,
463 (unsigned long long)data->bytes_req,
464 (unsigned long)data->bytes_req / data->hit,
465 (unsigned long)data->hit,
466 (unsigned long)data->pingpong,
467 fragmentation(data->bytes_req, data->bytes_alloc));
468
469 next = rb_next(next);
470 }
471
472 if (n_lines == -1)
473 printf(" ... | ... | ... | ... | ... | ... \n");
474
475 printf("%.102s\n", graph_dotted_line);
476}
477
478static void print_summary(void)
479{
480 printf("\nSUMMARY\n=======\n");
481 printf("Total bytes requested: %lu\n", total_requested);
482 printf("Total bytes allocated: %lu\n", total_allocated);
483 printf("Total bytes wasted on internal fragmentation: %lu\n",
484 total_allocated - total_requested);
485 printf("Internal fragmentation: %f%%\n",
486 fragmentation(total_requested, total_allocated));
487 printf("Cross CPU allocations: %lu/%lu\n", nr_cross_allocs, nr_allocs);
488}
489
490static void print_result(void)
491{
492 if (caller_flag)
493 __print_result(&root_caller_sorted, caller_lines, 1);
494 if (alloc_flag)
495 __print_result(&root_alloc_sorted, alloc_lines, 0);
496 print_summary();
497}
498
499struct sort_dimension {
500 const char name[20];
501 sort_fn_t cmp;
502 struct list_head list;
503};
504
505static LIST_HEAD(caller_sort);
506static LIST_HEAD(alloc_sort);
507
508static void sort_insert(struct rb_root *root, struct alloc_stat *data,
509 struct list_head *sort_list)
510{
511 struct rb_node **new = &(root->rb_node);
512 struct rb_node *parent = NULL;
513 struct sort_dimension *sort;
514
515 while (*new) {
516 struct alloc_stat *this;
517 int cmp = 0;
518
519 this = rb_entry(*new, struct alloc_stat, node);
520 parent = *new;
521
522 list_for_each_entry(sort, sort_list, list) {
523 cmp = sort->cmp(data, this);
524 if (cmp)
525 break;
526 }
527
528 if (cmp > 0)
529 new = &((*new)->rb_left);
530 else
531 new = &((*new)->rb_right);
532 }
533
534 rb_link_node(&data->node, parent, new);
535 rb_insert_color(&data->node, root);
536}
537
538static void __sort_result(struct rb_root *root, struct rb_root *root_sorted,
539 struct list_head *sort_list)
540{
541 struct rb_node *node;
542 struct alloc_stat *data;
543
544 for (;;) {
545 node = rb_first(root);
546 if (!node)
547 break;
548
549 rb_erase(node, root);
550 data = rb_entry(node, struct alloc_stat, node);
551 sort_insert(root_sorted, data, sort_list);
552 }
553}
554
555static void sort_result(void)
556{
557 __sort_result(&root_alloc_stat, &root_alloc_sorted, &alloc_sort);
558 __sort_result(&root_caller_stat, &root_caller_sorted, &caller_sort);
559}
560
561static int __cmd_kmem(void)
562{
563 setup_pager();
564 read_events();
565 sort_result();
566 print_result();
567
568 return 0;
569}
570
571static const char * const kmem_usage[] = {
572 "perf kmem [<options>] {record}",
573 NULL
574};
575
576static int ptr_cmp(struct alloc_stat *l, struct alloc_stat *r)
577{
578 if (l->ptr < r->ptr)
579 return -1;
580 else if (l->ptr > r->ptr)
581 return 1;
582 return 0;
583}
584
585static struct sort_dimension ptr_sort_dimension = {
586 .name = "ptr",
587 .cmp = ptr_cmp,
588};
589
590static int callsite_cmp(struct alloc_stat *l, struct alloc_stat *r)
591{
592 if (l->call_site < r->call_site)
593 return -1;
594 else if (l->call_site > r->call_site)
595 return 1;
596 return 0;
597}
598
599static struct sort_dimension callsite_sort_dimension = {
600 .name = "callsite",
601 .cmp = callsite_cmp,
602};
603
604static int hit_cmp(struct alloc_stat *l, struct alloc_stat *r)
605{
606 if (l->hit < r->hit)
607 return -1;
608 else if (l->hit > r->hit)
609 return 1;
610 return 0;
611}
612
613static struct sort_dimension hit_sort_dimension = {
614 .name = "hit",
615 .cmp = hit_cmp,
616};
617
618static int bytes_cmp(struct alloc_stat *l, struct alloc_stat *r)
619{
620 if (l->bytes_alloc < r->bytes_alloc)
621 return -1;
622 else if (l->bytes_alloc > r->bytes_alloc)
623 return 1;
624 return 0;
625}
626
627static struct sort_dimension bytes_sort_dimension = {
628 .name = "bytes",
629 .cmp = bytes_cmp,
630};
631
632static int frag_cmp(struct alloc_stat *l, struct alloc_stat *r)
633{
634 double x, y;
635
636 x = fragmentation(l->bytes_req, l->bytes_alloc);
637 y = fragmentation(r->bytes_req, r->bytes_alloc);
638
639 if (x < y)
640 return -1;
641 else if (x > y)
642 return 1;
643 return 0;
644}
645
646static struct sort_dimension frag_sort_dimension = {
647 .name = "frag",
648 .cmp = frag_cmp,
649};
650
651static int pingpong_cmp(struct alloc_stat *l, struct alloc_stat *r)
652{
653 if (l->pingpong < r->pingpong)
654 return -1;
655 else if (l->pingpong > r->pingpong)
656 return 1;
657 return 0;
658}
659
660static struct sort_dimension pingpong_sort_dimension = {
661 .name = "pingpong",
662 .cmp = pingpong_cmp,
663};
664
665static struct sort_dimension *avail_sorts[] = {
666 &ptr_sort_dimension,
667 &callsite_sort_dimension,
668 &hit_sort_dimension,
669 &bytes_sort_dimension,
670 &frag_sort_dimension,
671 &pingpong_sort_dimension,
672};
673
674#define NUM_AVAIL_SORTS \
675 (int)(sizeof(avail_sorts) / sizeof(struct sort_dimension *))
676
677static int sort_dimension__add(const char *tok, struct list_head *list)
678{
679 struct sort_dimension *sort;
680 int i;
681
682 for (i = 0; i < NUM_AVAIL_SORTS; i++) {
683 if (!strcmp(avail_sorts[i]->name, tok)) {
684 sort = malloc(sizeof(*sort));
685 if (!sort)
686 die("malloc");
687 memcpy(sort, avail_sorts[i], sizeof(*sort));
688 list_add_tail(&sort->list, list);
689 return 0;
690 }
691 }
692
693 return -1;
694}
695
696static int setup_sorting(struct list_head *sort_list, const char *arg)
697{
698 char *tok;
699 char *str = strdup(arg);
700
701 if (!str)
702 die("strdup");
703
704 while (true) {
705 tok = strsep(&str, ",");
706 if (!tok)
707 break;
708 if (sort_dimension__add(tok, sort_list) < 0) {
709 error("Unknown --sort key: '%s'", tok);
710 return -1;
711 }
712 }
713
714 free(str);
715 return 0;
716}
717
718static int parse_sort_opt(const struct option *opt __used,
719 const char *arg, int unset __used)
720{
721 if (!arg)
722 return -1;
723
724 if (caller_flag > alloc_flag)
725 return setup_sorting(&caller_sort, arg);
726 else
727 return setup_sorting(&alloc_sort, arg);
728
729 return 0;
730}
731
732static int parse_stat_opt(const struct option *opt __used,
733 const char *arg, int unset __used)
734{
735 if (!arg)
736 return -1;
737
738 if (strcmp(arg, "alloc") == 0)
739 alloc_flag = (caller_flag + 1);
740 else if (strcmp(arg, "caller") == 0)
741 caller_flag = (alloc_flag + 1);
742 else
743 return -1;
744 return 0;
745}
746
747static int parse_line_opt(const struct option *opt __used,
748 const char *arg, int unset __used)
749{
750 int lines;
751
752 if (!arg)
753 return -1;
754
755 lines = strtoul(arg, NULL, 10);
756
757 if (caller_flag > alloc_flag)
758 caller_lines = lines;
759 else
760 alloc_lines = lines;
761
762 return 0;
763}
764
765static const struct option kmem_options[] = {
766 OPT_STRING('i', "input", &input_name, "file",
767 "input file name"),
768 OPT_CALLBACK(0, "stat", NULL, "<alloc>|<caller>",
769 "stat selector, Pass 'alloc' or 'caller'.",
770 parse_stat_opt),
771 OPT_CALLBACK('s', "sort", NULL, "key[,key2...]",
772 "sort by keys: ptr, call_site, bytes, hit, pingpong, frag",
773 parse_sort_opt),
774 OPT_CALLBACK('l', "line", NULL, "num",
775 "show n lins",
776 parse_line_opt),
777 OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"),
778 OPT_END()
779};
780
781static const char *record_args[] = {
782 "record",
783 "-a",
784 "-R",
785 "-M",
786 "-f",
787 "-c", "1",
788 "-e", "kmem:kmalloc",
789 "-e", "kmem:kmalloc_node",
790 "-e", "kmem:kfree",
791 "-e", "kmem:kmem_cache_alloc",
792 "-e", "kmem:kmem_cache_alloc_node",
793 "-e", "kmem:kmem_cache_free",
794};
795
796static int __cmd_record(int argc, const char **argv)
797{
798 unsigned int rec_argc, i, j;
799 const char **rec_argv;
800
801 rec_argc = ARRAY_SIZE(record_args) + argc - 1;
802 rec_argv = calloc(rec_argc + 1, sizeof(char *));
803
804 for (i = 0; i < ARRAY_SIZE(record_args); i++)
805 rec_argv[i] = strdup(record_args[i]);
806
807 for (j = 1; j < (unsigned int)argc; j++, i++)
808 rec_argv[i] = argv[j];
809
810 return cmd_record(i, rec_argv, NULL);
811}
812
813int cmd_kmem(int argc, const char **argv, const char *prefix __used)
814{
815 symbol__init(0);
816
817 argc = parse_options(argc, argv, kmem_options, kmem_usage, 0);
818
819 if (argc && !strncmp(argv[0], "rec", 3))
820 return __cmd_record(argc, argv);
821 else if (argc)
822 usage_with_options(kmem_usage, kmem_options);
823
824 if (list_empty(&caller_sort))
825 setup_sorting(&caller_sort, default_sort_order);
826 if (list_empty(&alloc_sort))
827 setup_sorting(&alloc_sort, default_sort_order);
828
829 setup_cpunode_map();
830
831 return __cmd_kmem();
832}
833
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 82260c56db3d..0e519c667e3a 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -307,6 +307,12 @@ try_again:
307 printf("\n"); 307 printf("\n");
308 error("perfcounter syscall returned with %d (%s)\n", 308 error("perfcounter syscall returned with %d (%s)\n",
309 fd[nr_cpu][counter], strerror(err)); 309 fd[nr_cpu][counter], strerror(err));
310
311#if defined(__i386__) || defined(__x86_64__)
312 if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
313 die("No hardware sampling interrupt available. No APIC? If so then you can boot the kernel with the \"lapic\" boot parameter to force-enable it.\n");
314#endif
315
310 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); 316 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
311 exit(-1); 317 exit(-1);
312 } 318 }
@@ -400,7 +406,7 @@ static int __cmd_record(int argc, const char **argv)
400 struct stat st; 406 struct stat st;
401 pid_t pid = 0; 407 pid_t pid = 0;
402 int flags; 408 int flags;
403 int ret; 409 int err;
404 unsigned long waking = 0; 410 unsigned long waking = 0;
405 411
406 page_size = sysconf(_SC_PAGE_SIZE); 412 page_size = sysconf(_SC_PAGE_SIZE);
@@ -434,16 +440,18 @@ static int __cmd_record(int argc, const char **argv)
434 exit(-1); 440 exit(-1);
435 } 441 }
436 442
437 if (!file_new) 443 header = perf_header__new();
438 header = perf_header__read(output);
439 else
440 header = perf_header__new();
441
442 if (header == NULL) { 444 if (header == NULL) {
443 pr_err("Not enough memory for reading perf file header\n"); 445 pr_err("Not enough memory for reading perf file header\n");
444 return -1; 446 return -1;
445 } 447 }
446 448
449 if (!file_new) {
450 err = perf_header__read(header, output);
451 if (err < 0)
452 return err;
453 }
454
447 if (raw_samples) { 455 if (raw_samples) {
448 perf_header__set_feat(header, HEADER_TRACE_INFO); 456 perf_header__set_feat(header, HEADER_TRACE_INFO);
449 } else { 457 } else {
@@ -472,8 +480,11 @@ static int __cmd_record(int argc, const char **argv)
472 } 480 }
473 } 481 }
474 482
475 if (file_new) 483 if (file_new) {
476 perf_header__write(header, output, false); 484 err = perf_header__write(header, output, false);
485 if (err < 0)
486 return err;
487 }
477 488
478 if (!system_wide) 489 if (!system_wide)
479 event__synthesize_thread(pid, process_synthesized_event); 490 event__synthesize_thread(pid, process_synthesized_event);
@@ -527,7 +538,7 @@ static int __cmd_record(int argc, const char **argv)
527 if (hits == samples) { 538 if (hits == samples) {
528 if (done) 539 if (done)
529 break; 540 break;
530 ret = poll(event_array, nr_poll, -1); 541 err = poll(event_array, nr_poll, -1);
531 waking++; 542 waking++;
532 } 543 }
533 544
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 1a806d5f05cf..fe474b7f8ad0 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -38,6 +38,7 @@ static char *dso_list_str, *comm_list_str, *sym_list_str,
38static struct strlist *dso_list, *comm_list, *sym_list; 38static struct strlist *dso_list, *comm_list, *sym_list;
39 39
40static int force; 40static int force;
41static bool use_modules;
41 42
42static int full_paths; 43static int full_paths;
43static int show_nr_samples; 44static int show_nr_samples;
@@ -51,6 +52,7 @@ static char *pretty_printing_style = default_pretty_printing_style;
51static int exclude_other = 1; 52static int exclude_other = 1;
52 53
53static char callchain_default_opt[] = "fractal,0.5"; 54static char callchain_default_opt[] = "fractal,0.5";
55const char *vmlinux_name;
54 56
55static char *cwd; 57static char *cwd;
56static int cwdlen; 58static int cwdlen;
@@ -448,7 +450,7 @@ got_map:
448 * trick of looking in the whole kernel symbol list. 450 * trick of looking in the whole kernel symbol list.
449 */ 451 */
450 if ((long long)ip < 0) 452 if ((long long)ip < 0)
451 return kernel_maps__find_symbol(ip, mapp); 453 return kernel_maps__find_symbol(ip, mapp, NULL);
452 } 454 }
453 dump_printf(" ...... dso: %s\n", 455 dump_printf(" ...... dso: %s\n",
454 map ? map->dso->long_name : "<not found>"); 456 map ? map->dso->long_name : "<not found>");
@@ -466,7 +468,7 @@ static int call__match(struct symbol *sym)
466 return 0; 468 return 0;
467} 469}
468 470
469static struct symbol **resolve_callchain(struct thread *thread, struct map *map, 471static struct symbol **resolve_callchain(struct thread *thread,
470 struct ip_callchain *chain, 472 struct ip_callchain *chain,
471 struct symbol **parent) 473 struct symbol **parent)
472{ 474{
@@ -495,10 +497,10 @@ static struct symbol **resolve_callchain(struct thread *thread, struct map *map,
495 case PERF_CONTEXT_HV: 497 case PERF_CONTEXT_HV:
496 break; 498 break;
497 case PERF_CONTEXT_KERNEL: 499 case PERF_CONTEXT_KERNEL:
498 sym = kernel_maps__find_symbol(ip, &map); 500 sym = kernel_maps__find_symbol(ip, NULL, NULL);
499 break; 501 break;
500 default: 502 default:
501 sym = resolve_symbol(thread, &map, &ip); 503 sym = resolve_symbol(thread, NULL, &ip);
502 break; 504 break;
503 } 505 }
504 506
@@ -528,7 +530,7 @@ hist_entry__add(struct thread *thread, struct map *map,
528 struct hist_entry *he; 530 struct hist_entry *he;
529 531
530 if ((sort__has_parent || callchain) && chain) 532 if ((sort__has_parent || callchain) && chain)
531 syms = resolve_callchain(thread, map, chain, &parent); 533 syms = resolve_callchain(thread, chain, &parent);
532 534
533 he = __hist_entry__add(thread, map, sym, parent, 535 he = __hist_entry__add(thread, map, sym, parent,
534 ip, count, level, &hit); 536 ip, count, level, &hit);
@@ -715,7 +717,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
715 717
716 if (cpumode == PERF_RECORD_MISC_KERNEL) { 718 if (cpumode == PERF_RECORD_MISC_KERNEL) {
717 level = 'k'; 719 level = 'k';
718 sym = kernel_maps__find_symbol(ip, &map); 720 sym = kernel_maps__find_symbol(ip, &map, NULL);
719 dump_printf(" ...... dso: %s\n", 721 dump_printf(" ...... dso: %s\n",
720 map ? map->dso->long_name : "<not found>"); 722 map ? map->dso->long_name : "<not found>");
721 } else if (cpumode == PERF_RECORD_MISC_USER) { 723 } else if (cpumode == PERF_RECORD_MISC_USER) {
@@ -924,8 +926,9 @@ static int __cmd_report(void)
924 926
925 register_perf_file_handler(&file_handler); 927 register_perf_file_handler(&file_handler);
926 928
927 ret = mmap_dispatch_perf_file(&header, input_name, force, full_paths, 929 ret = mmap_dispatch_perf_file(&header, input_name, vmlinux_name,
928 &cwdlen, &cwd); 930 !vmlinux_name, force,
931 full_paths, &cwdlen, &cwd);
929 if (ret) 932 if (ret)
930 return ret; 933 return ret;
931 934
@@ -1023,7 +1026,7 @@ static const struct option options[] = {
1023 "dump raw trace in ASCII"), 1026 "dump raw trace in ASCII"),
1024 OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"), 1027 OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"),
1025 OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), 1028 OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
1026 OPT_BOOLEAN('m', "modules", &modules, 1029 OPT_BOOLEAN('m', "modules", &use_modules,
1027 "load module symbols - WARNING: use only with -k and LIVE kernel"), 1030 "load module symbols - WARNING: use only with -k and LIVE kernel"),
1028 OPT_BOOLEAN('n', "show-nr-samples", &show_nr_samples, 1031 OPT_BOOLEAN('n', "show-nr-samples", &show_nr_samples,
1029 "Show a column with the number of samples"), 1032 "Show a column with the number of samples"),
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index df44b756cecc..260f57a72ee0 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1718,7 +1718,8 @@ static int read_events(void)
1718 register_idle_thread(); 1718 register_idle_thread();
1719 register_perf_file_handler(&file_handler); 1719 register_perf_file_handler(&file_handler);
1720 1720
1721 return mmap_dispatch_perf_file(&header, input_name, 0, 0, &cwdlen, &cwd); 1721 return mmap_dispatch_perf_file(&header, input_name, NULL, false, 0, 0,
1722 &cwdlen, &cwd);
1722} 1723}
1723 1724
1724static void print_bad_events(void) 1725static void print_bad_events(void)
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 665877e4a944..dd4d82ac7aa4 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -1093,7 +1093,7 @@ static void process_samples(void)
1093 1093
1094static int __cmd_timechart(void) 1094static int __cmd_timechart(void)
1095{ 1095{
1096 int ret, rc = EXIT_FAILURE; 1096 int err, rc = EXIT_FAILURE;
1097 unsigned long offset = 0; 1097 unsigned long offset = 0;
1098 unsigned long head, shift; 1098 unsigned long head, shift;
1099 struct stat statbuf; 1099 struct stat statbuf;
@@ -1111,8 +1111,8 @@ static int __cmd_timechart(void)
1111 exit(-1); 1111 exit(-1);
1112 } 1112 }
1113 1113
1114 ret = fstat(input, &statbuf); 1114 err = fstat(input, &statbuf);
1115 if (ret < 0) { 1115 if (err < 0) {
1116 perror("failed to stat file"); 1116 perror("failed to stat file");
1117 exit(-1); 1117 exit(-1);
1118 } 1118 }
@@ -1122,7 +1122,16 @@ static int __cmd_timechart(void)
1122 exit(0); 1122 exit(0);
1123 } 1123 }
1124 1124
1125 header = perf_header__read(input); 1125 header = perf_header__new();
1126 if (header == NULL)
1127 return -ENOMEM;
1128
1129 err = perf_header__read(header, input);
1130 if (err < 0) {
1131 perf_header__delete(header);
1132 return err;
1133 }
1134
1126 head = header->data_offset; 1135 head = header->data_offset;
1127 1136
1128 sample_type = perf_header__sample_type(header); 1137 sample_type = perf_header__sample_type(header);
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 07b92c378ae2..6a5de90e9b83 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -79,13 +79,7 @@ static int dump_symtab = 0;
79static bool hide_kernel_symbols = false; 79static bool hide_kernel_symbols = false;
80static bool hide_user_symbols = false; 80static bool hide_user_symbols = false;
81static struct winsize winsize; 81static struct winsize winsize;
82static const char *graph_line = 82const char *vmlinux_name;
83 "_____________________________________________________________________"
84 "_____________________________________________________________________";
85static const char *graph_dotted_line =
86 "---------------------------------------------------------------------"
87 "---------------------------------------------------------------------"
88 "---------------------------------------------------------------------";
89 83
90/* 84/*
91 * Source 85 * Source
@@ -830,6 +824,8 @@ static void handle_keypress(int c)
830 case 'q': 824 case 'q':
831 case 'Q': 825 case 'Q':
832 printf("exiting.\n"); 826 printf("exiting.\n");
827 if (dump_symtab)
828 dsos__fprintf(stderr);
833 exit(0); 829 exit(0);
834 case 's': 830 case 's':
835 prompt_symbol(&sym_filter_entry, "Enter details symbol"); 831 prompt_symbol(&sym_filter_entry, "Enter details symbol");
@@ -946,17 +942,6 @@ static int symbol_filter(struct map *map, struct symbol *sym)
946 return 0; 942 return 0;
947} 943}
948 944
949static int parse_symbols(void)
950{
951 if (dsos__load_kernel(vmlinux_name, symbol_filter, 1) <= 0)
952 return -1;
953
954 if (dump_symtab)
955 dsos__fprintf(stderr);
956
957 return 0;
958}
959
960static void event__process_sample(const event_t *self, int counter) 945static void event__process_sample(const event_t *self, int counter)
961{ 946{
962 u64 ip = self->ip.ip; 947 u64 ip = self->ip.ip;
@@ -999,7 +984,7 @@ static void event__process_sample(const event_t *self, int counter)
999 if (hide_kernel_symbols) 984 if (hide_kernel_symbols)
1000 return; 985 return;
1001 986
1002 sym = kernel_maps__find_symbol(ip, &map); 987 sym = kernel_maps__find_symbol(ip, &map, symbol_filter);
1003 if (sym == NULL) 988 if (sym == NULL)
1004 return; 989 return;
1005 break; 990 break;
@@ -1326,7 +1311,7 @@ static const struct option options[] = {
1326 1311
1327int cmd_top(int argc, const char **argv, const char *prefix __used) 1312int cmd_top(int argc, const char **argv, const char *prefix __used)
1328{ 1313{
1329 int counter; 1314 int counter, err;
1330 1315
1331 page_size = sysconf(_SC_PAGE_SIZE); 1316 page_size = sysconf(_SC_PAGE_SIZE);
1332 1317
@@ -1350,10 +1335,11 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
1350 if (delay_secs < 1) 1335 if (delay_secs < 1)
1351 delay_secs = 1; 1336 delay_secs = 1;
1352 1337
1353 parse_symbols(); 1338 err = kernel_maps__init(vmlinux_name, !vmlinux_name, true);
1339 if (err < 0)
1340 return err;
1354 parse_source(sym_filter_entry); 1341 parse_source(sym_filter_entry);
1355 1342
1356
1357 /* 1343 /*
1358 * User specified count overrides default frequency. 1344 * User specified count overrides default frequency.
1359 */ 1345 */
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index d042d656c561..b71198e5dc14 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -131,7 +131,8 @@ static int __cmd_trace(void)
131 register_idle_thread(); 131 register_idle_thread();
132 register_perf_file_handler(&file_handler); 132 register_perf_file_handler(&file_handler);
133 133
134 return mmap_dispatch_perf_file(&header, input_name, 0, 0, &cwdlen, &cwd); 134 return mmap_dispatch_perf_file(&header, input_name, NULL, false,
135 0, 0, &cwdlen, &cwd);
135} 136}
136 137
137static const char * const annotate_usage[] = { 138static const char * const annotate_usage[] = {
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 9b02d85091fe..a3d8bf65f26c 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -28,5 +28,6 @@ extern int cmd_top(int argc, const char **argv, const char *prefix);
28extern int cmd_trace(int argc, const char **argv, const char *prefix); 28extern int cmd_trace(int argc, const char **argv, const char *prefix);
29extern int cmd_version(int argc, const char **argv, const char *prefix); 29extern int cmd_version(int argc, const char **argv, const char *prefix);
30extern int cmd_probe(int argc, const char **argv, const char *prefix); 30extern int cmd_probe(int argc, const char **argv, const char *prefix);
31extern int cmd_kmem(int argc, const char **argv, const char *prefix);
31 32
32#endif 33#endif
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index d3a6e18e4a5e..02b09ea17a3e 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -14,3 +14,4 @@ perf-timechart mainporcelain common
14perf-top mainporcelain common 14perf-top mainporcelain common
15perf-trace mainporcelain common 15perf-trace mainporcelain common
16perf-probe mainporcelain common 16perf-probe mainporcelain common
17perf-kmem mainporcelain common
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 89b82acac7d9..cf64049bc9bd 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -285,20 +285,21 @@ static void handle_internal_command(int argc, const char **argv)
285{ 285{
286 const char *cmd = argv[0]; 286 const char *cmd = argv[0];
287 static struct cmd_struct commands[] = { 287 static struct cmd_struct commands[] = {
288 { "help", cmd_help, 0 },
289 { "list", cmd_list, 0 },
290 { "buildid-list", cmd_buildid_list, 0 }, 288 { "buildid-list", cmd_buildid_list, 0 },
291 { "record", cmd_record, 0 }, 289 { "help", cmd_help, 0 },
292 { "report", cmd_report, 0 }, 290 { "list", cmd_list, 0 },
293 { "bench", cmd_bench, 0 }, 291 { "record", cmd_record, 0 },
294 { "stat", cmd_stat, 0 }, 292 { "report", cmd_report, 0 },
295 { "timechart", cmd_timechart, 0 }, 293 { "bench", cmd_bench, 0 },
296 { "top", cmd_top, 0 }, 294 { "stat", cmd_stat, 0 },
297 { "annotate", cmd_annotate, 0 }, 295 { "timechart", cmd_timechart, 0 },
298 { "version", cmd_version, 0 }, 296 { "top", cmd_top, 0 },
299 { "trace", cmd_trace, 0 }, 297 { "annotate", cmd_annotate, 0 },
300 { "sched", cmd_sched, 0 }, 298 { "version", cmd_version, 0 },
301 { "probe", cmd_probe, 0 }, 299 { "trace", cmd_trace, 0 },
300 { "sched", cmd_sched, 0 },
301 { "probe", cmd_probe, 0 },
302 { "kmem", cmd_kmem, 0 },
302 }; 303 };
303 unsigned int i; 304 unsigned int i;
304 static const char ext[] = STRIP_EXTENSION; 305 static const char ext[] = STRIP_EXTENSION;
diff --git a/tools/perf/util/ctype.c b/tools/perf/util/ctype.c
index 0b791bd346bc..35073621e5de 100644
--- a/tools/perf/util/ctype.c
+++ b/tools/perf/util/ctype.c
@@ -29,3 +29,11 @@ unsigned char sane_ctype[256] = {
29 A, A, A, A, A, A, A, A, A, A, A, R, R, P, P, 0, /* 112..127 */ 29 A, A, A, A, A, A, A, A, A, A, A, R, R, P, P, 0, /* 112..127 */
30 /* Nothing in the 128.. range */ 30 /* Nothing in the 128.. range */
31}; 31};
32
33const char *graph_line =
34 "_____________________________________________________________________"
35 "_____________________________________________________________________";
36const char *graph_dotted_line =
37 "---------------------------------------------------------------------"
38 "---------------------------------------------------------------------"
39 "---------------------------------------------------------------------";
diff --git a/tools/perf/util/data_map.c b/tools/perf/util/data_map.c
index 14cb8465eb08..f318d19b2562 100644
--- a/tools/perf/util/data_map.c
+++ b/tools/perf/util/data_map.c
@@ -101,12 +101,14 @@ out:
101 101
102int mmap_dispatch_perf_file(struct perf_header **pheader, 102int mmap_dispatch_perf_file(struct perf_header **pheader,
103 const char *input_name, 103 const char *input_name,
104 const char *vmlinux_name,
105 bool try_vmlinux_path,
104 int force, 106 int force,
105 int full_paths, 107 int full_paths,
106 int *cwdlen, 108 int *cwdlen,
107 char **cwd) 109 char **cwd)
108{ 110{
109 int ret, rc = EXIT_FAILURE; 111 int err;
110 struct perf_header *header; 112 struct perf_header *header;
111 unsigned long head, shift; 113 unsigned long head, shift;
112 unsigned long offset = 0; 114 unsigned long offset = 0;
@@ -118,56 +120,69 @@ int mmap_dispatch_perf_file(struct perf_header **pheader,
118 int input; 120 int input;
119 char *buf; 121 char *buf;
120 122
121 if (!curr_handler) 123 if (curr_handler == NULL) {
122 die("Forgot to register perf file handler"); 124 pr_debug("Forgot to register perf file handler\n");
125 return -EINVAL;
126 }
123 127
124 page_size = getpagesize(); 128 page_size = getpagesize();
125 129
126 input = open(input_name, O_RDONLY); 130 input = open(input_name, O_RDONLY);
127 if (input < 0) { 131 if (input < 0) {
128 fprintf(stderr, " failed to open file: %s", input_name); 132 pr_err("Failed to open file: %s", input_name);
129 if (!strcmp(input_name, "perf.data")) 133 if (!strcmp(input_name, "perf.data"))
130 fprintf(stderr, " (try 'perf record' first)"); 134 pr_err(" (try 'perf record' first)");
131 fprintf(stderr, "\n"); 135 pr_err("\n");
132 exit(-1); 136 return -errno;
133 } 137 }
134 138
135 ret = fstat(input, &input_stat); 139 if (fstat(input, &input_stat) < 0) {
136 if (ret < 0) { 140 pr_err("failed to stat file");
137 perror("failed to stat file"); 141 err = -errno;
138 exit(-1); 142 goto out_close;
139 } 143 }
140 144
145 err = -EACCES;
141 if (!force && input_stat.st_uid && (input_stat.st_uid != geteuid())) { 146 if (!force && input_stat.st_uid && (input_stat.st_uid != geteuid())) {
142 fprintf(stderr, "file: %s not owned by current user or root\n", 147 pr_err("file: %s not owned by current user or root\n",
143 input_name); 148 input_name);
144 exit(-1); 149 goto out_close;
145 } 150 }
146 151
147 if (!input_stat.st_size) { 152 if (input_stat.st_size == 0) {
148 fprintf(stderr, "zero-sized file, nothing to do!\n"); 153 pr_info("zero-sized file, nothing to do!\n");
149 exit(0); 154 goto done;
150 } 155 }
151 156
152 *pheader = perf_header__read(input); 157 err = -ENOMEM;
153 header = *pheader; 158 header = perf_header__new();
159 if (header == NULL)
160 goto out_close;
161
162 err = perf_header__read(header, input);
163 if (err < 0)
164 goto out_delete;
165 *pheader = header;
154 head = header->data_offset; 166 head = header->data_offset;
155 167
156 sample_type = perf_header__sample_type(header); 168 sample_type = perf_header__sample_type(header);
157 169
158 if (curr_handler->sample_type_check) 170 err = -EINVAL;
159 if (curr_handler->sample_type_check(sample_type) < 0) 171 if (curr_handler->sample_type_check &&
160 exit(-1); 172 curr_handler->sample_type_check(sample_type) < 0)
173 goto out_delete;
161 174
162 if (load_kernel(NULL) < 0) { 175 err = -ENOMEM;
163 perror("failed to load kernel symbols"); 176 if (kernel_maps__init(vmlinux_name, try_vmlinux_path, true) < 0) {
164 return EXIT_FAILURE; 177 pr_err("failed to setup the kernel maps to resolve symbols\n");
178 goto out_delete;
165 } 179 }
166 180
167 if (!full_paths) { 181 if (!full_paths) {
168 if (getcwd(__cwd, sizeof(__cwd)) == NULL) { 182 if (getcwd(__cwd, sizeof(__cwd)) == NULL) {
169 perror("failed to get the current directory"); 183 pr_err("failed to get the current directory\n");
170 return EXIT_FAILURE; 184 err = -errno;
185 goto out_delete;
171 } 186 }
172 *cwd = __cwd; 187 *cwd = __cwd;
173 *cwdlen = strlen(*cwd); 188 *cwdlen = strlen(*cwd);
@@ -181,11 +196,12 @@ int mmap_dispatch_perf_file(struct perf_header **pheader,
181 head -= shift; 196 head -= shift;
182 197
183remap: 198remap:
184 buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ, 199 buf = mmap(NULL, page_size * mmap_window, PROT_READ,
185 MAP_SHARED, input, offset); 200 MAP_SHARED, input, offset);
186 if (buf == MAP_FAILED) { 201 if (buf == MAP_FAILED) {
187 perror("failed to mmap file"); 202 pr_err("failed to mmap file\n");
188 exit(-1); 203 err = -errno;
204 goto out_delete;
189 } 205 }
190 206
191more: 207more:
@@ -242,10 +258,12 @@ more:
242 goto more; 258 goto more;
243 259
244done: 260done:
245 rc = EXIT_SUCCESS; 261 err = 0;
262out_close:
246 close(input); 263 close(input);
247 264
248 return rc; 265 return err;
266out_delete:
267 perf_header__delete(header);
268 goto out_close;
249} 269}
250
251
diff --git a/tools/perf/util/data_map.h b/tools/perf/util/data_map.h
index ae036ecd7625..3f0d21b3819e 100644
--- a/tools/perf/util/data_map.h
+++ b/tools/perf/util/data_map.h
@@ -23,6 +23,8 @@ struct perf_file_handler {
23void register_perf_file_handler(struct perf_file_handler *handler); 23void register_perf_file_handler(struct perf_file_handler *handler);
24int mmap_dispatch_perf_file(struct perf_header **pheader, 24int mmap_dispatch_perf_file(struct perf_header **pheader,
25 const char *input_name, 25 const char *input_name,
26 const char *vmlinux_name,
27 bool try_vmlinux_path,
26 int force, 28 int force,
27 int full_paths, 29 int full_paths,
28 int *cwdlen, 30 int *cwdlen,
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 1f771ce3a957..f1e392612652 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -69,13 +69,6 @@ struct build_id_event {
69 char filename[]; 69 char filename[];
70}; 70};
71 71
72struct build_id_list {
73 struct build_id_event event;
74 struct list_head list;
75 const char *dso_name;
76 int len;
77};
78
79typedef union event_union { 72typedef union event_union {
80 struct perf_event_header header; 73 struct perf_event_header header;
81 struct ip_event ip; 74 struct ip_event ip;
@@ -122,10 +115,13 @@ typedef int (*symbol_filter_t)(struct map *map, struct symbol *sym);
122void map__init(struct map *self, u64 start, u64 end, u64 pgoff, 115void map__init(struct map *self, u64 start, u64 end, u64 pgoff,
123 struct dso *dso); 116 struct dso *dso);
124struct map *map__new(struct mmap_event *event, char *cwd, int cwdlen); 117struct map *map__new(struct mmap_event *event, char *cwd, int cwdlen);
118void map__delete(struct map *self);
125struct map *map__clone(struct map *self); 119struct map *map__clone(struct map *self);
126int map__overlap(struct map *l, struct map *r); 120int map__overlap(struct map *l, struct map *r);
127size_t map__fprintf(struct map *self, FILE *fp); 121size_t map__fprintf(struct map *self, FILE *fp);
128struct symbol *map__find_symbol(struct map *self, u64 ip, symbol_filter_t filter); 122struct symbol *map__find_symbol(struct map *self, u64 ip, symbol_filter_t filter);
123void map__fixup_start(struct map *self);
124void map__fixup_end(struct map *self);
129 125
130int event__synthesize_thread(pid_t pid, int (*process)(event_t *event)); 126int event__synthesize_thread(pid_t pid, int (*process)(event_t *event));
131void event__synthesize_threads(int (*process)(event_t *event)); 127void event__synthesize_threads(int (*process)(event_t *event));
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index b01a9537977f..1332f8ec04aa 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -78,16 +78,24 @@ struct perf_header *perf_header__new(void)
78 return self; 78 return self;
79} 79}
80 80
81void perf_header__delete(struct perf_header *self)
82{
83 int i;
84
85 for (i = 0; i < self->attrs; ++i)
86 perf_header_attr__delete(self->attr[i]);
87
88 free(self->attr);
89 free(self);
90}
91
81int perf_header__add_attr(struct perf_header *self, 92int perf_header__add_attr(struct perf_header *self,
82 struct perf_header_attr *attr) 93 struct perf_header_attr *attr)
83{ 94{
84 int pos = self->attrs;
85
86 if (self->frozen) 95 if (self->frozen)
87 return -1; 96 return -1;
88 97
89 self->attrs++; 98 if (self->attrs == self->size) {
90 if (self->attrs > self->size) {
91 int nsize = self->size * 2; 99 int nsize = self->size * 2;
92 struct perf_header_attr **nattr; 100 struct perf_header_attr **nattr;
93 101
@@ -98,7 +106,8 @@ int perf_header__add_attr(struct perf_header *self,
98 self->size = nsize; 106 self->size = nsize;
99 self->attr = nattr; 107 self->attr = nattr;
100 } 108 }
101 self->attr[pos] = attr; 109
110 self->attr[self->attrs++] = attr;
102 return 0; 111 return 0;
103} 112}
104 113
@@ -167,7 +176,7 @@ static int do_write(int fd, const void *buf, size_t size)
167 int ret = write(fd, buf, size); 176 int ret = write(fd, buf, size);
168 177
169 if (ret < 0) 178 if (ret < 0)
170 return -1; 179 return -errno;
171 180
172 size -= ret; 181 size -= ret;
173 buf += ret; 182 buf += ret;
@@ -176,43 +185,51 @@ static int do_write(int fd, const void *buf, size_t size)
176 return 0; 185 return 0;
177} 186}
178 187
179static int write_buildid_table(int fd, struct list_head *id_head) 188static int dsos__write_buildid_table(int fd)
180{ 189{
181 struct build_id_list *iter, *next; 190 struct dso *pos;
182 191
183 list_for_each_entry_safe(iter, next, id_head, list) { 192 list_for_each_entry(pos, &dsos, node) {
184 struct build_id_event *b = &iter->event; 193 int err;
185 194 struct build_id_event b;
186 if (do_write(fd, b, sizeof(*b)) < 0 || 195 size_t len;
187 do_write(fd, iter->dso_name, iter->len) < 0) 196
188 return -1; 197 if (!pos->has_build_id)
189 list_del(&iter->list); 198 continue;
190 free(iter); 199 len = pos->long_name_len + 1;
200 len = ALIGN(len, 64);
201 memset(&b, 0, sizeof(b));
202 memcpy(&b.build_id, pos->build_id, sizeof(pos->build_id));
203 b.header.size = sizeof(b) + len;
204 err = do_write(fd, &b, sizeof(b));
205 if (err < 0)
206 return err;
207 err = do_write(fd, pos->long_name, len);
208 if (err < 0)
209 return err;
191 } 210 }
192 211
193 return 0; 212 return 0;
194} 213}
195 214
196static void 215static int perf_header__adds_write(struct perf_header *self, int fd)
197perf_header__adds_write(struct perf_header *self, int fd)
198{ 216{
199 LIST_HEAD(id_list);
200 int nr_sections; 217 int nr_sections;
201 struct perf_file_section *feat_sec; 218 struct perf_file_section *feat_sec;
202 int sec_size; 219 int sec_size;
203 u64 sec_start; 220 u64 sec_start;
204 int idx = 0; 221 int idx = 0, err;
205 222
206 if (fetch_build_id_table(&id_list)) 223 if (dsos__read_build_ids())
207 perf_header__set_feat(self, HEADER_BUILD_ID); 224 perf_header__set_feat(self, HEADER_BUILD_ID);
208 225
209 nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS); 226 nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS);
210 if (!nr_sections) 227 if (!nr_sections)
211 return; 228 return 0;
212 229
213 feat_sec = calloc(sizeof(*feat_sec), nr_sections); 230 feat_sec = calloc(sizeof(*feat_sec), nr_sections);
214 if (!feat_sec) 231 if (feat_sec == NULL)
215 die("No memory"); 232 return -ENOMEM;
216 233
217 sec_size = sizeof(*feat_sec) * nr_sections; 234 sec_size = sizeof(*feat_sec) * nr_sections;
218 235
@@ -236,25 +253,37 @@ perf_header__adds_write(struct perf_header *self, int fd)
236 253
237 buildid_sec = &feat_sec[idx++]; 254 buildid_sec = &feat_sec[idx++];
238 255
256 /*
257 * Read the kernel buildid nad the list of loaded modules with
258 * its build_ids:
259 */
260 kernel_maps__init(NULL, false, true);
261
239 /* Write build-ids */ 262 /* Write build-ids */
240 buildid_sec->offset = lseek(fd, 0, SEEK_CUR); 263 buildid_sec->offset = lseek(fd, 0, SEEK_CUR);
241 if (write_buildid_table(fd, &id_list) < 0) 264 err = dsos__write_buildid_table(fd);
242 die("failed to write buildid table"); 265 if (err < 0) {
266 pr_debug("failed to write buildid table\n");
267 goto out_free;
268 }
243 buildid_sec->size = lseek(fd, 0, SEEK_CUR) - buildid_sec->offset; 269 buildid_sec->size = lseek(fd, 0, SEEK_CUR) - buildid_sec->offset;
244 } 270 }
245 271
246 lseek(fd, sec_start, SEEK_SET); 272 lseek(fd, sec_start, SEEK_SET);
247 if (do_write(fd, feat_sec, sec_size) < 0) 273 err = do_write(fd, feat_sec, sec_size);
248 die("failed to write feature section"); 274 if (err < 0)
275 pr_debug("failed to write feature section\n");
276out_free:
249 free(feat_sec); 277 free(feat_sec);
278 return err;
250} 279}
251 280
252void perf_header__write(struct perf_header *self, int fd, bool at_exit) 281int perf_header__write(struct perf_header *self, int fd, bool at_exit)
253{ 282{
254 struct perf_file_header f_header; 283 struct perf_file_header f_header;
255 struct perf_file_attr f_attr; 284 struct perf_file_attr f_attr;
256 struct perf_header_attr *attr; 285 struct perf_header_attr *attr;
257 int i; 286 int i, err;
258 287
259 lseek(fd, sizeof(f_header), SEEK_SET); 288 lseek(fd, sizeof(f_header), SEEK_SET);
260 289
@@ -263,8 +292,11 @@ void perf_header__write(struct perf_header *self, int fd, bool at_exit)
263 attr = self->attr[i]; 292 attr = self->attr[i];
264 293
265 attr->id_offset = lseek(fd, 0, SEEK_CUR); 294 attr->id_offset = lseek(fd, 0, SEEK_CUR);
266 if (do_write(fd, attr->id, attr->ids * sizeof(u64)) < 0) 295 err = do_write(fd, attr->id, attr->ids * sizeof(u64));
267 die("failed to write perf header"); 296 if (err < 0) {
297 pr_debug("failed to write perf header\n");
298 return err;
299 }
268 } 300 }
269 301
270 302
@@ -280,20 +312,30 @@ void perf_header__write(struct perf_header *self, int fd, bool at_exit)
280 .size = attr->ids * sizeof(u64), 312 .size = attr->ids * sizeof(u64),
281 } 313 }
282 }; 314 };
283 if (do_write(fd, &f_attr, sizeof(f_attr)) < 0) 315 err = do_write(fd, &f_attr, sizeof(f_attr));
284 die("failed to write perf header attribute"); 316 if (err < 0) {
317 pr_debug("failed to write perf header attribute\n");
318 return err;
319 }
285 } 320 }
286 321
287 self->event_offset = lseek(fd, 0, SEEK_CUR); 322 self->event_offset = lseek(fd, 0, SEEK_CUR);
288 self->event_size = event_count * sizeof(struct perf_trace_event_type); 323 self->event_size = event_count * sizeof(struct perf_trace_event_type);
289 if (events) 324 if (events) {
290 if (do_write(fd, events, self->event_size) < 0) 325 err = do_write(fd, events, self->event_size);
291 die("failed to write perf header events"); 326 if (err < 0) {
327 pr_debug("failed to write perf header events\n");
328 return err;
329 }
330 }
292 331
293 self->data_offset = lseek(fd, 0, SEEK_CUR); 332 self->data_offset = lseek(fd, 0, SEEK_CUR);
294 333
295 if (at_exit) 334 if (at_exit) {
296 perf_header__adds_write(self, fd); 335 err = perf_header__adds_write(self, fd);
336 if (err < 0)
337 return err;
338 }
297 339
298 f_header = (struct perf_file_header){ 340 f_header = (struct perf_file_header){
299 .magic = PERF_MAGIC, 341 .magic = PERF_MAGIC,
@@ -316,11 +358,15 @@ void perf_header__write(struct perf_header *self, int fd, bool at_exit)
316 memcpy(&f_header.adds_features, &self->adds_features, sizeof(self->adds_features)); 358 memcpy(&f_header.adds_features, &self->adds_features, sizeof(self->adds_features));
317 359
318 lseek(fd, 0, SEEK_SET); 360 lseek(fd, 0, SEEK_SET);
319 if (do_write(fd, &f_header, sizeof(f_header)) < 0) 361 err = do_write(fd, &f_header, sizeof(f_header));
320 die("failed to write perf header"); 362 if (err < 0) {
363 pr_debug("failed to write perf header\n");
364 return err;
365 }
321 lseek(fd, self->data_offset + self->data_size, SEEK_SET); 366 lseek(fd, self->data_offset + self->data_size, SEEK_SET);
322 367
323 self->frozen = 1; 368 self->frozen = 1;
369 return 0;
324} 370}
325 371
326static void do_read(int fd, void *buf, size_t size) 372static void do_read(int fd, void *buf, size_t size)
@@ -430,19 +476,17 @@ static int perf_file_section__process(struct perf_file_section *self,
430 return 0; 476 return 0;
431} 477}
432 478
433struct perf_header *perf_header__read(int fd) 479int perf_header__read(struct perf_header *self, int fd)
434{ 480{
435 struct perf_header *self = perf_header__new();
436 struct perf_file_header f_header; 481 struct perf_file_header f_header;
437 struct perf_file_attr f_attr; 482 struct perf_file_attr f_attr;
438 u64 f_id; 483 u64 f_id;
439 int nr_attrs, nr_ids, i, j; 484 int nr_attrs, nr_ids, i, j;
440 485
441 if (self == NULL) 486 if (perf_file_header__read(&f_header, self, fd) < 0) {
442 die("nomem"); 487 pr_debug("incompatible file format\n");
443 488 return -EINVAL;
444 if (perf_file_header__read(&f_header, self, fd) < 0) 489 }
445 die("incompatible file format");
446 490
447 nr_attrs = f_header.attrs.size / sizeof(f_attr); 491 nr_attrs = f_header.attrs.size / sizeof(f_attr);
448 lseek(fd, f_header.attrs.offset, SEEK_SET); 492 lseek(fd, f_header.attrs.offset, SEEK_SET);
@@ -456,7 +500,7 @@ struct perf_header *perf_header__read(int fd)
456 500
457 attr = perf_header_attr__new(&f_attr.attr); 501 attr = perf_header_attr__new(&f_attr.attr);
458 if (attr == NULL) 502 if (attr == NULL)
459 die("nomem"); 503 return -ENOMEM;
460 504
461 nr_ids = f_attr.ids.size / sizeof(u64); 505 nr_ids = f_attr.ids.size / sizeof(u64);
462 lseek(fd, f_attr.ids.offset, SEEK_SET); 506 lseek(fd, f_attr.ids.offset, SEEK_SET);
@@ -464,11 +508,15 @@ struct perf_header *perf_header__read(int fd)
464 for (j = 0; j < nr_ids; j++) { 508 for (j = 0; j < nr_ids; j++) {
465 do_read(fd, &f_id, sizeof(f_id)); 509 do_read(fd, &f_id, sizeof(f_id));
466 510
467 if (perf_header_attr__add_id(attr, f_id) < 0) 511 if (perf_header_attr__add_id(attr, f_id) < 0) {
468 die("nomem"); 512 perf_header_attr__delete(attr);
513 return -ENOMEM;
514 }
515 }
516 if (perf_header__add_attr(self, attr) < 0) {
517 perf_header_attr__delete(attr);
518 return -ENOMEM;
469 } 519 }
470 if (perf_header__add_attr(self, attr) < 0)
471 die("nomem");
472 520
473 lseek(fd, tmp, SEEK_SET); 521 lseek(fd, tmp, SEEK_SET);
474 } 522 }
@@ -476,8 +524,8 @@ struct perf_header *perf_header__read(int fd)
476 if (f_header.event_types.size) { 524 if (f_header.event_types.size) {
477 lseek(fd, f_header.event_types.offset, SEEK_SET); 525 lseek(fd, f_header.event_types.offset, SEEK_SET);
478 events = malloc(f_header.event_types.size); 526 events = malloc(f_header.event_types.size);
479 if (!events) 527 if (events == NULL)
480 die("nomem"); 528 return -ENOMEM;
481 do_read(fd, events, f_header.event_types.size); 529 do_read(fd, events, f_header.event_types.size);
482 event_count = f_header.event_types.size / sizeof(struct perf_trace_event_type); 530 event_count = f_header.event_types.size / sizeof(struct perf_trace_event_type);
483 } 531 }
@@ -487,8 +535,7 @@ struct perf_header *perf_header__read(int fd)
487 lseek(fd, self->data_offset, SEEK_SET); 535 lseek(fd, self->data_offset, SEEK_SET);
488 536
489 self->frozen = 1; 537 self->frozen = 1;
490 538 return 0;
491 return self;
492} 539}
493 540
494u64 perf_header__sample_type(struct perf_header *header) 541u64 perf_header__sample_type(struct perf_header *header)
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index f46a94e09eea..d1dbe2b79c42 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -55,8 +55,11 @@ struct perf_header {
55 DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS); 55 DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS);
56}; 56};
57 57
58struct perf_header *perf_header__read(int fd); 58struct perf_header *perf_header__new(void);
59void perf_header__write(struct perf_header *self, int fd, bool at_exit); 59void perf_header__delete(struct perf_header *self);
60
61int perf_header__read(struct perf_header *self, int fd);
62int perf_header__write(struct perf_header *self, int fd, bool at_exit);
60 63
61int perf_header__add_attr(struct perf_header *self, 64int perf_header__add_attr(struct perf_header *self,
62 struct perf_header_attr *attr); 65 struct perf_header_attr *attr);
@@ -75,8 +78,6 @@ perf_header__find_attr(u64 id, struct perf_header *header);
75void perf_header__set_feat(struct perf_header *self, int feat); 78void perf_header__set_feat(struct perf_header *self, int feat);
76bool perf_header__has_feat(const struct perf_header *self, int feat); 79bool perf_header__has_feat(const struct perf_header *self, int feat);
77 80
78struct perf_header *perf_header__new(void);
79
80int perf_header__process_sections(struct perf_header *self, int fd, 81int perf_header__process_sections(struct perf_header *self, int fd,
81 int (*process)(struct perf_file_section *self, 82 int (*process)(struct perf_file_section *self,
82 int feat, int fd)); 83 int feat, int fd));
diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h
index ace57c36d1d0..8d63116e9435 100644
--- a/tools/perf/util/include/linux/bitops.h
+++ b/tools/perf/util/include/linux/bitops.h
@@ -7,6 +7,8 @@
7#define CONFIG_GENERIC_FIND_FIRST_BIT 7#define CONFIG_GENERIC_FIND_FIRST_BIT
8#include "../../../../include/linux/bitops.h" 8#include "../../../../include/linux/bitops.h"
9 9
10#undef __KERNEL__
11
10static inline void set_bit(int nr, unsigned long *addr) 12static inline void set_bit(int nr, unsigned long *addr)
11{ 13{
12 addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG); 14 addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 94ca95073c40..09412321a80d 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -75,6 +75,29 @@ out_delete:
75 return NULL; 75 return NULL;
76} 76}
77 77
78void map__delete(struct map *self)
79{
80 free(self);
81}
82
83void map__fixup_start(struct map *self)
84{
85 struct rb_node *nd = rb_first(&self->dso->syms);
86 if (nd != NULL) {
87 struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
88 self->start = sym->start;
89 }
90}
91
92void map__fixup_end(struct map *self)
93{
94 struct rb_node *nd = rb_last(&self->dso->syms);
95 if (nd != NULL) {
96 struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
97 self->end = sym->end;
98 }
99}
100
78#define DSO__DELETED "(deleted)" 101#define DSO__DELETED "(deleted)"
79 102
80struct symbol * 103struct symbol *
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 0faf4f2bb5ca..070027469270 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1,4 +1,4 @@
1 1#include "../../../include/linux/hw_breakpoint.h"
2#include "util.h" 2#include "util.h"
3#include "../perf.h" 3#include "../perf.h"
4#include "parse-options.h" 4#include "parse-options.h"
@@ -540,6 +540,81 @@ static enum event_result parse_tracepoint_event(const char **strp,
540 attr, strp); 540 attr, strp);
541} 541}
542 542
543static enum event_result
544parse_breakpoint_type(const char *type, const char **strp,
545 struct perf_event_attr *attr)
546{
547 int i;
548
549 for (i = 0; i < 3; i++) {
550 if (!type[i])
551 break;
552
553 switch (type[i]) {
554 case 'r':
555 attr->bp_type |= HW_BREAKPOINT_R;
556 break;
557 case 'w':
558 attr->bp_type |= HW_BREAKPOINT_W;
559 break;
560 case 'x':
561 attr->bp_type |= HW_BREAKPOINT_X;
562 break;
563 default:
564 return EVT_FAILED;
565 }
566 }
567 if (!attr->bp_type) /* Default */
568 attr->bp_type = HW_BREAKPOINT_R | HW_BREAKPOINT_W;
569
570 *strp = type + i;
571
572 return EVT_HANDLED;
573}
574
575static enum event_result
576parse_breakpoint_event(const char **strp, struct perf_event_attr *attr)
577{
578 const char *target;
579 const char *type;
580 char *endaddr;
581 u64 addr;
582 enum event_result err;
583
584 target = strchr(*strp, ':');
585 if (!target)
586 return EVT_FAILED;
587
588 if (strncmp(*strp, "mem", target - *strp) != 0)
589 return EVT_FAILED;
590
591 target++;
592
593 addr = strtoull(target, &endaddr, 0);
594 if (target == endaddr)
595 return EVT_FAILED;
596
597 attr->bp_addr = addr;
598 *strp = endaddr;
599
600 type = strchr(target, ':');
601
602 /* If no type is defined, just rw as default */
603 if (!type) {
604 attr->bp_type = HW_BREAKPOINT_R | HW_BREAKPOINT_W;
605 } else {
606 err = parse_breakpoint_type(++type, strp, attr);
607 if (err == EVT_FAILED)
608 return EVT_FAILED;
609 }
610
611 /* We should find a nice way to override the access type */
612 attr->bp_len = HW_BREAKPOINT_LEN_4;
613 attr->type = PERF_TYPE_BREAKPOINT;
614
615 return EVT_HANDLED;
616}
617
543static int check_events(const char *str, unsigned int i) 618static int check_events(const char *str, unsigned int i)
544{ 619{
545 int n; 620 int n;
@@ -673,6 +748,10 @@ parse_event_symbols(const char **str, struct perf_event_attr *attr)
673 if (ret != EVT_FAILED) 748 if (ret != EVT_FAILED)
674 goto modifier; 749 goto modifier;
675 750
751 ret = parse_breakpoint_event(str, attr);
752 if (ret != EVT_FAILED)
753 goto modifier;
754
676 fprintf(stderr, "invalid or unsupported event: '%s'\n", *str); 755 fprintf(stderr, "invalid or unsupported event: '%s'\n", *str);
677 fprintf(stderr, "Run 'perf list' for a list of valid events\n"); 756 fprintf(stderr, "Run 'perf list' for a list of valid events\n");
678 return EVT_FAILED; 757 return EVT_FAILED;
@@ -859,6 +938,9 @@ void print_events(void)
859 "rNNN"); 938 "rNNN");
860 printf("\n"); 939 printf("\n");
861 940
941 printf(" %-42s [hardware breakpoint]\n", "mem:<addr>[:access]");
942 printf("\n");
943
862 print_tracepoint_events(); 944 print_tracepoint_events();
863 945
864 exit(129); 946 exit(129);
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 5cc96c86861b..44d81d5ae8cf 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -9,8 +9,13 @@
9#include <libelf.h> 9#include <libelf.h>
10#include <gelf.h> 10#include <gelf.h>
11#include <elf.h> 11#include <elf.h>
12#include <limits.h>
12#include <sys/utsname.h> 13#include <sys/utsname.h>
13 14
15#ifndef NT_GNU_BUILD_ID
16#define NT_GNU_BUILD_ID 3
17#endif
18
14enum dso_origin { 19enum dso_origin {
15 DSO__ORIG_KERNEL = 0, 20 DSO__ORIG_KERNEL = 0,
16 DSO__ORIG_JAVA_JIT, 21 DSO__ORIG_JAVA_JIT,
@@ -26,7 +31,11 @@ static void dsos__add(struct dso *dso);
26static struct dso *dsos__find(const char *name); 31static struct dso *dsos__find(const char *name);
27static struct map *map__new2(u64 start, struct dso *dso); 32static struct map *map__new2(u64 start, struct dso *dso);
28static void kernel_maps__insert(struct map *map); 33static void kernel_maps__insert(struct map *map);
34static int dso__load_kernel_sym(struct dso *self, struct map *map,
35 symbol_filter_t filter);
29unsigned int symbol__priv_size; 36unsigned int symbol__priv_size;
37static int vmlinux_path__nr_entries;
38static char **vmlinux_path;
30 39
31static struct rb_root kernel_maps; 40static struct rb_root kernel_maps;
32 41
@@ -69,11 +78,11 @@ static void kernel_maps__fixup_end(void)
69 prev->end = curr->start - 1; 78 prev->end = curr->start - 1;
70 } 79 }
71 80
72 nd = rb_last(&curr->dso->syms); 81 /*
73 if (nd) { 82 * We still haven't the actual symbols, so guess the
74 struct symbol *sym = rb_entry(nd, struct symbol, rb_node); 83 * last map final address.
75 curr->end = sym->end; 84 */
76 } 85 curr->end = ~0UL;
77} 86}
78 87
79static struct symbol *symbol__new(u64 start, u64 len, const char *name) 88static struct symbol *symbol__new(u64 start, u64 len, const char *name)
@@ -111,6 +120,8 @@ static size_t symbol__fprintf(struct symbol *self, FILE *fp)
111 120
112static void dso__set_long_name(struct dso *self, char *name) 121static void dso__set_long_name(struct dso *self, char *name)
113{ 122{
123 if (name == NULL)
124 return;
114 self->long_name = name; 125 self->long_name = name;
115 self->long_name_len = strlen(name); 126 self->long_name_len = strlen(name);
116} 127}
@@ -323,7 +334,7 @@ out_failure:
323 * kernel range is broken in several maps, named [kernel].N, as we don't have 334 * kernel range is broken in several maps, named [kernel].N, as we don't have
324 * the original ELF section names vmlinux have. 335 * the original ELF section names vmlinux have.
325 */ 336 */
326static int kernel_maps__split_kallsyms(symbol_filter_t filter, int use_modules) 337static int kernel_maps__split_kallsyms(symbol_filter_t filter)
327{ 338{
328 struct map *map = kernel_map; 339 struct map *map = kernel_map;
329 struct symbol *pos; 340 struct symbol *pos;
@@ -339,9 +350,6 @@ static int kernel_maps__split_kallsyms(symbol_filter_t filter, int use_modules)
339 350
340 module = strchr(pos->name, '\t'); 351 module = strchr(pos->name, '\t');
341 if (module) { 352 if (module) {
342 if (!use_modules)
343 goto delete_symbol;
344
345 *module++ = '\0'; 353 *module++ = '\0';
346 354
347 if (strcmp(map->dso->name, module)) { 355 if (strcmp(map->dso->name, module)) {
@@ -381,7 +389,6 @@ static int kernel_maps__split_kallsyms(symbol_filter_t filter, int use_modules)
381 } 389 }
382 390
383 if (filter && filter(map, pos)) { 391 if (filter && filter(map, pos)) {
384delete_symbol:
385 rb_erase(&pos->rb_node, &kernel_map->dso->syms); 392 rb_erase(&pos->rb_node, &kernel_map->dso->syms);
386 symbol__delete(pos); 393 symbol__delete(pos);
387 } else { 394 } else {
@@ -397,17 +404,18 @@ delete_symbol:
397} 404}
398 405
399 406
400static int kernel_maps__load_kallsyms(symbol_filter_t filter, int use_modules) 407static int kernel_maps__load_kallsyms(symbol_filter_t filter)
401{ 408{
402 if (kernel_maps__load_all_kallsyms()) 409 if (kernel_maps__load_all_kallsyms())
403 return -1; 410 return -1;
404 411
405 dso__fixup_sym_end(kernel_map->dso); 412 dso__fixup_sym_end(kernel_map->dso);
413 kernel_map->dso->origin = DSO__ORIG_KERNEL;
406 414
407 return kernel_maps__split_kallsyms(filter, use_modules); 415 return kernel_maps__split_kallsyms(filter);
408} 416}
409 417
410static size_t kernel_maps__fprintf(FILE *fp) 418size_t kernel_maps__fprintf(FILE *fp)
411{ 419{
412 size_t printed = fprintf(fp, "Kernel maps:\n"); 420 size_t printed = fprintf(fp, "Kernel maps:\n");
413 struct rb_node *nd; 421 struct rb_node *nd;
@@ -883,47 +891,40 @@ out_close:
883 return err; 891 return err;
884} 892}
885 893
886bool fetch_build_id_table(struct list_head *head) 894static bool dso__build_id_equal(const struct dso *self, u8 *build_id)
887{ 895{
888 bool have_buildid = false; 896 return memcmp(self->build_id, build_id, sizeof(self->build_id)) == 0;
889 struct dso *pos; 897}
890
891 list_for_each_entry(pos, &dsos, node) {
892 struct build_id_list *new;
893 struct build_id_event b;
894 size_t len;
895
896 if (filename__read_build_id(pos->long_name,
897 &b.build_id,
898 sizeof(b.build_id)) < 0)
899 continue;
900 have_buildid = true;
901 memset(&b.header, 0, sizeof(b.header));
902 len = pos->long_name_len + 1;
903 len = ALIGN(len, 64);
904 b.header.size = sizeof(b) + len;
905
906 new = malloc(sizeof(*new));
907 if (!new)
908 die("No memory\n");
909 898
910 memcpy(&new->event, &b, sizeof(b)); 899bool dsos__read_build_ids(void)
911 new->dso_name = pos->long_name; 900{
912 new->len = len; 901 bool have_build_id = false;
902 struct dso *pos;
913 903
914 list_add_tail(&new->list, head); 904 list_for_each_entry(pos, &dsos, node)
915 } 905 if (filename__read_build_id(pos->long_name, pos->build_id,
906 sizeof(pos->build_id)) > 0) {
907 have_build_id = true;
908 pos->has_build_id = true;
909 }
916 910
917 return have_buildid; 911 return have_build_id;
918} 912}
919 913
914/*
915 * Align offset to 4 bytes as needed for note name and descriptor data.
916 */
917#define NOTE_ALIGN(n) (((n) + 3) & -4U)
918
920int filename__read_build_id(const char *filename, void *bf, size_t size) 919int filename__read_build_id(const char *filename, void *bf, size_t size)
921{ 920{
922 int fd, err = -1; 921 int fd, err = -1;
923 GElf_Ehdr ehdr; 922 GElf_Ehdr ehdr;
924 GElf_Shdr shdr; 923 GElf_Shdr shdr;
925 Elf_Data *build_id_data; 924 Elf_Data *data;
926 Elf_Scn *sec; 925 Elf_Scn *sec;
926 Elf_Kind ek;
927 void *ptr;
927 Elf *elf; 928 Elf *elf;
928 929
929 if (size < BUILD_ID_SIZE) 930 if (size < BUILD_ID_SIZE)
@@ -939,6 +940,10 @@ int filename__read_build_id(const char *filename, void *bf, size_t size)
939 goto out_close; 940 goto out_close;
940 } 941 }
941 942
943 ek = elf_kind(elf);
944 if (ek != ELF_K_ELF)
945 goto out_elf_end;
946
942 if (gelf_getehdr(elf, &ehdr) == NULL) { 947 if (gelf_getehdr(elf, &ehdr) == NULL) {
943 pr_err("%s: cannot get elf header.\n", __func__); 948 pr_err("%s: cannot get elf header.\n", __func__);
944 goto out_elf_end; 949 goto out_elf_end;
@@ -946,14 +951,37 @@ int filename__read_build_id(const char *filename, void *bf, size_t size)
946 951
947 sec = elf_section_by_name(elf, &ehdr, &shdr, 952 sec = elf_section_by_name(elf, &ehdr, &shdr,
948 ".note.gnu.build-id", NULL); 953 ".note.gnu.build-id", NULL);
949 if (sec == NULL) 954 if (sec == NULL) {
950 goto out_elf_end; 955 sec = elf_section_by_name(elf, &ehdr, &shdr,
956 ".notes", NULL);
957 if (sec == NULL)
958 goto out_elf_end;
959 }
951 960
952 build_id_data = elf_getdata(sec, NULL); 961 data = elf_getdata(sec, NULL);
953 if (build_id_data == NULL) 962 if (data == NULL)
954 goto out_elf_end; 963 goto out_elf_end;
955 memcpy(bf, build_id_data->d_buf + 16, BUILD_ID_SIZE); 964
956 err = BUILD_ID_SIZE; 965 ptr = data->d_buf;
966 while (ptr < (data->d_buf + data->d_size)) {
967 GElf_Nhdr *nhdr = ptr;
968 int namesz = NOTE_ALIGN(nhdr->n_namesz),
969 descsz = NOTE_ALIGN(nhdr->n_descsz);
970 const char *name;
971
972 ptr += sizeof(*nhdr);
973 name = ptr;
974 ptr += namesz;
975 if (nhdr->n_type == NT_GNU_BUILD_ID &&
976 nhdr->n_namesz == sizeof("GNU")) {
977 if (memcmp(name, "GNU", sizeof("GNU")) == 0) {
978 memcpy(bf, ptr, BUILD_ID_SIZE);
979 err = BUILD_ID_SIZE;
980 break;
981 }
982 }
983 ptr += descsz;
984 }
957out_elf_end: 985out_elf_end:
958 elf_end(elf); 986 elf_end(elf);
959out_close: 987out_close:
@@ -962,23 +990,48 @@ out:
962 return err; 990 return err;
963} 991}
964 992
965static char *dso__read_build_id(struct dso *self) 993int sysfs__read_build_id(const char *filename, void *build_id, size_t size)
966{ 994{
967 int len; 995 int fd, err = -1;
968 char *build_id = NULL;
969 unsigned char rawbf[BUILD_ID_SIZE];
970 996
971 len = filename__read_build_id(self->long_name, rawbf, sizeof(rawbf)); 997 if (size < BUILD_ID_SIZE)
972 if (len < 0)
973 goto out; 998 goto out;
974 999
975 build_id = malloc(len * 2 + 1); 1000 fd = open(filename, O_RDONLY);
976 if (build_id == NULL) 1001 if (fd < 0)
977 goto out; 1002 goto out;
978 1003
979 build_id__sprintf(rawbf, len, build_id); 1004 while (1) {
1005 char bf[BUFSIZ];
1006 GElf_Nhdr nhdr;
1007 int namesz, descsz;
1008
1009 if (read(fd, &nhdr, sizeof(nhdr)) != sizeof(nhdr))
1010 break;
1011
1012 namesz = NOTE_ALIGN(nhdr.n_namesz);
1013 descsz = NOTE_ALIGN(nhdr.n_descsz);
1014 if (nhdr.n_type == NT_GNU_BUILD_ID &&
1015 nhdr.n_namesz == sizeof("GNU")) {
1016 if (read(fd, bf, namesz) != namesz)
1017 break;
1018 if (memcmp(bf, "GNU", sizeof("GNU")) == 0) {
1019 if (read(fd, build_id,
1020 BUILD_ID_SIZE) == BUILD_ID_SIZE) {
1021 err = 0;
1022 break;
1023 }
1024 } else if (read(fd, bf, descsz) != descsz)
1025 break;
1026 } else {
1027 int n = namesz + descsz;
1028 if (read(fd, bf, n) != n)
1029 break;
1030 }
1031 }
1032 close(fd);
980out: 1033out:
981 return build_id; 1034 return err;
982} 1035}
983 1036
984char dso__symtab_origin(const struct dso *self) 1037char dso__symtab_origin(const struct dso *self)
@@ -1001,12 +1054,17 @@ char dso__symtab_origin(const struct dso *self)
1001int dso__load(struct dso *self, struct map *map, symbol_filter_t filter) 1054int dso__load(struct dso *self, struct map *map, symbol_filter_t filter)
1002{ 1055{
1003 int size = PATH_MAX; 1056 int size = PATH_MAX;
1004 char *name = malloc(size), *build_id = NULL; 1057 char *name;
1058 u8 build_id[BUILD_ID_SIZE];
1005 int ret = -1; 1059 int ret = -1;
1006 int fd; 1060 int fd;
1007 1061
1008 self->loaded = 1; 1062 self->loaded = 1;
1009 1063
1064 if (self->kernel)
1065 return dso__load_kernel_sym(self, map, filter);
1066
1067 name = malloc(size);
1010 if (!name) 1068 if (!name)
1011 return -1; 1069 return -1;
1012 1070
@@ -1023,8 +1081,6 @@ int dso__load(struct dso *self, struct map *map, symbol_filter_t filter)
1023 1081
1024more: 1082more:
1025 do { 1083 do {
1026 int berr = 0;
1027
1028 self->origin++; 1084 self->origin++;
1029 switch (self->origin) { 1085 switch (self->origin) {
1030 case DSO__ORIG_FEDORA: 1086 case DSO__ORIG_FEDORA:
@@ -1036,12 +1092,18 @@ more:
1036 self->long_name); 1092 self->long_name);
1037 break; 1093 break;
1038 case DSO__ORIG_BUILDID: 1094 case DSO__ORIG_BUILDID:
1039 build_id = dso__read_build_id(self); 1095 if (filename__read_build_id(self->long_name, build_id,
1040 if (build_id != NULL) { 1096 sizeof(build_id))) {
1097 char build_id_hex[BUILD_ID_SIZE * 2 + 1];
1098
1099 build_id__sprintf(build_id, sizeof(build_id),
1100 build_id_hex);
1041 snprintf(name, size, 1101 snprintf(name, size,
1042 "/usr/lib/debug/.build-id/%.2s/%s.debug", 1102 "/usr/lib/debug/.build-id/%.2s/%s.debug",
1043 build_id, build_id + 2); 1103 build_id_hex, build_id_hex + 2);
1044 goto compare_build_id; 1104 if (self->has_build_id)
1105 goto compare_build_id;
1106 break;
1045 } 1107 }
1046 self->origin++; 1108 self->origin++;
1047 /* Fall thru */ 1109 /* Fall thru */
@@ -1054,18 +1116,11 @@ more:
1054 } 1116 }
1055 1117
1056 if (self->has_build_id) { 1118 if (self->has_build_id) {
1057 bool match; 1119 if (filename__read_build_id(name, build_id,
1058 build_id = malloc(BUILD_ID_SIZE); 1120 sizeof(build_id)) < 0)
1059 if (build_id == NULL)
1060 goto more; 1121 goto more;
1061 berr = filename__read_build_id(name, build_id,
1062 BUILD_ID_SIZE);
1063compare_build_id: 1122compare_build_id:
1064 match = berr > 0 && memcmp(build_id, self->build_id, 1123 if (!dso__build_id_equal(self, build_id))
1065 sizeof(self->build_id)) == 0;
1066 free(build_id);
1067 build_id = NULL;
1068 if (!match)
1069 goto more; 1124 goto more;
1070 } 1125 }
1071 1126
@@ -1100,7 +1155,8 @@ static void kernel_maps__insert(struct map *map)
1100 maps__insert(&kernel_maps, map); 1155 maps__insert(&kernel_maps, map);
1101} 1156}
1102 1157
1103struct symbol *kernel_maps__find_symbol(u64 ip, struct map **mapp) 1158struct symbol *kernel_maps__find_symbol(u64 ip, struct map **mapp,
1159 symbol_filter_t filter)
1104{ 1160{
1105 struct map *map = maps__find(&kernel_maps, ip); 1161 struct map *map = maps__find(&kernel_maps, ip);
1106 1162
@@ -1109,7 +1165,7 @@ struct symbol *kernel_maps__find_symbol(u64 ip, struct map **mapp)
1109 1165
1110 if (map) { 1166 if (map) {
1111 ip = map->map_ip(map, ip); 1167 ip = map->map_ip(map, ip);
1112 return map->dso->find_symbol(map->dso, ip); 1168 return map__find_symbol(map, ip, filter);
1113 } 1169 }
1114 1170
1115 return NULL; 1171 return NULL;
@@ -1129,32 +1185,13 @@ struct map *kernel_maps__find_by_dso_name(const char *name)
1129 return NULL; 1185 return NULL;
1130} 1186}
1131 1187
1132static int dso__load_module_sym(struct dso *self, struct map *map, 1188static int dsos__set_modules_path_dir(char *dirname)
1133 symbol_filter_t filter)
1134{
1135 int err = 0, fd = open(self->long_name, O_RDONLY);
1136
1137 self->loaded = 1;
1138
1139 if (fd < 0) {
1140 pr_err("%s: cannot open %s\n", __func__, self->long_name);
1141 return err;
1142 }
1143
1144 err = dso__load_sym(self, map, self->long_name, fd, filter, 0, 1);
1145 close(fd);
1146
1147 return err;
1148}
1149
1150static int dsos__load_modules_sym_dir(char *dirname, symbol_filter_t filter)
1151{ 1189{
1152 struct dirent *dent; 1190 struct dirent *dent;
1153 int nr_symbols = 0, err;
1154 DIR *dir = opendir(dirname); 1191 DIR *dir = opendir(dirname);
1155 1192
1156 if (!dir) { 1193 if (!dir) {
1157 pr_err("%s: cannot open %s dir\n", __func__, dirname); 1194 pr_debug("%s: cannot open %s dir\n", __func__, dirname);
1158 return -1; 1195 return -1;
1159 } 1196 }
1160 1197
@@ -1168,14 +1205,12 @@ static int dsos__load_modules_sym_dir(char *dirname, symbol_filter_t filter)
1168 1205
1169 snprintf(path, sizeof(path), "%s/%s", 1206 snprintf(path, sizeof(path), "%s/%s",
1170 dirname, dent->d_name); 1207 dirname, dent->d_name);
1171 err = dsos__load_modules_sym_dir(path, filter); 1208 if (dsos__set_modules_path_dir(path) < 0)
1172 if (err < 0)
1173 goto failure; 1209 goto failure;
1174 } else { 1210 } else {
1175 char *dot = strrchr(dent->d_name, '.'), 1211 char *dot = strrchr(dent->d_name, '.'),
1176 dso_name[PATH_MAX]; 1212 dso_name[PATH_MAX];
1177 struct map *map; 1213 struct map *map;
1178 struct rb_node *last;
1179 char *long_name; 1214 char *long_name;
1180 1215
1181 if (dot == NULL || strcmp(dot, ".ko")) 1216 if (dot == NULL || strcmp(dot, ".ko"))
@@ -1195,36 +1230,16 @@ static int dsos__load_modules_sym_dir(char *dirname, symbol_filter_t filter)
1195 if (long_name == NULL) 1230 if (long_name == NULL)
1196 goto failure; 1231 goto failure;
1197 dso__set_long_name(map->dso, long_name); 1232 dso__set_long_name(map->dso, long_name);
1198 dso__set_basename(map->dso);
1199
1200 err = dso__load_module_sym(map->dso, map, filter);
1201 if (err < 0)
1202 goto failure;
1203 last = rb_last(&map->dso->syms);
1204 if (last) {
1205 struct symbol *sym;
1206 /*
1207 * We do this here as well, even having the
1208 * symbol size found in the symtab because
1209 * misannotated ASM symbols may have the size
1210 * set to zero.
1211 */
1212 dso__fixup_sym_end(map->dso);
1213
1214 sym = rb_entry(last, struct symbol, rb_node);
1215 map->end = map->start + sym->end;
1216 }
1217 } 1233 }
1218 nr_symbols += err;
1219 } 1234 }
1220 1235
1221 return nr_symbols; 1236 return 0;
1222failure: 1237failure:
1223 closedir(dir); 1238 closedir(dir);
1224 return -1; 1239 return -1;
1225} 1240}
1226 1241
1227static int dsos__load_modules_sym(symbol_filter_t filter) 1242static int dsos__set_modules_path(void)
1228{ 1243{
1229 struct utsname uts; 1244 struct utsname uts;
1230 char modules_path[PATH_MAX]; 1245 char modules_path[PATH_MAX];
@@ -1235,7 +1250,7 @@ static int dsos__load_modules_sym(symbol_filter_t filter)
1235 snprintf(modules_path, sizeof(modules_path), "/lib/modules/%s/kernel", 1250 snprintf(modules_path, sizeof(modules_path), "/lib/modules/%s/kernel",
1236 uts.release); 1251 uts.release);
1237 1252
1238 return dsos__load_modules_sym_dir(modules_path, filter); 1253 return dsos__set_modules_path_dir(modules_path);
1239} 1254}
1240 1255
1241/* 1256/*
@@ -1257,7 +1272,7 @@ static struct map *map__new2(u64 start, struct dso *dso)
1257 return self; 1272 return self;
1258} 1273}
1259 1274
1260static int dsos__load_modules(void) 1275static int kernel_maps__create_module_maps(void)
1261{ 1276{
1262 char *line = NULL; 1277 char *line = NULL;
1263 size_t n; 1278 size_t n;
@@ -1307,6 +1322,12 @@ static int dsos__load_modules(void)
1307 goto out_delete_line; 1322 goto out_delete_line;
1308 } 1323 }
1309 1324
1325 snprintf(name, sizeof(name),
1326 "/sys/module/%s/notes/.note.gnu.build-id", line);
1327 if (sysfs__read_build_id(name, dso->build_id,
1328 sizeof(dso->build_id)) == 0)
1329 dso->has_build_id = true;
1330
1310 dso->origin = DSO__ORIG_KMODULE; 1331 dso->origin = DSO__ORIG_KMODULE;
1311 kernel_maps__insert(map); 1332 kernel_maps__insert(map);
1312 dsos__add(dso); 1333 dsos__add(dso);
@@ -1315,7 +1336,7 @@ static int dsos__load_modules(void)
1315 free(line); 1336 free(line);
1316 fclose(file); 1337 fclose(file);
1317 1338
1318 return 0; 1339 return dsos__set_modules_path();
1319 1340
1320out_delete_line: 1341out_delete_line:
1321 free(line); 1342 free(line);
@@ -1326,13 +1347,37 @@ out_failure:
1326static int dso__load_vmlinux(struct dso *self, struct map *map, 1347static int dso__load_vmlinux(struct dso *self, struct map *map,
1327 const char *vmlinux, symbol_filter_t filter) 1348 const char *vmlinux, symbol_filter_t filter)
1328{ 1349{
1329 int err, fd = open(vmlinux, O_RDONLY); 1350 int err = -1, fd;
1330 1351
1331 self->loaded = 1; 1352 if (self->has_build_id) {
1353 u8 build_id[BUILD_ID_SIZE];
1354
1355 if (filename__read_build_id(vmlinux, build_id,
1356 sizeof(build_id)) < 0) {
1357 pr_debug("No build_id in %s, ignoring it\n", vmlinux);
1358 return -1;
1359 }
1360 if (!dso__build_id_equal(self, build_id)) {
1361 char expected_build_id[BUILD_ID_SIZE * 2 + 1],
1362 vmlinux_build_id[BUILD_ID_SIZE * 2 + 1];
1363
1364 build_id__sprintf(self->build_id,
1365 sizeof(self->build_id),
1366 expected_build_id);
1367 build_id__sprintf(build_id, sizeof(build_id),
1368 vmlinux_build_id);
1369 pr_debug("build_id in %s is %s while expected is %s, "
1370 "ignoring it\n", vmlinux, vmlinux_build_id,
1371 expected_build_id);
1372 return -1;
1373 }
1374 }
1332 1375
1376 fd = open(vmlinux, O_RDONLY);
1333 if (fd < 0) 1377 if (fd < 0)
1334 return -1; 1378 return -1;
1335 1379
1380 self->loaded = 1;
1336 err = dso__load_sym(self, map, self->long_name, fd, filter, 1, 0); 1381 err = dso__load_sym(self, map, self->long_name, fd, filter, 1, 0);
1337 1382
1338 close(fd); 1383 close(fd);
@@ -1340,78 +1385,55 @@ static int dso__load_vmlinux(struct dso *self, struct map *map,
1340 return err; 1385 return err;
1341} 1386}
1342 1387
1343int dsos__load_kernel(const char *vmlinux, symbol_filter_t filter, 1388static int dso__load_kernel_sym(struct dso *self, struct map *map,
1344 int use_modules) 1389 symbol_filter_t filter)
1345{ 1390{
1346 int err = -1; 1391 int err;
1347 struct dso *dso = dso__new(vmlinux); 1392 bool is_kallsyms;
1348 1393
1349 if (dso == NULL) 1394 if (vmlinux_path != NULL) {
1350 return -1; 1395 int i;
1351 1396 pr_debug("Looking at the vmlinux_path (%d entries long)\n",
1352 dso->short_name = "[kernel]"; 1397 vmlinux_path__nr_entries);
1353 kernel_map = map__new2(0, dso); 1398 for (i = 0; i < vmlinux_path__nr_entries; ++i) {
1354 if (kernel_map == NULL) 1399 err = dso__load_vmlinux(self, map, vmlinux_path[i],
1355 goto out_delete_dso; 1400 filter);
1356 1401 if (err > 0) {
1357 kernel_map->map_ip = kernel_map->unmap_ip = identity__map_ip; 1402 pr_debug("Using %s for symbols\n",
1358 1403 vmlinux_path[i]);
1359 if (use_modules && dsos__load_modules() < 0) { 1404 dso__set_long_name(self,
1360 pr_warning("Failed to load list of modules in use! " 1405 strdup(vmlinux_path[i]));
1361 "Continuing...\n"); 1406 goto out_fixup;
1362 use_modules = 0; 1407 }
1363 }
1364
1365 if (vmlinux) {
1366 err = dso__load_vmlinux(dso, kernel_map, vmlinux, filter);
1367 if (err > 0 && use_modules) {
1368 int syms = dsos__load_modules_sym(filter);
1369
1370 if (syms < 0)
1371 pr_warning("Failed to read module symbols!"
1372 " Continuing...\n");
1373 else
1374 err += syms;
1375 } 1408 }
1376 } 1409 }
1377 1410
1378 if (err <= 0) 1411 is_kallsyms = self->long_name[0] == '[';
1379 err = kernel_maps__load_kallsyms(filter, use_modules); 1412 if (is_kallsyms)
1413 goto do_kallsyms;
1414
1415 err = dso__load_vmlinux(self, map, self->long_name, filter);
1416 if (err <= 0) {
1417 pr_info("The file %s cannot be used, "
1418 "trying to use /proc/kallsyms...", self->long_name);
1419 sleep(2);
1420do_kallsyms:
1421 err = kernel_maps__load_kallsyms(filter);
1422 if (err > 0 && !is_kallsyms)
1423 dso__set_long_name(self, strdup("[kernel.kallsyms]"));
1424 }
1380 1425
1381 if (err > 0) { 1426 if (err > 0) {
1382 struct rb_node *node = rb_first(&dso->syms); 1427out_fixup:
1383 struct symbol *sym = rb_entry(node, struct symbol, rb_node); 1428 map__fixup_start(map);
1384 1429 map__fixup_end(map);
1385 kernel_map->start = sym->start;
1386 node = rb_last(&dso->syms);
1387 sym = rb_entry(node, struct symbol, rb_node);
1388 kernel_map->end = sym->end;
1389
1390 dso->origin = DSO__ORIG_KERNEL;
1391 kernel_maps__insert(kernel_map);
1392 /*
1393 * Now that we have all sorted out, just set the ->end of all
1394 * maps:
1395 */
1396 kernel_maps__fixup_end();
1397 dsos__add(dso);
1398
1399 if (verbose)
1400 kernel_maps__fprintf(stderr);
1401 } 1430 }
1402 1431
1403 return err; 1432 return err;
1404
1405out_delete_dso:
1406 dso__delete(dso);
1407 return -1;
1408} 1433}
1409 1434
1410LIST_HEAD(dsos); 1435LIST_HEAD(dsos);
1411struct dso *vdso; 1436struct dso *vdso;
1412
1413const char *vmlinux_name = "vmlinux";
1414int modules;
1415 1437
1416static void dsos__add(struct dso *dso) 1438static void dsos__add(struct dso *dso)
1417{ 1439{
@@ -1463,18 +1485,117 @@ size_t dsos__fprintf_buildid(FILE *fp)
1463 return ret; 1485 return ret;
1464} 1486}
1465 1487
1466int load_kernel(symbol_filter_t filter) 1488static int kernel_maps__create_kernel_map(const char *vmlinux_name)
1467{ 1489{
1468 if (dsos__load_kernel(vmlinux_name, filter, modules) <= 0) 1490 struct dso *kernel = dso__new(vmlinux_name ?: "[kernel.kallsyms]");
1491
1492 if (kernel == NULL)
1469 return -1; 1493 return -1;
1470 1494
1495 kernel_map = map__new2(0, kernel);
1496 if (kernel_map == NULL)
1497 goto out_delete_kernel_dso;
1498
1499 kernel_map->map_ip = kernel_map->unmap_ip = identity__map_ip;
1500 kernel->short_name = "[kernel]";
1501 kernel->kernel = 1;
1502
1471 vdso = dso__new("[vdso]"); 1503 vdso = dso__new("[vdso]");
1472 if (!vdso) 1504 if (vdso == NULL)
1473 return -1; 1505 goto out_delete_kernel_map;
1506
1507 if (sysfs__read_build_id("/sys/kernel/notes", kernel->build_id,
1508 sizeof(kernel->build_id)) == 0)
1509 kernel->has_build_id = true;
1474 1510
1511 kernel_maps__insert(kernel_map);
1512 dsos__add(kernel);
1475 dsos__add(vdso); 1513 dsos__add(vdso);
1476 1514
1477 return 0; 1515 return 0;
1516
1517out_delete_kernel_map:
1518 map__delete(kernel_map);
1519 kernel_map = NULL;
1520out_delete_kernel_dso:
1521 dso__delete(kernel);
1522 return -1;
1523}
1524
1525static void vmlinux_path__exit(void)
1526{
1527 while (--vmlinux_path__nr_entries >= 0) {
1528 free(vmlinux_path[vmlinux_path__nr_entries]);
1529 vmlinux_path[vmlinux_path__nr_entries] = NULL;
1530 }
1531
1532 free(vmlinux_path);
1533 vmlinux_path = NULL;
1534}
1535
1536static int vmlinux_path__init(void)
1537{
1538 struct utsname uts;
1539 char bf[PATH_MAX];
1540
1541 if (uname(&uts) < 0)
1542 return -1;
1543
1544 vmlinux_path = malloc(sizeof(char *) * 5);
1545 if (vmlinux_path == NULL)
1546 return -1;
1547
1548 vmlinux_path[vmlinux_path__nr_entries] = strdup("vmlinux");
1549 if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
1550 goto out_fail;
1551 ++vmlinux_path__nr_entries;
1552 vmlinux_path[vmlinux_path__nr_entries] = strdup("/boot/vmlinux");
1553 if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
1554 goto out_fail;
1555 ++vmlinux_path__nr_entries;
1556 snprintf(bf, sizeof(bf), "/boot/vmlinux-%s", uts.release);
1557 vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
1558 if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
1559 goto out_fail;
1560 ++vmlinux_path__nr_entries;
1561 snprintf(bf, sizeof(bf), "/lib/modules/%s/build/vmlinux", uts.release);
1562 vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
1563 if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
1564 goto out_fail;
1565 ++vmlinux_path__nr_entries;
1566 snprintf(bf, sizeof(bf), "/usr/lib/debug/lib/modules/%s/vmlinux",
1567 uts.release);
1568 vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
1569 if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
1570 goto out_fail;
1571 ++vmlinux_path__nr_entries;
1572
1573 return 0;
1574
1575out_fail:
1576 vmlinux_path__exit();
1577 return -1;
1578}
1579
1580int kernel_maps__init(const char *vmlinux_name, bool try_vmlinux_path,
1581 bool use_modules)
1582{
1583 if (try_vmlinux_path && vmlinux_path__init() < 0)
1584 return -1;
1585
1586 if (kernel_maps__create_kernel_map(vmlinux_name) < 0) {
1587 vmlinux_path__exit();
1588 return -1;
1589 }
1590
1591 if (use_modules && kernel_maps__create_module_maps() < 0)
1592 pr_debug("Failed to load list of modules in use, "
1593 "continuing...\n");
1594 /*
1595 * Now that we have all the maps created, just set the ->end of them:
1596 */
1597 kernel_maps__fixup_end();
1598 return 0;
1478} 1599}
1479 1600
1480void symbol__init(unsigned int priv_size) 1601void symbol__init(unsigned int priv_size)
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 5ad1019607dd..8c4d026e067a 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -64,6 +64,7 @@ struct dso {
64 u8 slen_calculated:1; 64 u8 slen_calculated:1;
65 u8 loaded:1; 65 u8 loaded:1;
66 u8 has_build_id:1; 66 u8 has_build_id:1;
67 u8 kernel:1;
67 unsigned char origin; 68 unsigned char origin;
68 u8 build_id[BUILD_ID_SIZE]; 69 u8 build_id[BUILD_ID_SIZE];
69 u16 long_name_len; 70 u16 long_name_len;
@@ -77,7 +78,6 @@ void dso__delete(struct dso *self);
77 78
78struct symbol *dso__find_symbol(struct dso *self, u64 ip); 79struct symbol *dso__find_symbol(struct dso *self, u64 ip);
79 80
80int dsos__load_kernel(const char *vmlinux, symbol_filter_t filter, int modules);
81struct dso *dsos__findnew(const char *name); 81struct dso *dsos__findnew(const char *name);
82int dso__load(struct dso *self, struct map *map, symbol_filter_t filter); 82int dso__load(struct dso *self, struct map *map, symbol_filter_t filter);
83void dsos__fprintf(FILE *fp); 83void dsos__fprintf(FILE *fp);
@@ -89,16 +89,17 @@ char dso__symtab_origin(const struct dso *self);
89void dso__set_build_id(struct dso *self, void *build_id); 89void dso__set_build_id(struct dso *self, void *build_id);
90 90
91int filename__read_build_id(const char *filename, void *bf, size_t size); 91int filename__read_build_id(const char *filename, void *bf, size_t size);
92bool fetch_build_id_table(struct list_head *head); 92int sysfs__read_build_id(const char *filename, void *bf, size_t size);
93bool dsos__read_build_ids(void);
93int build_id__sprintf(u8 *self, int len, char *bf); 94int build_id__sprintf(u8 *self, int len, char *bf);
94 95
95int load_kernel(symbol_filter_t filter); 96int kernel_maps__init(const char *vmlinux_name, bool try_vmlinux_path,
97 bool use_modules);
98size_t kernel_maps__fprintf(FILE *fp);
96 99
97void symbol__init(unsigned int priv_size); 100void symbol__init(unsigned int priv_size);
98 101
99extern struct list_head dsos; 102extern struct list_head dsos;
100extern struct map *kernel_map; 103extern struct map *kernel_map;
101extern struct dso *vdso; 104extern struct dso *vdso;
102extern const char *vmlinux_name;
103extern int modules;
104#endif /* __PERF_SYMBOL */ 105#endif /* __PERF_SYMBOL */
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 53addd77ce8f..e4b8d437725a 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -26,7 +26,8 @@ size_t threads__fprintf(FILE *fp);
26void maps__insert(struct rb_root *maps, struct map *map); 26void maps__insert(struct rb_root *maps, struct map *map);
27struct map *maps__find(struct rb_root *maps, u64 ip); 27struct map *maps__find(struct rb_root *maps, u64 ip);
28 28
29struct symbol *kernel_maps__find_symbol(const u64 ip, struct map **mapp); 29struct symbol *kernel_maps__find_symbol(const u64 ip, struct map **mapp,
30 symbol_filter_t filter);
30struct map *kernel_maps__find_by_dso_name(const char *name); 31struct map *kernel_maps__find_by_dso_name(const char *name);
31 32
32static inline struct map *thread__find_map(struct thread *self, u64 ip) 33static inline struct map *thread__find_map(struct thread *self, u64 ip)
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index 831052d4b4fb..cace35595530 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -33,11 +33,11 @@
33#include <ctype.h> 33#include <ctype.h>
34#include <errno.h> 34#include <errno.h>
35#include <stdbool.h> 35#include <stdbool.h>
36#include <linux/kernel.h>
36 37
37#include "../perf.h" 38#include "../perf.h"
38#include "trace-event.h" 39#include "trace-event.h"
39 40
40
41#define VERSION "0.5" 41#define VERSION "0.5"
42 42
43#define _STR(x) #x 43#define _STR(x) #x
@@ -483,23 +483,31 @@ static struct tracepoint_path *
483get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events) 483get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events)
484{ 484{
485 struct tracepoint_path path, *ppath = &path; 485 struct tracepoint_path path, *ppath = &path;
486 int i; 486 int i, nr_tracepoints = 0;
487 487
488 for (i = 0; i < nb_events; i++) { 488 for (i = 0; i < nb_events; i++) {
489 if (pattrs[i].type != PERF_TYPE_TRACEPOINT) 489 if (pattrs[i].type != PERF_TYPE_TRACEPOINT)
490 continue; 490 continue;
491 ++nr_tracepoints;
491 ppath->next = tracepoint_id_to_path(pattrs[i].config); 492 ppath->next = tracepoint_id_to_path(pattrs[i].config);
492 if (!ppath->next) 493 if (!ppath->next)
493 die("%s\n", "No memory to alloc tracepoints list"); 494 die("%s\n", "No memory to alloc tracepoints list");
494 ppath = ppath->next; 495 ppath = ppath->next;
495 } 496 }
496 497
497 return path.next; 498 return nr_tracepoints > 0 ? path.next : NULL;
498} 499}
499void read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events) 500
501int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events)
500{ 502{
501 char buf[BUFSIZ]; 503 char buf[BUFSIZ];
502 struct tracepoint_path *tps; 504 struct tracepoint_path *tps = get_tracepoints_path(pattrs, nb_events);
505
506 /*
507 * What? No tracepoints? No sense writing anything here, bail out.
508 */
509 if (tps == NULL)
510 return -1;
503 511
504 output_fd = fd; 512 output_fd = fd;
505 513
@@ -528,11 +536,11 @@ void read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events)
528 page_size = getpagesize(); 536 page_size = getpagesize();
529 write_or_die(&page_size, 4); 537 write_or_die(&page_size, 4);
530 538
531 tps = get_tracepoints_path(pattrs, nb_events);
532
533 read_header_files(); 539 read_header_files();
534 read_ftrace_files(tps); 540 read_ftrace_files(tps);
535 read_event_files(tps); 541 read_event_files(tps);
536 read_proc_kallsyms(); 542 read_proc_kallsyms();
537 read_ftrace_printk(); 543 read_ftrace_printk();
544
545 return 0;
538} 546}
diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c
index 44292e06cca4..342dfdd43f87 100644
--- a/tools/perf/util/trace-event-read.c
+++ b/tools/perf/util/trace-event-read.c
@@ -471,11 +471,11 @@ void trace_report(int fd)
471 471
472 read_or_die(buf, 3); 472 read_or_die(buf, 3);
473 if (memcmp(buf, test, 3) != 0) 473 if (memcmp(buf, test, 3) != 0)
474 die("not an trace data file"); 474 die("no trace data in the file");
475 475
476 read_or_die(buf, 7); 476 read_or_die(buf, 7);
477 if (memcmp(buf, "tracing", 7) != 0) 477 if (memcmp(buf, "tracing", 7) != 0)
478 die("not a trace file (missing tracing)"); 478 die("not a trace file (missing 'tracing' tag)");
479 479
480 version = read_string(); 480 version = read_string();
481 if (show_version) 481 if (show_version)
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index f6637c2fa1fe..dd51c6872a15 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -248,7 +248,7 @@ unsigned long long
248raw_field_value(struct event *event, const char *name, void *data); 248raw_field_value(struct event *event, const char *name, void *data);
249void *raw_field_ptr(struct event *event, const char *name, void *data); 249void *raw_field_ptr(struct event *event, const char *name, void *data);
250 250
251void read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events); 251int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events);
252 252
253/* taken from kernel/trace/trace.h */ 253/* taken from kernel/trace/trace.h */
254enum trace_flag_type { 254enum trace_flag_type {
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index f2203a0946bc..e1c623e0c99e 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -84,6 +84,9 @@
84#include <iconv.h> 84#include <iconv.h>
85#endif 85#endif
86 86
87extern const char *graph_line;
88extern const char *graph_dotted_line;
89
87/* On most systems <limits.h> would have given us this, but 90/* On most systems <limits.h> would have given us this, but
88 * not on some systems (e.g. GNU/Hurd). 91 * not on some systems (e.g. GNU/Hurd).
89 */ 92 */