aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/trace/uprobetracer.txt113
-rw-r--r--arch/Kconfig17
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/include/asm/uprobes.h57
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--arch/x86/kernel/uprobes.c674
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/uprobes.h165
-rw-r--r--kernel/events/Makefile3
-rw-r--r--kernel/events/uprobes.c1667
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/trace/Kconfig20
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/trace.h5
-rw-r--r--kernel/trace/trace_kprobe.c899
-rw-r--r--kernel/trace/trace_probe.c839
-rw-r--r--kernel/trace/trace_probe.h161
-rw-r--r--kernel/trace/trace_uprobe.c788
-rw-r--r--mm/memory.c3
-rw-r--r--mm/mmap.c33
24 files changed, 4601 insertions, 878 deletions
diff --git a/Documentation/trace/uprobetracer.txt b/Documentation/trace/uprobetracer.txt
new file mode 100644
index 000000000000..24ce6823a09e
--- /dev/null
+++ b/Documentation/trace/uprobetracer.txt
@@ -0,0 +1,113 @@
1 Uprobe-tracer: Uprobe-based Event Tracing
2 =========================================
3 Documentation written by Srikar Dronamraju
4
5Overview
6--------
7Uprobe based trace events are similar to kprobe based trace events.
8To enable this feature, build your kernel with CONFIG_UPROBE_EVENT=y.
9
10Similar to the kprobe-event tracer, this doesn't need to be activated via
11current_tracer. Instead of that, add probe points via
12/sys/kernel/debug/tracing/uprobe_events, and enable it via
13/sys/kernel/debug/tracing/events/uprobes/<EVENT>/enabled.
14
15However unlike kprobe-event tracer, the uprobe event interface expects the
16user to calculate the offset of the probepoint in the object
17
18Synopsis of uprobe_tracer
19-------------------------
20 p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] : Set a probe
21
22 GRP : Group name. If omitted, use "uprobes" for it.
23 EVENT : Event name. If omitted, the event name is generated
24 based on SYMBOL+offs.
25 PATH : path to an executable or a library.
26 SYMBOL[+offs] : Symbol+offset where the probe is inserted.
27
28 FETCHARGS : Arguments. Each probe can have up to 128 args.
29 %REG : Fetch register REG
30
31Event Profiling
32---------------
33 You can check the total number of probe hits and probe miss-hits via
34/sys/kernel/debug/tracing/uprobe_profile.
35 The first column is event name, the second is the number of probe hits,
36the third is the number of probe miss-hits.
37
38Usage examples
39--------------
40To add a probe as a new event, write a new definition to uprobe_events
41as below.
42
43 echo 'p: /bin/bash:0x4245c0' > /sys/kernel/debug/tracing/uprobe_events
44
45 This sets a uprobe at an offset of 0x4245c0 in the executable /bin/bash
46
47 echo > /sys/kernel/debug/tracing/uprobe_events
48
49 This clears all probe points.
50
51The following example shows how to dump the instruction pointer and %ax
52a register at the probed text address. Here we are trying to probe
53function zfree in /bin/zsh
54
55 # cd /sys/kernel/debug/tracing/
56 # cat /proc/`pgrep zsh`/maps | grep /bin/zsh | grep r-xp
57 00400000-0048a000 r-xp 00000000 08:03 130904 /bin/zsh
58 # objdump -T /bin/zsh | grep -w zfree
59 0000000000446420 g DF .text 0000000000000012 Base zfree
60
610x46420 is the offset of zfree in object /bin/zsh that is loaded at
620x00400000. Hence the command to probe would be :
63
64 # echo 'p /bin/zsh:0x46420 %ip %ax' > uprobe_events
65
66Please note: User has to explicitly calculate the offset of the probepoint
67in the object. We can see the events that are registered by looking at the
68uprobe_events file.
69
70 # cat uprobe_events
71 p:uprobes/p_zsh_0x46420 /bin/zsh:0x00046420 arg1=%ip arg2=%ax
72
73The format of events can be seen by viewing the file events/uprobes/p_zsh_0x46420/format
74
75 # cat events/uprobes/p_zsh_0x46420/format
76 name: p_zsh_0x46420
77 ID: 922
78 format:
79 field:unsigned short common_type; offset:0; size:2; signed:0;
80 field:unsigned char common_flags; offset:2; size:1; signed:0;
81 field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
82 field:int common_pid; offset:4; size:4; signed:1;
83 field:int common_padding; offset:8; size:4; signed:1;
84
85 field:unsigned long __probe_ip; offset:12; size:4; signed:0;
86 field:u32 arg1; offset:16; size:4; signed:0;
87 field:u32 arg2; offset:20; size:4; signed:0;
88
89 print fmt: "(%lx) arg1=%lx arg2=%lx", REC->__probe_ip, REC->arg1, REC->arg2
90
91Right after definition, each event is disabled by default. For tracing these
92events, you need to enable it by:
93
94 # echo 1 > events/uprobes/enable
95
96Lets disable the event after sleeping for some time.
97 # sleep 20
98 # echo 0 > events/uprobes/enable
99
100And you can see the traced information via /sys/kernel/debug/tracing/trace.
101
102 # cat trace
103 # tracer: nop
104 #
105 # TASK-PID CPU# TIMESTAMP FUNCTION
106 # | | | | |
107 zsh-24842 [006] 258544.995456: p_zsh_0x46420: (0x446420) arg1=446421 arg2=79
108 zsh-24842 [007] 258545.000270: p_zsh_0x46420: (0x446420) arg1=446421 arg2=79
109 zsh-24842 [002] 258545.043929: p_zsh_0x46420: (0x446420) arg1=446421 arg2=79
110 zsh-24842 [004] 258547.046129: p_zsh_0x46420: (0x446420) arg1=446421 arg2=79
111
112Each line shows us probes were triggered for a pid 24842 with ip being
1130x446421 and contents of ax register being 79.
diff --git a/arch/Kconfig b/arch/Kconfig
index 684eb5af439d..2880abf8a269 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -76,6 +76,23 @@ config OPTPROBES
76 depends on KPROBES && HAVE_OPTPROBES 76 depends on KPROBES && HAVE_OPTPROBES
77 depends on !PREEMPT 77 depends on !PREEMPT
78 78
79config UPROBES
80 bool "Transparent user-space probes (EXPERIMENTAL)"
81 depends on UPROBE_EVENT && PERF_EVENTS
82 default n
83 help
84 Uprobes is the user-space counterpart to kprobes: they
85 enable instrumentation applications (such as 'perf probe')
86 to establish unintrusive probes in user-space binaries and
87 libraries, by executing handler functions when the probes
88 are hit by user-space applications.
89
90 ( These probes come in the form of single-byte breakpoints,
91 managed by the kernel and kept transparent to the probed
92 application. )
93
94 If in doubt, say "N".
95
79config HAVE_EFFICIENT_UNALIGNED_ACCESS 96config HAVE_EFFICIENT_UNALIGNED_ACCESS
80 bool 97 bool
81 help 98 help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1324139612e1..8443c50fbbf6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -83,7 +83,7 @@ config X86
83 select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC 83 select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC
84 84
85config INSTRUCTION_DECODER 85config INSTRUCTION_DECODER
86 def_bool (KPROBES || PERF_EVENTS) 86 def_bool (KPROBES || PERF_EVENTS || UPROBES)
87 87
88config OUTPUT_FORMAT 88config OUTPUT_FORMAT
89 string 89 string
@@ -242,6 +242,9 @@ config ARCH_CPU_PROBE_RELEASE
242 def_bool y 242 def_bool y
243 depends on HOTPLUG_CPU 243 depends on HOTPLUG_CPU
244 244
245config ARCH_SUPPORTS_UPROBES
246 def_bool y
247
245source "init/Kconfig" 248source "init/Kconfig"
246source "kernel/Kconfig.freezer" 249source "kernel/Kconfig.freezer"
247 250
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ad6df8ccd715..0710c11305d4 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -85,6 +85,7 @@ struct thread_info {
85#define TIF_SECCOMP 8 /* secure computing */ 85#define TIF_SECCOMP 8 /* secure computing */
86#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ 86#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
87#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ 87#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
88#define TIF_UPROBE 12 /* breakpointed or singlestepping */
88#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 89#define TIF_NOTSC 16 /* TSC is not accessible in userland */
89#define TIF_IA32 17 /* IA32 compatibility process */ 90#define TIF_IA32 17 /* IA32 compatibility process */
90#define TIF_FORK 18 /* ret_from_fork */ 91#define TIF_FORK 18 /* ret_from_fork */
@@ -109,6 +110,7 @@ struct thread_info {
109#define _TIF_SECCOMP (1 << TIF_SECCOMP) 110#define _TIF_SECCOMP (1 << TIF_SECCOMP)
110#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) 111#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
111#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) 112#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
113#define _TIF_UPROBE (1 << TIF_UPROBE)
112#define _TIF_NOTSC (1 << TIF_NOTSC) 114#define _TIF_NOTSC (1 << TIF_NOTSC)
113#define _TIF_IA32 (1 << TIF_IA32) 115#define _TIF_IA32 (1 << TIF_IA32)
114#define _TIF_FORK (1 << TIF_FORK) 116#define _TIF_FORK (1 << TIF_FORK)
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
new file mode 100644
index 000000000000..1e9bed14f7ae
--- /dev/null
+++ b/arch/x86/include/asm/uprobes.h
@@ -0,0 +1,57 @@
1#ifndef _ASM_UPROBES_H
2#define _ASM_UPROBES_H
3/*
4 * User-space Probes (UProbes) for x86
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright (C) IBM Corporation, 2008-2011
21 * Authors:
22 * Srikar Dronamraju
23 * Jim Keniston
24 */
25
26#include <linux/notifier.h>
27
28typedef u8 uprobe_opcode_t;
29
30#define MAX_UINSN_BYTES 16
31#define UPROBE_XOL_SLOT_BYTES 128 /* to keep it cache aligned */
32
33#define UPROBE_SWBP_INSN 0xcc
34#define UPROBE_SWBP_INSN_SIZE 1
35
36struct arch_uprobe {
37 u16 fixups;
38 u8 insn[MAX_UINSN_BYTES];
39#ifdef CONFIG_X86_64
40 unsigned long rip_rela_target_address;
41#endif
42};
43
44struct arch_uprobe_task {
45 unsigned long saved_trap_nr;
46#ifdef CONFIG_X86_64
47 unsigned long saved_scratch_register;
48#endif
49};
50
51extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm);
52extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);
53extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
54extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
55extern int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data);
56extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs);
57#endif /* _ASM_UPROBES_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 532d2e090e6f..d23d83577d6b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -101,6 +101,7 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
101 101
102obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o 102obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
103obj-$(CONFIG_OF) += devicetree.o 103obj-$(CONFIG_OF) += devicetree.o
104obj-$(CONFIG_UPROBES) += uprobes.o
104 105
105### 106###
106# 64 bit specific files 107# 64 bit specific files
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 115eac431483..041af2fd088d 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -18,6 +18,7 @@
18#include <linux/personality.h> 18#include <linux/personality.h>
19#include <linux/uaccess.h> 19#include <linux/uaccess.h>
20#include <linux/user-return-notifier.h> 20#include <linux/user-return-notifier.h>
21#include <linux/uprobes.h>
21 22
22#include <asm/processor.h> 23#include <asm/processor.h>
23#include <asm/ucontext.h> 24#include <asm/ucontext.h>
@@ -824,6 +825,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
824 mce_notify_process(); 825 mce_notify_process();
825#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ 826#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
826 827
828 if (thread_info_flags & _TIF_UPROBE) {
829 clear_thread_flag(TIF_UPROBE);
830 uprobe_notify_resume(regs);
831 }
832
827 /* deal with pending signal delivery */ 833 /* deal with pending signal delivery */
828 if (thread_info_flags & _TIF_SIGPENDING) 834 if (thread_info_flags & _TIF_SIGPENDING)
829 do_signal(regs); 835 do_signal(regs);
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
new file mode 100644
index 000000000000..dc4e910a7d96
--- /dev/null
+++ b/arch/x86/kernel/uprobes.c
@@ -0,0 +1,674 @@
1/*
2 * User-space Probes (UProbes) for x86
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2008-2011
19 * Authors:
20 * Srikar Dronamraju
21 * Jim Keniston
22 */
23#include <linux/kernel.h>
24#include <linux/sched.h>
25#include <linux/ptrace.h>
26#include <linux/uprobes.h>
27#include <linux/uaccess.h>
28
29#include <linux/kdebug.h>
30#include <asm/processor.h>
31#include <asm/insn.h>
32
33/* Post-execution fixups. */
34
35/* No fixup needed */
36#define UPROBE_FIX_NONE 0x0
37
38/* Adjust IP back to vicinity of actual insn */
39#define UPROBE_FIX_IP 0x1
40
41/* Adjust the return address of a call insn */
42#define UPROBE_FIX_CALL 0x2
43
44#define UPROBE_FIX_RIP_AX 0x8000
45#define UPROBE_FIX_RIP_CX 0x4000
46
47#define UPROBE_TRAP_NR UINT_MAX
48
49/* Adaptations for mhiramat x86 decoder v14. */
50#define OPCODE1(insn) ((insn)->opcode.bytes[0])
51#define OPCODE2(insn) ((insn)->opcode.bytes[1])
52#define OPCODE3(insn) ((insn)->opcode.bytes[2])
53#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value)
54
55#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
56 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
57 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
58 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
59 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
60 << (row % 32))
61
62/*
63 * Good-instruction tables for 32-bit apps. This is non-const and volatile
64 * to keep gcc from statically optimizing it out, as variable_test_bit makes
65 * some versions of gcc to think only *(unsigned long*) is used.
66 */
67static volatile u32 good_insns_32[256 / 32] = {
68 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
69 /* ---------------------------------------------- */
70 W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */
71 W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
72 W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */
73 W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */
74 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
75 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
76 W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
77 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
78 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
79 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
80 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
81 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
82 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
83 W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
84 W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
85 W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
86 /* ---------------------------------------------- */
87 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
88};
89
90/* Using this for both 64-bit and 32-bit apps */
91static volatile u32 good_2byte_insns[256 / 32] = {
92 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
93 /* ---------------------------------------------- */
94 W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
95 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
96 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
97 W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
98 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
99 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
100 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
101 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
102 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
103 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
104 W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
105 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
106 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
107 W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
108 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
109 W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */
110 /* ---------------------------------------------- */
111 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
112};
113
114#ifdef CONFIG_X86_64
115/* Good-instruction tables for 64-bit apps */
116static volatile u32 good_insns_64[256 / 32] = {
117 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
118 /* ---------------------------------------------- */
119 W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */
120 W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
121 W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */
122 W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */
123 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
124 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
125 W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
126 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
127 W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
128 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
129 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
130 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
131 W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
132 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
133 W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
134 W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
135 /* ---------------------------------------------- */
136 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
137};
138#endif
139#undef W
140
141/*
142 * opcodes we'll probably never support:
143 *
144 * 6c-6d, e4-e5, ec-ed - in
145 * 6e-6f, e6-e7, ee-ef - out
146 * cc, cd - int3, int
147 * cf - iret
148 * d6 - illegal instruction
149 * f1 - int1/icebp
150 * f4 - hlt
151 * fa, fb - cli, sti
152 * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
153 *
154 * invalid opcodes in 64-bit mode:
155 *
156 * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
157 * 63 - we support this opcode in x86_64 but not in i386.
158 *
159 * opcodes we may need to refine support for:
160 *
161 * 0f - 2-byte instructions: For many of these instructions, the validity
162 * depends on the prefix and/or the reg field. On such instructions, we
163 * just consider the opcode combination valid if it corresponds to any
164 * valid instruction.
165 *
166 * 8f - Group 1 - only reg = 0 is OK
167 * c6-c7 - Group 11 - only reg = 0 is OK
168 * d9-df - fpu insns with some illegal encodings
169 * f2, f3 - repnz, repz prefixes. These are also the first byte for
170 * certain floating-point instructions, such as addsd.
171 *
172 * fe - Group 4 - only reg = 0 or 1 is OK
173 * ff - Group 5 - only reg = 0-6 is OK
174 *
175 * others -- Do we need to support these?
176 *
177 * 0f - (floating-point?) prefetch instructions
178 * 07, 17, 1f - pop es, pop ss, pop ds
179 * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
180 * but 64 and 65 (fs: and gs:) seem to be used, so we support them
181 * 67 - addr16 prefix
182 * ce - into
183 * f0 - lock prefix
184 */
185
186/*
187 * TODO:
188 * - Where necessary, examine the modrm byte and allow only valid instructions
189 * in the different Groups and fpu instructions.
190 */
191
192static bool is_prefix_bad(struct insn *insn)
193{
194 int i;
195
196 for (i = 0; i < insn->prefixes.nbytes; i++) {
197 switch (insn->prefixes.bytes[i]) {
198 case 0x26: /* INAT_PFX_ES */
199 case 0x2E: /* INAT_PFX_CS */
200 case 0x36: /* INAT_PFX_DS */
201 case 0x3E: /* INAT_PFX_SS */
202 case 0xF0: /* INAT_PFX_LOCK */
203 return true;
204 }
205 }
206 return false;
207}
208
209static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
210{
211 insn_init(insn, auprobe->insn, false);
212
213 /* Skip good instruction prefixes; reject "bad" ones. */
214 insn_get_opcode(insn);
215 if (is_prefix_bad(insn))
216 return -ENOTSUPP;
217
218 if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32))
219 return 0;
220
221 if (insn->opcode.nbytes == 2) {
222 if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
223 return 0;
224 }
225
226 return -ENOTSUPP;
227}
228
229/*
230 * Figure out which fixups arch_uprobe_post_xol() will need to perform, and
231 * annotate arch_uprobe->fixups accordingly. To start with,
232 * arch_uprobe->fixups is either zero or it reflects rip-related fixups.
233 */
234static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
235{
236 bool fix_ip = true, fix_call = false; /* defaults */
237 int reg;
238
239 insn_get_opcode(insn); /* should be a nop */
240
241 switch (OPCODE1(insn)) {
242 case 0xc3: /* ret/lret */
243 case 0xcb:
244 case 0xc2:
245 case 0xca:
246 /* ip is correct */
247 fix_ip = false;
248 break;
249 case 0xe8: /* call relative - Fix return addr */
250 fix_call = true;
251 break;
252 case 0x9a: /* call absolute - Fix return addr, not ip */
253 fix_call = true;
254 fix_ip = false;
255 break;
256 case 0xff:
257 insn_get_modrm(insn);
258 reg = MODRM_REG(insn);
259 if (reg == 2 || reg == 3) {
260 /* call or lcall, indirect */
261 /* Fix return addr; ip is correct. */
262 fix_call = true;
263 fix_ip = false;
264 } else if (reg == 4 || reg == 5) {
265 /* jmp or ljmp, indirect */
266 /* ip is correct. */
267 fix_ip = false;
268 }
269 break;
270 case 0xea: /* jmp absolute -- ip is correct */
271 fix_ip = false;
272 break;
273 default:
274 break;
275 }
276 if (fix_ip)
277 auprobe->fixups |= UPROBE_FIX_IP;
278 if (fix_call)
279 auprobe->fixups |= UPROBE_FIX_CALL;
280}
281
282#ifdef CONFIG_X86_64
283/*
284 * If arch_uprobe->insn doesn't use rip-relative addressing, return
285 * immediately. Otherwise, rewrite the instruction so that it accesses
286 * its memory operand indirectly through a scratch register. Set
287 * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address
288 * accordingly. (The contents of the scratch register will be saved
289 * before we single-step the modified instruction, and restored
290 * afterward.)
291 *
292 * We do this because a rip-relative instruction can access only a
293 * relatively small area (+/- 2 GB from the instruction), and the XOL
294 * area typically lies beyond that area. At least for instructions
295 * that store to memory, we can't execute the original instruction
296 * and "fix things up" later, because the misdirected store could be
297 * disastrous.
298 *
299 * Some useful facts about rip-relative instructions:
300 *
301 * - There's always a modrm byte.
302 * - There's never a SIB byte.
303 * - The displacement is always 4 bytes.
304 */
305static void
306handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
307{
308 u8 *cursor;
309 u8 reg;
310
311 if (mm->context.ia32_compat)
312 return;
313
314 auprobe->rip_rela_target_address = 0x0;
315 if (!insn_rip_relative(insn))
316 return;
317
318 /*
319 * insn_rip_relative() would have decoded rex_prefix, modrm.
320 * Clear REX.b bit (extension of MODRM.rm field):
321 * we want to encode rax/rcx, not r8/r9.
322 */
323 if (insn->rex_prefix.nbytes) {
324 cursor = auprobe->insn + insn_offset_rex_prefix(insn);
325 *cursor &= 0xfe; /* Clearing REX.B bit */
326 }
327
328 /*
329 * Point cursor at the modrm byte. The next 4 bytes are the
330 * displacement. Beyond the displacement, for some instructions,
331 * is the immediate operand.
332 */
333 cursor = auprobe->insn + insn_offset_modrm(insn);
334 insn_get_length(insn);
335
336 /*
337 * Convert from rip-relative addressing to indirect addressing
338 * via a scratch register. Change the r/m field from 0x5 (%rip)
339 * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field.
340 */
341 reg = MODRM_REG(insn);
342 if (reg == 0) {
343 /*
344 * The register operand (if any) is either the A register
345 * (%rax, %eax, etc.) or (if the 0x4 bit is set in the
346 * REX prefix) %r8. In any case, we know the C register
347 * is NOT the register operand, so we use %rcx (register
348 * #1) for the scratch register.
349 */
350 auprobe->fixups = UPROBE_FIX_RIP_CX;
351 /* Change modrm from 00 000 101 to 00 000 001. */
352 *cursor = 0x1;
353 } else {
354 /* Use %rax (register #0) for the scratch register. */
355 auprobe->fixups = UPROBE_FIX_RIP_AX;
356 /* Change modrm from 00 xxx 101 to 00 xxx 000 */
357 *cursor = (reg << 3);
358 }
359
360 /* Target address = address of next instruction + (signed) offset */
361 auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value;
362
363 /* Displacement field is gone; slide immediate field (if any) over. */
364 if (insn->immediate.nbytes) {
365 cursor++;
366 memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes);
367 }
368 return;
369}
370
371static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn)
372{
373 insn_init(insn, auprobe->insn, true);
374
375 /* Skip good instruction prefixes; reject "bad" ones. */
376 insn_get_opcode(insn);
377 if (is_prefix_bad(insn))
378 return -ENOTSUPP;
379
380 if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64))
381 return 0;
382
383 if (insn->opcode.nbytes == 2) {
384 if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
385 return 0;
386 }
387 return -ENOTSUPP;
388}
389
390static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
391{
392 if (mm->context.ia32_compat)
393 return validate_insn_32bits(auprobe, insn);
394 return validate_insn_64bits(auprobe, insn);
395}
396#else /* 32-bit: */
397static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
398{
399 /* No RIP-relative addressing on 32-bit */
400}
401
402static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
403{
404 return validate_insn_32bits(auprobe, insn);
405}
406#endif /* CONFIG_X86_64 */
407
408/**
409 * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
410 * @mm: the probed address space.
411 * @arch_uprobe: the probepoint information.
412 * Return 0 on success or a -ve number on error.
413 */
414int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm)
415{
416 int ret;
417 struct insn insn;
418
419 auprobe->fixups = 0;
420 ret = validate_insn_bits(auprobe, mm, &insn);
421 if (ret != 0)
422 return ret;
423
424 handle_riprel_insn(auprobe, mm, &insn);
425 prepare_fixups(auprobe, &insn);
426
427 return 0;
428}
429
430#ifdef CONFIG_X86_64
431/*
432 * If we're emulating a rip-relative instruction, save the contents
433 * of the scratch register and store the target address in that register.
434 */
435static void
436pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
437 struct arch_uprobe_task *autask)
438{
439 if (auprobe->fixups & UPROBE_FIX_RIP_AX) {
440 autask->saved_scratch_register = regs->ax;
441 regs->ax = current->utask->vaddr;
442 regs->ax += auprobe->rip_rela_target_address;
443 } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) {
444 autask->saved_scratch_register = regs->cx;
445 regs->cx = current->utask->vaddr;
446 regs->cx += auprobe->rip_rela_target_address;
447 }
448}
449#else
450static void
451pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
452 struct arch_uprobe_task *autask)
453{
454 /* No RIP-relative addressing on 32-bit */
455}
456#endif
457
458/*
459 * arch_uprobe_pre_xol - prepare to execute out of line.
460 * @auprobe: the probepoint information.
461 * @regs: reflects the saved user state of current task.
462 */
463int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
464{
465 struct arch_uprobe_task *autask;
466
467 autask = &current->utask->autask;
468 autask->saved_trap_nr = current->thread.trap_nr;
469 current->thread.trap_nr = UPROBE_TRAP_NR;
470 regs->ip = current->utask->xol_vaddr;
471 pre_xol_rip_insn(auprobe, regs, autask);
472
473 return 0;
474}
475
476/*
477 * This function is called by arch_uprobe_post_xol() to adjust the return
478 * address pushed by a call instruction executed out of line.
479 */
480static int adjust_ret_addr(unsigned long sp, long correction)
481{
482 int rasize, ncopied;
483 long ra = 0;
484
485 if (is_ia32_task())
486 rasize = 4;
487 else
488 rasize = 8;
489
490 ncopied = copy_from_user(&ra, (void __user *)sp, rasize);
491 if (unlikely(ncopied))
492 return -EFAULT;
493
494 ra += correction;
495 ncopied = copy_to_user((void __user *)sp, &ra, rasize);
496 if (unlikely(ncopied))
497 return -EFAULT;
498
499 return 0;
500}
501
502#ifdef CONFIG_X86_64
503static bool is_riprel_insn(struct arch_uprobe *auprobe)
504{
505 return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0);
506}
507
508static void
509handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
510{
511 if (is_riprel_insn(auprobe)) {
512 struct arch_uprobe_task *autask;
513
514 autask = &current->utask->autask;
515 if (auprobe->fixups & UPROBE_FIX_RIP_AX)
516 regs->ax = autask->saved_scratch_register;
517 else
518 regs->cx = autask->saved_scratch_register;
519
520 /*
521 * The original instruction includes a displacement, and so
522 * is 4 bytes longer than what we've just single-stepped.
523 * Fall through to handle stuff like "jmpq *...(%rip)" and
524 * "callq *...(%rip)".
525 */
526 if (correction)
527 *correction += 4;
528 }
529}
530#else
531static void
532handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
533{
534 /* No RIP-relative addressing on 32-bit */
535}
536#endif
537
538/*
539 * If xol insn itself traps and generates a signal(Say,
540 * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped
541 * instruction jumps back to its own address. It is assumed that anything
542 * like do_page_fault/do_trap/etc sets thread.trap_nr != -1.
543 *
544 * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr,
545 * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to
546 * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol().
547 */
548bool arch_uprobe_xol_was_trapped(struct task_struct *t)
549{
550 if (t->thread.trap_nr != UPROBE_TRAP_NR)
551 return true;
552
553 return false;
554}
555
556/*
557 * Called after single-stepping. To avoid the SMP problems that can
558 * occur when we temporarily put back the original opcode to
559 * single-step, we single-stepped a copy of the instruction.
560 *
561 * This function prepares to resume execution after the single-step.
562 * We have to fix things up as follows:
563 *
564 * Typically, the new ip is relative to the copied instruction. We need
565 * to make it relative to the original instruction (FIX_IP). Exceptions
566 * are return instructions and absolute or indirect jump or call instructions.
567 *
568 * If the single-stepped instruction was a call, the return address that
569 * is atop the stack is the address following the copied instruction. We
570 * need to make it the address following the original instruction (FIX_CALL).
571 *
572 * If the original instruction was a rip-relative instruction such as
573 * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
574 * instruction using a scratch register -- e.g., "movl %edx,(%rax)".
575 * We need to restore the contents of the scratch register and adjust
576 * the ip, keeping in mind that the instruction we executed is 4 bytes
577 * shorter than the original instruction (since we squeezed out the offset
578 * field). (FIX_RIP_AX or FIX_RIP_CX)
579 */
580int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
581{
582 struct uprobe_task *utask;
583 long correction;
584 int result = 0;
585
586 WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
587
588 utask = current->utask;
589 current->thread.trap_nr = utask->autask.saved_trap_nr;
590 correction = (long)(utask->vaddr - utask->xol_vaddr);
591 handle_riprel_post_xol(auprobe, regs, &correction);
592 if (auprobe->fixups & UPROBE_FIX_IP)
593 regs->ip += correction;
594
595 if (auprobe->fixups & UPROBE_FIX_CALL)
596 result = adjust_ret_addr(regs->sp, correction);
597
598 return result;
599}
600
601/* callback routine for handling exceptions. */
602int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data)
603{
604 struct die_args *args = data;
605 struct pt_regs *regs = args->regs;
606 int ret = NOTIFY_DONE;
607
608 /* We are only interested in userspace traps */
609 if (regs && !user_mode_vm(regs))
610 return NOTIFY_DONE;
611
612 switch (val) {
613 case DIE_INT3:
614 if (uprobe_pre_sstep_notifier(regs))
615 ret = NOTIFY_STOP;
616
617 break;
618
619 case DIE_DEBUG:
620 if (uprobe_post_sstep_notifier(regs))
621 ret = NOTIFY_STOP;
622
623 default:
624 break;
625 }
626
627 return ret;
628}
629
630/*
631 * This function gets called when XOL instruction either gets trapped or
632 * the thread has a fatal signal, so reset the instruction pointer to its
633 * probed address.
634 */
635void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
636{
637 struct uprobe_task *utask = current->utask;
638
639 current->thread.trap_nr = utask->autask.saved_trap_nr;
640 handle_riprel_post_xol(auprobe, regs, NULL);
641 instruction_pointer_set(regs, utask->vaddr);
642}
643
644/*
645 * Skip these instructions as per the currently known x86 ISA.
646 * 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 }
647 */
648bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
649{
650 int i;
651
652 for (i = 0; i < MAX_UINSN_BYTES; i++) {
653 if ((auprobe->insn[i] == 0x66))
654 continue;
655
656 if (auprobe->insn[i] == 0x90)
657 return true;
658
659 if (i == (MAX_UINSN_BYTES - 1))
660 break;
661
662 if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x1f))
663 return true;
664
665 if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x19))
666 return true;
667
668 if ((auprobe->insn[i] == 0x87) && (auprobe->insn[i+1] == 0xc0))
669 return true;
670
671 break;
672 }
673 return false;
674}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3cc3062b3767..26574c726121 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -12,6 +12,7 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/cpumask.h> 13#include <linux/cpumask.h>
14#include <linux/page-debug-flags.h> 14#include <linux/page-debug-flags.h>
15#include <linux/uprobes.h>
15#include <asm/page.h> 16#include <asm/page.h>
16#include <asm/mmu.h> 17#include <asm/mmu.h>
17 18
@@ -388,6 +389,7 @@ struct mm_struct {
388#ifdef CONFIG_CPUMASK_OFFSTACK 389#ifdef CONFIG_CPUMASK_OFFSTACK
389 struct cpumask cpumask_allocation; 390 struct cpumask cpumask_allocation;
390#endif 391#endif
392 struct uprobes_state uprobes_state;
391}; 393};
392 394
393static inline void mm_init_cpumask(struct mm_struct *mm) 395static inline void mm_init_cpumask(struct mm_struct *mm)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 81a173c0897d..cff94cda34b2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1617,6 +1617,10 @@ struct task_struct {
1617#ifdef CONFIG_HAVE_HW_BREAKPOINT 1617#ifdef CONFIG_HAVE_HW_BREAKPOINT
1618 atomic_t ptrace_bp_refcnt; 1618 atomic_t ptrace_bp_refcnt;
1619#endif 1619#endif
1620#ifdef CONFIG_UPROBES
1621 struct uprobe_task *utask;
1622 int uprobe_srcu_id;
1623#endif
1620}; 1624};
1621 1625
1622/* Future-safe accessor for struct task_struct's cpus_allowed. */ 1626/* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
new file mode 100644
index 000000000000..efe4b3308c74
--- /dev/null
+++ b/include/linux/uprobes.h
@@ -0,0 +1,165 @@
1#ifndef _LINUX_UPROBES_H
2#define _LINUX_UPROBES_H
3/*
4 * User-space Probes (UProbes)
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright (C) IBM Corporation, 2008-2012
21 * Authors:
22 * Srikar Dronamraju
23 * Jim Keniston
24 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
25 */
26
27#include <linux/errno.h>
28#include <linux/rbtree.h>
29
30struct vm_area_struct;
31struct mm_struct;
32struct inode;
33
34#ifdef CONFIG_ARCH_SUPPORTS_UPROBES
35# include <asm/uprobes.h>
36#endif
37
38/* flags that denote/change uprobes behaviour */
39
40/* Have a copy of original instruction */
41#define UPROBE_COPY_INSN 0x1
42
43/* Dont run handlers when first register/ last unregister in progress*/
44#define UPROBE_RUN_HANDLER 0x2
45/* Can skip singlestep */
46#define UPROBE_SKIP_SSTEP 0x4
47
48struct uprobe_consumer {
49 int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
50 /*
51 * filter is optional; If a filter exists, handler is run
52 * if and only if filter returns true.
53 */
54 bool (*filter)(struct uprobe_consumer *self, struct task_struct *task);
55
56 struct uprobe_consumer *next;
57};
58
59#ifdef CONFIG_UPROBES
60enum uprobe_task_state {
61 UTASK_RUNNING,
62 UTASK_BP_HIT,
63 UTASK_SSTEP,
64 UTASK_SSTEP_ACK,
65 UTASK_SSTEP_TRAPPED,
66};
67
68/*
69 * uprobe_task: Metadata of a task while it singlesteps.
70 */
71struct uprobe_task {
72 enum uprobe_task_state state;
73 struct arch_uprobe_task autask;
74
75 struct uprobe *active_uprobe;
76
77 unsigned long xol_vaddr;
78 unsigned long vaddr;
79};
80
81/*
82 * On a breakpoint hit, thread contests for a slot. It frees the
83 * slot after singlestep. Currently a fixed number of slots are
84 * allocated.
85 */
86struct xol_area {
87 wait_queue_head_t wq; /* if all slots are busy */
88 atomic_t slot_count; /* number of in-use slots */
89 unsigned long *bitmap; /* 0 = free slot */
90 struct page *page;
91
92 /*
93 * We keep the vma's vm_start rather than a pointer to the vma
94 * itself. The probed process or a naughty kernel module could make
95 * the vma go away, and we must handle that reasonably gracefully.
96 */
97 unsigned long vaddr; /* Page(s) of instruction slots */
98};
99
100struct uprobes_state {
101 struct xol_area *xol_area;
102 atomic_t count;
103};
104extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
105extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr, bool verify);
106extern bool __weak is_swbp_insn(uprobe_opcode_t *insn);
107extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
108extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
109extern int uprobe_mmap(struct vm_area_struct *vma);
110extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end);
111extern void uprobe_free_utask(struct task_struct *t);
112extern void uprobe_copy_process(struct task_struct *t);
113extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs);
114extern int uprobe_post_sstep_notifier(struct pt_regs *regs);
115extern int uprobe_pre_sstep_notifier(struct pt_regs *regs);
116extern void uprobe_notify_resume(struct pt_regs *regs);
117extern bool uprobe_deny_signal(void);
118extern bool __weak arch_uprobe_skip_sstep(struct arch_uprobe *aup, struct pt_regs *regs);
119extern void uprobe_clear_state(struct mm_struct *mm);
120extern void uprobe_reset_state(struct mm_struct *mm);
121#else /* !CONFIG_UPROBES */
122struct uprobes_state {
123};
124static inline int
125uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
126{
127 return -ENOSYS;
128}
129static inline void
130uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
131{
132}
133static inline int uprobe_mmap(struct vm_area_struct *vma)
134{
135 return 0;
136}
137static inline void
138uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
139{
140}
141static inline void uprobe_notify_resume(struct pt_regs *regs)
142{
143}
144static inline bool uprobe_deny_signal(void)
145{
146 return false;
147}
148static inline unsigned long uprobe_get_swbp_addr(struct pt_regs *regs)
149{
150 return 0;
151}
152static inline void uprobe_free_utask(struct task_struct *t)
153{
154}
155static inline void uprobe_copy_process(struct task_struct *t)
156{
157}
158static inline void uprobe_clear_state(struct mm_struct *mm)
159{
160}
161static inline void uprobe_reset_state(struct mm_struct *mm)
162{
163}
164#endif /* !CONFIG_UPROBES */
165#endif /* _LINUX_UPROBES_H */
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 22d901f9caf4..103f5d147b2f 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -3,4 +3,7 @@ CFLAGS_REMOVE_core.o = -pg
3endif 3endif
4 4
5obj-y := core.o ring_buffer.o callchain.o 5obj-y := core.o ring_buffer.o callchain.o
6
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 7obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
8obj-$(CONFIG_UPROBES) += uprobes.o
9
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
new file mode 100644
index 000000000000..985be4d80fe8
--- /dev/null
+++ b/kernel/events/uprobes.c
@@ -0,0 +1,1667 @@
1/*
2 * User-space Probes (UProbes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2008-2012
19 * Authors:
20 * Srikar Dronamraju
21 * Jim Keniston
22 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
23 */
24
25#include <linux/kernel.h>
26#include <linux/highmem.h>
27#include <linux/pagemap.h> /* read_mapping_page */
28#include <linux/slab.h>
29#include <linux/sched.h>
30#include <linux/rmap.h> /* anon_vma_prepare */
31#include <linux/mmu_notifier.h> /* set_pte_at_notify */
32#include <linux/swap.h> /* try_to_free_swap */
33#include <linux/ptrace.h> /* user_enable_single_step */
34#include <linux/kdebug.h> /* notifier mechanism */
35
36#include <linux/uprobes.h>
37
38#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
39#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
40
41static struct srcu_struct uprobes_srcu;
42static struct rb_root uprobes_tree = RB_ROOT;
43
44static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
45
46#define UPROBES_HASH_SZ 13
47
48/* serialize (un)register */
49static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
50
51#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
52
53/* serialize uprobe->pending_list */
54static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
55#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
56
57/*
58 * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
59 * events active at this time. Probably a fine grained per inode count is
60 * better?
61 */
62static atomic_t uprobe_events = ATOMIC_INIT(0);
63
64/*
65 * Maintain a temporary per vma info that can be used to search if a vma
66 * has already been handled. This structure is introduced since extending
67 * vm_area_struct wasnt recommended.
68 */
69struct vma_info {
70 struct list_head probe_list;
71 struct mm_struct *mm;
72 loff_t vaddr;
73};
74
75struct uprobe {
76 struct rb_node rb_node; /* node in the rb tree */
77 atomic_t ref;
78 struct rw_semaphore consumer_rwsem;
79 struct list_head pending_list;
80 struct uprobe_consumer *consumers;
81 struct inode *inode; /* Also hold a ref to inode */
82 loff_t offset;
83 int flags;
84 struct arch_uprobe arch;
85};
86
87/*
88 * valid_vma: Verify if the specified vma is an executable vma
89 * Relax restrictions while unregistering: vm_flags might have
90 * changed after breakpoint was inserted.
91 * - is_register: indicates if we are in register context.
92 * - Return 1 if the specified virtual address is in an
93 * executable vma.
94 */
95static bool valid_vma(struct vm_area_struct *vma, bool is_register)
96{
97 if (!vma->vm_file)
98 return false;
99
100 if (!is_register)
101 return true;
102
103 if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC))
104 return true;
105
106 return false;
107}
108
109static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
110{
111 loff_t vaddr;
112
113 vaddr = vma->vm_start + offset;
114 vaddr -= vma->vm_pgoff << PAGE_SHIFT;
115
116 return vaddr;
117}
118
119/**
120 * __replace_page - replace page in vma by new page.
121 * based on replace_page in mm/ksm.c
122 *
123 * @vma: vma that holds the pte pointing to page
124 * @page: the cowed page we are replacing by kpage
125 * @kpage: the modified page we replace page by
126 *
127 * Returns 0 on success, -EFAULT on failure.
128 */
129static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage)
130{
131 struct mm_struct *mm = vma->vm_mm;
132 pgd_t *pgd;
133 pud_t *pud;
134 pmd_t *pmd;
135 pte_t *ptep;
136 spinlock_t *ptl;
137 unsigned long addr;
138 int err = -EFAULT;
139
140 addr = page_address_in_vma(page, vma);
141 if (addr == -EFAULT)
142 goto out;
143
144 pgd = pgd_offset(mm, addr);
145 if (!pgd_present(*pgd))
146 goto out;
147
148 pud = pud_offset(pgd, addr);
149 if (!pud_present(*pud))
150 goto out;
151
152 pmd = pmd_offset(pud, addr);
153 if (!pmd_present(*pmd))
154 goto out;
155
156 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
157 if (!ptep)
158 goto out;
159
160 get_page(kpage);
161 page_add_new_anon_rmap(kpage, vma, addr);
162
163 if (!PageAnon(page)) {
164 dec_mm_counter(mm, MM_FILEPAGES);
165 inc_mm_counter(mm, MM_ANONPAGES);
166 }
167
168 flush_cache_page(vma, addr, pte_pfn(*ptep));
169 ptep_clear_flush(vma, addr, ptep);
170 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
171
172 page_remove_rmap(page);
173 if (!page_mapped(page))
174 try_to_free_swap(page);
175 put_page(page);
176 pte_unmap_unlock(ptep, ptl);
177 err = 0;
178
179out:
180 return err;
181}
182
183/**
184 * is_swbp_insn - check if instruction is breakpoint instruction.
185 * @insn: instruction to be checked.
186 * Default implementation of is_swbp_insn
187 * Returns true if @insn is a breakpoint instruction.
188 */
189bool __weak is_swbp_insn(uprobe_opcode_t *insn)
190{
191 return *insn == UPROBE_SWBP_INSN;
192}
193
194/*
195 * NOTE:
196 * Expect the breakpoint instruction to be the smallest size instruction for
197 * the architecture. If an arch has variable length instruction and the
198 * breakpoint instruction is not of the smallest length instruction
199 * supported by that architecture then we need to modify read_opcode /
200 * write_opcode accordingly. This would never be a problem for archs that
201 * have fixed length instructions.
202 */
203
204/*
205 * write_opcode - write the opcode at a given virtual address.
206 * @auprobe: arch breakpointing information.
207 * @mm: the probed process address space.
208 * @vaddr: the virtual address to store the opcode.
209 * @opcode: opcode to be written at @vaddr.
210 *
211 * Called with mm->mmap_sem held (for read and with a reference to
212 * mm).
213 *
214 * For mm @mm, write the opcode at @vaddr.
215 * Return 0 (success) or a negative errno.
216 */
217static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
218 unsigned long vaddr, uprobe_opcode_t opcode)
219{
220 struct page *old_page, *new_page;
221 struct address_space *mapping;
222 void *vaddr_old, *vaddr_new;
223 struct vm_area_struct *vma;
224 struct uprobe *uprobe;
225 loff_t addr;
226 int ret;
227
228 /* Read the page with vaddr into memory */
229 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
230 if (ret <= 0)
231 return ret;
232
233 ret = -EINVAL;
234
235 /*
236 * We are interested in text pages only. Our pages of interest
237 * should be mapped for read and execute only. We desist from
238 * adding probes in write mapped pages since the breakpoints
239 * might end up in the file copy.
240 */
241 if (!valid_vma(vma, is_swbp_insn(&opcode)))
242 goto put_out;
243
244 uprobe = container_of(auprobe, struct uprobe, arch);
245 mapping = uprobe->inode->i_mapping;
246 if (mapping != vma->vm_file->f_mapping)
247 goto put_out;
248
249 addr = vma_address(vma, uprobe->offset);
250 if (vaddr != (unsigned long)addr)
251 goto put_out;
252
253 ret = -ENOMEM;
254 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
255 if (!new_page)
256 goto put_out;
257
258 __SetPageUptodate(new_page);
259
260 /*
261 * lock page will serialize against do_wp_page()'s
262 * PageAnon() handling
263 */
264 lock_page(old_page);
265 /* copy the page now that we've got it stable */
266 vaddr_old = kmap_atomic(old_page);
267 vaddr_new = kmap_atomic(new_page);
268
269 memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
270
271 /* poke the new insn in, ASSUMES we don't cross page boundary */
272 vaddr &= ~PAGE_MASK;
273 BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
274 memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
275
276 kunmap_atomic(vaddr_new);
277 kunmap_atomic(vaddr_old);
278
279 ret = anon_vma_prepare(vma);
280 if (ret)
281 goto unlock_out;
282
283 lock_page(new_page);
284 ret = __replace_page(vma, old_page, new_page);
285 unlock_page(new_page);
286
287unlock_out:
288 unlock_page(old_page);
289 page_cache_release(new_page);
290
291put_out:
292 put_page(old_page);
293
294 return ret;
295}
296
297/**
298 * read_opcode - read the opcode at a given virtual address.
299 * @mm: the probed process address space.
300 * @vaddr: the virtual address to read the opcode.
301 * @opcode: location to store the read opcode.
302 *
303 * Called with mm->mmap_sem held (for read and with a reference to
304 * mm.
305 *
306 * For mm @mm, read the opcode at @vaddr and store it in @opcode.
307 * Return 0 (success) or a negative errno.
308 */
309static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode)
310{
311 struct page *page;
312 void *vaddr_new;
313 int ret;
314
315 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL);
316 if (ret <= 0)
317 return ret;
318
319 lock_page(page);
320 vaddr_new = kmap_atomic(page);
321 vaddr &= ~PAGE_MASK;
322 memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE);
323 kunmap_atomic(vaddr_new);
324 unlock_page(page);
325
326 put_page(page);
327
328 return 0;
329}
330
331static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
332{
333 uprobe_opcode_t opcode;
334 int result;
335
336 result = read_opcode(mm, vaddr, &opcode);
337 if (result)
338 return result;
339
340 if (is_swbp_insn(&opcode))
341 return 1;
342
343 return 0;
344}
345
346/**
347 * set_swbp - store breakpoint at a given address.
348 * @auprobe: arch specific probepoint information.
349 * @mm: the probed process address space.
350 * @vaddr: the virtual address to insert the opcode.
351 *
352 * For mm @mm, store the breakpoint instruction at @vaddr.
353 * Return 0 (success) or a negative errno.
354 */
355int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
356{
357 int result;
358
359 result = is_swbp_at_addr(mm, vaddr);
360 if (result == 1)
361 return -EEXIST;
362
363 if (result)
364 return result;
365
366 return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
367}
368
369/**
370 * set_orig_insn - Restore the original instruction.
371 * @mm: the probed process address space.
372 * @auprobe: arch specific probepoint information.
373 * @vaddr: the virtual address to insert the opcode.
374 * @verify: if true, verify existance of breakpoint instruction.
375 *
376 * For mm @mm, restore the original opcode (opcode) at @vaddr.
377 * Return 0 (success) or a negative errno.
378 */
379int __weak
380set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify)
381{
382 if (verify) {
383 int result;
384
385 result = is_swbp_at_addr(mm, vaddr);
386 if (!result)
387 return -EINVAL;
388
389 if (result != 1)
390 return result;
391 }
392 return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
393}
394
395static int match_uprobe(struct uprobe *l, struct uprobe *r)
396{
397 if (l->inode < r->inode)
398 return -1;
399
400 if (l->inode > r->inode)
401 return 1;
402
403 if (l->offset < r->offset)
404 return -1;
405
406 if (l->offset > r->offset)
407 return 1;
408
409 return 0;
410}
411
412static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
413{
414 struct uprobe u = { .inode = inode, .offset = offset };
415 struct rb_node *n = uprobes_tree.rb_node;
416 struct uprobe *uprobe;
417 int match;
418
419 while (n) {
420 uprobe = rb_entry(n, struct uprobe, rb_node);
421 match = match_uprobe(&u, uprobe);
422 if (!match) {
423 atomic_inc(&uprobe->ref);
424 return uprobe;
425 }
426
427 if (match < 0)
428 n = n->rb_left;
429 else
430 n = n->rb_right;
431 }
432 return NULL;
433}
434
435/*
436 * Find a uprobe corresponding to a given inode:offset
437 * Acquires uprobes_treelock
438 */
439static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
440{
441 struct uprobe *uprobe;
442 unsigned long flags;
443
444 spin_lock_irqsave(&uprobes_treelock, flags);
445 uprobe = __find_uprobe(inode, offset);
446 spin_unlock_irqrestore(&uprobes_treelock, flags);
447
448 return uprobe;
449}
450
451static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
452{
453 struct rb_node **p = &uprobes_tree.rb_node;
454 struct rb_node *parent = NULL;
455 struct uprobe *u;
456 int match;
457
458 while (*p) {
459 parent = *p;
460 u = rb_entry(parent, struct uprobe, rb_node);
461 match = match_uprobe(uprobe, u);
462 if (!match) {
463 atomic_inc(&u->ref);
464 return u;
465 }
466
467 if (match < 0)
468 p = &parent->rb_left;
469 else
470 p = &parent->rb_right;
471
472 }
473
474 u = NULL;
475 rb_link_node(&uprobe->rb_node, parent, p);
476 rb_insert_color(&uprobe->rb_node, &uprobes_tree);
477 /* get access + creation ref */
478 atomic_set(&uprobe->ref, 2);
479
480 return u;
481}
482
483/*
484 * Acquire uprobes_treelock.
485 * Matching uprobe already exists in rbtree;
486 * increment (access refcount) and return the matching uprobe.
487 *
488 * No matching uprobe; insert the uprobe in rb_tree;
489 * get a double refcount (access + creation) and return NULL.
490 */
491static struct uprobe *insert_uprobe(struct uprobe *uprobe)
492{
493 unsigned long flags;
494 struct uprobe *u;
495
496 spin_lock_irqsave(&uprobes_treelock, flags);
497 u = __insert_uprobe(uprobe);
498 spin_unlock_irqrestore(&uprobes_treelock, flags);
499
500 /* For now assume that the instruction need not be single-stepped */
501 uprobe->flags |= UPROBE_SKIP_SSTEP;
502
503 return u;
504}
505
506static void put_uprobe(struct uprobe *uprobe)
507{
508 if (atomic_dec_and_test(&uprobe->ref))
509 kfree(uprobe);
510}
511
512static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
513{
514 struct uprobe *uprobe, *cur_uprobe;
515
516 uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
517 if (!uprobe)
518 return NULL;
519
520 uprobe->inode = igrab(inode);
521 uprobe->offset = offset;
522 init_rwsem(&uprobe->consumer_rwsem);
523 INIT_LIST_HEAD(&uprobe->pending_list);
524
525 /* add to uprobes_tree, sorted on inode:offset */
526 cur_uprobe = insert_uprobe(uprobe);
527
528 /* a uprobe exists for this inode:offset combination */
529 if (cur_uprobe) {
530 kfree(uprobe);
531 uprobe = cur_uprobe;
532 iput(inode);
533 } else {
534 atomic_inc(&uprobe_events);
535 }
536
537 return uprobe;
538}
539
540static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
541{
542 struct uprobe_consumer *uc;
543
544 if (!(uprobe->flags & UPROBE_RUN_HANDLER))
545 return;
546
547 down_read(&uprobe->consumer_rwsem);
548 for (uc = uprobe->consumers; uc; uc = uc->next) {
549 if (!uc->filter || uc->filter(uc, current))
550 uc->handler(uc, regs);
551 }
552 up_read(&uprobe->consumer_rwsem);
553}
554
555/* Returns the previous consumer */
556static struct uprobe_consumer *
557consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
558{
559 down_write(&uprobe->consumer_rwsem);
560 uc->next = uprobe->consumers;
561 uprobe->consumers = uc;
562 up_write(&uprobe->consumer_rwsem);
563
564 return uc->next;
565}
566
567/*
568 * For uprobe @uprobe, delete the consumer @uc.
569 * Return true if the @uc is deleted successfully
570 * or return false.
571 */
572static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
573{
574 struct uprobe_consumer **con;
575 bool ret = false;
576
577 down_write(&uprobe->consumer_rwsem);
578 for (con = &uprobe->consumers; *con; con = &(*con)->next) {
579 if (*con == uc) {
580 *con = uc->next;
581 ret = true;
582 break;
583 }
584 }
585 up_write(&uprobe->consumer_rwsem);
586
587 return ret;
588}
589
590static int
591__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn,
592 unsigned long nbytes, unsigned long offset)
593{
594 struct file *filp = vma->vm_file;
595 struct page *page;
596 void *vaddr;
597 unsigned long off1;
598 unsigned long idx;
599
600 if (!filp)
601 return -EINVAL;
602
603 idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT);
604 off1 = offset &= ~PAGE_MASK;
605
606 /*
607 * Ensure that the page that has the original instruction is
608 * populated and in page-cache.
609 */
610 page = read_mapping_page(mapping, idx, filp);
611 if (IS_ERR(page))
612 return PTR_ERR(page);
613
614 vaddr = kmap_atomic(page);
615 memcpy(insn, vaddr + off1, nbytes);
616 kunmap_atomic(vaddr);
617 page_cache_release(page);
618
619 return 0;
620}
621
622static int
623copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
624{
625 struct address_space *mapping;
626 unsigned long nbytes;
627 int bytes;
628
629 addr &= ~PAGE_MASK;
630 nbytes = PAGE_SIZE - addr;
631 mapping = uprobe->inode->i_mapping;
632
633 /* Instruction at end of binary; copy only available bytes */
634 if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
635 bytes = uprobe->inode->i_size - uprobe->offset;
636 else
637 bytes = MAX_UINSN_BYTES;
638
639 /* Instruction at the page-boundary; copy bytes in second page */
640 if (nbytes < bytes) {
641 if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes,
642 bytes - nbytes, uprobe->offset + nbytes))
643 return -ENOMEM;
644
645 bytes = nbytes;
646 }
647 return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset);
648}
649
650/*
651 * How mm->uprobes_state.count gets updated
652 * uprobe_mmap() increments the count if
653 * - it successfully adds a breakpoint.
654 * - it cannot add a breakpoint, but sees that there is a underlying
655 * breakpoint (via a is_swbp_at_addr()).
656 *
657 * uprobe_munmap() decrements the count if
658 * - it sees a underlying breakpoint, (via is_swbp_at_addr)
659 * (Subsequent uprobe_unregister wouldnt find the breakpoint
660 * unless a uprobe_mmap kicks in, since the old vma would be
661 * dropped just after uprobe_munmap.)
662 *
663 * uprobe_register increments the count if:
664 * - it successfully adds a breakpoint.
665 *
666 * uprobe_unregister decrements the count if:
667 * - it sees a underlying breakpoint and removes successfully.
668 * (via is_swbp_at_addr)
669 * (Subsequent uprobe_munmap wouldnt find the breakpoint
670 * since there is no underlying breakpoint after the
671 * breakpoint removal.)
672 */
673static int
674install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
675 struct vm_area_struct *vma, loff_t vaddr)
676{
677 unsigned long addr;
678 int ret;
679
680 /*
681 * If probe is being deleted, unregister thread could be done with
682 * the vma-rmap-walk through. Adding a probe now can be fatal since
683 * nobody will be able to cleanup. Also we could be from fork or
684 * mremap path, where the probe might have already been inserted.
685 * Hence behave as if probe already existed.
686 */
687 if (!uprobe->consumers)
688 return -EEXIST;
689
690 addr = (unsigned long)vaddr;
691
692 if (!(uprobe->flags & UPROBE_COPY_INSN)) {
693 ret = copy_insn(uprobe, vma, addr);
694 if (ret)
695 return ret;
696
697 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
698 return -EEXIST;
699
700 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm);
701 if (ret)
702 return ret;
703
704 uprobe->flags |= UPROBE_COPY_INSN;
705 }
706
707 /*
708 * Ideally, should be updating the probe count after the breakpoint
709 * has been successfully inserted. However a thread could hit the
710 * breakpoint we just inserted even before the probe count is
711 * incremented. If this is the first breakpoint placed, breakpoint
712 * notifier might ignore uprobes and pass the trap to the thread.
713 * Hence increment before and decrement on failure.
714 */
715 atomic_inc(&mm->uprobes_state.count);
716 ret = set_swbp(&uprobe->arch, mm, addr);
717 if (ret)
718 atomic_dec(&mm->uprobes_state.count);
719
720 return ret;
721}
722
723static void
724remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr)
725{
726 if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true))
727 atomic_dec(&mm->uprobes_state.count);
728}
729
730/*
731 * There could be threads that have hit the breakpoint and are entering the
732 * notifier code and trying to acquire the uprobes_treelock. The thread
733 * calling delete_uprobe() that is removing the uprobe from the rb_tree can
734 * race with these threads and might acquire the uprobes_treelock compared
735 * to some of the breakpoint hit threads. In such a case, the breakpoint
736 * hit threads will not find the uprobe. The current unregistering thread
737 * waits till all other threads have hit a breakpoint, to acquire the
738 * uprobes_treelock before the uprobe is removed from the rbtree.
739 */
740static void delete_uprobe(struct uprobe *uprobe)
741{
742 unsigned long flags;
743
744 synchronize_srcu(&uprobes_srcu);
745 spin_lock_irqsave(&uprobes_treelock, flags);
746 rb_erase(&uprobe->rb_node, &uprobes_tree);
747 spin_unlock_irqrestore(&uprobes_treelock, flags);
748 iput(uprobe->inode);
749 put_uprobe(uprobe);
750 atomic_dec(&uprobe_events);
751}
752
753static struct vma_info *
754__find_next_vma_info(struct address_space *mapping, struct list_head *head,
755 struct vma_info *vi, loff_t offset, bool is_register)
756{
757 struct prio_tree_iter iter;
758 struct vm_area_struct *vma;
759 struct vma_info *tmpvi;
760 unsigned long pgoff;
761 int existing_vma;
762 loff_t vaddr;
763
764 pgoff = offset >> PAGE_SHIFT;
765
766 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
767 if (!valid_vma(vma, is_register))
768 continue;
769
770 existing_vma = 0;
771 vaddr = vma_address(vma, offset);
772
773 list_for_each_entry(tmpvi, head, probe_list) {
774 if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) {
775 existing_vma = 1;
776 break;
777 }
778 }
779
780 /*
781 * Another vma needs a probe to be installed. However skip
782 * installing the probe if the vma is about to be unlinked.
783 */
784 if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
785 vi->mm = vma->vm_mm;
786 vi->vaddr = vaddr;
787 list_add(&vi->probe_list, head);
788
789 return vi;
790 }
791 }
792
793 return NULL;
794}
795
796/*
797 * Iterate in the rmap prio tree and find a vma where a probe has not
798 * yet been inserted.
799 */
800static struct vma_info *
801find_next_vma_info(struct address_space *mapping, struct list_head *head,
802 loff_t offset, bool is_register)
803{
804 struct vma_info *vi, *retvi;
805
806 vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL);
807 if (!vi)
808 return ERR_PTR(-ENOMEM);
809
810 mutex_lock(&mapping->i_mmap_mutex);
811 retvi = __find_next_vma_info(mapping, head, vi, offset, is_register);
812 mutex_unlock(&mapping->i_mmap_mutex);
813
814 if (!retvi)
815 kfree(vi);
816
817 return retvi;
818}
819
820static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
821{
822 struct list_head try_list;
823 struct vm_area_struct *vma;
824 struct address_space *mapping;
825 struct vma_info *vi, *tmpvi;
826 struct mm_struct *mm;
827 loff_t vaddr;
828 int ret;
829
830 mapping = uprobe->inode->i_mapping;
831 INIT_LIST_HEAD(&try_list);
832
833 ret = 0;
834
835 for (;;) {
836 vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register);
837 if (!vi)
838 break;
839
840 if (IS_ERR(vi)) {
841 ret = PTR_ERR(vi);
842 break;
843 }
844
845 mm = vi->mm;
846 down_read(&mm->mmap_sem);
847 vma = find_vma(mm, (unsigned long)vi->vaddr);
848 if (!vma || !valid_vma(vma, is_register)) {
849 list_del(&vi->probe_list);
850 kfree(vi);
851 up_read(&mm->mmap_sem);
852 mmput(mm);
853 continue;
854 }
855 vaddr = vma_address(vma, uprobe->offset);
856 if (vma->vm_file->f_mapping->host != uprobe->inode ||
857 vaddr != vi->vaddr) {
858 list_del(&vi->probe_list);
859 kfree(vi);
860 up_read(&mm->mmap_sem);
861 mmput(mm);
862 continue;
863 }
864
865 if (is_register)
866 ret = install_breakpoint(uprobe, mm, vma, vi->vaddr);
867 else
868 remove_breakpoint(uprobe, mm, vi->vaddr);
869
870 up_read(&mm->mmap_sem);
871 mmput(mm);
872 if (is_register) {
873 if (ret && ret == -EEXIST)
874 ret = 0;
875 if (ret)
876 break;
877 }
878 }
879
880 list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) {
881 list_del(&vi->probe_list);
882 kfree(vi);
883 }
884
885 return ret;
886}
887
888static int __uprobe_register(struct uprobe *uprobe)
889{
890 return register_for_each_vma(uprobe, true);
891}
892
893static void __uprobe_unregister(struct uprobe *uprobe)
894{
895 if (!register_for_each_vma(uprobe, false))
896 delete_uprobe(uprobe);
897
898 /* TODO : cant unregister? schedule a worker thread */
899}
900
901/*
902 * uprobe_register - register a probe
903 * @inode: the file in which the probe has to be placed.
904 * @offset: offset from the start of the file.
905 * @uc: information on howto handle the probe..
906 *
907 * Apart from the access refcount, uprobe_register() takes a creation
908 * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
909 * inserted into the rbtree (i.e first consumer for a @inode:@offset
910 * tuple). Creation refcount stops uprobe_unregister from freeing the
911 * @uprobe even before the register operation is complete. Creation
912 * refcount is released when the last @uc for the @uprobe
913 * unregisters.
914 *
915 * Return errno if it cannot successully install probes
916 * else return 0 (success)
917 */
918int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
919{
920 struct uprobe *uprobe;
921 int ret;
922
923 if (!inode || !uc || uc->next)
924 return -EINVAL;
925
926 if (offset > i_size_read(inode))
927 return -EINVAL;
928
929 ret = 0;
930 mutex_lock(uprobes_hash(inode));
931 uprobe = alloc_uprobe(inode, offset);
932
933 if (uprobe && !consumer_add(uprobe, uc)) {
934 ret = __uprobe_register(uprobe);
935 if (ret) {
936 uprobe->consumers = NULL;
937 __uprobe_unregister(uprobe);
938 } else {
939 uprobe->flags |= UPROBE_RUN_HANDLER;
940 }
941 }
942
943 mutex_unlock(uprobes_hash(inode));
944 put_uprobe(uprobe);
945
946 return ret;
947}
948
949/*
950 * uprobe_unregister - unregister a already registered probe.
951 * @inode: the file in which the probe has to be removed.
952 * @offset: offset from the start of the file.
953 * @uc: identify which probe if multiple probes are colocated.
954 */
955void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
956{
957 struct uprobe *uprobe;
958
959 if (!inode || !uc)
960 return;
961
962 uprobe = find_uprobe(inode, offset);
963 if (!uprobe)
964 return;
965
966 mutex_lock(uprobes_hash(inode));
967
968 if (consumer_del(uprobe, uc)) {
969 if (!uprobe->consumers) {
970 __uprobe_unregister(uprobe);
971 uprobe->flags &= ~UPROBE_RUN_HANDLER;
972 }
973 }
974
975 mutex_unlock(uprobes_hash(inode));
976 if (uprobe)
977 put_uprobe(uprobe);
978}
979
980/*
981 * Of all the nodes that correspond to the given inode, return the node
982 * with the least offset.
983 */
984static struct rb_node *find_least_offset_node(struct inode *inode)
985{
986 struct uprobe u = { .inode = inode, .offset = 0};
987 struct rb_node *n = uprobes_tree.rb_node;
988 struct rb_node *close_node = NULL;
989 struct uprobe *uprobe;
990 int match;
991
992 while (n) {
993 uprobe = rb_entry(n, struct uprobe, rb_node);
994 match = match_uprobe(&u, uprobe);
995
996 if (uprobe->inode == inode)
997 close_node = n;
998
999 if (!match)
1000 return close_node;
1001
1002 if (match < 0)
1003 n = n->rb_left;
1004 else
1005 n = n->rb_right;
1006 }
1007
1008 return close_node;
1009}
1010
1011/*
1012 * For a given inode, build a list of probes that need to be inserted.
1013 */
1014static void build_probe_list(struct inode *inode, struct list_head *head)
1015{
1016 struct uprobe *uprobe;
1017 unsigned long flags;
1018 struct rb_node *n;
1019
1020 spin_lock_irqsave(&uprobes_treelock, flags);
1021
1022 n = find_least_offset_node(inode);
1023
1024 for (; n; n = rb_next(n)) {
1025 uprobe = rb_entry(n, struct uprobe, rb_node);
1026 if (uprobe->inode != inode)
1027 break;
1028
1029 list_add(&uprobe->pending_list, head);
1030 atomic_inc(&uprobe->ref);
1031 }
1032
1033 spin_unlock_irqrestore(&uprobes_treelock, flags);
1034}
1035
1036/*
1037 * Called from mmap_region.
1038 * called with mm->mmap_sem acquired.
1039 *
1040 * Return -ve no if we fail to insert probes and we cannot
1041 * bail-out.
1042 * Return 0 otherwise. i.e:
1043 *
1044 * - successful insertion of probes
1045 * - (or) no possible probes to be inserted.
1046 * - (or) insertion of probes failed but we can bail-out.
1047 */
1048int uprobe_mmap(struct vm_area_struct *vma)
1049{
1050 struct list_head tmp_list;
1051 struct uprobe *uprobe, *u;
1052 struct inode *inode;
1053 int ret, count;
1054
1055 if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
1056 return 0;
1057
1058 inode = vma->vm_file->f_mapping->host;
1059 if (!inode)
1060 return 0;
1061
1062 INIT_LIST_HEAD(&tmp_list);
1063 mutex_lock(uprobes_mmap_hash(inode));
1064 build_probe_list(inode, &tmp_list);
1065
1066 ret = 0;
1067 count = 0;
1068
1069 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1070 loff_t vaddr;
1071
1072 list_del(&uprobe->pending_list);
1073 if (!ret) {
1074 vaddr = vma_address(vma, uprobe->offset);
1075
1076 if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
1077 put_uprobe(uprobe);
1078 continue;
1079 }
1080
1081 ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
1082
1083 /* Ignore double add: */
1084 if (ret == -EEXIST) {
1085 ret = 0;
1086
1087 if (!is_swbp_at_addr(vma->vm_mm, vaddr))
1088 continue;
1089
1090 /*
1091 * Unable to insert a breakpoint, but
1092 * breakpoint lies underneath. Increment the
1093 * probe count.
1094 */
1095 atomic_inc(&vma->vm_mm->uprobes_state.count);
1096 }
1097
1098 if (!ret)
1099 count++;
1100 }
1101 put_uprobe(uprobe);
1102 }
1103
1104 mutex_unlock(uprobes_mmap_hash(inode));
1105
1106 if (ret)
1107 atomic_sub(count, &vma->vm_mm->uprobes_state.count);
1108
1109 return ret;
1110}
1111
1112/*
1113 * Called in context of a munmap of a vma.
1114 */
1115void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1116{
1117 struct list_head tmp_list;
1118 struct uprobe *uprobe, *u;
1119 struct inode *inode;
1120
1121 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
1122 return;
1123
1124 if (!atomic_read(&vma->vm_mm->uprobes_state.count))
1125 return;
1126
1127 inode = vma->vm_file->f_mapping->host;
1128 if (!inode)
1129 return;
1130
1131 INIT_LIST_HEAD(&tmp_list);
1132 mutex_lock(uprobes_mmap_hash(inode));
1133 build_probe_list(inode, &tmp_list);
1134
1135 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1136 loff_t vaddr;
1137
1138 list_del(&uprobe->pending_list);
1139 vaddr = vma_address(vma, uprobe->offset);
1140
1141 if (vaddr >= start && vaddr < end) {
1142 /*
1143 * An unregister could have removed the probe before
1144 * unmap. So check before we decrement the count.
1145 */
1146 if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
1147 atomic_dec(&vma->vm_mm->uprobes_state.count);
1148 }
1149 put_uprobe(uprobe);
1150 }
1151 mutex_unlock(uprobes_mmap_hash(inode));
1152}
1153
1154/* Slot allocation for XOL */
1155static int xol_add_vma(struct xol_area *area)
1156{
1157 struct mm_struct *mm;
1158 int ret;
1159
1160 area->page = alloc_page(GFP_HIGHUSER);
1161 if (!area->page)
1162 return -ENOMEM;
1163
1164 ret = -EALREADY;
1165 mm = current->mm;
1166
1167 down_write(&mm->mmap_sem);
1168 if (mm->uprobes_state.xol_area)
1169 goto fail;
1170
1171 ret = -ENOMEM;
1172
1173 /* Try to map as high as possible, this is only a hint. */
1174 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
1175 if (area->vaddr & ~PAGE_MASK) {
1176 ret = area->vaddr;
1177 goto fail;
1178 }
1179
1180 ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
1181 VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
1182 if (ret)
1183 goto fail;
1184
1185 smp_wmb(); /* pairs with get_xol_area() */
1186 mm->uprobes_state.xol_area = area;
1187 ret = 0;
1188
1189fail:
1190 up_write(&mm->mmap_sem);
1191 if (ret)
1192 __free_page(area->page);
1193
1194 return ret;
1195}
1196
1197static struct xol_area *get_xol_area(struct mm_struct *mm)
1198{
1199 struct xol_area *area;
1200
1201 area = mm->uprobes_state.xol_area;
1202 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1203
1204 return area;
1205}
1206
1207/*
1208 * xol_alloc_area - Allocate process's xol_area.
1209 * This area will be used for storing instructions for execution out of
1210 * line.
1211 *
1212 * Returns the allocated area or NULL.
1213 */
1214static struct xol_area *xol_alloc_area(void)
1215{
1216 struct xol_area *area;
1217
1218 area = kzalloc(sizeof(*area), GFP_KERNEL);
1219 if (unlikely(!area))
1220 return NULL;
1221
1222 area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
1223
1224 if (!area->bitmap)
1225 goto fail;
1226
1227 init_waitqueue_head(&area->wq);
1228 if (!xol_add_vma(area))
1229 return area;
1230
1231fail:
1232 kfree(area->bitmap);
1233 kfree(area);
1234
1235 return get_xol_area(current->mm);
1236}
1237
1238/*
1239 * uprobe_clear_state - Free the area allocated for slots.
1240 */
1241void uprobe_clear_state(struct mm_struct *mm)
1242{
1243 struct xol_area *area = mm->uprobes_state.xol_area;
1244
1245 if (!area)
1246 return;
1247
1248 put_page(area->page);
1249 kfree(area->bitmap);
1250 kfree(area);
1251}
1252
1253/*
1254 * uprobe_reset_state - Free the area allocated for slots.
1255 */
1256void uprobe_reset_state(struct mm_struct *mm)
1257{
1258 mm->uprobes_state.xol_area = NULL;
1259 atomic_set(&mm->uprobes_state.count, 0);
1260}
1261
1262/*
1263 * - search for a free slot.
1264 */
1265static unsigned long xol_take_insn_slot(struct xol_area *area)
1266{
1267 unsigned long slot_addr;
1268 int slot_nr;
1269
1270 do {
1271 slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
1272 if (slot_nr < UINSNS_PER_PAGE) {
1273 if (!test_and_set_bit(slot_nr, area->bitmap))
1274 break;
1275
1276 slot_nr = UINSNS_PER_PAGE;
1277 continue;
1278 }
1279 wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
1280 } while (slot_nr >= UINSNS_PER_PAGE);
1281
1282 slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
1283 atomic_inc(&area->slot_count);
1284
1285 return slot_addr;
1286}
1287
1288/*
1289 * xol_get_insn_slot - If was not allocated a slot, then
1290 * allocate a slot.
1291 * Returns the allocated slot address or 0.
1292 */
1293static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr)
1294{
1295 struct xol_area *area;
1296 unsigned long offset;
1297 void *vaddr;
1298
1299 area = get_xol_area(current->mm);
1300 if (!area) {
1301 area = xol_alloc_area();
1302 if (!area)
1303 return 0;
1304 }
1305 current->utask->xol_vaddr = xol_take_insn_slot(area);
1306
1307 /*
1308 * Initialize the slot if xol_vaddr points to valid
1309 * instruction slot.
1310 */
1311 if (unlikely(!current->utask->xol_vaddr))
1312 return 0;
1313
1314 current->utask->vaddr = slot_addr;
1315 offset = current->utask->xol_vaddr & ~PAGE_MASK;
1316 vaddr = kmap_atomic(area->page);
1317 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
1318 kunmap_atomic(vaddr);
1319
1320 return current->utask->xol_vaddr;
1321}
1322
1323/*
1324 * xol_free_insn_slot - If slot was earlier allocated by
1325 * @xol_get_insn_slot(), make the slot available for
1326 * subsequent requests.
1327 */
1328static void xol_free_insn_slot(struct task_struct *tsk)
1329{
1330 struct xol_area *area;
1331 unsigned long vma_end;
1332 unsigned long slot_addr;
1333
1334 if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
1335 return;
1336
1337 slot_addr = tsk->utask->xol_vaddr;
1338
1339 if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
1340 return;
1341
1342 area = tsk->mm->uprobes_state.xol_area;
1343 vma_end = area->vaddr + PAGE_SIZE;
1344 if (area->vaddr <= slot_addr && slot_addr < vma_end) {
1345 unsigned long offset;
1346 int slot_nr;
1347
1348 offset = slot_addr - area->vaddr;
1349 slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
1350 if (slot_nr >= UINSNS_PER_PAGE)
1351 return;
1352
1353 clear_bit(slot_nr, area->bitmap);
1354 atomic_dec(&area->slot_count);
1355 if (waitqueue_active(&area->wq))
1356 wake_up(&area->wq);
1357
1358 tsk->utask->xol_vaddr = 0;
1359 }
1360}
1361
1362/**
1363 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
1364 * @regs: Reflects the saved state of the task after it has hit a breakpoint
1365 * instruction.
1366 * Return the address of the breakpoint instruction.
1367 */
1368unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1369{
1370 return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
1371}
1372
1373/*
1374 * Called with no locks held.
1375 * Called in context of a exiting or a exec-ing thread.
1376 */
1377void uprobe_free_utask(struct task_struct *t)
1378{
1379 struct uprobe_task *utask = t->utask;
1380
1381 if (t->uprobe_srcu_id != -1)
1382 srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id);
1383
1384 if (!utask)
1385 return;
1386
1387 if (utask->active_uprobe)
1388 put_uprobe(utask->active_uprobe);
1389
1390 xol_free_insn_slot(t);
1391 kfree(utask);
1392 t->utask = NULL;
1393}
1394
1395/*
1396 * Called in context of a new clone/fork from copy_process.
1397 */
1398void uprobe_copy_process(struct task_struct *t)
1399{
1400 t->utask = NULL;
1401 t->uprobe_srcu_id = -1;
1402}
1403
1404/*
1405 * Allocate a uprobe_task object for the task.
1406 * Called when the thread hits a breakpoint for the first time.
1407 *
1408 * Returns:
1409 * - pointer to new uprobe_task on success
1410 * - NULL otherwise
1411 */
1412static struct uprobe_task *add_utask(void)
1413{
1414 struct uprobe_task *utask;
1415
1416 utask = kzalloc(sizeof *utask, GFP_KERNEL);
1417 if (unlikely(!utask))
1418 return NULL;
1419
1420 utask->active_uprobe = NULL;
1421 current->utask = utask;
1422 return utask;
1423}
1424
1425/* Prepare to single-step probed instruction out of line. */
1426static int
1427pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr)
1428{
1429 if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs))
1430 return 0;
1431
1432 return -EFAULT;
1433}
1434
1435/*
1436 * If we are singlestepping, then ensure this thread is not connected to
1437 * non-fatal signals until completion of singlestep. When xol insn itself
1438 * triggers the signal, restart the original insn even if the task is
1439 * already SIGKILL'ed (since coredump should report the correct ip). This
1440 * is even more important if the task has a handler for SIGSEGV/etc, The
1441 * _same_ instruction should be repeated again after return from the signal
1442 * handler, and SSTEP can never finish in this case.
1443 */
1444bool uprobe_deny_signal(void)
1445{
1446 struct task_struct *t = current;
1447 struct uprobe_task *utask = t->utask;
1448
1449 if (likely(!utask || !utask->active_uprobe))
1450 return false;
1451
1452 WARN_ON_ONCE(utask->state != UTASK_SSTEP);
1453
1454 if (signal_pending(t)) {
1455 spin_lock_irq(&t->sighand->siglock);
1456 clear_tsk_thread_flag(t, TIF_SIGPENDING);
1457 spin_unlock_irq(&t->sighand->siglock);
1458
1459 if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
1460 utask->state = UTASK_SSTEP_TRAPPED;
1461 set_tsk_thread_flag(t, TIF_UPROBE);
1462 set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
1463 }
1464 }
1465
1466 return true;
1467}
1468
1469/*
1470 * Avoid singlestepping the original instruction if the original instruction
1471 * is a NOP or can be emulated.
1472 */
1473static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
1474{
1475 if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
1476 return true;
1477
1478 uprobe->flags &= ~UPROBE_SKIP_SSTEP;
1479 return false;
1480}
1481
1482/*
1483 * Run handler and ask thread to singlestep.
1484 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
1485 */
1486static void handle_swbp(struct pt_regs *regs)
1487{
1488 struct vm_area_struct *vma;
1489 struct uprobe_task *utask;
1490 struct uprobe *uprobe;
1491 struct mm_struct *mm;
1492 unsigned long bp_vaddr;
1493
1494 uprobe = NULL;
1495 bp_vaddr = uprobe_get_swbp_addr(regs);
1496 mm = current->mm;
1497 down_read(&mm->mmap_sem);
1498 vma = find_vma(mm, bp_vaddr);
1499
1500 if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) {
1501 struct inode *inode;
1502 loff_t offset;
1503
1504 inode = vma->vm_file->f_mapping->host;
1505 offset = bp_vaddr - vma->vm_start;
1506 offset += (vma->vm_pgoff << PAGE_SHIFT);
1507 uprobe = find_uprobe(inode, offset);
1508 }
1509
1510 srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id);
1511 current->uprobe_srcu_id = -1;
1512 up_read(&mm->mmap_sem);
1513
1514 if (!uprobe) {
1515 /* No matching uprobe; signal SIGTRAP. */
1516 send_sig(SIGTRAP, current, 0);
1517 return;
1518 }
1519
1520 utask = current->utask;
1521 if (!utask) {
1522 utask = add_utask();
1523 /* Cannot allocate; re-execute the instruction. */
1524 if (!utask)
1525 goto cleanup_ret;
1526 }
1527 utask->active_uprobe = uprobe;
1528 handler_chain(uprobe, regs);
1529 if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs))
1530 goto cleanup_ret;
1531
1532 utask->state = UTASK_SSTEP;
1533 if (!pre_ssout(uprobe, regs, bp_vaddr)) {
1534 user_enable_single_step(current);
1535 return;
1536 }
1537
1538cleanup_ret:
1539 if (utask) {
1540 utask->active_uprobe = NULL;
1541 utask->state = UTASK_RUNNING;
1542 }
1543 if (uprobe) {
1544 if (!(uprobe->flags & UPROBE_SKIP_SSTEP))
1545
1546 /*
1547 * cannot singlestep; cannot skip instruction;
1548 * re-execute the instruction.
1549 */
1550 instruction_pointer_set(regs, bp_vaddr);
1551
1552 put_uprobe(uprobe);
1553 }
1554}
1555
1556/*
1557 * Perform required fix-ups and disable singlestep.
1558 * Allow pending signals to take effect.
1559 */
1560static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1561{
1562 struct uprobe *uprobe;
1563
1564 uprobe = utask->active_uprobe;
1565 if (utask->state == UTASK_SSTEP_ACK)
1566 arch_uprobe_post_xol(&uprobe->arch, regs);
1567 else if (utask->state == UTASK_SSTEP_TRAPPED)
1568 arch_uprobe_abort_xol(&uprobe->arch, regs);
1569 else
1570 WARN_ON_ONCE(1);
1571
1572 put_uprobe(uprobe);
1573 utask->active_uprobe = NULL;
1574 utask->state = UTASK_RUNNING;
1575 user_disable_single_step(current);
1576 xol_free_insn_slot(current);
1577
1578 spin_lock_irq(&current->sighand->siglock);
1579 recalc_sigpending(); /* see uprobe_deny_signal() */
1580 spin_unlock_irq(&current->sighand->siglock);
1581}
1582
1583/*
1584 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on
1585 * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and
1586 * allows the thread to return from interrupt.
1587 *
1588 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and
1589 * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from
1590 * interrupt.
1591 *
1592 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
1593 * uprobe_notify_resume().
1594 */
1595void uprobe_notify_resume(struct pt_regs *regs)
1596{
1597 struct uprobe_task *utask;
1598
1599 utask = current->utask;
1600 if (!utask || utask->state == UTASK_BP_HIT)
1601 handle_swbp(regs);
1602 else
1603 handle_singlestep(utask, regs);
1604}
1605
1606/*
1607 * uprobe_pre_sstep_notifier gets called from interrupt context as part of
1608 * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
1609 */
1610int uprobe_pre_sstep_notifier(struct pt_regs *regs)
1611{
1612 struct uprobe_task *utask;
1613
1614 if (!current->mm || !atomic_read(&current->mm->uprobes_state.count))
1615 /* task is currently not uprobed */
1616 return 0;
1617
1618 utask = current->utask;
1619 if (utask)
1620 utask->state = UTASK_BP_HIT;
1621
1622 set_thread_flag(TIF_UPROBE);
1623 current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu);
1624
1625 return 1;
1626}
1627
1628/*
1629 * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
1630 * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
1631 */
1632int uprobe_post_sstep_notifier(struct pt_regs *regs)
1633{
1634 struct uprobe_task *utask = current->utask;
1635
1636 if (!current->mm || !utask || !utask->active_uprobe)
1637 /* task is currently not uprobed */
1638 return 0;
1639
1640 utask->state = UTASK_SSTEP_ACK;
1641 set_thread_flag(TIF_UPROBE);
1642 return 1;
1643}
1644
1645static struct notifier_block uprobe_exception_nb = {
1646 .notifier_call = arch_uprobe_exception_notify,
1647 .priority = INT_MAX-1, /* notified after kprobes, kgdb */
1648};
1649
1650static int __init init_uprobes(void)
1651{
1652 int i;
1653
1654 for (i = 0; i < UPROBES_HASH_SZ; i++) {
1655 mutex_init(&uprobes_mutex[i]);
1656 mutex_init(&uprobes_mmap_mutex[i]);
1657 }
1658 init_srcu_struct(&uprobes_srcu);
1659
1660 return register_die_notifier(&uprobe_exception_nb);
1661}
1662module_init(init_uprobes);
1663
1664static void __exit exit_uprobes(void)
1665{
1666}
1667module_exit(exit_uprobes);
diff --git a/kernel/fork.c b/kernel/fork.c
index b9372a0bff18..ca9a3845ef3e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -67,6 +67,7 @@
67#include <linux/oom.h> 67#include <linux/oom.h>
68#include <linux/khugepaged.h> 68#include <linux/khugepaged.h>
69#include <linux/signalfd.h> 69#include <linux/signalfd.h>
70#include <linux/uprobes.h>
70 71
71#include <asm/pgtable.h> 72#include <asm/pgtable.h>
72#include <asm/pgalloc.h> 73#include <asm/pgalloc.h>
@@ -421,6 +422,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
421 422
422 if (retval) 423 if (retval)
423 goto out; 424 goto out;
425
426 if (file && uprobe_mmap(tmp))
427 goto out;
424 } 428 }
425 /* a new mm has just been created */ 429 /* a new mm has just been created */
426 arch_dup_mmap(oldmm, mm); 430 arch_dup_mmap(oldmm, mm);
@@ -569,6 +573,7 @@ void mmput(struct mm_struct *mm)
569 might_sleep(); 573 might_sleep();
570 574
571 if (atomic_dec_and_test(&mm->mm_users)) { 575 if (atomic_dec_and_test(&mm->mm_users)) {
576 uprobe_clear_state(mm);
572 exit_aio(mm); 577 exit_aio(mm);
573 ksm_exit(mm); 578 ksm_exit(mm);
574 khugepaged_exit(mm); /* must run before exit_mmap */ 579 khugepaged_exit(mm); /* must run before exit_mmap */
@@ -747,6 +752,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
747 exit_pi_state_list(tsk); 752 exit_pi_state_list(tsk);
748#endif 753#endif
749 754
755 uprobe_free_utask(tsk);
756
750 /* Get rid of any cached register state */ 757 /* Get rid of any cached register state */
751 deactivate_mm(tsk, mm); 758 deactivate_mm(tsk, mm);
752 759
@@ -801,6 +808,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
801#ifdef CONFIG_TRANSPARENT_HUGEPAGE 808#ifdef CONFIG_TRANSPARENT_HUGEPAGE
802 mm->pmd_huge_pte = NULL; 809 mm->pmd_huge_pte = NULL;
803#endif 810#endif
811 uprobe_reset_state(mm);
804 812
805 if (!mm_init(mm, tsk)) 813 if (!mm_init(mm, tsk))
806 goto fail_nomem; 814 goto fail_nomem;
@@ -1342,6 +1350,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1342 INIT_LIST_HEAD(&p->pi_state_list); 1350 INIT_LIST_HEAD(&p->pi_state_list);
1343 p->pi_state_cache = NULL; 1351 p->pi_state_cache = NULL;
1344#endif 1352#endif
1353 uprobe_copy_process(p);
1345 /* 1354 /*
1346 * sigaltstack should be cleared when sharing the same VM 1355 * sigaltstack should be cleared when sharing the same VM
1347 */ 1356 */
diff --git a/kernel/signal.c b/kernel/signal.c
index 17afcaf582d0..60d80ab2601c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -29,6 +29,7 @@
29#include <linux/pid_namespace.h> 29#include <linux/pid_namespace.h>
30#include <linux/nsproxy.h> 30#include <linux/nsproxy.h>
31#include <linux/user_namespace.h> 31#include <linux/user_namespace.h>
32#include <linux/uprobes.h>
32#define CREATE_TRACE_POINTS 33#define CREATE_TRACE_POINTS
33#include <trace/events/signal.h> 34#include <trace/events/signal.h>
34 35
@@ -2202,6 +2203,9 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
2202 struct signal_struct *signal = current->signal; 2203 struct signal_struct *signal = current->signal;
2203 int signr; 2204 int signr;
2204 2205
2206 if (unlikely(uprobe_deny_signal()))
2207 return 0;
2208
2205relock: 2209relock:
2206 /* 2210 /*
2207 * We'll jump back here after any time we were stopped in TASK_STOPPED. 2211 * We'll jump back here after any time we were stopped in TASK_STOPPED.
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a1d2849f2473..ea4bff6295fc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -373,6 +373,7 @@ config KPROBE_EVENT
373 depends on HAVE_REGS_AND_STACK_ACCESS_API 373 depends on HAVE_REGS_AND_STACK_ACCESS_API
374 bool "Enable kprobes-based dynamic events" 374 bool "Enable kprobes-based dynamic events"
375 select TRACING 375 select TRACING
376 select PROBE_EVENTS
376 default y 377 default y
377 help 378 help
378 This allows the user to add tracing events (similar to tracepoints) 379 This allows the user to add tracing events (similar to tracepoints)
@@ -385,6 +386,25 @@ config KPROBE_EVENT
385 This option is also required by perf-probe subcommand of perf tools. 386 This option is also required by perf-probe subcommand of perf tools.
386 If you want to use perf tools, this option is strongly recommended. 387 If you want to use perf tools, this option is strongly recommended.
387 388
389config UPROBE_EVENT
390 bool "Enable uprobes-based dynamic events"
391 depends on ARCH_SUPPORTS_UPROBES
392 depends on MMU
393 select UPROBES
394 select PROBE_EVENTS
395 select TRACING
396 default n
397 help
398 This allows the user to add tracing events on top of userspace
399 dynamic events (similar to tracepoints) on the fly via the trace
400 events interface. Those events can be inserted wherever uprobes
401 can probe, and record various registers.
402 This option is required if you plan to use perf-probe subcommand
403 of perf tools on user space applications.
404
405config PROBE_EVENTS
406 def_bool n
407
388config DYNAMIC_FTRACE 408config DYNAMIC_FTRACE
389 bool "enable/disable ftrace tracepoints dynamically" 409 bool "enable/disable ftrace tracepoints dynamically"
390 depends on FUNCTION_TRACER 410 depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 5f39a07fe5ea..1734c03e048b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -61,5 +61,7 @@ endif
61ifeq ($(CONFIG_TRACING),y) 61ifeq ($(CONFIG_TRACING),y)
62obj-$(CONFIG_KGDB_KDB) += trace_kdb.o 62obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
63endif 63endif
64obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
65obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
64 66
65libftrace-y := ftrace.o 67libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1c8b7c6f7b3b..a7d28e033a96 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -103,6 +103,11 @@ struct kretprobe_trace_entry_head {
103 unsigned long ret_ip; 103 unsigned long ret_ip;
104}; 104};
105 105
106struct uprobe_trace_entry_head {
107 struct trace_entry ent;
108 unsigned long ip;
109};
110
106/* 111/*
107 * trace_flag_type is an enumeration that holds different 112 * trace_flag_type is an enumeration that holds different
108 * states when a trace occurs. These are: 113 * states when a trace occurs. These are:
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 580a05ec926b..b31d3d5699fe 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -19,547 +19,15 @@
19 19
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/kprobes.h>
23#include <linux/seq_file.h>
24#include <linux/slab.h>
25#include <linux/smp.h>
26#include <linux/debugfs.h>
27#include <linux/types.h>
28#include <linux/string.h>
29#include <linux/ctype.h>
30#include <linux/ptrace.h>
31#include <linux/perf_event.h>
32#include <linux/stringify.h>
33#include <linux/limits.h>
34#include <asm/bitsperlong.h>
35
36#include "trace.h"
37#include "trace_output.h"
38
39#define MAX_TRACE_ARGS 128
40#define MAX_ARGSTR_LEN 63
41#define MAX_EVENT_NAME_LEN 64
42#define MAX_STRING_SIZE PATH_MAX
43#define KPROBE_EVENT_SYSTEM "kprobes"
44
45/* Reserved field names */
46#define FIELD_STRING_IP "__probe_ip"
47#define FIELD_STRING_RETIP "__probe_ret_ip"
48#define FIELD_STRING_FUNC "__probe_func"
49
50const char *reserved_field_names[] = {
51 "common_type",
52 "common_flags",
53 "common_preempt_count",
54 "common_pid",
55 "common_tgid",
56 FIELD_STRING_IP,
57 FIELD_STRING_RETIP,
58 FIELD_STRING_FUNC,
59};
60
61/* Printing function type */
62typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
63 void *);
64#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
65#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
66
67/* Printing in basic type function template */
68#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
69static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
70 const char *name, \
71 void *data, void *ent)\
72{ \
73 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
74} \
75static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
76
77DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
78DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
79DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
80DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
81DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
82DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
83DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
84DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
85
86/* data_rloc: data relative location, compatible with u32 */
87#define make_data_rloc(len, roffs) \
88 (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
89#define get_rloc_len(dl) ((u32)(dl) >> 16)
90#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
91
92static inline void *get_rloc_data(u32 *dl)
93{
94 return (u8 *)dl + get_rloc_offs(*dl);
95}
96
97/* For data_loc conversion */
98static inline void *get_loc_data(u32 *dl, void *ent)
99{
100 return (u8 *)ent + get_rloc_offs(*dl);
101}
102
103/*
104 * Convert data_rloc to data_loc:
105 * data_rloc stores the offset from data_rloc itself, but data_loc
106 * stores the offset from event entry.
107 */
108#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
109
110/* For defining macros, define string/string_size types */
111typedef u32 string;
112typedef u32 string_size;
113
114/* Print type function for string type */
115static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
116 const char *name,
117 void *data, void *ent)
118{
119 int len = *(u32 *)data >> 16;
120
121 if (!len)
122 return trace_seq_printf(s, " %s=(fault)", name);
123 else
124 return trace_seq_printf(s, " %s=\"%s\"", name,
125 (const char *)get_loc_data(data, ent));
126}
127static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
128
129/* Data fetch function type */
130typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
131
132struct fetch_param {
133 fetch_func_t fn;
134 void *data;
135};
136
137static __kprobes void call_fetch(struct fetch_param *fprm,
138 struct pt_regs *regs, void *dest)
139{
140 return fprm->fn(regs, fprm->data, dest);
141}
142
143#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
144/*
145 * Define macro for basic types - we don't need to define s* types, because
146 * we have to care only about bitwidth at recording time.
147 */
148#define DEFINE_BASIC_FETCH_FUNCS(method) \
149DEFINE_FETCH_##method(u8) \
150DEFINE_FETCH_##method(u16) \
151DEFINE_FETCH_##method(u32) \
152DEFINE_FETCH_##method(u64)
153
154#define CHECK_FETCH_FUNCS(method, fn) \
155 (((FETCH_FUNC_NAME(method, u8) == fn) || \
156 (FETCH_FUNC_NAME(method, u16) == fn) || \
157 (FETCH_FUNC_NAME(method, u32) == fn) || \
158 (FETCH_FUNC_NAME(method, u64) == fn) || \
159 (FETCH_FUNC_NAME(method, string) == fn) || \
160 (FETCH_FUNC_NAME(method, string_size) == fn)) \
161 && (fn != NULL))
162
163/* Data fetch function templates */
164#define DEFINE_FETCH_reg(type) \
165static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
166 void *offset, void *dest) \
167{ \
168 *(type *)dest = (type)regs_get_register(regs, \
169 (unsigned int)((unsigned long)offset)); \
170}
171DEFINE_BASIC_FETCH_FUNCS(reg)
172/* No string on the register */
173#define fetch_reg_string NULL
174#define fetch_reg_string_size NULL
175
176#define DEFINE_FETCH_stack(type) \
177static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
178 void *offset, void *dest) \
179{ \
180 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
181 (unsigned int)((unsigned long)offset)); \
182}
183DEFINE_BASIC_FETCH_FUNCS(stack)
184/* No string on the stack entry */
185#define fetch_stack_string NULL
186#define fetch_stack_string_size NULL
187
188#define DEFINE_FETCH_retval(type) \
189static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
190 void *dummy, void *dest) \
191{ \
192 *(type *)dest = (type)regs_return_value(regs); \
193}
194DEFINE_BASIC_FETCH_FUNCS(retval)
195/* No string on the retval */
196#define fetch_retval_string NULL
197#define fetch_retval_string_size NULL
198
199#define DEFINE_FETCH_memory(type) \
200static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
201 void *addr, void *dest) \
202{ \
203 type retval; \
204 if (probe_kernel_address(addr, retval)) \
205 *(type *)dest = 0; \
206 else \
207 *(type *)dest = retval; \
208}
209DEFINE_BASIC_FETCH_FUNCS(memory)
210/*
211 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
212 * length and relative data location.
213 */
214static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
215 void *addr, void *dest)
216{
217 long ret;
218 int maxlen = get_rloc_len(*(u32 *)dest);
219 u8 *dst = get_rloc_data(dest);
220 u8 *src = addr;
221 mm_segment_t old_fs = get_fs();
222 if (!maxlen)
223 return;
224 /*
225 * Try to get string again, since the string can be changed while
226 * probing.
227 */
228 set_fs(KERNEL_DS);
229 pagefault_disable();
230 do
231 ret = __copy_from_user_inatomic(dst++, src++, 1);
232 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
233 dst[-1] = '\0';
234 pagefault_enable();
235 set_fs(old_fs);
236
237 if (ret < 0) { /* Failed to fetch string */
238 ((u8 *)get_rloc_data(dest))[0] = '\0';
239 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
240 } else
241 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
242 get_rloc_offs(*(u32 *)dest));
243}
244/* Return the length of string -- including null terminal byte */
245static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
246 void *addr, void *dest)
247{
248 int ret, len = 0;
249 u8 c;
250 mm_segment_t old_fs = get_fs();
251
252 set_fs(KERNEL_DS);
253 pagefault_disable();
254 do {
255 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
256 len++;
257 } while (c && ret == 0 && len < MAX_STRING_SIZE);
258 pagefault_enable();
259 set_fs(old_fs);
260
261 if (ret < 0) /* Failed to check the length */
262 *(u32 *)dest = 0;
263 else
264 *(u32 *)dest = len;
265}
266
267/* Memory fetching by symbol */
268struct symbol_cache {
269 char *symbol;
270 long offset;
271 unsigned long addr;
272};
273
274static unsigned long update_symbol_cache(struct symbol_cache *sc)
275{
276 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
277 if (sc->addr)
278 sc->addr += sc->offset;
279 return sc->addr;
280}
281
282static void free_symbol_cache(struct symbol_cache *sc)
283{
284 kfree(sc->symbol);
285 kfree(sc);
286}
287
288static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
289{
290 struct symbol_cache *sc;
291
292 if (!sym || strlen(sym) == 0)
293 return NULL;
294 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
295 if (!sc)
296 return NULL;
297
298 sc->symbol = kstrdup(sym, GFP_KERNEL);
299 if (!sc->symbol) {
300 kfree(sc);
301 return NULL;
302 }
303 sc->offset = offset;
304 22
305 update_symbol_cache(sc); 23#include "trace_probe.h"
306 return sc;
307}
308
309#define DEFINE_FETCH_symbol(type) \
310static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
311 void *data, void *dest) \
312{ \
313 struct symbol_cache *sc = data; \
314 if (sc->addr) \
315 fetch_memory_##type(regs, (void *)sc->addr, dest); \
316 else \
317 *(type *)dest = 0; \
318}
319DEFINE_BASIC_FETCH_FUNCS(symbol)
320DEFINE_FETCH_symbol(string)
321DEFINE_FETCH_symbol(string_size)
322
323/* Dereference memory access function */
324struct deref_fetch_param {
325 struct fetch_param orig;
326 long offset;
327};
328
329#define DEFINE_FETCH_deref(type) \
330static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
331 void *data, void *dest) \
332{ \
333 struct deref_fetch_param *dprm = data; \
334 unsigned long addr; \
335 call_fetch(&dprm->orig, regs, &addr); \
336 if (addr) { \
337 addr += dprm->offset; \
338 fetch_memory_##type(regs, (void *)addr, dest); \
339 } else \
340 *(type *)dest = 0; \
341}
342DEFINE_BASIC_FETCH_FUNCS(deref)
343DEFINE_FETCH_deref(string)
344DEFINE_FETCH_deref(string_size)
345
346static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
347{
348 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
349 update_deref_fetch_param(data->orig.data);
350 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
351 update_symbol_cache(data->orig.data);
352}
353
354static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
355{
356 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
357 free_deref_fetch_param(data->orig.data);
358 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
359 free_symbol_cache(data->orig.data);
360 kfree(data);
361}
362
363/* Bitfield fetch function */
364struct bitfield_fetch_param {
365 struct fetch_param orig;
366 unsigned char hi_shift;
367 unsigned char low_shift;
368};
369 24
370#define DEFINE_FETCH_bitfield(type) \ 25#define KPROBE_EVENT_SYSTEM "kprobes"
371static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
372 void *data, void *dest) \
373{ \
374 struct bitfield_fetch_param *bprm = data; \
375 type buf = 0; \
376 call_fetch(&bprm->orig, regs, &buf); \
377 if (buf) { \
378 buf <<= bprm->hi_shift; \
379 buf >>= bprm->low_shift; \
380 } \
381 *(type *)dest = buf; \
382}
383DEFINE_BASIC_FETCH_FUNCS(bitfield)
384#define fetch_bitfield_string NULL
385#define fetch_bitfield_string_size NULL
386
387static __kprobes void
388update_bitfield_fetch_param(struct bitfield_fetch_param *data)
389{
390 /*
391 * Don't check the bitfield itself, because this must be the
392 * last fetch function.
393 */
394 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
395 update_deref_fetch_param(data->orig.data);
396 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
397 update_symbol_cache(data->orig.data);
398}
399
400static __kprobes void
401free_bitfield_fetch_param(struct bitfield_fetch_param *data)
402{
403 /*
404 * Don't check the bitfield itself, because this must be the
405 * last fetch function.
406 */
407 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
408 free_deref_fetch_param(data->orig.data);
409 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
410 free_symbol_cache(data->orig.data);
411 kfree(data);
412}
413
414/* Default (unsigned long) fetch type */
415#define __DEFAULT_FETCH_TYPE(t) u##t
416#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
417#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
418#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
419
420/* Fetch types */
421enum {
422 FETCH_MTD_reg = 0,
423 FETCH_MTD_stack,
424 FETCH_MTD_retval,
425 FETCH_MTD_memory,
426 FETCH_MTD_symbol,
427 FETCH_MTD_deref,
428 FETCH_MTD_bitfield,
429 FETCH_MTD_END,
430};
431
432#define ASSIGN_FETCH_FUNC(method, type) \
433 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
434
435#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
436 {.name = _name, \
437 .size = _size, \
438 .is_signed = sign, \
439 .print = PRINT_TYPE_FUNC_NAME(ptype), \
440 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
441 .fmttype = _fmttype, \
442 .fetch = { \
443ASSIGN_FETCH_FUNC(reg, ftype), \
444ASSIGN_FETCH_FUNC(stack, ftype), \
445ASSIGN_FETCH_FUNC(retval, ftype), \
446ASSIGN_FETCH_FUNC(memory, ftype), \
447ASSIGN_FETCH_FUNC(symbol, ftype), \
448ASSIGN_FETCH_FUNC(deref, ftype), \
449ASSIGN_FETCH_FUNC(bitfield, ftype), \
450 } \
451 }
452
453#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
454 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
455
456#define FETCH_TYPE_STRING 0
457#define FETCH_TYPE_STRSIZE 1
458
459/* Fetch type information table */
460static const struct fetch_type {
461 const char *name; /* Name of type */
462 size_t size; /* Byte size of type */
463 int is_signed; /* Signed flag */
464 print_type_func_t print; /* Print functions */
465 const char *fmt; /* Fromat string */
466 const char *fmttype; /* Name in format file */
467 /* Fetch functions */
468 fetch_func_t fetch[FETCH_MTD_END];
469} fetch_type_table[] = {
470 /* Special types */
471 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
472 sizeof(u32), 1, "__data_loc char[]"),
473 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
474 string_size, sizeof(u32), 0, "u32"),
475 /* Basic types */
476 ASSIGN_FETCH_TYPE(u8, u8, 0),
477 ASSIGN_FETCH_TYPE(u16, u16, 0),
478 ASSIGN_FETCH_TYPE(u32, u32, 0),
479 ASSIGN_FETCH_TYPE(u64, u64, 0),
480 ASSIGN_FETCH_TYPE(s8, u8, 1),
481 ASSIGN_FETCH_TYPE(s16, u16, 1),
482 ASSIGN_FETCH_TYPE(s32, u32, 1),
483 ASSIGN_FETCH_TYPE(s64, u64, 1),
484};
485
486static const struct fetch_type *find_fetch_type(const char *type)
487{
488 int i;
489
490 if (!type)
491 type = DEFAULT_FETCH_TYPE_STR;
492
493 /* Special case: bitfield */
494 if (*type == 'b') {
495 unsigned long bs;
496 type = strchr(type, '/');
497 if (!type)
498 goto fail;
499 type++;
500 if (strict_strtoul(type, 0, &bs))
501 goto fail;
502 switch (bs) {
503 case 8:
504 return find_fetch_type("u8");
505 case 16:
506 return find_fetch_type("u16");
507 case 32:
508 return find_fetch_type("u32");
509 case 64:
510 return find_fetch_type("u64");
511 default:
512 goto fail;
513 }
514 }
515
516 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
517 if (strcmp(type, fetch_type_table[i].name) == 0)
518 return &fetch_type_table[i];
519fail:
520 return NULL;
521}
522
523/* Special function : only accept unsigned long */
524static __kprobes void fetch_stack_address(struct pt_regs *regs,
525 void *dummy, void *dest)
526{
527 *(unsigned long *)dest = kernel_stack_pointer(regs);
528}
529
530static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
531 fetch_func_t orig_fn)
532{
533 int i;
534
535 if (type != &fetch_type_table[FETCH_TYPE_STRING])
536 return NULL; /* Only string type needs size function */
537 for (i = 0; i < FETCH_MTD_END; i++)
538 if (type->fetch[i] == orig_fn)
539 return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
540
541 WARN_ON(1); /* This should not happen */
542 return NULL;
543}
544 26
545/** 27/**
546 * Kprobe event core functions 28 * Kprobe event core functions
547 */ 29 */
548 30
549struct probe_arg {
550 struct fetch_param fetch;
551 struct fetch_param fetch_size;
552 unsigned int offset; /* Offset from argument entry */
553 const char *name; /* Name of this argument */
554 const char *comm; /* Command of this argument */
555 const struct fetch_type *type; /* Type of this argument */
556};
557
558/* Flags for trace_probe */
559#define TP_FLAG_TRACE 1
560#define TP_FLAG_PROFILE 2
561#define TP_FLAG_REGISTERED 4
562
563struct trace_probe { 31struct trace_probe {
564 struct list_head list; 32 struct list_head list;
565 struct kretprobe rp; /* Use rp.kp for kprobe use */ 33 struct kretprobe rp; /* Use rp.kp for kprobe use */
@@ -631,18 +99,6 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
631static int kretprobe_dispatcher(struct kretprobe_instance *ri, 99static int kretprobe_dispatcher(struct kretprobe_instance *ri,
632 struct pt_regs *regs); 100 struct pt_regs *regs);
633 101
634/* Check the name is good for event/group/fields */
635static int is_good_name(const char *name)
636{
637 if (!isalpha(*name) && *name != '_')
638 return 0;
639 while (*++name != '\0') {
640 if (!isalpha(*name) && !isdigit(*name) && *name != '_')
641 return 0;
642 }
643 return 1;
644}
645
646/* 102/*
647 * Allocate new trace_probe and initialize it (including kprobes). 103 * Allocate new trace_probe and initialize it (including kprobes).
648 */ 104 */
@@ -651,7 +107,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
651 void *addr, 107 void *addr,
652 const char *symbol, 108 const char *symbol,
653 unsigned long offs, 109 unsigned long offs,
654 int nargs, int is_return) 110 int nargs, bool is_return)
655{ 111{
656 struct trace_probe *tp; 112 struct trace_probe *tp;
657 int ret = -ENOMEM; 113 int ret = -ENOMEM;
@@ -702,34 +158,12 @@ error:
702 return ERR_PTR(ret); 158 return ERR_PTR(ret);
703} 159}
704 160
705static void update_probe_arg(struct probe_arg *arg)
706{
707 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
708 update_bitfield_fetch_param(arg->fetch.data);
709 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
710 update_deref_fetch_param(arg->fetch.data);
711 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
712 update_symbol_cache(arg->fetch.data);
713}
714
715static void free_probe_arg(struct probe_arg *arg)
716{
717 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
718 free_bitfield_fetch_param(arg->fetch.data);
719 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
720 free_deref_fetch_param(arg->fetch.data);
721 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
722 free_symbol_cache(arg->fetch.data);
723 kfree(arg->name);
724 kfree(arg->comm);
725}
726
727static void free_trace_probe(struct trace_probe *tp) 161static void free_trace_probe(struct trace_probe *tp)
728{ 162{
729 int i; 163 int i;
730 164
731 for (i = 0; i < tp->nr_args; i++) 165 for (i = 0; i < tp->nr_args; i++)
732 free_probe_arg(&tp->args[i]); 166 traceprobe_free_probe_arg(&tp->args[i]);
733 167
734 kfree(tp->call.class->system); 168 kfree(tp->call.class->system);
735 kfree(tp->call.name); 169 kfree(tp->call.name);
@@ -787,7 +221,7 @@ static int __register_trace_probe(struct trace_probe *tp)
787 return -EINVAL; 221 return -EINVAL;
788 222
789 for (i = 0; i < tp->nr_args; i++) 223 for (i = 0; i < tp->nr_args; i++)
790 update_probe_arg(&tp->args[i]); 224 traceprobe_update_arg(&tp->args[i]);
791 225
792 /* Set/clear disabled flag according to tp->flag */ 226 /* Set/clear disabled flag according to tp->flag */
793 if (trace_probe_is_enabled(tp)) 227 if (trace_probe_is_enabled(tp))
@@ -919,227 +353,6 @@ static struct notifier_block trace_probe_module_nb = {
919 .priority = 1 /* Invoked after kprobe module callback */ 353 .priority = 1 /* Invoked after kprobe module callback */
920}; 354};
921 355
922/* Split symbol and offset. */
923static int split_symbol_offset(char *symbol, unsigned long *offset)
924{
925 char *tmp;
926 int ret;
927
928 if (!offset)
929 return -EINVAL;
930
931 tmp = strchr(symbol, '+');
932 if (tmp) {
933 /* skip sign because strict_strtol doesn't accept '+' */
934 ret = strict_strtoul(tmp + 1, 0, offset);
935 if (ret)
936 return ret;
937 *tmp = '\0';
938 } else
939 *offset = 0;
940 return 0;
941}
942
943#define PARAM_MAX_ARGS 16
944#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
945
946static int parse_probe_vars(char *arg, const struct fetch_type *t,
947 struct fetch_param *f, int is_return)
948{
949 int ret = 0;
950 unsigned long param;
951
952 if (strcmp(arg, "retval") == 0) {
953 if (is_return)
954 f->fn = t->fetch[FETCH_MTD_retval];
955 else
956 ret = -EINVAL;
957 } else if (strncmp(arg, "stack", 5) == 0) {
958 if (arg[5] == '\0') {
959 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
960 f->fn = fetch_stack_address;
961 else
962 ret = -EINVAL;
963 } else if (isdigit(arg[5])) {
964 ret = strict_strtoul(arg + 5, 10, &param);
965 if (ret || param > PARAM_MAX_STACK)
966 ret = -EINVAL;
967 else {
968 f->fn = t->fetch[FETCH_MTD_stack];
969 f->data = (void *)param;
970 }
971 } else
972 ret = -EINVAL;
973 } else
974 ret = -EINVAL;
975 return ret;
976}
977
978/* Recursive argument parser */
979static int __parse_probe_arg(char *arg, const struct fetch_type *t,
980 struct fetch_param *f, int is_return)
981{
982 int ret = 0;
983 unsigned long param;
984 long offset;
985 char *tmp;
986
987 switch (arg[0]) {
988 case '$':
989 ret = parse_probe_vars(arg + 1, t, f, is_return);
990 break;
991 case '%': /* named register */
992 ret = regs_query_register_offset(arg + 1);
993 if (ret >= 0) {
994 f->fn = t->fetch[FETCH_MTD_reg];
995 f->data = (void *)(unsigned long)ret;
996 ret = 0;
997 }
998 break;
999 case '@': /* memory or symbol */
1000 if (isdigit(arg[1])) {
1001 ret = strict_strtoul(arg + 1, 0, &param);
1002 if (ret)
1003 break;
1004 f->fn = t->fetch[FETCH_MTD_memory];
1005 f->data = (void *)param;
1006 } else {
1007 ret = split_symbol_offset(arg + 1, &offset);
1008 if (ret)
1009 break;
1010 f->data = alloc_symbol_cache(arg + 1, offset);
1011 if (f->data)
1012 f->fn = t->fetch[FETCH_MTD_symbol];
1013 }
1014 break;
1015 case '+': /* deref memory */
1016 arg++; /* Skip '+', because strict_strtol() rejects it. */
1017 case '-':
1018 tmp = strchr(arg, '(');
1019 if (!tmp)
1020 break;
1021 *tmp = '\0';
1022 ret = strict_strtol(arg, 0, &offset);
1023 if (ret)
1024 break;
1025 arg = tmp + 1;
1026 tmp = strrchr(arg, ')');
1027 if (tmp) {
1028 struct deref_fetch_param *dprm;
1029 const struct fetch_type *t2 = find_fetch_type(NULL);
1030 *tmp = '\0';
1031 dprm = kzalloc(sizeof(struct deref_fetch_param),
1032 GFP_KERNEL);
1033 if (!dprm)
1034 return -ENOMEM;
1035 dprm->offset = offset;
1036 ret = __parse_probe_arg(arg, t2, &dprm->orig,
1037 is_return);
1038 if (ret)
1039 kfree(dprm);
1040 else {
1041 f->fn = t->fetch[FETCH_MTD_deref];
1042 f->data = (void *)dprm;
1043 }
1044 }
1045 break;
1046 }
1047 if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
1048 pr_info("%s type has no corresponding fetch method.\n",
1049 t->name);
1050 ret = -EINVAL;
1051 }
1052 return ret;
1053}
1054
1055#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
1056
1057/* Bitfield type needs to be parsed into a fetch function */
1058static int __parse_bitfield_probe_arg(const char *bf,
1059 const struct fetch_type *t,
1060 struct fetch_param *f)
1061{
1062 struct bitfield_fetch_param *bprm;
1063 unsigned long bw, bo;
1064 char *tail;
1065
1066 if (*bf != 'b')
1067 return 0;
1068
1069 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1070 if (!bprm)
1071 return -ENOMEM;
1072 bprm->orig = *f;
1073 f->fn = t->fetch[FETCH_MTD_bitfield];
1074 f->data = (void *)bprm;
1075
1076 bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
1077 if (bw == 0 || *tail != '@')
1078 return -EINVAL;
1079
1080 bf = tail + 1;
1081 bo = simple_strtoul(bf, &tail, 0);
1082 if (tail == bf || *tail != '/')
1083 return -EINVAL;
1084
1085 bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
1086 bprm->low_shift = bprm->hi_shift + bo;
1087 return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
1088}
1089
1090/* String length checking wrapper */
1091static int parse_probe_arg(char *arg, struct trace_probe *tp,
1092 struct probe_arg *parg, int is_return)
1093{
1094 const char *t;
1095 int ret;
1096
1097 if (strlen(arg) > MAX_ARGSTR_LEN) {
1098 pr_info("Argument is too long.: %s\n", arg);
1099 return -ENOSPC;
1100 }
1101 parg->comm = kstrdup(arg, GFP_KERNEL);
1102 if (!parg->comm) {
1103 pr_info("Failed to allocate memory for command '%s'.\n", arg);
1104 return -ENOMEM;
1105 }
1106 t = strchr(parg->comm, ':');
1107 if (t) {
1108 arg[t - parg->comm] = '\0';
1109 t++;
1110 }
1111 parg->type = find_fetch_type(t);
1112 if (!parg->type) {
1113 pr_info("Unsupported type: %s\n", t);
1114 return -EINVAL;
1115 }
1116 parg->offset = tp->size;
1117 tp->size += parg->type->size;
1118 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
1119 if (ret >= 0 && t != NULL)
1120 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
1121 if (ret >= 0) {
1122 parg->fetch_size.fn = get_fetch_size_function(parg->type,
1123 parg->fetch.fn);
1124 parg->fetch_size.data = parg->fetch.data;
1125 }
1126 return ret;
1127}
1128
1129/* Return 1 if name is reserved or already used by another argument */
1130static int conflict_field_name(const char *name,
1131 struct probe_arg *args, int narg)
1132{
1133 int i;
1134 for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
1135 if (strcmp(reserved_field_names[i], name) == 0)
1136 return 1;
1137 for (i = 0; i < narg; i++)
1138 if (strcmp(args[i].name, name) == 0)
1139 return 1;
1140 return 0;
1141}
1142
1143static int create_trace_probe(int argc, char **argv) 356static int create_trace_probe(int argc, char **argv)
1144{ 357{
1145 /* 358 /*
@@ -1162,7 +375,7 @@ static int create_trace_probe(int argc, char **argv)
1162 */ 375 */
1163 struct trace_probe *tp; 376 struct trace_probe *tp;
1164 int i, ret = 0; 377 int i, ret = 0;
1165 int is_return = 0, is_delete = 0; 378 bool is_return = false, is_delete = false;
1166 char *symbol = NULL, *event = NULL, *group = NULL; 379 char *symbol = NULL, *event = NULL, *group = NULL;
1167 char *arg; 380 char *arg;
1168 unsigned long offset = 0; 381 unsigned long offset = 0;
@@ -1171,11 +384,11 @@ static int create_trace_probe(int argc, char **argv)
1171 384
1172 /* argc must be >= 1 */ 385 /* argc must be >= 1 */
1173 if (argv[0][0] == 'p') 386 if (argv[0][0] == 'p')
1174 is_return = 0; 387 is_return = false;
1175 else if (argv[0][0] == 'r') 388 else if (argv[0][0] == 'r')
1176 is_return = 1; 389 is_return = true;
1177 else if (argv[0][0] == '-') 390 else if (argv[0][0] == '-')
1178 is_delete = 1; 391 is_delete = true;
1179 else { 392 else {
1180 pr_info("Probe definition must be started with 'p', 'r' or" 393 pr_info("Probe definition must be started with 'p', 'r' or"
1181 " '-'.\n"); 394 " '-'.\n");
@@ -1240,7 +453,7 @@ static int create_trace_probe(int argc, char **argv)
1240 /* a symbol specified */ 453 /* a symbol specified */
1241 symbol = argv[1]; 454 symbol = argv[1];
1242 /* TODO: support .init module functions */ 455 /* TODO: support .init module functions */
1243 ret = split_symbol_offset(symbol, &offset); 456 ret = traceprobe_split_symbol_offset(symbol, &offset);
1244 if (ret) { 457 if (ret) {
1245 pr_info("Failed to parse symbol.\n"); 458 pr_info("Failed to parse symbol.\n");
1246 return ret; 459 return ret;
@@ -1302,7 +515,8 @@ static int create_trace_probe(int argc, char **argv)
1302 goto error; 515 goto error;
1303 } 516 }
1304 517
1305 if (conflict_field_name(tp->args[i].name, tp->args, i)) { 518 if (traceprobe_conflict_field_name(tp->args[i].name,
519 tp->args, i)) {
1306 pr_info("Argument[%d] name '%s' conflicts with " 520 pr_info("Argument[%d] name '%s' conflicts with "
1307 "another field.\n", i, argv[i]); 521 "another field.\n", i, argv[i]);
1308 ret = -EINVAL; 522 ret = -EINVAL;
@@ -1310,7 +524,8 @@ static int create_trace_probe(int argc, char **argv)
1310 } 524 }
1311 525
1312 /* Parse fetch argument */ 526 /* Parse fetch argument */
1313 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); 527 ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i],
528 is_return, true);
1314 if (ret) { 529 if (ret) {
1315 pr_info("Parse error at argument[%d]. (%d)\n", i, ret); 530 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
1316 goto error; 531 goto error;
@@ -1412,70 +627,11 @@ static int probes_open(struct inode *inode, struct file *file)
1412 return seq_open(file, &probes_seq_op); 627 return seq_open(file, &probes_seq_op);
1413} 628}
1414 629
1415static int command_trace_probe(const char *buf)
1416{
1417 char **argv;
1418 int argc = 0, ret = 0;
1419
1420 argv = argv_split(GFP_KERNEL, buf, &argc);
1421 if (!argv)
1422 return -ENOMEM;
1423
1424 if (argc)
1425 ret = create_trace_probe(argc, argv);
1426
1427 argv_free(argv);
1428 return ret;
1429}
1430
1431#define WRITE_BUFSIZE 4096
1432
1433static ssize_t probes_write(struct file *file, const char __user *buffer, 630static ssize_t probes_write(struct file *file, const char __user *buffer,
1434 size_t count, loff_t *ppos) 631 size_t count, loff_t *ppos)
1435{ 632{
1436 char *kbuf, *tmp; 633 return traceprobe_probes_write(file, buffer, count, ppos,
1437 int ret; 634 create_trace_probe);
1438 size_t done;
1439 size_t size;
1440
1441 kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
1442 if (!kbuf)
1443 return -ENOMEM;
1444
1445 ret = done = 0;
1446 while (done < count) {
1447 size = count - done;
1448 if (size >= WRITE_BUFSIZE)
1449 size = WRITE_BUFSIZE - 1;
1450 if (copy_from_user(kbuf, buffer + done, size)) {
1451 ret = -EFAULT;
1452 goto out;
1453 }
1454 kbuf[size] = '\0';
1455 tmp = strchr(kbuf, '\n');
1456 if (tmp) {
1457 *tmp = '\0';
1458 size = tmp - kbuf + 1;
1459 } else if (done + size < count) {
1460 pr_warning("Line length is too long: "
1461 "Should be less than %d.", WRITE_BUFSIZE);
1462 ret = -EINVAL;
1463 goto out;
1464 }
1465 done += size;
1466 /* Remove comments */
1467 tmp = strchr(kbuf, '#');
1468 if (tmp)
1469 *tmp = '\0';
1470
1471 ret = command_trace_probe(kbuf);
1472 if (ret)
1473 goto out;
1474 }
1475 ret = done;
1476out:
1477 kfree(kbuf);
1478 return ret;
1479} 635}
1480 636
1481static const struct file_operations kprobe_events_ops = { 637static const struct file_operations kprobe_events_ops = {
@@ -1711,16 +867,6 @@ partial:
1711 return TRACE_TYPE_PARTIAL_LINE; 867 return TRACE_TYPE_PARTIAL_LINE;
1712} 868}
1713 869
1714#undef DEFINE_FIELD
1715#define DEFINE_FIELD(type, item, name, is_signed) \
1716 do { \
1717 ret = trace_define_field(event_call, #type, name, \
1718 offsetof(typeof(field), item), \
1719 sizeof(field.item), is_signed, \
1720 FILTER_OTHER); \
1721 if (ret) \
1722 return ret; \
1723 } while (0)
1724 870
1725static int kprobe_event_define_fields(struct ftrace_event_call *event_call) 871static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1726{ 872{
@@ -2051,8 +1197,9 @@ static __init int kprobe_trace_self_tests_init(void)
2051 1197
2052 pr_info("Testing kprobe tracing: "); 1198 pr_info("Testing kprobe tracing: ");
2053 1199
2054 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " 1200 ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
2055 "$stack $stack0 +0($stack)"); 1201 "$stack $stack0 +0($stack)",
1202 create_trace_probe);
2056 if (WARN_ON_ONCE(ret)) { 1203 if (WARN_ON_ONCE(ret)) {
2057 pr_warning("error on probing function entry.\n"); 1204 pr_warning("error on probing function entry.\n");
2058 warn++; 1205 warn++;
@@ -2066,8 +1213,8 @@ static __init int kprobe_trace_self_tests_init(void)
2066 enable_trace_probe(tp, TP_FLAG_TRACE); 1213 enable_trace_probe(tp, TP_FLAG_TRACE);
2067 } 1214 }
2068 1215
2069 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " 1216 ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
2070 "$retval"); 1217 "$retval", create_trace_probe);
2071 if (WARN_ON_ONCE(ret)) { 1218 if (WARN_ON_ONCE(ret)) {
2072 pr_warning("error on probing function return.\n"); 1219 pr_warning("error on probing function return.\n");
2073 warn++; 1220 warn++;
@@ -2101,13 +1248,13 @@ static __init int kprobe_trace_self_tests_init(void)
2101 } else 1248 } else
2102 disable_trace_probe(tp, TP_FLAG_TRACE); 1249 disable_trace_probe(tp, TP_FLAG_TRACE);
2103 1250
2104 ret = command_trace_probe("-:testprobe"); 1251 ret = traceprobe_command("-:testprobe", create_trace_probe);
2105 if (WARN_ON_ONCE(ret)) { 1252 if (WARN_ON_ONCE(ret)) {
2106 pr_warning("error on deleting a probe.\n"); 1253 pr_warning("error on deleting a probe.\n");
2107 warn++; 1254 warn++;
2108 } 1255 }
2109 1256
2110 ret = command_trace_probe("-:testprobe2"); 1257 ret = traceprobe_command("-:testprobe2", create_trace_probe);
2111 if (WARN_ON_ONCE(ret)) { 1258 if (WARN_ON_ONCE(ret)) {
2112 pr_warning("error on deleting a probe.\n"); 1259 pr_warning("error on deleting a probe.\n");
2113 warn++; 1260 warn++;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
new file mode 100644
index 000000000000..daa9980153af
--- /dev/null
+++ b/kernel/trace/trace_probe.c
@@ -0,0 +1,839 @@
1/*
2 * Common code for probe-based Dynamic events.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 *
17 * This code was copied from kernel/trace/trace_kprobe.c written by
18 * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
19 *
20 * Updates to make this generic:
21 * Copyright (C) IBM Corporation, 2010-2011
22 * Author: Srikar Dronamraju
23 */
24
25#include "trace_probe.h"
26
27const char *reserved_field_names[] = {
28 "common_type",
29 "common_flags",
30 "common_preempt_count",
31 "common_pid",
32 "common_tgid",
33 FIELD_STRING_IP,
34 FIELD_STRING_RETIP,
35 FIELD_STRING_FUNC,
36};
37
38/* Printing function type */
39#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
40#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
41
42/* Printing in basic type function template */
43#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
44static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
45 const char *name, \
46 void *data, void *ent)\
47{ \
48 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
49} \
50static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
51
52DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
53DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
54DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
55DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
56DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
57DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
58DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
59DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
60
61static inline void *get_rloc_data(u32 *dl)
62{
63 return (u8 *)dl + get_rloc_offs(*dl);
64}
65
66/* For data_loc conversion */
67static inline void *get_loc_data(u32 *dl, void *ent)
68{
69 return (u8 *)ent + get_rloc_offs(*dl);
70}
71
72/* For defining macros, define string/string_size types */
73typedef u32 string;
74typedef u32 string_size;
75
76/* Print type function for string type */
77static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
78 const char *name,
79 void *data, void *ent)
80{
81 int len = *(u32 *)data >> 16;
82
83 if (!len)
84 return trace_seq_printf(s, " %s=(fault)", name);
85 else
86 return trace_seq_printf(s, " %s=\"%s\"", name,
87 (const char *)get_loc_data(data, ent));
88}
89
90static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
91
92#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
93/*
94 * Define macro for basic types - we don't need to define s* types, because
95 * we have to care only about bitwidth at recording time.
96 */
97#define DEFINE_BASIC_FETCH_FUNCS(method) \
98DEFINE_FETCH_##method(u8) \
99DEFINE_FETCH_##method(u16) \
100DEFINE_FETCH_##method(u32) \
101DEFINE_FETCH_##method(u64)
102
103#define CHECK_FETCH_FUNCS(method, fn) \
104 (((FETCH_FUNC_NAME(method, u8) == fn) || \
105 (FETCH_FUNC_NAME(method, u16) == fn) || \
106 (FETCH_FUNC_NAME(method, u32) == fn) || \
107 (FETCH_FUNC_NAME(method, u64) == fn) || \
108 (FETCH_FUNC_NAME(method, string) == fn) || \
109 (FETCH_FUNC_NAME(method, string_size) == fn)) \
110 && (fn != NULL))
111
112/* Data fetch function templates */
113#define DEFINE_FETCH_reg(type) \
114static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
115 void *offset, void *dest) \
116{ \
117 *(type *)dest = (type)regs_get_register(regs, \
118 (unsigned int)((unsigned long)offset)); \
119}
120DEFINE_BASIC_FETCH_FUNCS(reg)
121/* No string on the register */
122#define fetch_reg_string NULL
123#define fetch_reg_string_size NULL
124
125#define DEFINE_FETCH_stack(type) \
126static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
127 void *offset, void *dest) \
128{ \
129 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
130 (unsigned int)((unsigned long)offset)); \
131}
132DEFINE_BASIC_FETCH_FUNCS(stack)
133/* No string on the stack entry */
134#define fetch_stack_string NULL
135#define fetch_stack_string_size NULL
136
137#define DEFINE_FETCH_retval(type) \
138static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
139 void *dummy, void *dest) \
140{ \
141 *(type *)dest = (type)regs_return_value(regs); \
142}
143DEFINE_BASIC_FETCH_FUNCS(retval)
144/* No string on the retval */
145#define fetch_retval_string NULL
146#define fetch_retval_string_size NULL
147
148#define DEFINE_FETCH_memory(type) \
149static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
150 void *addr, void *dest) \
151{ \
152 type retval; \
153 if (probe_kernel_address(addr, retval)) \
154 *(type *)dest = 0; \
155 else \
156 *(type *)dest = retval; \
157}
158DEFINE_BASIC_FETCH_FUNCS(memory)
159/*
160 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
161 * length and relative data location.
162 */
163static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
164 void *addr, void *dest)
165{
166 long ret;
167 int maxlen = get_rloc_len(*(u32 *)dest);
168 u8 *dst = get_rloc_data(dest);
169 u8 *src = addr;
170 mm_segment_t old_fs = get_fs();
171
172 if (!maxlen)
173 return;
174
175 /*
176 * Try to get string again, since the string can be changed while
177 * probing.
178 */
179 set_fs(KERNEL_DS);
180 pagefault_disable();
181
182 do
183 ret = __copy_from_user_inatomic(dst++, src++, 1);
184 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
185
186 dst[-1] = '\0';
187 pagefault_enable();
188 set_fs(old_fs);
189
190 if (ret < 0) { /* Failed to fetch string */
191 ((u8 *)get_rloc_data(dest))[0] = '\0';
192 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
193 } else {
194 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
195 get_rloc_offs(*(u32 *)dest));
196 }
197}
198
199/* Return the length of string -- including null terminal byte */
200static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
201 void *addr, void *dest)
202{
203 mm_segment_t old_fs;
204 int ret, len = 0;
205 u8 c;
206
207 old_fs = get_fs();
208 set_fs(KERNEL_DS);
209 pagefault_disable();
210
211 do {
212 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
213 len++;
214 } while (c && ret == 0 && len < MAX_STRING_SIZE);
215
216 pagefault_enable();
217 set_fs(old_fs);
218
219 if (ret < 0) /* Failed to check the length */
220 *(u32 *)dest = 0;
221 else
222 *(u32 *)dest = len;
223}
224
225/* Memory fetching by symbol */
226struct symbol_cache {
227 char *symbol;
228 long offset;
229 unsigned long addr;
230};
231
232static unsigned long update_symbol_cache(struct symbol_cache *sc)
233{
234 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
235
236 if (sc->addr)
237 sc->addr += sc->offset;
238
239 return sc->addr;
240}
241
242static void free_symbol_cache(struct symbol_cache *sc)
243{
244 kfree(sc->symbol);
245 kfree(sc);
246}
247
248static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
249{
250 struct symbol_cache *sc;
251
252 if (!sym || strlen(sym) == 0)
253 return NULL;
254
255 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
256 if (!sc)
257 return NULL;
258
259 sc->symbol = kstrdup(sym, GFP_KERNEL);
260 if (!sc->symbol) {
261 kfree(sc);
262 return NULL;
263 }
264 sc->offset = offset;
265 update_symbol_cache(sc);
266
267 return sc;
268}
269
270#define DEFINE_FETCH_symbol(type) \
271static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
272 void *data, void *dest) \
273{ \
274 struct symbol_cache *sc = data; \
275 if (sc->addr) \
276 fetch_memory_##type(regs, (void *)sc->addr, dest); \
277 else \
278 *(type *)dest = 0; \
279}
280DEFINE_BASIC_FETCH_FUNCS(symbol)
281DEFINE_FETCH_symbol(string)
282DEFINE_FETCH_symbol(string_size)
283
284/* Dereference memory access function */
285struct deref_fetch_param {
286 struct fetch_param orig;
287 long offset;
288};
289
290#define DEFINE_FETCH_deref(type) \
291static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
292 void *data, void *dest) \
293{ \
294 struct deref_fetch_param *dprm = data; \
295 unsigned long addr; \
296 call_fetch(&dprm->orig, regs, &addr); \
297 if (addr) { \
298 addr += dprm->offset; \
299 fetch_memory_##type(regs, (void *)addr, dest); \
300 } else \
301 *(type *)dest = 0; \
302}
303DEFINE_BASIC_FETCH_FUNCS(deref)
304DEFINE_FETCH_deref(string)
305DEFINE_FETCH_deref(string_size)
306
307static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
308{
309 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
310 update_deref_fetch_param(data->orig.data);
311 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
312 update_symbol_cache(data->orig.data);
313}
314
315static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
316{
317 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
318 free_deref_fetch_param(data->orig.data);
319 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
320 free_symbol_cache(data->orig.data);
321 kfree(data);
322}
323
324/* Bitfield fetch function */
325struct bitfield_fetch_param {
326 struct fetch_param orig;
327 unsigned char hi_shift;
328 unsigned char low_shift;
329};
330
331#define DEFINE_FETCH_bitfield(type) \
332static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
333 void *data, void *dest) \
334{ \
335 struct bitfield_fetch_param *bprm = data; \
336 type buf = 0; \
337 call_fetch(&bprm->orig, regs, &buf); \
338 if (buf) { \
339 buf <<= bprm->hi_shift; \
340 buf >>= bprm->low_shift; \
341 } \
342 *(type *)dest = buf; \
343}
344
345DEFINE_BASIC_FETCH_FUNCS(bitfield)
346#define fetch_bitfield_string NULL
347#define fetch_bitfield_string_size NULL
348
349static __kprobes void
350update_bitfield_fetch_param(struct bitfield_fetch_param *data)
351{
352 /*
353 * Don't check the bitfield itself, because this must be the
354 * last fetch function.
355 */
356 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
357 update_deref_fetch_param(data->orig.data);
358 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
359 update_symbol_cache(data->orig.data);
360}
361
362static __kprobes void
363free_bitfield_fetch_param(struct bitfield_fetch_param *data)
364{
365 /*
366 * Don't check the bitfield itself, because this must be the
367 * last fetch function.
368 */
369 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
370 free_deref_fetch_param(data->orig.data);
371 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
372 free_symbol_cache(data->orig.data);
373
374 kfree(data);
375}
376
377/* Default (unsigned long) fetch type */
378#define __DEFAULT_FETCH_TYPE(t) u##t
379#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
380#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
381#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
382
383#define ASSIGN_FETCH_FUNC(method, type) \
384 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
385
386#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
387 {.name = _name, \
388 .size = _size, \
389 .is_signed = sign, \
390 .print = PRINT_TYPE_FUNC_NAME(ptype), \
391 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
392 .fmttype = _fmttype, \
393 .fetch = { \
394ASSIGN_FETCH_FUNC(reg, ftype), \
395ASSIGN_FETCH_FUNC(stack, ftype), \
396ASSIGN_FETCH_FUNC(retval, ftype), \
397ASSIGN_FETCH_FUNC(memory, ftype), \
398ASSIGN_FETCH_FUNC(symbol, ftype), \
399ASSIGN_FETCH_FUNC(deref, ftype), \
400ASSIGN_FETCH_FUNC(bitfield, ftype), \
401 } \
402 }
403
404#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
405 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
406
407#define FETCH_TYPE_STRING 0
408#define FETCH_TYPE_STRSIZE 1
409
410/* Fetch type information table */
411static const struct fetch_type fetch_type_table[] = {
412 /* Special types */
413 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
414 sizeof(u32), 1, "__data_loc char[]"),
415 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
416 string_size, sizeof(u32), 0, "u32"),
417 /* Basic types */
418 ASSIGN_FETCH_TYPE(u8, u8, 0),
419 ASSIGN_FETCH_TYPE(u16, u16, 0),
420 ASSIGN_FETCH_TYPE(u32, u32, 0),
421 ASSIGN_FETCH_TYPE(u64, u64, 0),
422 ASSIGN_FETCH_TYPE(s8, u8, 1),
423 ASSIGN_FETCH_TYPE(s16, u16, 1),
424 ASSIGN_FETCH_TYPE(s32, u32, 1),
425 ASSIGN_FETCH_TYPE(s64, u64, 1),
426};
427
428static const struct fetch_type *find_fetch_type(const char *type)
429{
430 int i;
431
432 if (!type)
433 type = DEFAULT_FETCH_TYPE_STR;
434
435 /* Special case: bitfield */
436 if (*type == 'b') {
437 unsigned long bs;
438
439 type = strchr(type, '/');
440 if (!type)
441 goto fail;
442
443 type++;
444 if (strict_strtoul(type, 0, &bs))
445 goto fail;
446
447 switch (bs) {
448 case 8:
449 return find_fetch_type("u8");
450 case 16:
451 return find_fetch_type("u16");
452 case 32:
453 return find_fetch_type("u32");
454 case 64:
455 return find_fetch_type("u64");
456 default:
457 goto fail;
458 }
459 }
460
461 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
462 if (strcmp(type, fetch_type_table[i].name) == 0)
463 return &fetch_type_table[i];
464
465fail:
466 return NULL;
467}
468
469/* Special function : only accept unsigned long */
470static __kprobes void fetch_stack_address(struct pt_regs *regs,
471 void *dummy, void *dest)
472{
473 *(unsigned long *)dest = kernel_stack_pointer(regs);
474}
475
476static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
477 fetch_func_t orig_fn)
478{
479 int i;
480
481 if (type != &fetch_type_table[FETCH_TYPE_STRING])
482 return NULL; /* Only string type needs size function */
483
484 for (i = 0; i < FETCH_MTD_END; i++)
485 if (type->fetch[i] == orig_fn)
486 return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
487
488 WARN_ON(1); /* This should not happen */
489
490 return NULL;
491}
492
493/* Split symbol and offset. */
494int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
495{
496 char *tmp;
497 int ret;
498
499 if (!offset)
500 return -EINVAL;
501
502 tmp = strchr(symbol, '+');
503 if (tmp) {
504 /* skip sign because strict_strtol doesn't accept '+' */
505 ret = strict_strtoul(tmp + 1, 0, offset);
506 if (ret)
507 return ret;
508
509 *tmp = '\0';
510 } else
511 *offset = 0;
512
513 return 0;
514}
515
516#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
517
518static int parse_probe_vars(char *arg, const struct fetch_type *t,
519 struct fetch_param *f, bool is_return)
520{
521 int ret = 0;
522 unsigned long param;
523
524 if (strcmp(arg, "retval") == 0) {
525 if (is_return)
526 f->fn = t->fetch[FETCH_MTD_retval];
527 else
528 ret = -EINVAL;
529 } else if (strncmp(arg, "stack", 5) == 0) {
530 if (arg[5] == '\0') {
531 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
532 f->fn = fetch_stack_address;
533 else
534 ret = -EINVAL;
535 } else if (isdigit(arg[5])) {
536 ret = strict_strtoul(arg + 5, 10, &param);
537 if (ret || param > PARAM_MAX_STACK)
538 ret = -EINVAL;
539 else {
540 f->fn = t->fetch[FETCH_MTD_stack];
541 f->data = (void *)param;
542 }
543 } else
544 ret = -EINVAL;
545 } else
546 ret = -EINVAL;
547
548 return ret;
549}
550
551/* Recursive argument parser */
552static int parse_probe_arg(char *arg, const struct fetch_type *t,
553 struct fetch_param *f, bool is_return, bool is_kprobe)
554{
555 unsigned long param;
556 long offset;
557 char *tmp;
558 int ret;
559
560 ret = 0;
561
562 /* Until uprobe_events supports only reg arguments */
563 if (!is_kprobe && arg[0] != '%')
564 return -EINVAL;
565
566 switch (arg[0]) {
567 case '$':
568 ret = parse_probe_vars(arg + 1, t, f, is_return);
569 break;
570
571 case '%': /* named register */
572 ret = regs_query_register_offset(arg + 1);
573 if (ret >= 0) {
574 f->fn = t->fetch[FETCH_MTD_reg];
575 f->data = (void *)(unsigned long)ret;
576 ret = 0;
577 }
578 break;
579
580 case '@': /* memory or symbol */
581 if (isdigit(arg[1])) {
582 ret = strict_strtoul(arg + 1, 0, &param);
583 if (ret)
584 break;
585
586 f->fn = t->fetch[FETCH_MTD_memory];
587 f->data = (void *)param;
588 } else {
589 ret = traceprobe_split_symbol_offset(arg + 1, &offset);
590 if (ret)
591 break;
592
593 f->data = alloc_symbol_cache(arg + 1, offset);
594 if (f->data)
595 f->fn = t->fetch[FETCH_MTD_symbol];
596 }
597 break;
598
599 case '+': /* deref memory */
600 arg++; /* Skip '+', because strict_strtol() rejects it. */
601 case '-':
602 tmp = strchr(arg, '(');
603 if (!tmp)
604 break;
605
606 *tmp = '\0';
607 ret = strict_strtol(arg, 0, &offset);
608
609 if (ret)
610 break;
611
612 arg = tmp + 1;
613 tmp = strrchr(arg, ')');
614
615 if (tmp) {
616 struct deref_fetch_param *dprm;
617 const struct fetch_type *t2;
618
619 t2 = find_fetch_type(NULL);
620 *tmp = '\0';
621 dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL);
622
623 if (!dprm)
624 return -ENOMEM;
625
626 dprm->offset = offset;
627 ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
628 is_kprobe);
629 if (ret)
630 kfree(dprm);
631 else {
632 f->fn = t->fetch[FETCH_MTD_deref];
633 f->data = (void *)dprm;
634 }
635 }
636 break;
637 }
638 if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
639 pr_info("%s type has no corresponding fetch method.\n", t->name);
640 ret = -EINVAL;
641 }
642
643 return ret;
644}
645
646#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
647
648/* Bitfield type needs to be parsed into a fetch function */
649static int __parse_bitfield_probe_arg(const char *bf,
650 const struct fetch_type *t,
651 struct fetch_param *f)
652{
653 struct bitfield_fetch_param *bprm;
654 unsigned long bw, bo;
655 char *tail;
656
657 if (*bf != 'b')
658 return 0;
659
660 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
661 if (!bprm)
662 return -ENOMEM;
663
664 bprm->orig = *f;
665 f->fn = t->fetch[FETCH_MTD_bitfield];
666 f->data = (void *)bprm;
667 bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
668
669 if (bw == 0 || *tail != '@')
670 return -EINVAL;
671
672 bf = tail + 1;
673 bo = simple_strtoul(bf, &tail, 0);
674
675 if (tail == bf || *tail != '/')
676 return -EINVAL;
677
678 bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
679 bprm->low_shift = bprm->hi_shift + bo;
680
681 return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
682}
683
684/* String length checking wrapper */
685int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
686 struct probe_arg *parg, bool is_return, bool is_kprobe)
687{
688 const char *t;
689 int ret;
690
691 if (strlen(arg) > MAX_ARGSTR_LEN) {
692 pr_info("Argument is too long.: %s\n", arg);
693 return -ENOSPC;
694 }
695 parg->comm = kstrdup(arg, GFP_KERNEL);
696 if (!parg->comm) {
697 pr_info("Failed to allocate memory for command '%s'.\n", arg);
698 return -ENOMEM;
699 }
700 t = strchr(parg->comm, ':');
701 if (t) {
702 arg[t - parg->comm] = '\0';
703 t++;
704 }
705 parg->type = find_fetch_type(t);
706 if (!parg->type) {
707 pr_info("Unsupported type: %s\n", t);
708 return -EINVAL;
709 }
710 parg->offset = *size;
711 *size += parg->type->size;
712 ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe);
713
714 if (ret >= 0 && t != NULL)
715 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
716
717 if (ret >= 0) {
718 parg->fetch_size.fn = get_fetch_size_function(parg->type,
719 parg->fetch.fn);
720 parg->fetch_size.data = parg->fetch.data;
721 }
722
723 return ret;
724}
725
726/* Return 1 if name is reserved or already used by another argument */
727int traceprobe_conflict_field_name(const char *name,
728 struct probe_arg *args, int narg)
729{
730 int i;
731
732 for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
733 if (strcmp(reserved_field_names[i], name) == 0)
734 return 1;
735
736 for (i = 0; i < narg; i++)
737 if (strcmp(args[i].name, name) == 0)
738 return 1;
739
740 return 0;
741}
742
743void traceprobe_update_arg(struct probe_arg *arg)
744{
745 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
746 update_bitfield_fetch_param(arg->fetch.data);
747 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
748 update_deref_fetch_param(arg->fetch.data);
749 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
750 update_symbol_cache(arg->fetch.data);
751}
752
753void traceprobe_free_probe_arg(struct probe_arg *arg)
754{
755 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
756 free_bitfield_fetch_param(arg->fetch.data);
757 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
758 free_deref_fetch_param(arg->fetch.data);
759 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
760 free_symbol_cache(arg->fetch.data);
761
762 kfree(arg->name);
763 kfree(arg->comm);
764}
765
766int traceprobe_command(const char *buf, int (*createfn)(int, char **))
767{
768 char **argv;
769 int argc, ret;
770
771 argc = 0;
772 ret = 0;
773 argv = argv_split(GFP_KERNEL, buf, &argc);
774 if (!argv)
775 return -ENOMEM;
776
777 if (argc)
778 ret = createfn(argc, argv);
779
780 argv_free(argv);
781
782 return ret;
783}
784
785#define WRITE_BUFSIZE 4096
786
787ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
788 size_t count, loff_t *ppos,
789 int (*createfn)(int, char **))
790{
791 char *kbuf, *tmp;
792 int ret = 0;
793 size_t done = 0;
794 size_t size;
795
796 kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
797 if (!kbuf)
798 return -ENOMEM;
799
800 while (done < count) {
801 size = count - done;
802
803 if (size >= WRITE_BUFSIZE)
804 size = WRITE_BUFSIZE - 1;
805
806 if (copy_from_user(kbuf, buffer + done, size)) {
807 ret = -EFAULT;
808 goto out;
809 }
810 kbuf[size] = '\0';
811 tmp = strchr(kbuf, '\n');
812
813 if (tmp) {
814 *tmp = '\0';
815 size = tmp - kbuf + 1;
816 } else if (done + size < count) {
817 pr_warning("Line length is too long: "
818 "Should be less than %d.", WRITE_BUFSIZE);
819 ret = -EINVAL;
820 goto out;
821 }
822 done += size;
823 /* Remove comments */
824 tmp = strchr(kbuf, '#');
825
826 if (tmp)
827 *tmp = '\0';
828
829 ret = traceprobe_command(kbuf, createfn);
830 if (ret)
831 goto out;
832 }
833 ret = done;
834
835out:
836 kfree(kbuf);
837
838 return ret;
839}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
new file mode 100644
index 000000000000..933708677814
--- /dev/null
+++ b/kernel/trace/trace_probe.h
@@ -0,0 +1,161 @@
1/*
2 * Common header file for probe-based Dynamic events.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 *
17 * This code was copied from kernel/trace/trace_kprobe.h written by
18 * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
19 *
20 * Updates to make this generic:
21 * Copyright (C) IBM Corporation, 2010-2011
22 * Author: Srikar Dronamraju
23 */
24
25#include <linux/seq_file.h>
26#include <linux/slab.h>
27#include <linux/smp.h>
28#include <linux/debugfs.h>
29#include <linux/types.h>
30#include <linux/string.h>
31#include <linux/ctype.h>
32#include <linux/ptrace.h>
33#include <linux/perf_event.h>
34#include <linux/kprobes.h>
35#include <linux/stringify.h>
36#include <linux/limits.h>
37#include <linux/uaccess.h>
38#include <asm/bitsperlong.h>
39
40#include "trace.h"
41#include "trace_output.h"
42
43#define MAX_TRACE_ARGS 128
44#define MAX_ARGSTR_LEN 63
45#define MAX_EVENT_NAME_LEN 64
46#define MAX_STRING_SIZE PATH_MAX
47
48/* Reserved field names */
49#define FIELD_STRING_IP "__probe_ip"
50#define FIELD_STRING_RETIP "__probe_ret_ip"
51#define FIELD_STRING_FUNC "__probe_func"
52
53#undef DEFINE_FIELD
54#define DEFINE_FIELD(type, item, name, is_signed) \
55 do { \
56 ret = trace_define_field(event_call, #type, name, \
57 offsetof(typeof(field), item), \
58 sizeof(field.item), is_signed, \
59 FILTER_OTHER); \
60 if (ret) \
61 return ret; \
62 } while (0)
63
64
65/* Flags for trace_probe */
66#define TP_FLAG_TRACE 1
67#define TP_FLAG_PROFILE 2
68#define TP_FLAG_REGISTERED 4
69#define TP_FLAG_UPROBE 8
70
71
72/* data_rloc: data relative location, compatible with u32 */
73#define make_data_rloc(len, roffs) \
74 (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
75#define get_rloc_len(dl) ((u32)(dl) >> 16)
76#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
77
78/*
79 * Convert data_rloc to data_loc:
80 * data_rloc stores the offset from data_rloc itself, but data_loc
81 * stores the offset from event entry.
82 */
83#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
84
85/* Data fetch function type */
86typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
87/* Printing function type */
88typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *);
89
90/* Fetch types */
91enum {
92 FETCH_MTD_reg = 0,
93 FETCH_MTD_stack,
94 FETCH_MTD_retval,
95 FETCH_MTD_memory,
96 FETCH_MTD_symbol,
97 FETCH_MTD_deref,
98 FETCH_MTD_bitfield,
99 FETCH_MTD_END,
100};
101
102/* Fetch type information table */
103struct fetch_type {
104 const char *name; /* Name of type */
105 size_t size; /* Byte size of type */
106 int is_signed; /* Signed flag */
107 print_type_func_t print; /* Print functions */
108 const char *fmt; /* Fromat string */
109 const char *fmttype; /* Name in format file */
110 /* Fetch functions */
111 fetch_func_t fetch[FETCH_MTD_END];
112};
113
114struct fetch_param {
115 fetch_func_t fn;
116 void *data;
117};
118
119struct probe_arg {
120 struct fetch_param fetch;
121 struct fetch_param fetch_size;
122 unsigned int offset; /* Offset from argument entry */
123 const char *name; /* Name of this argument */
124 const char *comm; /* Command of this argument */
125 const struct fetch_type *type; /* Type of this argument */
126};
127
128static inline __kprobes void call_fetch(struct fetch_param *fprm,
129 struct pt_regs *regs, void *dest)
130{
131 return fprm->fn(regs, fprm->data, dest);
132}
133
134/* Check the name is good for event/group/fields */
135static inline int is_good_name(const char *name)
136{
137 if (!isalpha(*name) && *name != '_')
138 return 0;
139 while (*++name != '\0') {
140 if (!isalpha(*name) && !isdigit(*name) && *name != '_')
141 return 0;
142 }
143 return 1;
144}
145
146extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
147 struct probe_arg *parg, bool is_return, bool is_kprobe);
148
149extern int traceprobe_conflict_field_name(const char *name,
150 struct probe_arg *args, int narg);
151
152extern void traceprobe_update_arg(struct probe_arg *arg);
153extern void traceprobe_free_probe_arg(struct probe_arg *arg);
154
155extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
156
157extern ssize_t traceprobe_probes_write(struct file *file,
158 const char __user *buffer, size_t count, loff_t *ppos,
159 int (*createfn)(int, char**));
160
161extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
new file mode 100644
index 000000000000..2b36ac68549e
--- /dev/null
+++ b/kernel/trace/trace_uprobe.c
@@ -0,0 +1,788 @@
1/*
2 * uprobes-based tracing events
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 *
17 * Copyright (C) IBM Corporation, 2010-2012
18 * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
19 */
20
21#include <linux/module.h>
22#include <linux/uaccess.h>
23#include <linux/uprobes.h>
24#include <linux/namei.h>
25
26#include "trace_probe.h"
27
28#define UPROBE_EVENT_SYSTEM "uprobes"
29
30/*
31 * uprobe event core functions
32 */
33struct trace_uprobe;
34struct uprobe_trace_consumer {
35 struct uprobe_consumer cons;
36 struct trace_uprobe *tu;
37};
38
39struct trace_uprobe {
40 struct list_head list;
41 struct ftrace_event_class class;
42 struct ftrace_event_call call;
43 struct uprobe_trace_consumer *consumer;
44 struct inode *inode;
45 char *filename;
46 unsigned long offset;
47 unsigned long nhit;
48 unsigned int flags; /* For TP_FLAG_* */
49 ssize_t size; /* trace entry size */
50 unsigned int nr_args;
51 struct probe_arg args[];
52};
53
54#define SIZEOF_TRACE_UPROBE(n) \
55 (offsetof(struct trace_uprobe, args) + \
56 (sizeof(struct probe_arg) * (n)))
57
58static int register_uprobe_event(struct trace_uprobe *tu);
59static void unregister_uprobe_event(struct trace_uprobe *tu);
60
61static DEFINE_MUTEX(uprobe_lock);
62static LIST_HEAD(uprobe_list);
63
64static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
65
66/*
67 * Allocate new trace_uprobe and initialize it (including uprobes).
68 */
69static struct trace_uprobe *
70alloc_trace_uprobe(const char *group, const char *event, int nargs)
71{
72 struct trace_uprobe *tu;
73
74 if (!event || !is_good_name(event))
75 return ERR_PTR(-EINVAL);
76
77 if (!group || !is_good_name(group))
78 return ERR_PTR(-EINVAL);
79
80 tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL);
81 if (!tu)
82 return ERR_PTR(-ENOMEM);
83
84 tu->call.class = &tu->class;
85 tu->call.name = kstrdup(event, GFP_KERNEL);
86 if (!tu->call.name)
87 goto error;
88
89 tu->class.system = kstrdup(group, GFP_KERNEL);
90 if (!tu->class.system)
91 goto error;
92
93 INIT_LIST_HEAD(&tu->list);
94 return tu;
95
96error:
97 kfree(tu->call.name);
98 kfree(tu);
99
100 return ERR_PTR(-ENOMEM);
101}
102
103static void free_trace_uprobe(struct trace_uprobe *tu)
104{
105 int i;
106
107 for (i = 0; i < tu->nr_args; i++)
108 traceprobe_free_probe_arg(&tu->args[i]);
109
110 iput(tu->inode);
111 kfree(tu->call.class->system);
112 kfree(tu->call.name);
113 kfree(tu->filename);
114 kfree(tu);
115}
116
117static struct trace_uprobe *find_probe_event(const char *event, const char *group)
118{
119 struct trace_uprobe *tu;
120
121 list_for_each_entry(tu, &uprobe_list, list)
122 if (strcmp(tu->call.name, event) == 0 &&
123 strcmp(tu->call.class->system, group) == 0)
124 return tu;
125
126 return NULL;
127}
128
129/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
130static void unregister_trace_uprobe(struct trace_uprobe *tu)
131{
132 list_del(&tu->list);
133 unregister_uprobe_event(tu);
134 free_trace_uprobe(tu);
135}
136
137/* Register a trace_uprobe and probe_event */
138static int register_trace_uprobe(struct trace_uprobe *tu)
139{
140 struct trace_uprobe *old_tp;
141 int ret;
142
143 mutex_lock(&uprobe_lock);
144
145 /* register as an event */
146 old_tp = find_probe_event(tu->call.name, tu->call.class->system);
147 if (old_tp)
148 /* delete old event */
149 unregister_trace_uprobe(old_tp);
150
151 ret = register_uprobe_event(tu);
152 if (ret) {
153 pr_warning("Failed to register probe event(%d)\n", ret);
154 goto end;
155 }
156
157 list_add_tail(&tu->list, &uprobe_list);
158
159end:
160 mutex_unlock(&uprobe_lock);
161
162 return ret;
163}
164
165/*
166 * Argument syntax:
167 * - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS]
168 *
169 * - Remove uprobe: -:[GRP/]EVENT
170 */
171static int create_trace_uprobe(int argc, char **argv)
172{
173 struct trace_uprobe *tu;
174 struct inode *inode;
175 char *arg, *event, *group, *filename;
176 char buf[MAX_EVENT_NAME_LEN];
177 struct path path;
178 unsigned long offset;
179 bool is_delete;
180 int i, ret;
181
182 inode = NULL;
183 ret = 0;
184 is_delete = false;
185 event = NULL;
186 group = NULL;
187
188 /* argc must be >= 1 */
189 if (argv[0][0] == '-')
190 is_delete = true;
191 else if (argv[0][0] != 'p') {
192 pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n");
193 return -EINVAL;
194 }
195
196 if (argv[0][1] == ':') {
197 event = &argv[0][2];
198 arg = strchr(event, '/');
199
200 if (arg) {
201 group = event;
202 event = arg + 1;
203 event[-1] = '\0';
204
205 if (strlen(group) == 0) {
206 pr_info("Group name is not specified\n");
207 return -EINVAL;
208 }
209 }
210 if (strlen(event) == 0) {
211 pr_info("Event name is not specified\n");
212 return -EINVAL;
213 }
214 }
215 if (!group)
216 group = UPROBE_EVENT_SYSTEM;
217
218 if (is_delete) {
219 if (!event) {
220 pr_info("Delete command needs an event name.\n");
221 return -EINVAL;
222 }
223 mutex_lock(&uprobe_lock);
224 tu = find_probe_event(event, group);
225
226 if (!tu) {
227 mutex_unlock(&uprobe_lock);
228 pr_info("Event %s/%s doesn't exist.\n", group, event);
229 return -ENOENT;
230 }
231 /* delete an event */
232 unregister_trace_uprobe(tu);
233 mutex_unlock(&uprobe_lock);
234 return 0;
235 }
236
237 if (argc < 2) {
238 pr_info("Probe point is not specified.\n");
239 return -EINVAL;
240 }
241 if (isdigit(argv[1][0])) {
242 pr_info("probe point must be have a filename.\n");
243 return -EINVAL;
244 }
245 arg = strchr(argv[1], ':');
246 if (!arg)
247 goto fail_address_parse;
248
249 *arg++ = '\0';
250 filename = argv[1];
251 ret = kern_path(filename, LOOKUP_FOLLOW, &path);
252 if (ret)
253 goto fail_address_parse;
254
255 ret = strict_strtoul(arg, 0, &offset);
256 if (ret)
257 goto fail_address_parse;
258
259 inode = igrab(path.dentry->d_inode);
260
261 argc -= 2;
262 argv += 2;
263
264 /* setup a probe */
265 if (!event) {
266 char *tail = strrchr(filename, '/');
267 char *ptr;
268
269 ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL);
270 if (!ptr) {
271 ret = -ENOMEM;
272 goto fail_address_parse;
273 }
274
275 tail = ptr;
276 ptr = strpbrk(tail, ".-_");
277 if (ptr)
278 *ptr = '\0';
279
280 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset);
281 event = buf;
282 kfree(tail);
283 }
284
285 tu = alloc_trace_uprobe(group, event, argc);
286 if (IS_ERR(tu)) {
287 pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
288 ret = PTR_ERR(tu);
289 goto fail_address_parse;
290 }
291 tu->offset = offset;
292 tu->inode = inode;
293 tu->filename = kstrdup(filename, GFP_KERNEL);
294
295 if (!tu->filename) {
296 pr_info("Failed to allocate filename.\n");
297 ret = -ENOMEM;
298 goto error;
299 }
300
301 /* parse arguments */
302 ret = 0;
303 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
304 /* Increment count for freeing args in error case */
305 tu->nr_args++;
306
307 /* Parse argument name */
308 arg = strchr(argv[i], '=');
309 if (arg) {
310 *arg++ = '\0';
311 tu->args[i].name = kstrdup(argv[i], GFP_KERNEL);
312 } else {
313 arg = argv[i];
314 /* If argument name is omitted, set "argN" */
315 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
316 tu->args[i].name = kstrdup(buf, GFP_KERNEL);
317 }
318
319 if (!tu->args[i].name) {
320 pr_info("Failed to allocate argument[%d] name.\n", i);
321 ret = -ENOMEM;
322 goto error;
323 }
324
325 if (!is_good_name(tu->args[i].name)) {
326 pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name);
327 ret = -EINVAL;
328 goto error;
329 }
330
331 if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) {
332 pr_info("Argument[%d] name '%s' conflicts with "
333 "another field.\n", i, argv[i]);
334 ret = -EINVAL;
335 goto error;
336 }
337
338 /* Parse fetch argument */
339 ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false);
340 if (ret) {
341 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
342 goto error;
343 }
344 }
345
346 ret = register_trace_uprobe(tu);
347 if (ret)
348 goto error;
349 return 0;
350
351error:
352 free_trace_uprobe(tu);
353 return ret;
354
355fail_address_parse:
356 if (inode)
357 iput(inode);
358
359 pr_info("Failed to parse address.\n");
360
361 return ret;
362}
363
364static void cleanup_all_probes(void)
365{
366 struct trace_uprobe *tu;
367
368 mutex_lock(&uprobe_lock);
369 while (!list_empty(&uprobe_list)) {
370 tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
371 unregister_trace_uprobe(tu);
372 }
373 mutex_unlock(&uprobe_lock);
374}
375
376/* Probes listing interfaces */
377static void *probes_seq_start(struct seq_file *m, loff_t *pos)
378{
379 mutex_lock(&uprobe_lock);
380 return seq_list_start(&uprobe_list, *pos);
381}
382
383static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
384{
385 return seq_list_next(v, &uprobe_list, pos);
386}
387
388static void probes_seq_stop(struct seq_file *m, void *v)
389{
390 mutex_unlock(&uprobe_lock);
391}
392
393static int probes_seq_show(struct seq_file *m, void *v)
394{
395 struct trace_uprobe *tu = v;
396 int i;
397
398 seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name);
399 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
400
401 for (i = 0; i < tu->nr_args; i++)
402 seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm);
403
404 seq_printf(m, "\n");
405 return 0;
406}
407
408static const struct seq_operations probes_seq_op = {
409 .start = probes_seq_start,
410 .next = probes_seq_next,
411 .stop = probes_seq_stop,
412 .show = probes_seq_show
413};
414
415static int probes_open(struct inode *inode, struct file *file)
416{
417 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
418 cleanup_all_probes();
419
420 return seq_open(file, &probes_seq_op);
421}
422
423static ssize_t probes_write(struct file *file, const char __user *buffer,
424 size_t count, loff_t *ppos)
425{
426 return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
427}
428
429static const struct file_operations uprobe_events_ops = {
430 .owner = THIS_MODULE,
431 .open = probes_open,
432 .read = seq_read,
433 .llseek = seq_lseek,
434 .release = seq_release,
435 .write = probes_write,
436};
437
438/* Probes profiling interfaces */
439static int probes_profile_seq_show(struct seq_file *m, void *v)
440{
441 struct trace_uprobe *tu = v;
442
443 seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit);
444 return 0;
445}
446
447static const struct seq_operations profile_seq_op = {
448 .start = probes_seq_start,
449 .next = probes_seq_next,
450 .stop = probes_seq_stop,
451 .show = probes_profile_seq_show
452};
453
454static int profile_open(struct inode *inode, struct file *file)
455{
456 return seq_open(file, &profile_seq_op);
457}
458
459static const struct file_operations uprobe_profile_ops = {
460 .owner = THIS_MODULE,
461 .open = profile_open,
462 .read = seq_read,
463 .llseek = seq_lseek,
464 .release = seq_release,
465};
466
467/* uprobe handler */
468static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
469{
470 struct uprobe_trace_entry_head *entry;
471 struct ring_buffer_event *event;
472 struct ring_buffer *buffer;
473 u8 *data;
474 int size, i, pc;
475 unsigned long irq_flags;
476 struct ftrace_event_call *call = &tu->call;
477
478 tu->nhit++;
479
480 local_save_flags(irq_flags);
481 pc = preempt_count();
482
483 size = sizeof(*entry) + tu->size;
484
485 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
486 size, irq_flags, pc);
487 if (!event)
488 return;
489
490 entry = ring_buffer_event_data(event);
491 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
492 data = (u8 *)&entry[1];
493 for (i = 0; i < tu->nr_args; i++)
494 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
495
496 if (!filter_current_check_discard(buffer, call, entry, event))
497 trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
498}
499
500/* Event entry printers */
501static enum print_line_t
502print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
503{
504 struct uprobe_trace_entry_head *field;
505 struct trace_seq *s = &iter->seq;
506 struct trace_uprobe *tu;
507 u8 *data;
508 int i;
509
510 field = (struct uprobe_trace_entry_head *)iter->ent;
511 tu = container_of(event, struct trace_uprobe, call.event);
512
513 if (!trace_seq_printf(s, "%s: (", tu->call.name))
514 goto partial;
515
516 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
517 goto partial;
518
519 if (!trace_seq_puts(s, ")"))
520 goto partial;
521
522 data = (u8 *)&field[1];
523 for (i = 0; i < tu->nr_args; i++) {
524 if (!tu->args[i].type->print(s, tu->args[i].name,
525 data + tu->args[i].offset, field))
526 goto partial;
527 }
528
529 if (trace_seq_puts(s, "\n"))
530 return TRACE_TYPE_HANDLED;
531
532partial:
533 return TRACE_TYPE_PARTIAL_LINE;
534}
535
536static int probe_event_enable(struct trace_uprobe *tu, int flag)
537{
538 struct uprobe_trace_consumer *utc;
539 int ret = 0;
540
541 if (!tu->inode || tu->consumer)
542 return -EINTR;
543
544 utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL);
545 if (!utc)
546 return -EINTR;
547
548 utc->cons.handler = uprobe_dispatcher;
549 utc->cons.filter = NULL;
550 ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
551 if (ret) {
552 kfree(utc);
553 return ret;
554 }
555
556 tu->flags |= flag;
557 utc->tu = tu;
558 tu->consumer = utc;
559
560 return 0;
561}
562
563static void probe_event_disable(struct trace_uprobe *tu, int flag)
564{
565 if (!tu->inode || !tu->consumer)
566 return;
567
568 uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons);
569 tu->flags &= ~flag;
570 kfree(tu->consumer);
571 tu->consumer = NULL;
572}
573
574static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
575{
576 int ret, i;
577 struct uprobe_trace_entry_head field;
578 struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data;
579
580 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
581 /* Set argument names as fields */
582 for (i = 0; i < tu->nr_args; i++) {
583 ret = trace_define_field(event_call, tu->args[i].type->fmttype,
584 tu->args[i].name,
585 sizeof(field) + tu->args[i].offset,
586 tu->args[i].type->size,
587 tu->args[i].type->is_signed,
588 FILTER_OTHER);
589
590 if (ret)
591 return ret;
592 }
593 return 0;
594}
595
596#define LEN_OR_ZERO (len ? len - pos : 0)
597static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
598{
599 const char *fmt, *arg;
600 int i;
601 int pos = 0;
602
603 fmt = "(%lx)";
604 arg = "REC->" FIELD_STRING_IP;
605
606 /* When len=0, we just calculate the needed length */
607
608 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
609
610 for (i = 0; i < tu->nr_args; i++) {
611 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
612 tu->args[i].name, tu->args[i].type->fmt);
613 }
614
615 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
616
617 for (i = 0; i < tu->nr_args; i++) {
618 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
619 tu->args[i].name);
620 }
621
622 return pos; /* return the length of print_fmt */
623}
624#undef LEN_OR_ZERO
625
626static int set_print_fmt(struct trace_uprobe *tu)
627{
628 char *print_fmt;
629 int len;
630
631 /* First: called with 0 length to calculate the needed length */
632 len = __set_print_fmt(tu, NULL, 0);
633 print_fmt = kmalloc(len + 1, GFP_KERNEL);
634 if (!print_fmt)
635 return -ENOMEM;
636
637 /* Second: actually write the @print_fmt */
638 __set_print_fmt(tu, print_fmt, len + 1);
639 tu->call.print_fmt = print_fmt;
640
641 return 0;
642}
643
644#ifdef CONFIG_PERF_EVENTS
645/* uprobe profile handler */
646static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
647{
648 struct ftrace_event_call *call = &tu->call;
649 struct uprobe_trace_entry_head *entry;
650 struct hlist_head *head;
651 u8 *data;
652 int size, __size, i;
653 int rctx;
654
655 __size = sizeof(*entry) + tu->size;
656 size = ALIGN(__size + sizeof(u32), sizeof(u64));
657 size -= sizeof(u32);
658 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
659 return;
660
661 preempt_disable();
662
663 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
664 if (!entry)
665 goto out;
666
667 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
668 data = (u8 *)&entry[1];
669 for (i = 0; i < tu->nr_args; i++)
670 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
671
672 head = this_cpu_ptr(call->perf_events);
673 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
674
675 out:
676 preempt_enable();
677}
678#endif /* CONFIG_PERF_EVENTS */
679
680static
681int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
682{
683 struct trace_uprobe *tu = (struct trace_uprobe *)event->data;
684
685 switch (type) {
686 case TRACE_REG_REGISTER:
687 return probe_event_enable(tu, TP_FLAG_TRACE);
688
689 case TRACE_REG_UNREGISTER:
690 probe_event_disable(tu, TP_FLAG_TRACE);
691 return 0;
692
693#ifdef CONFIG_PERF_EVENTS
694 case TRACE_REG_PERF_REGISTER:
695 return probe_event_enable(tu, TP_FLAG_PROFILE);
696
697 case TRACE_REG_PERF_UNREGISTER:
698 probe_event_disable(tu, TP_FLAG_PROFILE);
699 return 0;
700#endif
701 default:
702 return 0;
703 }
704 return 0;
705}
706
707static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
708{
709 struct uprobe_trace_consumer *utc;
710 struct trace_uprobe *tu;
711
712 utc = container_of(con, struct uprobe_trace_consumer, cons);
713 tu = utc->tu;
714 if (!tu || tu->consumer != utc)
715 return 0;
716
717 if (tu->flags & TP_FLAG_TRACE)
718 uprobe_trace_func(tu, regs);
719
720#ifdef CONFIG_PERF_EVENTS
721 if (tu->flags & TP_FLAG_PROFILE)
722 uprobe_perf_func(tu, regs);
723#endif
724 return 0;
725}
726
727static struct trace_event_functions uprobe_funcs = {
728 .trace = print_uprobe_event
729};
730
731static int register_uprobe_event(struct trace_uprobe *tu)
732{
733 struct ftrace_event_call *call = &tu->call;
734 int ret;
735
736 /* Initialize ftrace_event_call */
737 INIT_LIST_HEAD(&call->class->fields);
738 call->event.funcs = &uprobe_funcs;
739 call->class->define_fields = uprobe_event_define_fields;
740
741 if (set_print_fmt(tu) < 0)
742 return -ENOMEM;
743
744 ret = register_ftrace_event(&call->event);
745 if (!ret) {
746 kfree(call->print_fmt);
747 return -ENODEV;
748 }
749 call->flags = 0;
750 call->class->reg = trace_uprobe_register;
751 call->data = tu;
752 ret = trace_add_event_call(call);
753
754 if (ret) {
755 pr_info("Failed to register uprobe event: %s\n", call->name);
756 kfree(call->print_fmt);
757 unregister_ftrace_event(&call->event);
758 }
759
760 return ret;
761}
762
763static void unregister_uprobe_event(struct trace_uprobe *tu)
764{
765 /* tu->event is unregistered in trace_remove_event_call() */
766 trace_remove_event_call(&tu->call);
767 kfree(tu->call.print_fmt);
768 tu->call.print_fmt = NULL;
769}
770
771/* Make a trace interface for controling probe points */
772static __init int init_uprobe_trace(void)
773{
774 struct dentry *d_tracer;
775
776 d_tracer = tracing_init_dentry();
777 if (!d_tracer)
778 return 0;
779
780 trace_create_file("uprobe_events", 0644, d_tracer,
781 NULL, &uprobe_events_ops);
782 /* Profile interface */
783 trace_create_file("uprobe_profile", 0444, d_tracer,
784 NULL, &uprobe_profile_ops);
785 return 0;
786}
787
788fs_initcall(init_uprobe_trace);
diff --git a/mm/memory.c b/mm/memory.c
index 6105f475fa86..bf8b4035277d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1307,6 +1307,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1307 if (end <= vma->vm_start) 1307 if (end <= vma->vm_start)
1308 return; 1308 return;
1309 1309
1310 if (vma->vm_file)
1311 uprobe_munmap(vma, start, end);
1312
1310 if (vma->vm_flags & VM_ACCOUNT) 1313 if (vma->vm_flags & VM_ACCOUNT)
1311 *nr_accounted += (end - start) >> PAGE_SHIFT; 1314 *nr_accounted += (end - start) >> PAGE_SHIFT;
1312 1315
diff --git a/mm/mmap.c b/mm/mmap.c
index 848ef52d9603..b8c4072dd9ca 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -30,6 +30,7 @@
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/audit.h> 31#include <linux/audit.h>
32#include <linux/khugepaged.h> 32#include <linux/khugepaged.h>
33#include <linux/uprobes.h>
33 34
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
35#include <asm/cacheflush.h> 36#include <asm/cacheflush.h>
@@ -546,8 +547,15 @@ again: remove_next = 1 + (end > next->vm_end);
546 547
547 if (file) { 548 if (file) {
548 mapping = file->f_mapping; 549 mapping = file->f_mapping;
549 if (!(vma->vm_flags & VM_NONLINEAR)) 550 if (!(vma->vm_flags & VM_NONLINEAR)) {
550 root = &mapping->i_mmap; 551 root = &mapping->i_mmap;
552 uprobe_munmap(vma, vma->vm_start, vma->vm_end);
553
554 if (adjust_next)
555 uprobe_munmap(next, next->vm_start,
556 next->vm_end);
557 }
558
551 mutex_lock(&mapping->i_mmap_mutex); 559 mutex_lock(&mapping->i_mmap_mutex);
552 if (insert) { 560 if (insert) {
553 /* 561 /*
@@ -617,8 +625,16 @@ again: remove_next = 1 + (end > next->vm_end);
617 if (mapping) 625 if (mapping)
618 mutex_unlock(&mapping->i_mmap_mutex); 626 mutex_unlock(&mapping->i_mmap_mutex);
619 627
628 if (root) {
629 uprobe_mmap(vma);
630
631 if (adjust_next)
632 uprobe_mmap(next);
633 }
634
620 if (remove_next) { 635 if (remove_next) {
621 if (file) { 636 if (file) {
637 uprobe_munmap(next, next->vm_start, next->vm_end);
622 fput(file); 638 fput(file);
623 if (next->vm_flags & VM_EXECUTABLE) 639 if (next->vm_flags & VM_EXECUTABLE)
624 removed_exe_file_vma(mm); 640 removed_exe_file_vma(mm);
@@ -638,6 +654,8 @@ again: remove_next = 1 + (end > next->vm_end);
638 goto again; 654 goto again;
639 } 655 }
640 } 656 }
657 if (insert && file)
658 uprobe_mmap(insert);
641 659
642 validate_mm(mm); 660 validate_mm(mm);
643 661
@@ -1371,6 +1389,11 @@ out:
1371 mm->locked_vm += (len >> PAGE_SHIFT); 1389 mm->locked_vm += (len >> PAGE_SHIFT);
1372 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) 1390 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
1373 make_pages_present(addr, addr + len); 1391 make_pages_present(addr, addr + len);
1392
1393 if (file && uprobe_mmap(vma))
1394 /* matching probes but cannot insert */
1395 goto unmap_and_free_vma;
1396
1374 return addr; 1397 return addr;
1375 1398
1376unmap_and_free_vma: 1399unmap_and_free_vma:
@@ -2352,6 +2375,10 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
2352 if ((vma->vm_flags & VM_ACCOUNT) && 2375 if ((vma->vm_flags & VM_ACCOUNT) &&
2353 security_vm_enough_memory_mm(mm, vma_pages(vma))) 2376 security_vm_enough_memory_mm(mm, vma_pages(vma)))
2354 return -ENOMEM; 2377 return -ENOMEM;
2378
2379 if (vma->vm_file && uprobe_mmap(vma))
2380 return -EINVAL;
2381
2355 vma_link(mm, vma, prev, rb_link, rb_parent); 2382 vma_link(mm, vma, prev, rb_link, rb_parent);
2356 return 0; 2383 return 0;
2357} 2384}
@@ -2421,6 +2448,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2421 new_vma->vm_pgoff = pgoff; 2448 new_vma->vm_pgoff = pgoff;
2422 if (new_vma->vm_file) { 2449 if (new_vma->vm_file) {
2423 get_file(new_vma->vm_file); 2450 get_file(new_vma->vm_file);
2451
2452 if (uprobe_mmap(new_vma))
2453 goto out_free_mempol;
2454
2424 if (vma->vm_flags & VM_EXECUTABLE) 2455 if (vma->vm_flags & VM_EXECUTABLE)
2425 added_exe_file_vma(mm); 2456 added_exe_file_vma(mm);
2426 } 2457 }