aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/Kconfig11
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/include/asm/uprobes.h42
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/uprobes.c412
-rw-r--r--include/linux/uprobes.h98
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/uprobes.c976
-rw-r--r--mm/mmap.c23
9 files changed, 1568 insertions, 1 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index 4f55c736be11..284f5898f526 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -65,6 +65,17 @@ config OPTPROBES
65 depends on KPROBES && HAVE_OPTPROBES 65 depends on KPROBES && HAVE_OPTPROBES
66 depends on !PREEMPT 66 depends on !PREEMPT
67 67
68config UPROBES
69 bool "User-space probes (EXPERIMENTAL)"
70 depends on ARCH_SUPPORTS_UPROBES
71 default n
72 help
73 Uprobes enables kernel subsystems to establish probepoints
74 in user applications and execute handler functions when
75 the probepoints are hit.
76
77 If in doubt, say "N".
78
68config HAVE_EFFICIENT_UNALIGNED_ACCESS 79config HAVE_EFFICIENT_UNALIGNED_ACCESS
69 bool 80 bool
70 help 81 help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5bed94e189fa..481dbfcf14ed 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -84,7 +84,7 @@ config X86
84 select GENERIC_IOMAP 84 select GENERIC_IOMAP
85 85
86config INSTRUCTION_DECODER 86config INSTRUCTION_DECODER
87 def_bool (KPROBES || PERF_EVENTS) 87 def_bool (KPROBES || PERF_EVENTS || UPROBES)
88 88
89config OUTPUT_FORMAT 89config OUTPUT_FORMAT
90 string 90 string
@@ -240,6 +240,9 @@ config ARCH_CPU_PROBE_RELEASE
240 def_bool y 240 def_bool y
241 depends on HOTPLUG_CPU 241 depends on HOTPLUG_CPU
242 242
243config ARCH_SUPPORTS_UPROBES
244 def_bool y
245
243source "init/Kconfig" 246source "init/Kconfig"
244source "kernel/Kconfig.freezer" 247source "kernel/Kconfig.freezer"
245 248
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
new file mode 100644
index 000000000000..8208234391ff
--- /dev/null
+++ b/arch/x86/include/asm/uprobes.h
@@ -0,0 +1,42 @@
1#ifndef _ASM_UPROBES_H
2#define _ASM_UPROBES_H
3/*
4 * Userspace Probes (UProbes) for x86
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright (C) IBM Corporation, 2008-2011
21 * Authors:
22 * Srikar Dronamraju
23 * Jim Keniston
24 */
25
26typedef u8 uprobe_opcode_t;
27#define MAX_UINSN_BYTES 16
28#define UPROBES_XOL_SLOT_BYTES 128 /* to keep it cache aligned */
29
30#define UPROBES_BKPT_INSN 0xcc
31#define UPROBES_BKPT_INSN_SIZE 1
32
33struct uprobe_arch_info {
34 u16 fixups;
35#ifdef CONFIG_X86_64
36 unsigned long rip_rela_target_address;
37#endif
38};
39
40struct uprobe;
41extern int analyze_insn(struct mm_struct *mm, struct uprobe *uprobe);
42#endif /* _ASM_UPROBES_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5369059c07a9..8c8c365a3bc3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
100 100
101obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o 101obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
102obj-$(CONFIG_OF) += devicetree.o 102obj-$(CONFIG_OF) += devicetree.o
103obj-$(CONFIG_UPROBES) += uprobes.o
103 104
104### 105###
105# 64 bit specific files 106# 64 bit specific files
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
new file mode 100644
index 000000000000..2a301bb91bdb
--- /dev/null
+++ b/arch/x86/kernel/uprobes.c
@@ -0,0 +1,412 @@
1/*
2 * Userspace Probes (UProbes) for x86
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2008-2011
19 * Authors:
20 * Srikar Dronamraju
21 * Jim Keniston
22 */
23
24#include <linux/kernel.h>
25#include <linux/sched.h>
26#include <linux/ptrace.h>
27#include <linux/uprobes.h>
28
29#include <linux/kdebug.h>
30#include <asm/insn.h>
31
32/* Post-execution fixups. */
33
34/* No fixup needed */
35#define UPROBES_FIX_NONE 0x0
36/* Adjust IP back to vicinity of actual insn */
37#define UPROBES_FIX_IP 0x1
38/* Adjust the return address of a call insn */
39#define UPROBES_FIX_CALL 0x2
40
41#define UPROBES_FIX_RIP_AX 0x8000
42#define UPROBES_FIX_RIP_CX 0x4000
43
44/* Adaptations for mhiramat x86 decoder v14. */
45#define OPCODE1(insn) ((insn)->opcode.bytes[0])
46#define OPCODE2(insn) ((insn)->opcode.bytes[1])
47#define OPCODE3(insn) ((insn)->opcode.bytes[2])
48#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value)
49
50#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
51 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
52 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
53 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
54 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
55 << (row % 32))
56
57#ifdef CONFIG_X86_64
58static volatile u32 good_insns_64[256 / 32] = {
59 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
60 /* ---------------------------------------------- */
61 W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */
62 W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
63 W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */
64 W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */
65 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
66 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
67 W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
68 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
69 W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
70 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
71 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
72 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
73 W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
74 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
75 W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
76 W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
77 /* ---------------------------------------------- */
78 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
79};
80#endif
81
82/* Good-instruction tables for 32-bit apps */
83
84static volatile u32 good_insns_32[256 / 32] = {
85 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
86 /* ---------------------------------------------- */
87 W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */
88 W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
89 W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */
90 W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */
91 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
92 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
93 W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
94 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
95 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
96 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
97 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
98 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
99 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
100 W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
101 W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
102 W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
103 /* ---------------------------------------------- */
104 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
105};
106
107/* Using this for both 64-bit and 32-bit apps */
108static volatile u32 good_2byte_insns[256 / 32] = {
109 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
110 /* ---------------------------------------------- */
111 W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
112 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
113 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
114 W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
115 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
116 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
117 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
118 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
119 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
120 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
121 W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
122 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
123 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
124 W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
125 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
126 W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */
127 /* ---------------------------------------------- */
128 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
129};
130
131#undef W
132
133/*
134 * opcodes we'll probably never support:
135 * 6c-6d, e4-e5, ec-ed - in
136 * 6e-6f, e6-e7, ee-ef - out
137 * cc, cd - int3, int
138 * cf - iret
139 * d6 - illegal instruction
140 * f1 - int1/icebp
141 * f4 - hlt
142 * fa, fb - cli, sti
143 * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
144 *
145 * invalid opcodes in 64-bit mode:
146 * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
147 *
148 * 63 - we support this opcode in x86_64 but not in i386.
149 *
150 * opcodes we may need to refine support for:
151 * 0f - 2-byte instructions: For many of these instructions, the validity
152 * depends on the prefix and/or the reg field. On such instructions, we
153 * just consider the opcode combination valid if it corresponds to any
154 * valid instruction.
155 * 8f - Group 1 - only reg = 0 is OK
156 * c6-c7 - Group 11 - only reg = 0 is OK
157 * d9-df - fpu insns with some illegal encodings
158 * f2, f3 - repnz, repz prefixes. These are also the first byte for
159 * certain floating-point instructions, such as addsd.
160 * fe - Group 4 - only reg = 0 or 1 is OK
161 * ff - Group 5 - only reg = 0-6 is OK
162 *
163 * others -- Do we need to support these?
164 * 0f - (floating-point?) prefetch instructions
165 * 07, 17, 1f - pop es, pop ss, pop ds
166 * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
167 * but 64 and 65 (fs: and gs:) seem to be used, so we support them
168 * 67 - addr16 prefix
169 * ce - into
170 * f0 - lock prefix
171 */
172
173/*
174 * TODO:
175 * - Where necessary, examine the modrm byte and allow only valid instructions
176 * in the different Groups and fpu instructions.
177 */
178
179static bool is_prefix_bad(struct insn *insn)
180{
181 int i;
182
183 for (i = 0; i < insn->prefixes.nbytes; i++) {
184 switch (insn->prefixes.bytes[i]) {
185 case 0x26: /*INAT_PFX_ES */
186 case 0x2E: /*INAT_PFX_CS */
187 case 0x36: /*INAT_PFX_DS */
188 case 0x3E: /*INAT_PFX_SS */
189 case 0xF0: /*INAT_PFX_LOCK */
190 return true;
191 }
192 }
193 return false;
194}
195
196static int validate_insn_32bits(struct uprobe *uprobe, struct insn *insn)
197{
198 insn_init(insn, uprobe->insn, false);
199
200 /* Skip good instruction prefixes; reject "bad" ones. */
201 insn_get_opcode(insn);
202 if (is_prefix_bad(insn))
203 return -ENOTSUPP;
204 if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32))
205 return 0;
206 if (insn->opcode.nbytes == 2) {
207 if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
208 return 0;
209 }
210 return -ENOTSUPP;
211}
212
213/*
214 * Figure out which fixups post_xol() will need to perform, and annotate
215 * uprobe->arch_info.fixups accordingly. To start with,
216 * uprobe->arch_info.fixups is either zero or it reflects rip-related
217 * fixups.
218 */
219static void prepare_fixups(struct uprobe *uprobe, struct insn *insn)
220{
221 bool fix_ip = true, fix_call = false; /* defaults */
222 int reg;
223
224 insn_get_opcode(insn); /* should be a nop */
225
226 switch (OPCODE1(insn)) {
227 case 0xc3: /* ret/lret */
228 case 0xcb:
229 case 0xc2:
230 case 0xca:
231 /* ip is correct */
232 fix_ip = false;
233 break;
234 case 0xe8: /* call relative - Fix return addr */
235 fix_call = true;
236 break;
237 case 0x9a: /* call absolute - Fix return addr, not ip */
238 fix_call = true;
239 fix_ip = false;
240 break;
241 case 0xff:
242 insn_get_modrm(insn);
243 reg = MODRM_REG(insn);
244 if (reg == 2 || reg == 3) {
245 /* call or lcall, indirect */
246 /* Fix return addr; ip is correct. */
247 fix_call = true;
248 fix_ip = false;
249 } else if (reg == 4 || reg == 5) {
250 /* jmp or ljmp, indirect */
251 /* ip is correct. */
252 fix_ip = false;
253 }
254 break;
255 case 0xea: /* jmp absolute -- ip is correct */
256 fix_ip = false;
257 break;
258 default:
259 break;
260 }
261 if (fix_ip)
262 uprobe->arch_info.fixups |= UPROBES_FIX_IP;
263 if (fix_call)
264 uprobe->arch_info.fixups |= UPROBES_FIX_CALL;
265}
266
267#ifdef CONFIG_X86_64
268/*
269 * If uprobe->insn doesn't use rip-relative addressing, return
270 * immediately. Otherwise, rewrite the instruction so that it accesses
271 * its memory operand indirectly through a scratch register. Set
272 * uprobe->arch_info.fixups and uprobe->arch_info.rip_rela_target_address
273 * accordingly. (The contents of the scratch register will be saved
274 * before we single-step the modified instruction, and restored
275 * afterward.)
276 *
277 * We do this because a rip-relative instruction can access only a
278 * relatively small area (+/- 2 GB from the instruction), and the XOL
279 * area typically lies beyond that area. At least for instructions
280 * that store to memory, we can't execute the original instruction
281 * and "fix things up" later, because the misdirected store could be
282 * disastrous.
283 *
284 * Some useful facts about rip-relative instructions:
285 * - There's always a modrm byte.
286 * - There's never a SIB byte.
287 * - The displacement is always 4 bytes.
288 */
289static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe,
290 struct insn *insn)
291{
292 u8 *cursor;
293 u8 reg;
294
295 if (mm->context.ia32_compat)
296 return;
297
298 uprobe->arch_info.rip_rela_target_address = 0x0;
299 if (!insn_rip_relative(insn))
300 return;
301
302 /*
303 * insn_rip_relative() would have decoded rex_prefix, modrm.
304 * Clear REX.b bit (extension of MODRM.rm field):
305 * we want to encode rax/rcx, not r8/r9.
306 */
307 if (insn->rex_prefix.nbytes) {
308 cursor = uprobe->insn + insn_offset_rex_prefix(insn);
309 *cursor &= 0xfe; /* Clearing REX.B bit */
310 }
311
312 /*
313 * Point cursor at the modrm byte. The next 4 bytes are the
314 * displacement. Beyond the displacement, for some instructions,
315 * is the immediate operand.
316 */
317 cursor = uprobe->insn + insn_offset_modrm(insn);
318 insn_get_length(insn);
319
320 /*
321 * Convert from rip-relative addressing to indirect addressing
322 * via a scratch register. Change the r/m field from 0x5 (%rip)
323 * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field.
324 */
325 reg = MODRM_REG(insn);
326 if (reg == 0) {
327 /*
328 * The register operand (if any) is either the A register
329 * (%rax, %eax, etc.) or (if the 0x4 bit is set in the
330 * REX prefix) %r8. In any case, we know the C register
331 * is NOT the register operand, so we use %rcx (register
332 * #1) for the scratch register.
333 */
334 uprobe->arch_info.fixups = UPROBES_FIX_RIP_CX;
335 /* Change modrm from 00 000 101 to 00 000 001. */
336 *cursor = 0x1;
337 } else {
338 /* Use %rax (register #0) for the scratch register. */
339 uprobe->arch_info.fixups = UPROBES_FIX_RIP_AX;
340 /* Change modrm from 00 xxx 101 to 00 xxx 000 */
341 *cursor = (reg << 3);
342 }
343
344 /* Target address = address of next instruction + (signed) offset */
345 uprobe->arch_info.rip_rela_target_address = (long)insn->length
346 + insn->displacement.value;
347 /* Displacement field is gone; slide immediate field (if any) over. */
348 if (insn->immediate.nbytes) {
349 cursor++;
350 memmove(cursor, cursor + insn->displacement.nbytes,
351 insn->immediate.nbytes);
352 }
353 return;
354}
355
356static int validate_insn_64bits(struct uprobe *uprobe, struct insn *insn)
357{
358 insn_init(insn, uprobe->insn, true);
359
360 /* Skip good instruction prefixes; reject "bad" ones. */
361 insn_get_opcode(insn);
362 if (is_prefix_bad(insn))
363 return -ENOTSUPP;
364 if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64))
365 return 0;
366 if (insn->opcode.nbytes == 2) {
367 if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
368 return 0;
369 }
370 return -ENOTSUPP;
371}
372
373static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe,
374 struct insn *insn)
375{
376 if (mm->context.ia32_compat)
377 return validate_insn_32bits(uprobe, insn);
378 return validate_insn_64bits(uprobe, insn);
379}
380#else
381static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe,
382 struct insn *insn)
383{
384 return;
385}
386
387static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe,
388 struct insn *insn)
389{
390 return validate_insn_32bits(uprobe, insn);
391}
392#endif /* CONFIG_X86_64 */
393
394/**
395 * analyze_insn - instruction analysis including validity and fixups.
396 * @mm: the probed address space.
397 * @uprobe: the probepoint information.
398 * Return 0 on success or a -ve number on error.
399 */
400int analyze_insn(struct mm_struct *mm, struct uprobe *uprobe)
401{
402 int ret;
403 struct insn insn;
404
405 uprobe->arch_info.fixups = 0;
406 ret = validate_insn_bits(mm, uprobe, &insn);
407 if (ret != 0)
408 return ret;
409 handle_riprel_insn(mm, uprobe, &insn);
410 prepare_fixups(uprobe, &insn);
411 return 0;
412}
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
new file mode 100644
index 000000000000..f1d13fd140f2
--- /dev/null
+++ b/include/linux/uprobes.h
@@ -0,0 +1,98 @@
1#ifndef _LINUX_UPROBES_H
2#define _LINUX_UPROBES_H
3/*
4 * Userspace Probes (UProbes)
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright (C) IBM Corporation, 2008-2011
21 * Authors:
22 * Srikar Dronamraju
23 * Jim Keniston
24 */
25
26#include <linux/errno.h>
27#include <linux/rbtree.h>
28
29struct vm_area_struct;
30#ifdef CONFIG_ARCH_SUPPORTS_UPROBES
31#include <asm/uprobes.h>
32#else
33
34typedef u8 uprobe_opcode_t;
35struct uprobe_arch_info {};
36
37#define MAX_UINSN_BYTES 4
38#endif
39
40#define uprobe_opcode_sz sizeof(uprobe_opcode_t)
41
42/* flags that denote/change uprobes behaviour */
43/* Have a copy of original instruction */
44#define UPROBES_COPY_INSN 0x1
45/* Dont run handlers when first register/ last unregister in progress*/
46#define UPROBES_RUN_HANDLER 0x2
47
48struct uprobe_consumer {
49 int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
50 /*
51 * filter is optional; If a filter exists, handler is run
52 * if and only if filter returns true.
53 */
54 bool (*filter)(struct uprobe_consumer *self, struct task_struct *task);
55
56 struct uprobe_consumer *next;
57};
58
59struct uprobe {
60 struct rb_node rb_node; /* node in the rb tree */
61 atomic_t ref;
62 struct rw_semaphore consumer_rwsem;
63 struct list_head pending_list;
64 struct uprobe_arch_info arch_info;
65 struct uprobe_consumer *consumers;
66 struct inode *inode; /* Also hold a ref to inode */
67 loff_t offset;
68 int flags;
69 u8 insn[MAX_UINSN_BYTES];
70};
71
72#ifdef CONFIG_UPROBES
73extern int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe,
74 unsigned long vaddr);
75extern int __weak set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe,
76 unsigned long vaddr, bool verify);
77extern bool __weak is_bkpt_insn(uprobe_opcode_t *insn);
78extern int register_uprobe(struct inode *inode, loff_t offset,
79 struct uprobe_consumer *consumer);
80extern void unregister_uprobe(struct inode *inode, loff_t offset,
81 struct uprobe_consumer *consumer);
82extern int mmap_uprobe(struct vm_area_struct *vma);
83#else /* CONFIG_UPROBES is not defined */
84static inline int register_uprobe(struct inode *inode, loff_t offset,
85 struct uprobe_consumer *consumer)
86{
87 return -ENOSYS;
88}
89static inline void unregister_uprobe(struct inode *inode, loff_t offset,
90 struct uprobe_consumer *consumer)
91{
92}
93static inline int mmap_uprobe(struct vm_area_struct *vma)
94{
95 return 0;
96}
97#endif /* CONFIG_UPROBES */
98#endif /* _LINUX_UPROBES_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d9de86b7e76..8609dd3d875a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -107,6 +107,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
107obj-$(CONFIG_PADATA) += padata.o 107obj-$(CONFIG_PADATA) += padata.o
108obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 108obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
109obj-$(CONFIG_JUMP_LABEL) += jump_label.o 109obj-$(CONFIG_JUMP_LABEL) += jump_label.o
110obj-$(CONFIG_UPROBES) += uprobes.o
110 111
111$(obj)/configs.o: $(obj)/config_data.h 112$(obj)/configs.o: $(obj)/config_data.h
112 113
diff --git a/kernel/uprobes.c b/kernel/uprobes.c
new file mode 100644
index 000000000000..72e8bb3b52cd
--- /dev/null
+++ b/kernel/uprobes.c
@@ -0,0 +1,976 @@
1/*
2 * Userspace Probes (UProbes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2008-2011
19 * Authors:
20 * Srikar Dronamraju
21 * Jim Keniston
22 */
23
24#include <linux/kernel.h>
25#include <linux/highmem.h>
26#include <linux/pagemap.h> /* read_mapping_page */
27#include <linux/slab.h>
28#include <linux/sched.h>
29#include <linux/rmap.h> /* anon_vma_prepare */
30#include <linux/mmu_notifier.h> /* set_pte_at_notify */
31#include <linux/swap.h> /* try_to_free_swap */
32#include <linux/uprobes.h>
33
34static struct rb_root uprobes_tree = RB_ROOT;
35static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
36
37#define UPROBES_HASH_SZ 13
38/* serialize (un)register */
39static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
40#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) %\
41 UPROBES_HASH_SZ])
42
43/* serialize uprobe->pending_list */
44static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
45#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) %\
46 UPROBES_HASH_SZ])
47
48/*
49 * uprobe_events allows us to skip the mmap_uprobe if there are no uprobe
50 * events active at this time. Probably a fine grained per inode count is
51 * better?
52 */
53static atomic_t uprobe_events = ATOMIC_INIT(0);
54
55/*
56 * Maintain a temporary per vma info that can be used to search if a vma
57 * has already been handled. This structure is introduced since extending
58 * vm_area_struct wasnt recommended.
59 */
60struct vma_info {
61 struct list_head probe_list;
62 struct mm_struct *mm;
63 loff_t vaddr;
64};
65
66/*
67 * valid_vma: Verify if the specified vma is an executable vma
68 * Relax restrictions while unregistering: vm_flags might have
69 * changed after breakpoint was inserted.
70 * - is_register: indicates if we are in register context.
71 * - Return 1 if the specified virtual address is in an
72 * executable vma.
73 */
74static bool valid_vma(struct vm_area_struct *vma, bool is_register)
75{
76 if (!vma->vm_file)
77 return false;
78
79 if (!is_register)
80 return true;
81
82 if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) ==
83 (VM_READ|VM_EXEC))
84 return true;
85
86 return false;
87}
88
89static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
90{
91 loff_t vaddr;
92
93 vaddr = vma->vm_start + offset;
94 vaddr -= vma->vm_pgoff << PAGE_SHIFT;
95 return vaddr;
96}
97
98/**
99 * __replace_page - replace page in vma by new page.
100 * based on replace_page in mm/ksm.c
101 *
102 * @vma: vma that holds the pte pointing to page
103 * @page: the cowed page we are replacing by kpage
104 * @kpage: the modified page we replace page by
105 *
106 * Returns 0 on success, -EFAULT on failure.
107 */
108static int __replace_page(struct vm_area_struct *vma, struct page *page,
109 struct page *kpage)
110{
111 struct mm_struct *mm = vma->vm_mm;
112 pgd_t *pgd;
113 pud_t *pud;
114 pmd_t *pmd;
115 pte_t *ptep;
116 spinlock_t *ptl;
117 unsigned long addr;
118 int err = -EFAULT;
119
120 addr = page_address_in_vma(page, vma);
121 if (addr == -EFAULT)
122 goto out;
123
124 pgd = pgd_offset(mm, addr);
125 if (!pgd_present(*pgd))
126 goto out;
127
128 pud = pud_offset(pgd, addr);
129 if (!pud_present(*pud))
130 goto out;
131
132 pmd = pmd_offset(pud, addr);
133 if (!pmd_present(*pmd))
134 goto out;
135
136 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
137 if (!ptep)
138 goto out;
139
140 get_page(kpage);
141 page_add_new_anon_rmap(kpage, vma, addr);
142
143 flush_cache_page(vma, addr, pte_pfn(*ptep));
144 ptep_clear_flush(vma, addr, ptep);
145 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
146
147 page_remove_rmap(page);
148 if (!page_mapped(page))
149 try_to_free_swap(page);
150 put_page(page);
151 pte_unmap_unlock(ptep, ptl);
152 err = 0;
153
154out:
155 return err;
156}
157
158/**
159 * is_bkpt_insn - check if instruction is breakpoint instruction.
160 * @insn: instruction to be checked.
161 * Default implementation of is_bkpt_insn
162 * Returns true if @insn is a breakpoint instruction.
163 */
164bool __weak is_bkpt_insn(uprobe_opcode_t *insn)
165{
166 return (*insn == UPROBES_BKPT_INSN);
167}
168
169/*
170 * NOTE:
171 * Expect the breakpoint instruction to be the smallest size instruction for
172 * the architecture. If an arch has variable length instruction and the
173 * breakpoint instruction is not of the smallest length instruction
174 * supported by that architecture then we need to modify read_opcode /
175 * write_opcode accordingly. This would never be a problem for archs that
176 * have fixed length instructions.
177 */
178
179/*
180 * write_opcode - write the opcode at a given virtual address.
181 * @mm: the probed process address space.
182 * @uprobe: the breakpointing information.
183 * @vaddr: the virtual address to store the opcode.
184 * @opcode: opcode to be written at @vaddr.
185 *
186 * Called with mm->mmap_sem held (for read and with a reference to
187 * mm).
188 *
189 * For mm @mm, write the opcode at @vaddr.
190 * Return 0 (success) or a negative errno.
191 */
192static int write_opcode(struct mm_struct *mm, struct uprobe *uprobe,
193 unsigned long vaddr, uprobe_opcode_t opcode)
194{
195 struct page *old_page, *new_page;
196 struct address_space *mapping;
197 void *vaddr_old, *vaddr_new;
198 struct vm_area_struct *vma;
199 loff_t addr;
200 int ret;
201
202 /* Read the page with vaddr into memory */
203 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
204 if (ret <= 0)
205 return ret;
206 ret = -EINVAL;
207
208 /*
209 * We are interested in text pages only. Our pages of interest
210 * should be mapped for read and execute only. We desist from
211 * adding probes in write mapped pages since the breakpoints
212 * might end up in the file copy.
213 */
214 if (!valid_vma(vma, is_bkpt_insn(&opcode)))
215 goto put_out;
216
217 mapping = uprobe->inode->i_mapping;
218 if (mapping != vma->vm_file->f_mapping)
219 goto put_out;
220
221 addr = vma_address(vma, uprobe->offset);
222 if (vaddr != (unsigned long)addr)
223 goto put_out;
224
225 ret = -ENOMEM;
226 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
227 if (!new_page)
228 goto put_out;
229
230 __SetPageUptodate(new_page);
231
232 /*
233 * lock page will serialize against do_wp_page()'s
234 * PageAnon() handling
235 */
236 lock_page(old_page);
237 /* copy the page now that we've got it stable */
238 vaddr_old = kmap_atomic(old_page);
239 vaddr_new = kmap_atomic(new_page);
240
241 memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
242 /* poke the new insn in, ASSUMES we don't cross page boundary */
243 vaddr &= ~PAGE_MASK;
244 BUG_ON(vaddr + uprobe_opcode_sz > PAGE_SIZE);
245 memcpy(vaddr_new + vaddr, &opcode, uprobe_opcode_sz);
246
247 kunmap_atomic(vaddr_new);
248 kunmap_atomic(vaddr_old);
249
250 ret = anon_vma_prepare(vma);
251 if (ret)
252 goto unlock_out;
253
254 lock_page(new_page);
255 ret = __replace_page(vma, old_page, new_page);
256 unlock_page(new_page);
257
258unlock_out:
259 unlock_page(old_page);
260 page_cache_release(new_page);
261
262put_out:
263 put_page(old_page); /* we did a get_page in the beginning */
264 return ret;
265}
266
267/**
268 * read_opcode - read the opcode at a given virtual address.
269 * @mm: the probed process address space.
270 * @vaddr: the virtual address to read the opcode.
271 * @opcode: location to store the read opcode.
272 *
273 * Called with mm->mmap_sem held (for read and with a reference to
274 * mm.
275 *
276 * For mm @mm, read the opcode at @vaddr and store it in @opcode.
277 * Return 0 (success) or a negative errno.
278 */
279static int read_opcode(struct mm_struct *mm, unsigned long vaddr,
280 uprobe_opcode_t *opcode)
281{
282 struct page *page;
283 void *vaddr_new;
284 int ret;
285
286 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL);
287 if (ret <= 0)
288 return ret;
289
290 lock_page(page);
291 vaddr_new = kmap_atomic(page);
292 vaddr &= ~PAGE_MASK;
293 memcpy(opcode, vaddr_new + vaddr, uprobe_opcode_sz);
294 kunmap_atomic(vaddr_new);
295 unlock_page(page);
296 put_page(page); /* we did a get_user_pages in the beginning */
297 return 0;
298}
299
300static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr)
301{
302 uprobe_opcode_t opcode;
303 int result = read_opcode(mm, vaddr, &opcode);
304
305 if (result)
306 return result;
307
308 if (is_bkpt_insn(&opcode))
309 return 1;
310
311 return 0;
312}
313
314/**
315 * set_bkpt - store breakpoint at a given address.
316 * @mm: the probed process address space.
317 * @uprobe: the probepoint information.
318 * @vaddr: the virtual address to insert the opcode.
319 *
320 * For mm @mm, store the breakpoint instruction at @vaddr.
321 * Return 0 (success) or a negative errno.
322 */
323int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe,
324 unsigned long vaddr)
325{
326 int result = is_bkpt_at_addr(mm, vaddr);
327
328 if (result == 1)
329 return -EEXIST;
330
331 if (result)
332 return result;
333
334 return write_opcode(mm, uprobe, vaddr, UPROBES_BKPT_INSN);
335}
336
337/**
338 * set_orig_insn - Restore the original instruction.
339 * @mm: the probed process address space.
340 * @uprobe: the probepoint information.
341 * @vaddr: the virtual address to insert the opcode.
342 * @verify: if true, verify existance of breakpoint instruction.
343 *
344 * For mm @mm, restore the original opcode (opcode) at @vaddr.
345 * Return 0 (success) or a negative errno.
346 */
347int __weak set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe,
348 unsigned long vaddr, bool verify)
349{
350 if (verify) {
351 int result = is_bkpt_at_addr(mm, vaddr);
352
353 if (!result)
354 return -EINVAL;
355
356 if (result != 1)
357 return result;
358 }
359 return write_opcode(mm, uprobe, vaddr,
360 *(uprobe_opcode_t *)uprobe->insn);
361}
362
363static int match_uprobe(struct uprobe *l, struct uprobe *r)
364{
365 if (l->inode < r->inode)
366 return -1;
367 if (l->inode > r->inode)
368 return 1;
369 else {
370 if (l->offset < r->offset)
371 return -1;
372
373 if (l->offset > r->offset)
374 return 1;
375 }
376
377 return 0;
378}
379
380static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
381{
382 struct uprobe u = { .inode = inode, .offset = offset };
383 struct rb_node *n = uprobes_tree.rb_node;
384 struct uprobe *uprobe;
385 int match;
386
387 while (n) {
388 uprobe = rb_entry(n, struct uprobe, rb_node);
389 match = match_uprobe(&u, uprobe);
390 if (!match) {
391 atomic_inc(&uprobe->ref);
392 return uprobe;
393 }
394 if (match < 0)
395 n = n->rb_left;
396 else
397 n = n->rb_right;
398 }
399 return NULL;
400}
401
402/*
403 * Find a uprobe corresponding to a given inode:offset
404 * Acquires uprobes_treelock
405 */
406static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
407{
408 struct uprobe *uprobe;
409 unsigned long flags;
410
411 spin_lock_irqsave(&uprobes_treelock, flags);
412 uprobe = __find_uprobe(inode, offset);
413 spin_unlock_irqrestore(&uprobes_treelock, flags);
414 return uprobe;
415}
416
417static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
418{
419 struct rb_node **p = &uprobes_tree.rb_node;
420 struct rb_node *parent = NULL;
421 struct uprobe *u;
422 int match;
423
424 while (*p) {
425 parent = *p;
426 u = rb_entry(parent, struct uprobe, rb_node);
427 match = match_uprobe(uprobe, u);
428 if (!match) {
429 atomic_inc(&u->ref);
430 return u;
431 }
432
433 if (match < 0)
434 p = &parent->rb_left;
435 else
436 p = &parent->rb_right;
437
438 }
439 u = NULL;
440 rb_link_node(&uprobe->rb_node, parent, p);
441 rb_insert_color(&uprobe->rb_node, &uprobes_tree);
442 /* get access + creation ref */
443 atomic_set(&uprobe->ref, 2);
444 return u;
445}
446
447/*
448 * Acquires uprobes_treelock.
449 * Matching uprobe already exists in rbtree;
450 * increment (access refcount) and return the matching uprobe.
451 *
452 * No matching uprobe; insert the uprobe in rb_tree;
453 * get a double refcount (access + creation) and return NULL.
454 */
455static struct uprobe *insert_uprobe(struct uprobe *uprobe)
456{
457 unsigned long flags;
458 struct uprobe *u;
459
460 spin_lock_irqsave(&uprobes_treelock, flags);
461 u = __insert_uprobe(uprobe);
462 spin_unlock_irqrestore(&uprobes_treelock, flags);
463 return u;
464}
465
466static void put_uprobe(struct uprobe *uprobe)
467{
468 if (atomic_dec_and_test(&uprobe->ref))
469 kfree(uprobe);
470}
471
472static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
473{
474 struct uprobe *uprobe, *cur_uprobe;
475
476 uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
477 if (!uprobe)
478 return NULL;
479
480 uprobe->inode = igrab(inode);
481 uprobe->offset = offset;
482 init_rwsem(&uprobe->consumer_rwsem);
483 INIT_LIST_HEAD(&uprobe->pending_list);
484
485 /* add to uprobes_tree, sorted on inode:offset */
486 cur_uprobe = insert_uprobe(uprobe);
487
488 /* a uprobe exists for this inode:offset combination */
489 if (cur_uprobe) {
490 kfree(uprobe);
491 uprobe = cur_uprobe;
492 iput(inode);
493 } else
494 atomic_inc(&uprobe_events);
495 return uprobe;
496}
497
498/* Returns the previous consumer */
499static struct uprobe_consumer *add_consumer(struct uprobe *uprobe,
500 struct uprobe_consumer *consumer)
501{
502 down_write(&uprobe->consumer_rwsem);
503 consumer->next = uprobe->consumers;
504 uprobe->consumers = consumer;
505 up_write(&uprobe->consumer_rwsem);
506 return consumer->next;
507}
508
509/*
510 * For uprobe @uprobe, delete the consumer @consumer.
511 * Return true if the @consumer is deleted successfully
512 * or return false.
513 */
514static bool del_consumer(struct uprobe *uprobe,
515 struct uprobe_consumer *consumer)
516{
517 struct uprobe_consumer **con;
518 bool ret = false;
519
520 down_write(&uprobe->consumer_rwsem);
521 for (con = &uprobe->consumers; *con; con = &(*con)->next) {
522 if (*con == consumer) {
523 *con = consumer->next;
524 ret = true;
525 break;
526 }
527 }
528 up_write(&uprobe->consumer_rwsem);
529 return ret;
530}
531
532static int __copy_insn(struct address_space *mapping,
533 struct vm_area_struct *vma, char *insn,
534 unsigned long nbytes, unsigned long offset)
535{
536 struct file *filp = vma->vm_file;
537 struct page *page;
538 void *vaddr;
539 unsigned long off1;
540 unsigned long idx;
541
542 if (!filp)
543 return -EINVAL;
544
545 idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT);
546 off1 = offset &= ~PAGE_MASK;
547
548 /*
549 * Ensure that the page that has the original instruction is
550 * populated and in page-cache.
551 */
552 page = read_mapping_page(mapping, idx, filp);
553 if (IS_ERR(page))
554 return PTR_ERR(page);
555
556 vaddr = kmap_atomic(page);
557 memcpy(insn, vaddr + off1, nbytes);
558 kunmap_atomic(vaddr);
559 page_cache_release(page);
560 return 0;
561}
562
563static int copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma,
564 unsigned long addr)
565{
566 struct address_space *mapping;
567 int bytes;
568 unsigned long nbytes;
569
570 addr &= ~PAGE_MASK;
571 nbytes = PAGE_SIZE - addr;
572 mapping = uprobe->inode->i_mapping;
573
574 /* Instruction at end of binary; copy only available bytes */
575 if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
576 bytes = uprobe->inode->i_size - uprobe->offset;
577 else
578 bytes = MAX_UINSN_BYTES;
579
580 /* Instruction at the page-boundary; copy bytes in second page */
581 if (nbytes < bytes) {
582 if (__copy_insn(mapping, vma, uprobe->insn + nbytes,
583 bytes - nbytes, uprobe->offset + nbytes))
584 return -ENOMEM;
585
586 bytes = nbytes;
587 }
588 return __copy_insn(mapping, vma, uprobe->insn, bytes, uprobe->offset);
589}
590
591static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe,
592 struct vm_area_struct *vma, loff_t vaddr)
593{
594 unsigned long addr;
595 int ret;
596
597 /*
598 * If probe is being deleted, unregister thread could be done with
599 * the vma-rmap-walk through. Adding a probe now can be fatal since
600 * nobody will be able to cleanup. Also we could be from fork or
601 * mremap path, where the probe might have already been inserted.
602 * Hence behave as if probe already existed.
603 */
604 if (!uprobe->consumers)
605 return -EEXIST;
606
607 addr = (unsigned long)vaddr;
608 if (!(uprobe->flags & UPROBES_COPY_INSN)) {
609 ret = copy_insn(uprobe, vma, addr);
610 if (ret)
611 return ret;
612
613 if (is_bkpt_insn((uprobe_opcode_t *)uprobe->insn))
614 return -EEXIST;
615
616 ret = analyze_insn(mm, uprobe);
617 if (ret)
618 return ret;
619
620 uprobe->flags |= UPROBES_COPY_INSN;
621 }
622 ret = set_bkpt(mm, uprobe, addr);
623
624 return ret;
625}
626
627static void remove_breakpoint(struct mm_struct *mm, struct uprobe *uprobe,
628 loff_t vaddr)
629{
630 set_orig_insn(mm, uprobe, (unsigned long)vaddr, true);
631}
632
633static void delete_uprobe(struct uprobe *uprobe)
634{
635 unsigned long flags;
636
637 spin_lock_irqsave(&uprobes_treelock, flags);
638 rb_erase(&uprobe->rb_node, &uprobes_tree);
639 spin_unlock_irqrestore(&uprobes_treelock, flags);
640 iput(uprobe->inode);
641 put_uprobe(uprobe);
642 atomic_dec(&uprobe_events);
643}
644
645static struct vma_info *__find_next_vma_info(struct list_head *head,
646 loff_t offset, struct address_space *mapping,
647 struct vma_info *vi, bool is_register)
648{
649 struct prio_tree_iter iter;
650 struct vm_area_struct *vma;
651 struct vma_info *tmpvi;
652 loff_t vaddr;
653 unsigned long pgoff = offset >> PAGE_SHIFT;
654 int existing_vma;
655
656 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
657 if (!valid_vma(vma, is_register))
658 continue;
659
660 existing_vma = 0;
661 vaddr = vma_address(vma, offset);
662 list_for_each_entry(tmpvi, head, probe_list) {
663 if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) {
664 existing_vma = 1;
665 break;
666 }
667 }
668
669 /*
670 * Another vma needs a probe to be installed. However skip
671 * installing the probe if the vma is about to be unlinked.
672 */
673 if (!existing_vma &&
674 atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
675 vi->mm = vma->vm_mm;
676 vi->vaddr = vaddr;
677 list_add(&vi->probe_list, head);
678 return vi;
679 }
680 }
681 return NULL;
682}
683
684/*
685 * Iterate in the rmap prio tree and find a vma where a probe has not
686 * yet been inserted.
687 */
688static struct vma_info *find_next_vma_info(struct list_head *head,
689 loff_t offset, struct address_space *mapping,
690 bool is_register)
691{
692 struct vma_info *vi, *retvi;
693 vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL);
694 if (!vi)
695 return ERR_PTR(-ENOMEM);
696
697 mutex_lock(&mapping->i_mmap_mutex);
698 retvi = __find_next_vma_info(head, offset, mapping, vi, is_register);
699 mutex_unlock(&mapping->i_mmap_mutex);
700
701 if (!retvi)
702 kfree(vi);
703 return retvi;
704}
705
706static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
707{
708 struct list_head try_list;
709 struct vm_area_struct *vma;
710 struct address_space *mapping;
711 struct vma_info *vi, *tmpvi;
712 struct mm_struct *mm;
713 loff_t vaddr;
714 int ret = 0;
715
716 mapping = uprobe->inode->i_mapping;
717 INIT_LIST_HEAD(&try_list);
718 while ((vi = find_next_vma_info(&try_list, uprobe->offset,
719 mapping, is_register)) != NULL) {
720 if (IS_ERR(vi)) {
721 ret = PTR_ERR(vi);
722 break;
723 }
724 mm = vi->mm;
725 down_read(&mm->mmap_sem);
726 vma = find_vma(mm, (unsigned long)vi->vaddr);
727 if (!vma || !valid_vma(vma, is_register)) {
728 list_del(&vi->probe_list);
729 kfree(vi);
730 up_read(&mm->mmap_sem);
731 mmput(mm);
732 continue;
733 }
734 vaddr = vma_address(vma, uprobe->offset);
735 if (vma->vm_file->f_mapping->host != uprobe->inode ||
736 vaddr != vi->vaddr) {
737 list_del(&vi->probe_list);
738 kfree(vi);
739 up_read(&mm->mmap_sem);
740 mmput(mm);
741 continue;
742 }
743
744 if (is_register)
745 ret = install_breakpoint(mm, uprobe, vma, vi->vaddr);
746 else
747 remove_breakpoint(mm, uprobe, vi->vaddr);
748
749 up_read(&mm->mmap_sem);
750 mmput(mm);
751 if (is_register) {
752 if (ret && ret == -EEXIST)
753 ret = 0;
754 if (ret)
755 break;
756 }
757 }
758 list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) {
759 list_del(&vi->probe_list);
760 kfree(vi);
761 }
762 return ret;
763}
764
765static int __register_uprobe(struct uprobe *uprobe)
766{
767 return register_for_each_vma(uprobe, true);
768}
769
770static void __unregister_uprobe(struct uprobe *uprobe)
771{
772 if (!register_for_each_vma(uprobe, false))
773 delete_uprobe(uprobe);
774
775 /* TODO : cant unregister? schedule a worker thread */
776}
777
778/*
779 * register_uprobe - register a probe
780 * @inode: the file in which the probe has to be placed.
781 * @offset: offset from the start of the file.
782 * @consumer: information on howto handle the probe..
783 *
784 * Apart from the access refcount, register_uprobe() takes a creation
785 * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
786 * inserted into the rbtree (i.e first consumer for a @inode:@offset
787 * tuple). Creation refcount stops unregister_uprobe from freeing the
788 * @uprobe even before the register operation is complete. Creation
789 * refcount is released when the last @consumer for the @uprobe
790 * unregisters.
791 *
792 * Return errno if it cannot successully install probes
793 * else return 0 (success)
794 */
795int register_uprobe(struct inode *inode, loff_t offset,
796 struct uprobe_consumer *consumer)
797{
798 struct uprobe *uprobe;
799 int ret = -EINVAL;
800
801 if (!inode || !consumer || consumer->next)
802 return ret;
803
804 if (offset > i_size_read(inode))
805 return ret;
806
807 ret = 0;
808 mutex_lock(uprobes_hash(inode));
809 uprobe = alloc_uprobe(inode, offset);
810 if (uprobe && !add_consumer(uprobe, consumer)) {
811 ret = __register_uprobe(uprobe);
812 if (ret) {
813 uprobe->consumers = NULL;
814 __unregister_uprobe(uprobe);
815 } else
816 uprobe->flags |= UPROBES_RUN_HANDLER;
817 }
818
819 mutex_unlock(uprobes_hash(inode));
820 put_uprobe(uprobe);
821
822 return ret;
823}
824
825/*
826 * unregister_uprobe - unregister a already registered probe.
827 * @inode: the file in which the probe has to be removed.
828 * @offset: offset from the start of the file.
829 * @consumer: identify which probe if multiple probes are colocated.
830 */
831void unregister_uprobe(struct inode *inode, loff_t offset,
832 struct uprobe_consumer *consumer)
833{
834 struct uprobe *uprobe = NULL;
835
836 if (!inode || !consumer)
837 return;
838
839 uprobe = find_uprobe(inode, offset);
840 if (!uprobe)
841 return;
842
843 mutex_lock(uprobes_hash(inode));
844 if (!del_consumer(uprobe, consumer))
845 goto unreg_out;
846
847 if (!uprobe->consumers) {
848 __unregister_uprobe(uprobe);
849 uprobe->flags &= ~UPROBES_RUN_HANDLER;
850 }
851
852unreg_out:
853 mutex_unlock(uprobes_hash(inode));
854 if (uprobe)
855 put_uprobe(uprobe);
856}
857
858/*
859 * Of all the nodes that correspond to the given inode, return the node
860 * with the least offset.
861 */
862static struct rb_node *find_least_offset_node(struct inode *inode)
863{
864 struct uprobe u = { .inode = inode, .offset = 0};
865 struct rb_node *n = uprobes_tree.rb_node;
866 struct rb_node *close_node = NULL;
867 struct uprobe *uprobe;
868 int match;
869
870 while (n) {
871 uprobe = rb_entry(n, struct uprobe, rb_node);
872 match = match_uprobe(&u, uprobe);
873 if (uprobe->inode == inode)
874 close_node = n;
875
876 if (!match)
877 return close_node;
878
879 if (match < 0)
880 n = n->rb_left;
881 else
882 n = n->rb_right;
883 }
884 return close_node;
885}
886
887/*
888 * For a given inode, build a list of probes that need to be inserted.
889 */
890static void build_probe_list(struct inode *inode, struct list_head *head)
891{
892 struct uprobe *uprobe;
893 struct rb_node *n;
894 unsigned long flags;
895
896 spin_lock_irqsave(&uprobes_treelock, flags);
897 n = find_least_offset_node(inode);
898 for (; n; n = rb_next(n)) {
899 uprobe = rb_entry(n, struct uprobe, rb_node);
900 if (uprobe->inode != inode)
901 break;
902
903 list_add(&uprobe->pending_list, head);
904 atomic_inc(&uprobe->ref);
905 }
906 spin_unlock_irqrestore(&uprobes_treelock, flags);
907}
908
909/*
910 * Called from mmap_region.
911 * called with mm->mmap_sem acquired.
912 *
913 * Return -ve no if we fail to insert probes and we cannot
914 * bail-out.
915 * Return 0 otherwise. i.e :
916 * - successful insertion of probes
917 * - (or) no possible probes to be inserted.
918 * - (or) insertion of probes failed but we can bail-out.
919 */
920int mmap_uprobe(struct vm_area_struct *vma)
921{
922 struct list_head tmp_list;
923 struct uprobe *uprobe, *u;
924 struct inode *inode;
925 int ret = 0;
926
927 if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
928 return ret; /* Bail-out */
929
930 inode = vma->vm_file->f_mapping->host;
931 if (!inode)
932 return ret;
933
934 INIT_LIST_HEAD(&tmp_list);
935 mutex_lock(uprobes_mmap_hash(inode));
936 build_probe_list(inode, &tmp_list);
937 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
938 loff_t vaddr;
939
940 list_del(&uprobe->pending_list);
941 if (!ret) {
942 vaddr = vma_address(vma, uprobe->offset);
943 if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
944 put_uprobe(uprobe);
945 continue;
946 }
947 ret = install_breakpoint(vma->vm_mm, uprobe, vma,
948 vaddr);
949 if (ret == -EEXIST)
950 ret = 0;
951 }
952 put_uprobe(uprobe);
953 }
954
955 mutex_unlock(uprobes_mmap_hash(inode));
956
957 return ret;
958}
959
960static int __init init_uprobes(void)
961{
962 int i;
963
964 for (i = 0; i < UPROBES_HASH_SZ; i++) {
965 mutex_init(&uprobes_mutex[i]);
966 mutex_init(&uprobes_mmap_mutex[i]);
967 }
968 return 0;
969}
970
971static void __exit exit_uprobes(void)
972{
973}
974
975module_init(init_uprobes);
976module_exit(exit_uprobes);
diff --git a/mm/mmap.c b/mm/mmap.c
index 3f758c7f4c81..1aed183636d7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -30,6 +30,7 @@
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/audit.h> 31#include <linux/audit.h>
32#include <linux/khugepaged.h> 32#include <linux/khugepaged.h>
33#include <linux/uprobes.h>
33 34
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
35#include <asm/cacheflush.h> 36#include <asm/cacheflush.h>
@@ -616,6 +617,13 @@ again: remove_next = 1 + (end > next->vm_end);
616 if (mapping) 617 if (mapping)
617 mutex_unlock(&mapping->i_mmap_mutex); 618 mutex_unlock(&mapping->i_mmap_mutex);
618 619
620 if (root) {
621 mmap_uprobe(vma);
622
623 if (adjust_next)
624 mmap_uprobe(next);
625 }
626
619 if (remove_next) { 627 if (remove_next) {
620 if (file) { 628 if (file) {
621 fput(file); 629 fput(file);
@@ -637,6 +645,8 @@ again: remove_next = 1 + (end > next->vm_end);
637 goto again; 645 goto again;
638 } 646 }
639 } 647 }
648 if (insert && file)
649 mmap_uprobe(insert);
640 650
641 validate_mm(mm); 651 validate_mm(mm);
642 652
@@ -1329,6 +1339,11 @@ out:
1329 mm->locked_vm += (len >> PAGE_SHIFT); 1339 mm->locked_vm += (len >> PAGE_SHIFT);
1330 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) 1340 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
1331 make_pages_present(addr, addr + len); 1341 make_pages_present(addr, addr + len);
1342
1343 if (file && mmap_uprobe(vma))
1344 /* matching probes but cannot insert */
1345 goto unmap_and_free_vma;
1346
1332 return addr; 1347 return addr;
1333 1348
1334unmap_and_free_vma: 1349unmap_and_free_vma:
@@ -2285,6 +2300,10 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
2285 if ((vma->vm_flags & VM_ACCOUNT) && 2300 if ((vma->vm_flags & VM_ACCOUNT) &&
2286 security_vm_enough_memory_mm(mm, vma_pages(vma))) 2301 security_vm_enough_memory_mm(mm, vma_pages(vma)))
2287 return -ENOMEM; 2302 return -ENOMEM;
2303
2304 if (vma->vm_file && mmap_uprobe(vma))
2305 return -EINVAL;
2306
2288 vma_link(mm, vma, prev, rb_link, rb_parent); 2307 vma_link(mm, vma, prev, rb_link, rb_parent);
2289 return 0; 2308 return 0;
2290} 2309}
@@ -2354,6 +2373,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2354 new_vma->vm_pgoff = pgoff; 2373 new_vma->vm_pgoff = pgoff;
2355 if (new_vma->vm_file) { 2374 if (new_vma->vm_file) {
2356 get_file(new_vma->vm_file); 2375 get_file(new_vma->vm_file);
2376
2377 if (mmap_uprobe(new_vma))
2378 goto out_free_mempol;
2379
2357 if (vma->vm_flags & VM_EXECUTABLE) 2380 if (vma->vm_flags & VM_EXECUTABLE)
2358 added_exe_file_vma(mm); 2381 added_exe_file_vma(mm);
2359 } 2382 }