diff options
Diffstat (limited to 'arch/x86_64/kernel/kprobes.c')
-rw-r--r-- | arch/x86_64/kernel/kprobes.c | 631 |
1 files changed, 631 insertions, 0 deletions
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c new file mode 100644 index 000000000000..4f2a852299b6 --- /dev/null +++ b/arch/x86_64/kernel/kprobes.c | |||
@@ -0,0 +1,631 @@ | |||
1 | /* | ||
2 | * Kernel Probes (KProbes) | ||
3 | * arch/x86_64/kernel/kprobes.c | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
18 | * | ||
19 | * Copyright (C) IBM Corporation, 2002, 2004 | ||
20 | * | ||
21 | * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel | ||
22 | * Probes initial implementation ( includes contributions from | ||
23 | * Rusty Russell). | ||
24 | * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes | ||
25 | * interface to access function arguments. | ||
26 | * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi | ||
27 | * <prasanna@in.ibm.com> adapted for x86_64 | ||
28 | * 2005-Mar Roland McGrath <roland@redhat.com> | ||
29 | * Fixed to handle %rip-relative addressing mode correctly. | ||
30 | */ | ||
31 | |||
32 | #include <linux/config.h> | ||
33 | #include <linux/kprobes.h> | ||
34 | #include <linux/ptrace.h> | ||
35 | #include <linux/spinlock.h> | ||
36 | #include <linux/string.h> | ||
37 | #include <linux/slab.h> | ||
38 | #include <linux/preempt.h> | ||
39 | #include <linux/moduleloader.h> | ||
40 | |||
41 | #include <asm/pgtable.h> | ||
42 | #include <asm/kdebug.h> | ||
43 | |||
44 | static DECLARE_MUTEX(kprobe_mutex); | ||
45 | |||
46 | /* kprobe_status settings */ | ||
47 | #define KPROBE_HIT_ACTIVE 0x00000001 | ||
48 | #define KPROBE_HIT_SS 0x00000002 | ||
49 | |||
50 | static struct kprobe *current_kprobe; | ||
51 | static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags; | ||
52 | static struct pt_regs jprobe_saved_regs; | ||
53 | static long *jprobe_saved_rsp; | ||
54 | static kprobe_opcode_t *get_insn_slot(void); | ||
55 | static void free_insn_slot(kprobe_opcode_t *slot); | ||
56 | void jprobe_return_end(void); | ||
57 | |||
58 | /* copy of the kernel stack at the probe fire time */ | ||
59 | static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE]; | ||
60 | |||
61 | /* | ||
62 | * returns non-zero if opcode modifies the interrupt flag. | ||
63 | */ | ||
64 | static inline int is_IF_modifier(kprobe_opcode_t *insn) | ||
65 | { | ||
66 | switch (*insn) { | ||
67 | case 0xfa: /* cli */ | ||
68 | case 0xfb: /* sti */ | ||
69 | case 0xcf: /* iret/iretd */ | ||
70 | case 0x9d: /* popf/popfd */ | ||
71 | return 1; | ||
72 | } | ||
73 | |||
74 | if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf) | ||
75 | return 1; | ||
76 | return 0; | ||
77 | } | ||
78 | |||
79 | int arch_prepare_kprobe(struct kprobe *p) | ||
80 | { | ||
81 | /* insn: must be on special executable page on x86_64. */ | ||
82 | up(&kprobe_mutex); | ||
83 | p->ainsn.insn = get_insn_slot(); | ||
84 | down(&kprobe_mutex); | ||
85 | if (!p->ainsn.insn) { | ||
86 | return -ENOMEM; | ||
87 | } | ||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | /* | ||
92 | * Determine if the instruction uses the %rip-relative addressing mode. | ||
93 | * If it does, return the address of the 32-bit displacement word. | ||
94 | * If not, return null. | ||
95 | */ | ||
96 | static inline s32 *is_riprel(u8 *insn) | ||
97 | { | ||
98 | #define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ | ||
99 | (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ | ||
100 | (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ | ||
101 | (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ | ||
102 | (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ | ||
103 | << (row % 64)) | ||
104 | static const u64 onebyte_has_modrm[256 / 64] = { | ||
105 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
106 | /* ------------------------------- */ | ||
107 | W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */ | ||
108 | W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */ | ||
109 | W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */ | ||
110 | W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */ | ||
111 | W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ | ||
112 | W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */ | ||
113 | W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */ | ||
114 | W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */ | ||
115 | W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ | ||
116 | W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */ | ||
117 | W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */ | ||
118 | W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */ | ||
119 | W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */ | ||
120 | W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */ | ||
121 | W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */ | ||
122 | W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */ | ||
123 | /* ------------------------------- */ | ||
124 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
125 | }; | ||
126 | static const u64 twobyte_has_modrm[256 / 64] = { | ||
127 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
128 | /* ------------------------------- */ | ||
129 | W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */ | ||
130 | W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */ | ||
131 | W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */ | ||
132 | W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */ | ||
133 | W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */ | ||
134 | W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */ | ||
135 | W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */ | ||
136 | W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */ | ||
137 | W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */ | ||
138 | W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */ | ||
139 | W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */ | ||
140 | W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */ | ||
141 | W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */ | ||
142 | W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */ | ||
143 | W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */ | ||
144 | W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */ | ||
145 | /* ------------------------------- */ | ||
146 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
147 | }; | ||
148 | #undef W | ||
149 | int need_modrm; | ||
150 | |||
151 | /* Skip legacy instruction prefixes. */ | ||
152 | while (1) { | ||
153 | switch (*insn) { | ||
154 | case 0x66: | ||
155 | case 0x67: | ||
156 | case 0x2e: | ||
157 | case 0x3e: | ||
158 | case 0x26: | ||
159 | case 0x64: | ||
160 | case 0x65: | ||
161 | case 0x36: | ||
162 | case 0xf0: | ||
163 | case 0xf3: | ||
164 | case 0xf2: | ||
165 | ++insn; | ||
166 | continue; | ||
167 | } | ||
168 | break; | ||
169 | } | ||
170 | |||
171 | /* Skip REX instruction prefix. */ | ||
172 | if ((*insn & 0xf0) == 0x40) | ||
173 | ++insn; | ||
174 | |||
175 | if (*insn == 0x0f) { /* Two-byte opcode. */ | ||
176 | ++insn; | ||
177 | need_modrm = test_bit(*insn, twobyte_has_modrm); | ||
178 | } else { /* One-byte opcode. */ | ||
179 | need_modrm = test_bit(*insn, onebyte_has_modrm); | ||
180 | } | ||
181 | |||
182 | if (need_modrm) { | ||
183 | u8 modrm = *++insn; | ||
184 | if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */ | ||
185 | /* Displacement follows ModRM byte. */ | ||
186 | return (s32 *) ++insn; | ||
187 | } | ||
188 | } | ||
189 | |||
190 | /* No %rip-relative addressing mode here. */ | ||
191 | return NULL; | ||
192 | } | ||
193 | |||
194 | void arch_copy_kprobe(struct kprobe *p) | ||
195 | { | ||
196 | s32 *ripdisp; | ||
197 | memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE); | ||
198 | ripdisp = is_riprel(p->ainsn.insn); | ||
199 | if (ripdisp) { | ||
200 | /* | ||
201 | * The copied instruction uses the %rip-relative | ||
202 | * addressing mode. Adjust the displacement for the | ||
203 | * difference between the original location of this | ||
204 | * instruction and the location of the copy that will | ||
205 | * actually be run. The tricky bit here is making sure | ||
206 | * that the sign extension happens correctly in this | ||
207 | * calculation, since we need a signed 32-bit result to | ||
208 | * be sign-extended to 64 bits when it's added to the | ||
209 | * %rip value and yield the same 64-bit result that the | ||
210 | * sign-extension of the original signed 32-bit | ||
211 | * displacement would have given. | ||
212 | */ | ||
213 | s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn; | ||
214 | BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ | ||
215 | *ripdisp = disp; | ||
216 | } | ||
217 | } | ||
218 | |||
219 | void arch_remove_kprobe(struct kprobe *p) | ||
220 | { | ||
221 | up(&kprobe_mutex); | ||
222 | free_insn_slot(p->ainsn.insn); | ||
223 | down(&kprobe_mutex); | ||
224 | } | ||
225 | |||
226 | static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs) | ||
227 | { | ||
228 | *p->addr = p->opcode; | ||
229 | regs->rip = (unsigned long)p->addr; | ||
230 | } | ||
231 | |||
232 | static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) | ||
233 | { | ||
234 | regs->eflags |= TF_MASK; | ||
235 | regs->eflags &= ~IF_MASK; | ||
236 | /*single step inline if the instruction is an int3*/ | ||
237 | if (p->opcode == BREAKPOINT_INSTRUCTION) | ||
238 | regs->rip = (unsigned long)p->addr; | ||
239 | else | ||
240 | regs->rip = (unsigned long)p->ainsn.insn; | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | * Interrupts are disabled on entry as trap3 is an interrupt gate and they | ||
245 | * remain disabled thorough out this function. | ||
246 | */ | ||
247 | int kprobe_handler(struct pt_regs *regs) | ||
248 | { | ||
249 | struct kprobe *p; | ||
250 | int ret = 0; | ||
251 | kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t)); | ||
252 | |||
253 | /* We're in an interrupt, but this is clear and BUG()-safe. */ | ||
254 | preempt_disable(); | ||
255 | |||
256 | /* Check we're not actually recursing */ | ||
257 | if (kprobe_running()) { | ||
258 | /* We *are* holding lock here, so this is safe. | ||
259 | Disarm the probe we just hit, and ignore it. */ | ||
260 | p = get_kprobe(addr); | ||
261 | if (p) { | ||
262 | if (kprobe_status == KPROBE_HIT_SS) { | ||
263 | regs->eflags &= ~TF_MASK; | ||
264 | regs->eflags |= kprobe_saved_rflags; | ||
265 | unlock_kprobes(); | ||
266 | goto no_kprobe; | ||
267 | } | ||
268 | disarm_kprobe(p, regs); | ||
269 | ret = 1; | ||
270 | } else { | ||
271 | p = current_kprobe; | ||
272 | if (p->break_handler && p->break_handler(p, regs)) { | ||
273 | goto ss_probe; | ||
274 | } | ||
275 | } | ||
276 | /* If it's not ours, can't be delete race, (we hold lock). */ | ||
277 | goto no_kprobe; | ||
278 | } | ||
279 | |||
280 | lock_kprobes(); | ||
281 | p = get_kprobe(addr); | ||
282 | if (!p) { | ||
283 | unlock_kprobes(); | ||
284 | if (*addr != BREAKPOINT_INSTRUCTION) { | ||
285 | /* | ||
286 | * The breakpoint instruction was removed right | ||
287 | * after we hit it. Another cpu has removed | ||
288 | * either a probepoint or a debugger breakpoint | ||
289 | * at this address. In either case, no further | ||
290 | * handling of this interrupt is appropriate. | ||
291 | */ | ||
292 | ret = 1; | ||
293 | } | ||
294 | /* Not one of ours: let kernel handle it */ | ||
295 | goto no_kprobe; | ||
296 | } | ||
297 | |||
298 | kprobe_status = KPROBE_HIT_ACTIVE; | ||
299 | current_kprobe = p; | ||
300 | kprobe_saved_rflags = kprobe_old_rflags | ||
301 | = (regs->eflags & (TF_MASK | IF_MASK)); | ||
302 | if (is_IF_modifier(p->ainsn.insn)) | ||
303 | kprobe_saved_rflags &= ~IF_MASK; | ||
304 | |||
305 | if (p->pre_handler && p->pre_handler(p, regs)) | ||
306 | /* handler has already set things up, so skip ss setup */ | ||
307 | return 1; | ||
308 | |||
309 | ss_probe: | ||
310 | prepare_singlestep(p, regs); | ||
311 | kprobe_status = KPROBE_HIT_SS; | ||
312 | return 1; | ||
313 | |||
314 | no_kprobe: | ||
315 | preempt_enable_no_resched(); | ||
316 | return ret; | ||
317 | } | ||
318 | |||
319 | /* | ||
320 | * Called after single-stepping. p->addr is the address of the | ||
321 | * instruction whose first byte has been replaced by the "int 3" | ||
322 | * instruction. To avoid the SMP problems that can occur when we | ||
323 | * temporarily put back the original opcode to single-step, we | ||
324 | * single-stepped a copy of the instruction. The address of this | ||
325 | * copy is p->ainsn.insn. | ||
326 | * | ||
327 | * This function prepares to return from the post-single-step | ||
328 | * interrupt. We have to fix up the stack as follows: | ||
329 | * | ||
330 | * 0) Except in the case of absolute or indirect jump or call instructions, | ||
331 | * the new rip is relative to the copied instruction. We need to make | ||
332 | * it relative to the original instruction. | ||
333 | * | ||
334 | * 1) If the single-stepped instruction was pushfl, then the TF and IF | ||
335 | * flags are set in the just-pushed eflags, and may need to be cleared. | ||
336 | * | ||
337 | * 2) If the single-stepped instruction was a call, the return address | ||
338 | * that is atop the stack is the address following the copied instruction. | ||
339 | * We need to make it the address following the original instruction. | ||
340 | */ | ||
341 | static void resume_execution(struct kprobe *p, struct pt_regs *regs) | ||
342 | { | ||
343 | unsigned long *tos = (unsigned long *)regs->rsp; | ||
344 | unsigned long next_rip = 0; | ||
345 | unsigned long copy_rip = (unsigned long)p->ainsn.insn; | ||
346 | unsigned long orig_rip = (unsigned long)p->addr; | ||
347 | kprobe_opcode_t *insn = p->ainsn.insn; | ||
348 | |||
349 | /*skip the REX prefix*/ | ||
350 | if (*insn >= 0x40 && *insn <= 0x4f) | ||
351 | insn++; | ||
352 | |||
353 | switch (*insn) { | ||
354 | case 0x9c: /* pushfl */ | ||
355 | *tos &= ~(TF_MASK | IF_MASK); | ||
356 | *tos |= kprobe_old_rflags; | ||
357 | break; | ||
358 | case 0xe8: /* call relative - Fix return addr */ | ||
359 | *tos = orig_rip + (*tos - copy_rip); | ||
360 | break; | ||
361 | case 0xff: | ||
362 | if ((*insn & 0x30) == 0x10) { | ||
363 | /* call absolute, indirect */ | ||
364 | /* Fix return addr; rip is correct. */ | ||
365 | next_rip = regs->rip; | ||
366 | *tos = orig_rip + (*tos - copy_rip); | ||
367 | } else if (((*insn & 0x31) == 0x20) || /* jmp near, absolute indirect */ | ||
368 | ((*insn & 0x31) == 0x21)) { /* jmp far, absolute indirect */ | ||
369 | /* rip is correct. */ | ||
370 | next_rip = regs->rip; | ||
371 | } | ||
372 | break; | ||
373 | case 0xea: /* jmp absolute -- rip is correct */ | ||
374 | next_rip = regs->rip; | ||
375 | break; | ||
376 | default: | ||
377 | break; | ||
378 | } | ||
379 | |||
380 | regs->eflags &= ~TF_MASK; | ||
381 | if (next_rip) { | ||
382 | regs->rip = next_rip; | ||
383 | } else { | ||
384 | regs->rip = orig_rip + (regs->rip - copy_rip); | ||
385 | } | ||
386 | } | ||
387 | |||
388 | /* | ||
389 | * Interrupts are disabled on entry as trap1 is an interrupt gate and they | ||
390 | * remain disabled thoroughout this function. And we hold kprobe lock. | ||
391 | */ | ||
392 | int post_kprobe_handler(struct pt_regs *regs) | ||
393 | { | ||
394 | if (!kprobe_running()) | ||
395 | return 0; | ||
396 | |||
397 | if (current_kprobe->post_handler) | ||
398 | current_kprobe->post_handler(current_kprobe, regs, 0); | ||
399 | |||
400 | resume_execution(current_kprobe, regs); | ||
401 | regs->eflags |= kprobe_saved_rflags; | ||
402 | |||
403 | unlock_kprobes(); | ||
404 | preempt_enable_no_resched(); | ||
405 | |||
406 | /* | ||
407 | * if somebody else is singlestepping across a probe point, eflags | ||
408 | * will have TF set, in which case, continue the remaining processing | ||
409 | * of do_debug, as if this is not a probe hit. | ||
410 | */ | ||
411 | if (regs->eflags & TF_MASK) | ||
412 | return 0; | ||
413 | |||
414 | return 1; | ||
415 | } | ||
416 | |||
417 | /* Interrupts disabled, kprobe_lock held. */ | ||
418 | int kprobe_fault_handler(struct pt_regs *regs, int trapnr) | ||
419 | { | ||
420 | if (current_kprobe->fault_handler | ||
421 | && current_kprobe->fault_handler(current_kprobe, regs, trapnr)) | ||
422 | return 1; | ||
423 | |||
424 | if (kprobe_status & KPROBE_HIT_SS) { | ||
425 | resume_execution(current_kprobe, regs); | ||
426 | regs->eflags |= kprobe_old_rflags; | ||
427 | |||
428 | unlock_kprobes(); | ||
429 | preempt_enable_no_resched(); | ||
430 | } | ||
431 | return 0; | ||
432 | } | ||
433 | |||
434 | /* | ||
435 | * Wrapper routine for handling exceptions. | ||
436 | */ | ||
437 | int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, | ||
438 | void *data) | ||
439 | { | ||
440 | struct die_args *args = (struct die_args *)data; | ||
441 | switch (val) { | ||
442 | case DIE_INT3: | ||
443 | if (kprobe_handler(args->regs)) | ||
444 | return NOTIFY_STOP; | ||
445 | break; | ||
446 | case DIE_DEBUG: | ||
447 | if (post_kprobe_handler(args->regs)) | ||
448 | return NOTIFY_STOP; | ||
449 | break; | ||
450 | case DIE_GPF: | ||
451 | if (kprobe_running() && | ||
452 | kprobe_fault_handler(args->regs, args->trapnr)) | ||
453 | return NOTIFY_STOP; | ||
454 | break; | ||
455 | case DIE_PAGE_FAULT: | ||
456 | if (kprobe_running() && | ||
457 | kprobe_fault_handler(args->regs, args->trapnr)) | ||
458 | return NOTIFY_STOP; | ||
459 | break; | ||
460 | default: | ||
461 | break; | ||
462 | } | ||
463 | return NOTIFY_DONE; | ||
464 | } | ||
465 | |||
466 | int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) | ||
467 | { | ||
468 | struct jprobe *jp = container_of(p, struct jprobe, kp); | ||
469 | unsigned long addr; | ||
470 | |||
471 | jprobe_saved_regs = *regs; | ||
472 | jprobe_saved_rsp = (long *) regs->rsp; | ||
473 | addr = (unsigned long)jprobe_saved_rsp; | ||
474 | /* | ||
475 | * As Linus pointed out, gcc assumes that the callee | ||
476 | * owns the argument space and could overwrite it, e.g. | ||
477 | * tailcall optimization. So, to be absolutely safe | ||
478 | * we also save and restore enough stack bytes to cover | ||
479 | * the argument area. | ||
480 | */ | ||
481 | memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr)); | ||
482 | regs->eflags &= ~IF_MASK; | ||
483 | regs->rip = (unsigned long)(jp->entry); | ||
484 | return 1; | ||
485 | } | ||
486 | |||
487 | void jprobe_return(void) | ||
488 | { | ||
489 | preempt_enable_no_resched(); | ||
490 | asm volatile (" xchg %%rbx,%%rsp \n" | ||
491 | " int3 \n" | ||
492 | " .globl jprobe_return_end \n" | ||
493 | " jprobe_return_end: \n" | ||
494 | " nop \n"::"b" | ||
495 | (jprobe_saved_rsp):"memory"); | ||
496 | } | ||
497 | |||
498 | int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) | ||
499 | { | ||
500 | u8 *addr = (u8 *) (regs->rip - 1); | ||
501 | unsigned long stack_addr = (unsigned long)jprobe_saved_rsp; | ||
502 | struct jprobe *jp = container_of(p, struct jprobe, kp); | ||
503 | |||
504 | if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { | ||
505 | if ((long *)regs->rsp != jprobe_saved_rsp) { | ||
506 | struct pt_regs *saved_regs = | ||
507 | container_of(jprobe_saved_rsp, struct pt_regs, rsp); | ||
508 | printk("current rsp %p does not match saved rsp %p\n", | ||
509 | (long *)regs->rsp, jprobe_saved_rsp); | ||
510 | printk("Saved registers for jprobe %p\n", jp); | ||
511 | show_registers(saved_regs); | ||
512 | printk("Current registers\n"); | ||
513 | show_registers(regs); | ||
514 | BUG(); | ||
515 | } | ||
516 | *regs = jprobe_saved_regs; | ||
517 | memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack, | ||
518 | MIN_STACK_SIZE(stack_addr)); | ||
519 | return 1; | ||
520 | } | ||
521 | return 0; | ||
522 | } | ||
523 | |||
524 | /* | ||
525 | * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped. | ||
526 | * By default on x86_64, pages we get from kmalloc or vmalloc are not | ||
527 | * executable. Single-stepping an instruction on such a page yields an | ||
528 | * oops. So instead of storing the instruction copies in their respective | ||
529 | * kprobe objects, we allocate a page, map it executable, and store all the | ||
530 | * instruction copies there. (We can allocate additional pages if somebody | ||
531 | * inserts a huge number of probes.) Each page can hold up to INSNS_PER_PAGE | ||
532 | * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t) | ||
533 | * bytes. | ||
534 | */ | ||
535 | #define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t))) | ||
536 | struct kprobe_insn_page { | ||
537 | struct hlist_node hlist; | ||
538 | kprobe_opcode_t *insns; /* page of instruction slots */ | ||
539 | char slot_used[INSNS_PER_PAGE]; | ||
540 | int nused; | ||
541 | }; | ||
542 | |||
543 | static struct hlist_head kprobe_insn_pages; | ||
544 | |||
545 | /** | ||
546 | * get_insn_slot() - Find a slot on an executable page for an instruction. | ||
547 | * We allocate an executable page if there's no room on existing ones. | ||
548 | */ | ||
549 | static kprobe_opcode_t *get_insn_slot(void) | ||
550 | { | ||
551 | struct kprobe_insn_page *kip; | ||
552 | struct hlist_node *pos; | ||
553 | |||
554 | hlist_for_each(pos, &kprobe_insn_pages) { | ||
555 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | ||
556 | if (kip->nused < INSNS_PER_PAGE) { | ||
557 | int i; | ||
558 | for (i = 0; i < INSNS_PER_PAGE; i++) { | ||
559 | if (!kip->slot_used[i]) { | ||
560 | kip->slot_used[i] = 1; | ||
561 | kip->nused++; | ||
562 | return kip->insns + (i*MAX_INSN_SIZE); | ||
563 | } | ||
564 | } | ||
565 | /* Surprise! No unused slots. Fix kip->nused. */ | ||
566 | kip->nused = INSNS_PER_PAGE; | ||
567 | } | ||
568 | } | ||
569 | |||
570 | /* All out of space. Need to allocate a new page. Use slot 0.*/ | ||
571 | kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); | ||
572 | if (!kip) { | ||
573 | return NULL; | ||
574 | } | ||
575 | |||
576 | /* | ||
577 | * For the %rip-relative displacement fixups to be doable, we | ||
578 | * need our instruction copy to be within +/- 2GB of any data it | ||
579 | * might access via %rip. That is, within 2GB of where the | ||
580 | * kernel image and loaded module images reside. So we allocate | ||
581 | * a page in the module loading area. | ||
582 | */ | ||
583 | kip->insns = module_alloc(PAGE_SIZE); | ||
584 | if (!kip->insns) { | ||
585 | kfree(kip); | ||
586 | return NULL; | ||
587 | } | ||
588 | INIT_HLIST_NODE(&kip->hlist); | ||
589 | hlist_add_head(&kip->hlist, &kprobe_insn_pages); | ||
590 | memset(kip->slot_used, 0, INSNS_PER_PAGE); | ||
591 | kip->slot_used[0] = 1; | ||
592 | kip->nused = 1; | ||
593 | return kip->insns; | ||
594 | } | ||
595 | |||
596 | /** | ||
597 | * free_insn_slot() - Free instruction slot obtained from get_insn_slot(). | ||
598 | */ | ||
599 | static void free_insn_slot(kprobe_opcode_t *slot) | ||
600 | { | ||
601 | struct kprobe_insn_page *kip; | ||
602 | struct hlist_node *pos; | ||
603 | |||
604 | hlist_for_each(pos, &kprobe_insn_pages) { | ||
605 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | ||
606 | if (kip->insns <= slot | ||
607 | && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) { | ||
608 | int i = (slot - kip->insns) / MAX_INSN_SIZE; | ||
609 | kip->slot_used[i] = 0; | ||
610 | kip->nused--; | ||
611 | if (kip->nused == 0) { | ||
612 | /* | ||
613 | * Page is no longer in use. Free it unless | ||
614 | * it's the last one. We keep the last one | ||
615 | * so as not to have to set it up again the | ||
616 | * next time somebody inserts a probe. | ||
617 | */ | ||
618 | hlist_del(&kip->hlist); | ||
619 | if (hlist_empty(&kprobe_insn_pages)) { | ||
620 | INIT_HLIST_NODE(&kip->hlist); | ||
621 | hlist_add_head(&kip->hlist, | ||
622 | &kprobe_insn_pages); | ||
623 | } else { | ||
624 | module_free(NULL, kip->insns); | ||
625 | kfree(kip); | ||
626 | } | ||
627 | } | ||
628 | return; | ||
629 | } | ||
630 | } | ||
631 | } | ||