aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2006-12-06 20:14:08 -0500
committerAndi Kleen <andi@basil.nowhere.org>2006-12-06 20:14:08 -0500
commit139ec7c416248b9ea227d21839235344edfee1e0 (patch)
tree54c396848b08367c0352c77f4633be6380a8eb16 /arch
parentd3561b7fa0fb0fc583bab0eeda32bec9e4c4056d (diff)
[PATCH] paravirt: Patch inline replacements for paravirt intercepts
It turns out that the most called ops, by several orders of magnitude, are the interrupt manipulation ops. These are obvious candidates for patching, so mark them up and create infrastructure for it. The method used is that the ops structure has a patch function, which is called for each place which needs to be patched: this returns a number of instructions (the rest are NOP-padded). Usually we can spare a register (%eax) for the binary patched code to use, but in a couple of critical places in entry.S we can't: we make the clobbers explicit at the call site, and manually clobber the allowed registers in debug mode as an extra check. And: Don't abuse CONFIG_DEBUG_KERNEL, add CONFIG_DEBUG_PARAVIRT. And: AK: Fix warnings in x86-64 alternative.c build And: AK: Fix compilation with defconfig And: ^From: Andrew Morton <akpm@osdl.org> Some binutlises still like to emit references to __stop_parainstructions and __start_parainstructions. And: AK: Fix warnings about unused variables when PARAVIRT is disabled. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Signed-off-by: Zachary Amsden <zach@vmware.com> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/i386/Kconfig.debug10
-rw-r--r--arch/i386/kernel/alternative.c63
-rw-r--r--arch/i386/kernel/entry.S39
-rw-r--r--arch/i386/kernel/module.c11
-rw-r--r--arch/i386/kernel/paravirt.c44
-rw-r--r--arch/i386/kernel/vmlinux.lds.S6
6 files changed, 148 insertions, 25 deletions
diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug
index b31c0802e1cc..f68cc6f215f8 100644
--- a/arch/i386/Kconfig.debug
+++ b/arch/i386/Kconfig.debug
@@ -85,4 +85,14 @@ config DOUBLEFAULT
85 option saves about 4k and might cause you much additional grey 85 option saves about 4k and might cause you much additional grey
86 hair. 86 hair.
87 87
88config DEBUG_PARAVIRT
89 bool "Enable some paravirtualization debugging"
90 default y
91 depends on PARAVIRT && DEBUG_KERNEL
92 help
93 Currently deliberately clobbers regs which are allowed to be
94 clobbered in inlined paravirt hooks, even in native mode.
95 If turning this off solves a problem, then DISABLE_INTERRUPTS() or
96 ENABLE_INTERRUPTS() is lying about what registers can be clobbered.
97
88endmenu 98endmenu
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 535f9794fba1..9eca21b49f6b 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -124,6 +124,20 @@ static unsigned char** find_nop_table(void)
124 124
125#endif /* CONFIG_X86_64 */ 125#endif /* CONFIG_X86_64 */
126 126
127static void nop_out(void *insns, unsigned int len)
128{
129 unsigned char **noptable = find_nop_table();
130
131 while (len > 0) {
132 unsigned int noplen = len;
133 if (noplen > ASM_NOP_MAX)
134 noplen = ASM_NOP_MAX;
135 memcpy(insns, noptable[noplen], noplen);
136 insns += noplen;
137 len -= noplen;
138 }
139}
140
127extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 141extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
128extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; 142extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
129extern u8 *__smp_locks[], *__smp_locks_end[]; 143extern u8 *__smp_locks[], *__smp_locks_end[];
@@ -138,10 +152,9 @@ extern u8 __smp_alt_begin[], __smp_alt_end[];
138 152
139void apply_alternatives(struct alt_instr *start, struct alt_instr *end) 153void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
140{ 154{
141 unsigned char **noptable = find_nop_table();
142 struct alt_instr *a; 155 struct alt_instr *a;
143 u8 *instr; 156 u8 *instr;
144 int diff, i, k; 157 int diff;
145 158
146 DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); 159 DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
147 for (a = start; a < end; a++) { 160 for (a = start; a < end; a++) {
@@ -159,13 +172,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
159#endif 172#endif
160 memcpy(instr, a->replacement, a->replacementlen); 173 memcpy(instr, a->replacement, a->replacementlen);
161 diff = a->instrlen - a->replacementlen; 174 diff = a->instrlen - a->replacementlen;
162 /* Pad the rest with nops */ 175 nop_out(instr + a->replacementlen, diff);
163 for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
164 k = diff;
165 if (k > ASM_NOP_MAX)
166 k = ASM_NOP_MAX;
167 memcpy(a->instr + i, noptable[k], k);
168 }
169 } 176 }
170} 177}
171 178
@@ -209,7 +216,6 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
209 216
210static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 217static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
211{ 218{
212 unsigned char **noptable = find_nop_table();
213 u8 **ptr; 219 u8 **ptr;
214 220
215 for (ptr = start; ptr < end; ptr++) { 221 for (ptr = start; ptr < end; ptr++) {
@@ -217,7 +223,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
217 continue; 223 continue;
218 if (*ptr > text_end) 224 if (*ptr > text_end)
219 continue; 225 continue;
220 **ptr = noptable[1][0]; 226 nop_out(*ptr, 1);
221 }; 227 };
222} 228}
223 229
@@ -343,6 +349,40 @@ void alternatives_smp_switch(int smp)
343 349
344#endif 350#endif
345 351
352#ifdef CONFIG_PARAVIRT
353void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
354{
355 struct paravirt_patch *p;
356
357 for (p = start; p < end; p++) {
358 unsigned int used;
359
360 used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
361 p->len);
362#ifdef CONFIG_DEBUG_PARAVIRT
363 {
364 int i;
365 /* Deliberately clobber regs using "not %reg" to find bugs. */
366 for (i = 0; i < 3; i++) {
367 if (p->len - used >= 2 && (p->clobbers & (1 << i))) {
368 memcpy(p->instr + used, "\xf7\xd0", 2);
369 p->instr[used+1] |= i;
370 used += 2;
371 }
372 }
373 }
374#endif
375 /* Pad the rest with nops */
376 nop_out(p->instr + used, p->len - used);
377 }
378
379 /* Sync to be conservative, in case we patched following instructions */
380 sync_core();
381}
382extern struct paravirt_patch __start_parainstructions[],
383 __stop_parainstructions[];
384#endif /* CONFIG_PARAVIRT */
385
346void __init alternative_instructions(void) 386void __init alternative_instructions(void)
347{ 387{
348 unsigned long flags; 388 unsigned long flags;
@@ -390,5 +430,6 @@ void __init alternative_instructions(void)
390 alternatives_smp_switch(0); 430 alternatives_smp_switch(0);
391 } 431 }
392#endif 432#endif
433 apply_paravirt(__start_parainstructions, __stop_parainstructions);
393 local_irq_restore(flags); 434 local_irq_restore(flags);
394} 435}
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index d274612e05cd..de34b7fed3c1 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -53,6 +53,19 @@
53#include <asm/dwarf2.h> 53#include <asm/dwarf2.h>
54#include "irq_vectors.h" 54#include "irq_vectors.h"
55 55
56/*
57 * We use macros for low-level operations which need to be overridden
58 * for paravirtualization. The following will never clobber any registers:
59 * INTERRUPT_RETURN (aka. "iret")
60 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
61 * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
62 *
63 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
64 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
65 * Allowing a register to be clobbered can shrink the paravirt replacement
66 * enough to patch inline, increasing performance.
67 */
68
56#define nr_syscalls ((syscall_table_size)/4) 69#define nr_syscalls ((syscall_table_size)/4)
57 70
58CF_MASK = 0x00000001 71CF_MASK = 0x00000001
@@ -63,9 +76,9 @@ NT_MASK = 0x00004000
63VM_MASK = 0x00020000 76VM_MASK = 0x00020000
64 77
65#ifdef CONFIG_PREEMPT 78#ifdef CONFIG_PREEMPT
66#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF 79#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
67#else 80#else
68#define preempt_stop 81#define preempt_stop(clobbers)
69#define resume_kernel restore_nocheck 82#define resume_kernel restore_nocheck
70#endif 83#endif
71 84
@@ -226,7 +239,7 @@ ENTRY(ret_from_fork)
226 ALIGN 239 ALIGN
227 RING0_PTREGS_FRAME 240 RING0_PTREGS_FRAME
228ret_from_exception: 241ret_from_exception:
229 preempt_stop 242 preempt_stop(CLBR_ANY)
230ret_from_intr: 243ret_from_intr:
231 GET_THREAD_INFO(%ebp) 244 GET_THREAD_INFO(%ebp)
232check_userspace: 245check_userspace:
@@ -237,7 +250,7 @@ check_userspace:
237 jb resume_kernel # not returning to v8086 or userspace 250 jb resume_kernel # not returning to v8086 or userspace
238 251
239ENTRY(resume_userspace) 252ENTRY(resume_userspace)
240 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 253 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
241 # setting need_resched or sigpending 254 # setting need_resched or sigpending
242 # between sampling and the iret 255 # between sampling and the iret
243 movl TI_flags(%ebp), %ecx 256 movl TI_flags(%ebp), %ecx
@@ -248,7 +261,7 @@ ENTRY(resume_userspace)
248 261
249#ifdef CONFIG_PREEMPT 262#ifdef CONFIG_PREEMPT
250ENTRY(resume_kernel) 263ENTRY(resume_kernel)
251 DISABLE_INTERRUPTS 264 DISABLE_INTERRUPTS(CLBR_ANY)
252 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? 265 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
253 jnz restore_nocheck 266 jnz restore_nocheck
254need_resched: 267need_resched:
@@ -277,7 +290,7 @@ sysenter_past_esp:
277 * No need to follow this irqs on/off section: the syscall 290 * No need to follow this irqs on/off section: the syscall
278 * disabled irqs and here we enable it straight after entry: 291 * disabled irqs and here we enable it straight after entry:
279 */ 292 */
280 ENABLE_INTERRUPTS 293 ENABLE_INTERRUPTS(CLBR_NONE)
281 pushl $(__USER_DS) 294 pushl $(__USER_DS)
282 CFI_ADJUST_CFA_OFFSET 4 295 CFI_ADJUST_CFA_OFFSET 4
283 /*CFI_REL_OFFSET ss, 0*/ 296 /*CFI_REL_OFFSET ss, 0*/
@@ -322,7 +335,7 @@ sysenter_past_esp:
322 jae syscall_badsys 335 jae syscall_badsys
323 call *sys_call_table(,%eax,4) 336 call *sys_call_table(,%eax,4)
324 movl %eax,PT_EAX(%esp) 337 movl %eax,PT_EAX(%esp)
325 DISABLE_INTERRUPTS 338 DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
326 TRACE_IRQS_OFF 339 TRACE_IRQS_OFF
327 movl TI_flags(%ebp), %ecx 340 movl TI_flags(%ebp), %ecx
328 testw $_TIF_ALLWORK_MASK, %cx 341 testw $_TIF_ALLWORK_MASK, %cx
@@ -364,7 +377,7 @@ syscall_call:
364 call *sys_call_table(,%eax,4) 377 call *sys_call_table(,%eax,4)
365 movl %eax,PT_EAX(%esp) # store the return value 378 movl %eax,PT_EAX(%esp) # store the return value
366syscall_exit: 379syscall_exit:
367 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 380 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
368 # setting need_resched or sigpending 381 # setting need_resched or sigpending
369 # between sampling and the iret 382 # between sampling and the iret
370 TRACE_IRQS_OFF 383 TRACE_IRQS_OFF
@@ -393,7 +406,7 @@ restore_nocheck_notrace:
393.section .fixup,"ax" 406.section .fixup,"ax"
394iret_exc: 407iret_exc:
395 TRACE_IRQS_ON 408 TRACE_IRQS_ON
396 ENABLE_INTERRUPTS 409 ENABLE_INTERRUPTS(CLBR_NONE)
397 pushl $0 # no error code 410 pushl $0 # no error code
398 pushl $do_iret_error 411 pushl $do_iret_error
399 jmp error_code 412 jmp error_code
@@ -436,7 +449,7 @@ ldt_ss:
436 CFI_ADJUST_CFA_OFFSET 4 449 CFI_ADJUST_CFA_OFFSET 4
437 pushl %eax 450 pushl %eax
438 CFI_ADJUST_CFA_OFFSET 4 451 CFI_ADJUST_CFA_OFFSET 4
439 DISABLE_INTERRUPTS 452 DISABLE_INTERRUPTS(CLBR_EAX)
440 TRACE_IRQS_OFF 453 TRACE_IRQS_OFF
441 lss (%esp), %esp 454 lss (%esp), %esp
442 CFI_ADJUST_CFA_OFFSET -8 455 CFI_ADJUST_CFA_OFFSET -8
@@ -451,7 +464,7 @@ work_pending:
451 jz work_notifysig 464 jz work_notifysig
452work_resched: 465work_resched:
453 call schedule 466 call schedule
454 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 467 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
455 # setting need_resched or sigpending 468 # setting need_resched or sigpending
456 # between sampling and the iret 469 # between sampling and the iret
457 TRACE_IRQS_OFF 470 TRACE_IRQS_OFF
@@ -509,7 +522,7 @@ syscall_exit_work:
509 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl 522 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
510 jz work_pending 523 jz work_pending
511 TRACE_IRQS_ON 524 TRACE_IRQS_ON
512 ENABLE_INTERRUPTS # could let do_syscall_trace() call 525 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
513 # schedule() instead 526 # schedule() instead
514 movl %esp, %eax 527 movl %esp, %eax
515 movl $1, %edx 528 movl $1, %edx
@@ -693,7 +706,7 @@ ENTRY(device_not_available)
693 GET_CR0_INTO_EAX 706 GET_CR0_INTO_EAX
694 testl $0x4, %eax # EM (math emulation bit) 707 testl $0x4, %eax # EM (math emulation bit)
695 jne device_not_available_emulate 708 jne device_not_available_emulate
696 preempt_stop 709 preempt_stop(CLBR_ANY)
697 call math_state_restore 710 call math_state_restore
698 jmp ret_from_exception 711 jmp ret_from_exception
699device_not_available_emulate: 712device_not_available_emulate:
diff --git a/arch/i386/kernel/module.c b/arch/i386/kernel/module.c
index 470cf97e7cd3..d7d9c8b23f72 100644
--- a/arch/i386/kernel/module.c
+++ b/arch/i386/kernel/module.c
@@ -108,7 +108,8 @@ int module_finalize(const Elf_Ehdr *hdr,
108 const Elf_Shdr *sechdrs, 108 const Elf_Shdr *sechdrs,
109 struct module *me) 109 struct module *me)
110{ 110{
111 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; 111 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
112 *para = NULL;
112 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 113 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
113 114
114 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 115 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -118,6 +119,8 @@ int module_finalize(const Elf_Ehdr *hdr,
118 alt = s; 119 alt = s;
119 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 120 if (!strcmp(".smp_locks", secstrings + s->sh_name))
120 locks= s; 121 locks= s;
122 if (!strcmp(".parainstructions", secstrings + s->sh_name))
123 para = s;
121 } 124 }
122 125
123 if (alt) { 126 if (alt) {
@@ -132,6 +135,12 @@ int module_finalize(const Elf_Ehdr *hdr,
132 lseg, lseg + locks->sh_size, 135 lseg, lseg + locks->sh_size,
133 tseg, tseg + text->sh_size); 136 tseg, tseg + text->sh_size);
134 } 137 }
138
139 if (para) {
140 void *pseg = (void *)para->sh_addr;
141 apply_paravirt(pseg, pseg + para->sh_size);
142 }
143
135 return 0; 144 return 0;
136} 145}
137 146
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index 478192cd4b90..d46460426446 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -45,6 +45,49 @@ char *memory_setup(void)
45 return paravirt_ops.memory_setup(); 45 return paravirt_ops.memory_setup();
46} 46}
47 47
48/* Simple instruction patching code. */
49#define DEF_NATIVE(name, code) \
50 extern const char start_##name[], end_##name[]; \
51 asm("start_" #name ": " code "; end_" #name ":")
52DEF_NATIVE(cli, "cli");
53DEF_NATIVE(sti, "sti");
54DEF_NATIVE(popf, "push %eax; popf");
55DEF_NATIVE(pushf, "pushf; pop %eax");
56DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli");
57DEF_NATIVE(iret, "iret");
58DEF_NATIVE(sti_sysexit, "sti; sysexit");
59
60static const struct native_insns
61{
62 const char *start, *end;
63} native_insns[] = {
64 [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
65 [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
66 [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
67 [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
68 [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
69 [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
70 [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit },
71};
72
73static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
74{
75 unsigned int insn_len;
76
77 /* Don't touch it if we don't have a replacement */
78 if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start)
79 return len;
80
81 insn_len = native_insns[type].end - native_insns[type].start;
82
83 /* Similarly if we can't fit replacement. */
84 if (len < insn_len)
85 return len;
86
87 memcpy(insns, native_insns[type].start, insn_len);
88 return insn_len;
89}
90
48static fastcall unsigned long native_get_debugreg(int regno) 91static fastcall unsigned long native_get_debugreg(int regno)
49{ 92{
50 unsigned long val = 0; /* Damn you, gcc! */ 93 unsigned long val = 0; /* Damn you, gcc! */
@@ -349,6 +392,7 @@ struct paravirt_ops paravirt_ops = {
349 .paravirt_enabled = 0, 392 .paravirt_enabled = 0,
350 .kernel_rpl = 0, 393 .kernel_rpl = 0,
351 394
395 .patch = native_patch,
352 .banner = default_banner, 396 .banner = default_banner,
353 .arch_setup = native_nop, 397 .arch_setup = native_nop,
354 .memory_setup = machine_specific_memory_setup, 398 .memory_setup = machine_specific_memory_setup,
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 6860f20aa579..5c69cf0e5944 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -165,6 +165,12 @@ SECTIONS
165 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { 165 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
166 *(.altinstr_replacement) 166 *(.altinstr_replacement)
167 } 167 }
168 . = ALIGN(4);
169 __start_parainstructions = .;
170 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
171 *(.parainstructions)
172 }
173 __stop_parainstructions = .;
168 /* .exit.text is discard at runtime, not link time, to deal with references 174 /* .exit.text is discard at runtime, not link time, to deal with references
169 from .altinstructions and .eh_frame */ 175 from .altinstructions and .eh_frame */
170 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } 176 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }