aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/lguest/x86
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/lguest/x86')
-rw-r--r--drivers/lguest/x86/core.c120
-rw-r--r--drivers/lguest/x86/switcher_32.S71
2 files changed, 113 insertions, 78 deletions
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 09d9207420dc..482aec2a9631 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -63,7 +63,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu)
63static DEFINE_PER_CPU(struct lguest *, last_guest); 63static DEFINE_PER_CPU(struct lguest *, last_guest);
64 64
65/*S:010 65/*S:010
66 * We are getting close to the Switcher. 66 * We approach the Switcher.
67 * 67 *
68 * Remember that each CPU has two pages which are visible to the Guest when it 68 * Remember that each CPU has two pages which are visible to the Guest when it
69 * runs on that CPU. This has to contain the state for that Guest: we copy the 69 * runs on that CPU. This has to contain the state for that Guest: we copy the
@@ -134,7 +134,7 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
134 * 134 *
135 * The lcall also pushes the old code segment (KERNEL_CS) onto the 135 * The lcall also pushes the old code segment (KERNEL_CS) onto the
136 * stack, then the address of this call. This stack layout happens to 136 * stack, then the address of this call. This stack layout happens to
137 * exactly match the stack of an interrupt... */ 137 * exactly match the stack layout created by an interrupt... */
138 asm volatile("pushf; lcall *lguest_entry" 138 asm volatile("pushf; lcall *lguest_entry"
139 /* This is how we tell GCC that %eax ("a") and %ebx ("b") 139 /* This is how we tell GCC that %eax ("a") and %ebx ("b")
140 * are changed by this routine. The "=" means output. */ 140 * are changed by this routine. The "=" means output. */
@@ -151,40 +151,46 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
151} 151}
152/*:*/ 152/*:*/
153 153
154/*M:002 There are hooks in the scheduler which we can register to tell when we
155 * get kicked off the CPU (preempt_notifier_register()). This would allow us
156 * to lazily disable SYSENTER which would regain some performance, and should
157 * also simplify copy_in_guest_info(). Note that we'd still need to restore
158 * things when we exit to Launcher userspace, but that's fairly easy.
159 *
160 * The hooks were designed for KVM, but we can also put them to good use. :*/
161
154/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts 162/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts
155 * are disabled: we own the CPU. */ 163 * are disabled: we own the CPU. */
156void lguest_arch_run_guest(struct lguest *lg) 164void lguest_arch_run_guest(struct lguest *lg)
157{ 165{
158 /* Remember the awfully-named TS bit? If the Guest has asked 166 /* Remember the awfully-named TS bit? If the Guest has asked to set it
159 * to set it we set it now, so we can trap and pass that trap 167 * we set it now, so we can trap and pass that trap to the Guest if it
160 * to the Guest if it uses the FPU. */ 168 * uses the FPU. */
161 if (lg->ts) 169 if (lg->ts)
162 lguest_set_ts(); 170 lguest_set_ts();
163 171
164 /* SYSENTER is an optimized way of doing system calls. We 172 /* SYSENTER is an optimized way of doing system calls. We can't allow
165 * can't allow it because it always jumps to privilege level 0. 173 * it because it always jumps to privilege level 0. A normal Guest
166 * A normal Guest won't try it because we don't advertise it in 174 * won't try it because we don't advertise it in CPUID, but a malicious
167 * CPUID, but a malicious Guest (or malicious Guest userspace 175 * Guest (or malicious Guest userspace program) could, so we tell the
168 * program) could, so we tell the CPU to disable it before 176 * CPU to disable it before running the Guest. */
169 * running the Guest. */
170 if (boot_cpu_has(X86_FEATURE_SEP)) 177 if (boot_cpu_has(X86_FEATURE_SEP))
171 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); 178 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
172 179
173 /* Now we actually run the Guest. It will pop back out when 180 /* Now we actually run the Guest. It will return when something
174 * something interesting happens, and we can examine its 181 * interesting happens, and we can examine its registers to see what it
175 * registers to see what it was doing. */ 182 * was doing. */
176 run_guest_once(lg, lguest_pages(raw_smp_processor_id())); 183 run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
177 184
178 /* The "regs" pointer contains two extra entries which are not 185 /* Note that the "regs" pointer contains two extra entries which are
179 * really registers: a trap number which says what interrupt or 186 * not really registers: a trap number which says what interrupt or
180 * trap made the switcher code come back, and an error code 187 * trap made the switcher code come back, and an error code which some
181 * which some traps set. */ 188 * traps set. */
182 189
183 /* If the Guest page faulted, then the cr2 register will tell 190 /* If the Guest page faulted, then the cr2 register will tell us the
184 * us the bad virtual address. We have to grab this now, 191 * bad virtual address. We have to grab this now, because once we
185 * because once we re-enable interrupts an interrupt could 192 * re-enable interrupts an interrupt could fault and thus overwrite
186 * fault and thus overwrite cr2, or we could even move off to a 193 * cr2, or we could even move off to a different CPU. */
187 * different CPU. */
188 if (lg->regs->trapnum == 14) 194 if (lg->regs->trapnum == 14)
189 lg->arch.last_pagefault = read_cr2(); 195 lg->arch.last_pagefault = read_cr2();
190 /* Similarly, if we took a trap because the Guest used the FPU, 196 /* Similarly, if we took a trap because the Guest used the FPU,
@@ -197,14 +203,15 @@ void lguest_arch_run_guest(struct lguest *lg)
197 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 203 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
198} 204}
199 205
200/*H:130 Our Guest is usually so well behaved; it never tries to do things it 206/*H:130 Now we've examined the hypercall code; our Guest can make requests.
201 * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't 207 * Our Guest is usually so well behaved; it never tries to do things it isn't
202 * quite complete, because it doesn't contain replacements for the Intel I/O 208 * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual
203 * instructions. As a result, the Guest sometimes fumbles across one during 209 * infrastructure isn't quite complete, because it doesn't contain replacements
204 * the boot process as it probes for various things which are usually attached 210 * for the Intel I/O instructions. As a result, the Guest sometimes fumbles
205 * to a PC. 211 * across one during the boot process as it probes for various things which are
212 * usually attached to a PC.
206 * 213 *
207 * When the Guest uses one of these instructions, we get trap #13 (General 214 * When the Guest uses one of these instructions, we get a trap (General
208 * Protection Fault) and come here. We see if it's one of those troublesome 215 * Protection Fault) and come here. We see if it's one of those troublesome
209 * instructions and skip over it. We return true if we did. */ 216 * instructions and skip over it. We return true if we did. */
210static int emulate_insn(struct lguest *lg) 217static int emulate_insn(struct lguest *lg)
@@ -275,43 +282,43 @@ static int emulate_insn(struct lguest *lg)
275void lguest_arch_handle_trap(struct lguest *lg) 282void lguest_arch_handle_trap(struct lguest *lg)
276{ 283{
277 switch (lg->regs->trapnum) { 284 switch (lg->regs->trapnum) {
278 case 13: /* We've intercepted a GPF. */ 285 case 13: /* We've intercepted a General Protection Fault. */
279 /* Check if this was one of those annoying IN or OUT 286 /* Check if this was one of those annoying IN or OUT
280 * instructions which we need to emulate. If so, we 287 * instructions which we need to emulate. If so, we just go
281 * just go back into the Guest after we've done it. */ 288 * back into the Guest after we've done it. */
282 if (lg->regs->errcode == 0) { 289 if (lg->regs->errcode == 0) {
283 if (emulate_insn(lg)) 290 if (emulate_insn(lg))
284 return; 291 return;
285 } 292 }
286 break; 293 break;
287 case 14: /* We've intercepted a page fault. */ 294 case 14: /* We've intercepted a Page Fault. */
288 /* The Guest accessed a virtual address that wasn't 295 /* The Guest accessed a virtual address that wasn't mapped.
289 * mapped. This happens a lot: we don't actually set 296 * This happens a lot: we don't actually set up most of the
290 * up most of the page tables for the Guest at all when 297 * page tables for the Guest at all when we start: as it runs
291 * we start: as it runs it asks for more and more, and 298 * it asks for more and more, and we set them up as
292 * we set them up as required. In this case, we don't 299 * required. In this case, we don't even tell the Guest that
293 * even tell the Guest that the fault happened. 300 * the fault happened.
294 * 301 *
295 * The errcode tells whether this was a read or a 302 * The errcode tells whether this was a read or a write, and
296 * write, and whether kernel or userspace code. */ 303 * whether kernel or userspace code. */
297 if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode)) 304 if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode))
298 return; 305 return;
299 306
300 /* OK, it's really not there (or not OK): the Guest 307 /* OK, it's really not there (or not OK): the Guest needs to
301 * needs to know. We write out the cr2 value so it 308 * know. We write out the cr2 value so it knows where the
302 * knows where the fault occurred. 309 * fault occurred.
303 * 310 *
304 * Note that if the Guest were really messed up, this 311 * Note that if the Guest were really messed up, this could
305 * could happen before it's done the INITIALIZE 312 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
306 * hypercall, so lg->lguest_data will be NULL */ 313 * lg->lguest_data could be NULL */
307 if (lg->lguest_data && 314 if (lg->lguest_data &&
308 put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2)) 315 put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2))
309 kill_guest(lg, "Writing cr2"); 316 kill_guest(lg, "Writing cr2");
310 break; 317 break;
311 case 7: /* We've intercepted a Device Not Available fault. */ 318 case 7: /* We've intercepted a Device Not Available fault. */
312 /* If the Guest doesn't want to know, we already 319 /* If the Guest doesn't want to know, we already restored the
313 * restored the Floating Point Unit, so we just 320 * Floating Point Unit, so we just continue without telling
314 * continue without telling it. */ 321 * it. */
315 if (!lg->ts) 322 if (!lg->ts)
316 return; 323 return;
317 break; 324 break;
@@ -536,9 +543,6 @@ int lguest_arch_init_hypercalls(struct lguest *lg)
536 543
537 return 0; 544 return 0;
538} 545}
539/* Now we've examined the hypercall code; our Guest can make requests. There
540 * is one other way we can do things for the Guest, as we see in
541 * emulate_insn(). :*/
542 546
543/*L:030 lguest_arch_setup_regs() 547/*L:030 lguest_arch_setup_regs()
544 * 548 *
@@ -570,8 +574,8 @@ void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
570 574
571 /* %esi points to our boot information, at physical address 0, so don't 575 /* %esi points to our boot information, at physical address 0, so don't
572 * touch it. */ 576 * touch it. */
577
573 /* There are a couple of GDT entries the Guest expects when first 578 /* There are a couple of GDT entries the Guest expects when first
574 * booting. */ 579 * booting. */
575
576 setup_guest_gdt(lg); 580 setup_guest_gdt(lg);
577} 581}
diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S
index 1010b90b11fc..0af8baaa0d4a 100644
--- a/drivers/lguest/x86/switcher_32.S
+++ b/drivers/lguest/x86/switcher_32.S
@@ -6,6 +6,37 @@
6 * are feeling invigorated and refreshed then the next, more challenging stage 6 * are feeling invigorated and refreshed then the next, more challenging stage
7 * can be found in "make Guest". :*/ 7 * can be found in "make Guest". :*/
8 8
9/*M:012 Lguest is meant to be simple: my rule of thumb is that 1% more LOC must
10 * gain at least 1% more performance. Since neither LOC nor performance can be
11 * measured beforehand, it generally means implementing a feature then deciding
12 * if it's worth it. And once it's implemented, who can say no?
13 *
14 * This is why I haven't implemented this idea myself. I want to, but I
15 * haven't. You could, though.
16 *
17 * The main place where lguest performance sucks is Guest page faulting. When
18 * a Guest userspace process hits an unmapped page we switch back to the Host,
19 * walk the page tables, find it's not mapped, switch back to the Guest page
20 * fault handler, which calls a hypercall to set the page table entry, then
21 * finally returns to userspace. That's two round-trips.
22 *
23 * If we had a small walker in the Switcher, we could quickly check the Guest
24 * page table and if the page isn't mapped, immediately reflect the fault back
25 * into the Guest. This means the Switcher would have to know the top of the
26 * Guest page table and the page fault handler address.
27 *
28 * For simplicity, the Guest should only handle the case where the privilege
29 * level of the fault is 3 and probably only not present or write faults. It
30 * should also detect recursive faults, and hand the original fault to the
31 * Host (which is actually really easy).
32 *
33 * Two questions remain. Would the performance gain outweigh the complexity?
34 * And who would write the verse documenting it? :*/
35
36/*M:011 Lguest64 handles NMI. This gave me NMI envy (until I looked at their
37 * code). It's worth doing though, since it would let us use oprofile in the
38 * Host when a Guest is running. :*/
39
9/*S:100 40/*S:100
10 * Welcome to the Switcher itself! 41 * Welcome to the Switcher itself!
11 * 42 *
@@ -88,7 +119,7 @@ ENTRY(switch_to_guest)
88 119
89 // All saved and there's now five steps before us: 120 // All saved and there's now five steps before us:
90 // Stack, GDT, IDT, TSS 121 // Stack, GDT, IDT, TSS
91 // And last of all the page tables are flipped. 122 // Then last of all the page tables are flipped.
92 123
93 // Yet beware that our stack pointer must be 124 // Yet beware that our stack pointer must be
94 // Always valid lest an NMI hits 125 // Always valid lest an NMI hits
@@ -103,25 +134,25 @@ ENTRY(switch_to_guest)
103 lgdt LGUEST_PAGES_guest_gdt_desc(%eax) 134 lgdt LGUEST_PAGES_guest_gdt_desc(%eax)
104 135
105 // The Guest's IDT we did partially 136 // The Guest's IDT we did partially
106 // Move to the "struct lguest_pages" as well. 137 // Copy to "struct lguest_pages" as well.
107 lidt LGUEST_PAGES_guest_idt_desc(%eax) 138 lidt LGUEST_PAGES_guest_idt_desc(%eax)
108 139
109 // The TSS entry which controls traps 140 // The TSS entry which controls traps
110 // Must be loaded up with "ltr" now: 141 // Must be loaded up with "ltr" now:
142 // The GDT entry that TSS uses
143 // Changes type when we load it: damn Intel!
111 // For after we switch over our page tables 144 // For after we switch over our page tables
112 // It (as the rest) will be writable no more. 145 // That entry will be read-only: we'd crash.
113 // (The GDT entry TSS needs
114 // Changes type when we load it: damn Intel!)
115 movl $(GDT_ENTRY_TSS*8), %edx 146 movl $(GDT_ENTRY_TSS*8), %edx
116 ltr %dx 147 ltr %dx
117 148
118 // Look back now, before we take this last step! 149 // Look back now, before we take this last step!
119 // The Host's TSS entry was also marked used; 150 // The Host's TSS entry was also marked used;
120 // Let's clear it again, ere we return. 151 // Let's clear it again for our return.
121 // The GDT descriptor of the Host 152 // The GDT descriptor of the Host
122 // Points to the table after two "size" bytes 153 // Points to the table after two "size" bytes
123 movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx 154 movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
124 // Clear the type field of "used" (byte 5, bit 2) 155 // Clear "used" from type field (byte 5, bit 2)
125 andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx) 156 andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
126 157
127 // Once our page table's switched, the Guest is live! 158 // Once our page table's switched, the Guest is live!
@@ -131,7 +162,7 @@ ENTRY(switch_to_guest)
131 162
132 // The page table change did one tricky thing: 163 // The page table change did one tricky thing:
133 // The Guest's register page has been mapped 164 // The Guest's register page has been mapped
134 // Writable onto our %esp (stack) -- 165 // Writable under our %esp (stack) --
135 // We can simply pop off all Guest regs. 166 // We can simply pop off all Guest regs.
136 popl %eax 167 popl %eax
137 popl %ebx 168 popl %ebx
@@ -152,16 +183,15 @@ ENTRY(switch_to_guest)
152 addl $8, %esp 183 addl $8, %esp
153 184
154 // The last five stack slots hold return address 185 // The last five stack slots hold return address
155 // And everything needed to change privilege 186 // And everything needed to switch privilege
156 // Into the Guest privilege level of 1, 187 // From Switcher's level 0 to Guest's 1,
157 // And the stack where the Guest had last left it. 188 // And the stack where the Guest had last left it.
158 // Interrupts are turned back on: we are Guest. 189 // Interrupts are turned back on: we are Guest.
159 iret 190 iret
160 191
161// There are two paths where we switch to the Host 192// We treat two paths to switch back to the Host
193// Yet both must save Guest state and restore Host
162// So we put the routine in a macro. 194// So we put the routine in a macro.
163// We are on our way home, back to the Host
164// Interrupted out of the Guest, we come here.
165#define SWITCH_TO_HOST \ 195#define SWITCH_TO_HOST \
166 /* We save the Guest state: all registers first \ 196 /* We save the Guest state: all registers first \
167 * Laid out just as "struct lguest_regs" defines */ \ 197 * Laid out just as "struct lguest_regs" defines */ \
@@ -194,7 +224,7 @@ ENTRY(switch_to_guest)
194 movl %esp, %eax; \ 224 movl %esp, %eax; \
195 andl $(~(1 << PAGE_SHIFT - 1)), %eax; \ 225 andl $(~(1 << PAGE_SHIFT - 1)), %eax; \
196 /* Save our trap number: the switch will obscure it \ 226 /* Save our trap number: the switch will obscure it \
197 * (The Guest regs are not mapped here in the Host) \ 227 * (In the Host the Guest regs are not mapped here) \
198 * %ebx holds it safe for deliver_to_host */ \ 228 * %ebx holds it safe for deliver_to_host */ \
199 movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \ 229 movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \
200 /* The Host GDT, IDT and stack! \ 230 /* The Host GDT, IDT and stack! \
@@ -210,9 +240,9 @@ ENTRY(switch_to_guest)
210 /* Switch to Host's GDT, IDT. */ \ 240 /* Switch to Host's GDT, IDT. */ \
211 lgdt LGUEST_PAGES_host_gdt_desc(%eax); \ 241 lgdt LGUEST_PAGES_host_gdt_desc(%eax); \
212 lidt LGUEST_PAGES_host_idt_desc(%eax); \ 242 lidt LGUEST_PAGES_host_idt_desc(%eax); \
213 /* Restore the Host's stack where it's saved regs lie */ \ 243 /* Restore the Host's stack where its saved regs lie */ \
214 movl LGUEST_PAGES_host_sp(%eax), %esp; \ 244 movl LGUEST_PAGES_host_sp(%eax), %esp; \
215 /* Last the TSS: our Host is complete */ \ 245 /* Last the TSS: our Host is returned */ \
216 movl $(GDT_ENTRY_TSS*8), %edx; \ 246 movl $(GDT_ENTRY_TSS*8), %edx; \
217 ltr %dx; \ 247 ltr %dx; \
218 /* Restore now the regs saved right at the first. */ \ 248 /* Restore now the regs saved right at the first. */ \
@@ -222,14 +252,15 @@ ENTRY(switch_to_guest)
222 popl %ds; \ 252 popl %ds; \
223 popl %es 253 popl %es
224 254
225// Here's where we come when the Guest has just trapped: 255// The first path is trod when the Guest has trapped:
226// (Which trap we'll see has been pushed on the stack). 256// (Which trap it was has been pushed on the stack).
227// We need only switch back, and the Host will decode 257// We need only switch back, and the Host will decode
228// Why we came home, and what needs to be done. 258// Why we came home, and what needs to be done.
229return_to_host: 259return_to_host:
230 SWITCH_TO_HOST 260 SWITCH_TO_HOST
231 iret 261 iret
232 262
263// We are lead to the second path like so:
233// An interrupt, with some cause external 264// An interrupt, with some cause external
234// Has ajerked us rudely from the Guest's code 265// Has ajerked us rudely from the Guest's code
235// Again we must return home to the Host 266// Again we must return home to the Host
@@ -238,7 +269,7 @@ deliver_to_host:
238 // But now we must go home via that place 269 // But now we must go home via that place
239 // Where that interrupt was supposed to go 270 // Where that interrupt was supposed to go
240 // Had we not been ensconced, running the Guest. 271 // Had we not been ensconced, running the Guest.
241 // Here we see the cleverness of our stack: 272 // Here we see the trickness of run_guest_once():
242 // The Host stack is formed like an interrupt 273 // The Host stack is formed like an interrupt
243 // With EIP, CS and EFLAGS layered. 274 // With EIP, CS and EFLAGS layered.
244 // Interrupt handlers end with "iret" 275 // Interrupt handlers end with "iret"
@@ -263,7 +294,7 @@ deliver_to_host:
263 xorw %ax, %ax 294 xorw %ax, %ax
264 orl %eax, %edx 295 orl %eax, %edx
265 // Now the address of the handler's in %edx 296 // Now the address of the handler's in %edx
266 // We call it now: its "iret" takes us home. 297 // We call it now: its "iret" drops us home.
267 jmp *%edx 298 jmp *%edx
268 299
269// Every interrupt can come to us here 300// Every interrupt can come to us here