aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/lguest/x86/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/lguest/x86/core.c')
-rw-r--r--drivers/lguest/x86/core.c372
1 files changed, 245 insertions, 127 deletions
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index eaf722fe309a..96f7d88ec7f8 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -17,13 +17,15 @@
17 * along with this program; if not, write to the Free Software 17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */ 19 */
20/*P:450 This file contains the x86-specific lguest code. It used to be all 20/*P:450
21 * This file contains the x86-specific lguest code. It used to be all
21 * mixed in with drivers/lguest/core.c but several foolhardy code slashers 22 * mixed in with drivers/lguest/core.c but several foolhardy code slashers
22 * wrestled most of the dependencies out to here in preparation for porting 23 * wrestled most of the dependencies out to here in preparation for porting
23 * lguest to other architectures (see what I mean by foolhardy?). 24 * lguest to other architectures (see what I mean by foolhardy?).
24 * 25 *
25 * This also contains a couple of non-obvious setup and teardown pieces which 26 * This also contains a couple of non-obvious setup and teardown pieces which
26 * were implemented after days of debugging pain. :*/ 27 * were implemented after days of debugging pain.
28:*/
27#include <linux/kernel.h> 29#include <linux/kernel.h>
28#include <linux/start_kernel.h> 30#include <linux/start_kernel.h>
29#include <linux/string.h> 31#include <linux/string.h>
@@ -82,25 +84,33 @@ static DEFINE_PER_CPU(struct lg_cpu *, last_cpu);
82 */ 84 */
83static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) 85static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
84{ 86{
85 /* Copying all this data can be quite expensive. We usually run the 87 /*
88 * Copying all this data can be quite expensive. We usually run the
86 * same Guest we ran last time (and that Guest hasn't run anywhere else 89 * same Guest we ran last time (and that Guest hasn't run anywhere else
87 * meanwhile). If that's not the case, we pretend everything in the 90 * meanwhile). If that's not the case, we pretend everything in the
88 * Guest has changed. */ 91 * Guest has changed.
92 */
89 if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) { 93 if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) {
90 __get_cpu_var(last_cpu) = cpu; 94 __get_cpu_var(last_cpu) = cpu;
91 cpu->last_pages = pages; 95 cpu->last_pages = pages;
92 cpu->changed = CHANGED_ALL; 96 cpu->changed = CHANGED_ALL;
93 } 97 }
94 98
95 /* These copies are pretty cheap, so we do them unconditionally: */ 99 /*
96 /* Save the current Host top-level page directory. */ 100 * These copies are pretty cheap, so we do them unconditionally: */
101 /* Save the current Host top-level page directory.
102 */
97 pages->state.host_cr3 = __pa(current->mm->pgd); 103 pages->state.host_cr3 = __pa(current->mm->pgd);
98 /* Set up the Guest's page tables to see this CPU's pages (and no 104 /*
99 * other CPU's pages). */ 105 * Set up the Guest's page tables to see this CPU's pages (and no
106 * other CPU's pages).
107 */
100 map_switcher_in_guest(cpu, pages); 108 map_switcher_in_guest(cpu, pages);
101 /* Set up the two "TSS" members which tell the CPU what stack to use 109 /*
110 * Set up the two "TSS" members which tell the CPU what stack to use
102 * for traps which do directly into the Guest (ie. traps at privilege 111 * for traps which do directly into the Guest (ie. traps at privilege
103 * level 1). */ 112 * level 1).
113 */
104 pages->state.guest_tss.sp1 = cpu->esp1; 114 pages->state.guest_tss.sp1 = cpu->esp1;
105 pages->state.guest_tss.ss1 = cpu->ss1; 115 pages->state.guest_tss.ss1 = cpu->ss1;
106 116
@@ -125,40 +135,53 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
125 /* This is a dummy value we need for GCC's sake. */ 135 /* This is a dummy value we need for GCC's sake. */
126 unsigned int clobber; 136 unsigned int clobber;
127 137
128 /* Copy the guest-specific information into this CPU's "struct 138 /*
129 * lguest_pages". */ 139 * Copy the guest-specific information into this CPU's "struct
140 * lguest_pages".
141 */
130 copy_in_guest_info(cpu, pages); 142 copy_in_guest_info(cpu, pages);
131 143
132 /* Set the trap number to 256 (impossible value). If we fault while 144 /*
145 * Set the trap number to 256 (impossible value). If we fault while
133 * switching to the Guest (bad segment registers or bug), this will 146 * switching to the Guest (bad segment registers or bug), this will
134 * cause us to abort the Guest. */ 147 * cause us to abort the Guest.
148 */
135 cpu->regs->trapnum = 256; 149 cpu->regs->trapnum = 256;
136 150
137 /* Now: we push the "eflags" register on the stack, then do an "lcall". 151 /*
152 * Now: we push the "eflags" register on the stack, then do an "lcall".
138 * This is how we change from using the kernel code segment to using 153 * This is how we change from using the kernel code segment to using
139 * the dedicated lguest code segment, as well as jumping into the 154 * the dedicated lguest code segment, as well as jumping into the
140 * Switcher. 155 * Switcher.
141 * 156 *
142 * The lcall also pushes the old code segment (KERNEL_CS) onto the 157 * The lcall also pushes the old code segment (KERNEL_CS) onto the
143 * stack, then the address of this call. This stack layout happens to 158 * stack, then the address of this call. This stack layout happens to
144 * exactly match the stack layout created by an interrupt... */ 159 * exactly match the stack layout created by an interrupt...
160 */
145 asm volatile("pushf; lcall *lguest_entry" 161 asm volatile("pushf; lcall *lguest_entry"
146 /* This is how we tell GCC that %eax ("a") and %ebx ("b") 162 /*
147 * are changed by this routine. The "=" means output. */ 163 * This is how we tell GCC that %eax ("a") and %ebx ("b")
164 * are changed by this routine. The "=" means output.
165 */
148 : "=a"(clobber), "=b"(clobber) 166 : "=a"(clobber), "=b"(clobber)
149 /* %eax contains the pages pointer. ("0" refers to the 167 /*
168 * %eax contains the pages pointer. ("0" refers to the
150 * 0-th argument above, ie "a"). %ebx contains the 169 * 0-th argument above, ie "a"). %ebx contains the
151 * physical address of the Guest's top-level page 170 * physical address of the Guest's top-level page
152 * directory. */ 171 * directory.
172 */
153 : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) 173 : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir))
154 /* We tell gcc that all these registers could change, 174 /*
175 * We tell gcc that all these registers could change,
155 * which means we don't have to save and restore them in 176 * which means we don't have to save and restore them in
156 * the Switcher. */ 177 * the Switcher.
178 */
157 : "memory", "%edx", "%ecx", "%edi", "%esi"); 179 : "memory", "%edx", "%ecx", "%edi", "%esi");
158} 180}
159/*:*/ 181/*:*/
160 182
161/*M:002 There are hooks in the scheduler which we can register to tell when we 183/*M:002
184 * There are hooks in the scheduler which we can register to tell when we
162 * get kicked off the CPU (preempt_notifier_register()). This would allow us 185 * get kicked off the CPU (preempt_notifier_register()). This would allow us
163 * to lazily disable SYSENTER which would regain some performance, and should 186 * to lazily disable SYSENTER which would regain some performance, and should
164 * also simplify copy_in_guest_info(). Note that we'd still need to restore 187 * also simplify copy_in_guest_info(). Note that we'd still need to restore
@@ -166,56 +189,72 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
166 * 189 *
167 * We could also try using this hooks for PGE, but that might be too expensive. 190 * We could also try using this hooks for PGE, but that might be too expensive.
168 * 191 *
169 * The hooks were designed for KVM, but we can also put them to good use. :*/ 192 * The hooks were designed for KVM, but we can also put them to good use.
193:*/
170 194
171/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts 195/*H:040
172 * are disabled: we own the CPU. */ 196 * This is the i386-specific code to setup and run the Guest. Interrupts
197 * are disabled: we own the CPU.
198 */
173void lguest_arch_run_guest(struct lg_cpu *cpu) 199void lguest_arch_run_guest(struct lg_cpu *cpu)
174{ 200{
175 /* Remember the awfully-named TS bit? If the Guest has asked to set it 201 /*
202 * Remember the awfully-named TS bit? If the Guest has asked to set it
176 * we set it now, so we can trap and pass that trap to the Guest if it 203 * we set it now, so we can trap and pass that trap to the Guest if it
177 * uses the FPU. */ 204 * uses the FPU.
205 */
178 if (cpu->ts) 206 if (cpu->ts)
179 unlazy_fpu(current); 207 unlazy_fpu(current);
180 208
181 /* SYSENTER is an optimized way of doing system calls. We can't allow 209 /*
210 * SYSENTER is an optimized way of doing system calls. We can't allow
182 * it because it always jumps to privilege level 0. A normal Guest 211 * it because it always jumps to privilege level 0. A normal Guest
183 * won't try it because we don't advertise it in CPUID, but a malicious 212 * won't try it because we don't advertise it in CPUID, but a malicious
184 * Guest (or malicious Guest userspace program) could, so we tell the 213 * Guest (or malicious Guest userspace program) could, so we tell the
185 * CPU to disable it before running the Guest. */ 214 * CPU to disable it before running the Guest.
215 */
186 if (boot_cpu_has(X86_FEATURE_SEP)) 216 if (boot_cpu_has(X86_FEATURE_SEP))
187 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); 217 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
188 218
189 /* Now we actually run the Guest. It will return when something 219 /*
220 * Now we actually run the Guest. It will return when something
190 * interesting happens, and we can examine its registers to see what it 221 * interesting happens, and we can examine its registers to see what it
191 * was doing. */ 222 * was doing.
223 */
192 run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); 224 run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
193 225
194 /* Note that the "regs" structure contains two extra entries which are 226 /*
227 * Note that the "regs" structure contains two extra entries which are
195 * not really registers: a trap number which says what interrupt or 228 * not really registers: a trap number which says what interrupt or
196 * trap made the switcher code come back, and an error code which some 229 * trap made the switcher code come back, and an error code which some
197 * traps set. */ 230 * traps set.
231 */
198 232
199 /* Restore SYSENTER if it's supposed to be on. */ 233 /* Restore SYSENTER if it's supposed to be on. */
200 if (boot_cpu_has(X86_FEATURE_SEP)) 234 if (boot_cpu_has(X86_FEATURE_SEP))
201 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 235 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
202 236
203 /* If the Guest page faulted, then the cr2 register will tell us the 237 /*
238 * If the Guest page faulted, then the cr2 register will tell us the
204 * bad virtual address. We have to grab this now, because once we 239 * bad virtual address. We have to grab this now, because once we
205 * re-enable interrupts an interrupt could fault and thus overwrite 240 * re-enable interrupts an interrupt could fault and thus overwrite
206 * cr2, or we could even move off to a different CPU. */ 241 * cr2, or we could even move off to a different CPU.
242 */
207 if (cpu->regs->trapnum == 14) 243 if (cpu->regs->trapnum == 14)
208 cpu->arch.last_pagefault = read_cr2(); 244 cpu->arch.last_pagefault = read_cr2();
209 /* Similarly, if we took a trap because the Guest used the FPU, 245 /*
246 * Similarly, if we took a trap because the Guest used the FPU,
210 * we have to restore the FPU it expects to see. 247 * we have to restore the FPU it expects to see.
211 * math_state_restore() may sleep and we may even move off to 248 * math_state_restore() may sleep and we may even move off to
212 * a different CPU. So all the critical stuff should be done 249 * a different CPU. So all the critical stuff should be done
213 * before this. */ 250 * before this.
251 */
214 else if (cpu->regs->trapnum == 7) 252 else if (cpu->regs->trapnum == 7)
215 math_state_restore(); 253 math_state_restore();
216} 254}
217 255
218/*H:130 Now we've examined the hypercall code; our Guest can make requests. 256/*H:130
257 * Now we've examined the hypercall code; our Guest can make requests.
219 * Our Guest is usually so well behaved; it never tries to do things it isn't 258 * Our Guest is usually so well behaved; it never tries to do things it isn't
220 * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual 259 * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual
221 * infrastructure isn't quite complete, because it doesn't contain replacements 260 * infrastructure isn't quite complete, because it doesn't contain replacements
@@ -225,26 +264,33 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
225 * 264 *
226 * When the Guest uses one of these instructions, we get a trap (General 265 * When the Guest uses one of these instructions, we get a trap (General
227 * Protection Fault) and come here. We see if it's one of those troublesome 266 * Protection Fault) and come here. We see if it's one of those troublesome
228 * instructions and skip over it. We return true if we did. */ 267 * instructions and skip over it. We return true if we did.
268 */
229static int emulate_insn(struct lg_cpu *cpu) 269static int emulate_insn(struct lg_cpu *cpu)
230{ 270{
231 u8 insn; 271 u8 insn;
232 unsigned int insnlen = 0, in = 0, shift = 0; 272 unsigned int insnlen = 0, in = 0, shift = 0;
233 /* The eip contains the *virtual* address of the Guest's instruction: 273 /*
234 * guest_pa just subtracts the Guest's page_offset. */ 274 * The eip contains the *virtual* address of the Guest's instruction:
275 * guest_pa just subtracts the Guest's page_offset.
276 */
235 unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); 277 unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
236 278
237 /* This must be the Guest kernel trying to do something, not userspace! 279 /*
280 * This must be the Guest kernel trying to do something, not userspace!
238 * The bottom two bits of the CS segment register are the privilege 281 * The bottom two bits of the CS segment register are the privilege
239 * level. */ 282 * level.
283 */
240 if ((cpu->regs->cs & 3) != GUEST_PL) 284 if ((cpu->regs->cs & 3) != GUEST_PL)
241 return 0; 285 return 0;
242 286
243 /* Decoding x86 instructions is icky. */ 287 /* Decoding x86 instructions is icky. */
244 insn = lgread(cpu, physaddr, u8); 288 insn = lgread(cpu, physaddr, u8);
245 289
246 /* 0x66 is an "operand prefix". It means it's using the upper 16 bits 290 /*
247 of the eax register. */ 291 * 0x66 is an "operand prefix". It means it's using the upper 16 bits
292 * of the eax register.
293 */
248 if (insn == 0x66) { 294 if (insn == 0x66) {
249 shift = 16; 295 shift = 16;
250 /* The instruction is 1 byte so far, read the next byte. */ 296 /* The instruction is 1 byte so far, read the next byte. */
@@ -252,8 +298,10 @@ static int emulate_insn(struct lg_cpu *cpu)
252 insn = lgread(cpu, physaddr + insnlen, u8); 298 insn = lgread(cpu, physaddr + insnlen, u8);
253 } 299 }
254 300
255 /* We can ignore the lower bit for the moment and decode the 4 opcodes 301 /*
256 * we need to emulate. */ 302 * We can ignore the lower bit for the moment and decode the 4 opcodes
303 * we need to emulate.
304 */
257 switch (insn & 0xFE) { 305 switch (insn & 0xFE) {
258 case 0xE4: /* in <next byte>,%al */ 306 case 0xE4: /* in <next byte>,%al */
259 insnlen += 2; 307 insnlen += 2;
@@ -274,9 +322,11 @@ static int emulate_insn(struct lg_cpu *cpu)
274 return 0; 322 return 0;
275 } 323 }
276 324
277 /* If it was an "IN" instruction, they expect the result to be read 325 /*
326 * If it was an "IN" instruction, they expect the result to be read
278 * into %eax, so we change %eax. We always return all-ones, which 327 * into %eax, so we change %eax. We always return all-ones, which
279 * traditionally means "there's nothing there". */ 328 * traditionally means "there's nothing there".
329 */
280 if (in) { 330 if (in) {
281 /* Lower bit tells is whether it's a 16 or 32 bit access */ 331 /* Lower bit tells is whether it's a 16 or 32 bit access */
282 if (insn & 0x1) 332 if (insn & 0x1)
@@ -290,7 +340,8 @@ static int emulate_insn(struct lg_cpu *cpu)
290 return 1; 340 return 1;
291} 341}
292 342
293/* Our hypercalls mechanism used to be based on direct software interrupts. 343/*
344 * Our hypercalls mechanism used to be based on direct software interrupts.
294 * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to 345 * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to
295 * change over to using kvm hypercalls. 346 * change over to using kvm hypercalls.
296 * 347 *
@@ -318,16 +369,20 @@ static int emulate_insn(struct lg_cpu *cpu)
318 */ 369 */
319static void rewrite_hypercall(struct lg_cpu *cpu) 370static void rewrite_hypercall(struct lg_cpu *cpu)
320{ 371{
321 /* This are the opcodes we use to patch the Guest. The opcode for "int 372 /*
373 * This are the opcodes we use to patch the Guest. The opcode for "int
322 * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we 374 * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we
323 * complete the sequence with a NOP (0x90). */ 375 * complete the sequence with a NOP (0x90).
376 */
324 u8 insn[3] = {0xcd, 0x1f, 0x90}; 377 u8 insn[3] = {0xcd, 0x1f, 0x90};
325 378
326 __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); 379 __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn));
327 /* The above write might have caused a copy of that page to be made 380 /*
381 * The above write might have caused a copy of that page to be made
328 * (if it was read-only). We need to make sure the Guest has 382 * (if it was read-only). We need to make sure the Guest has
329 * up-to-date pagetables. As this doesn't happen often, we can just 383 * up-to-date pagetables. As this doesn't happen often, we can just
330 * drop them all. */ 384 * drop them all.
385 */
331 guest_pagetable_clear_all(cpu); 386 guest_pagetable_clear_all(cpu);
332} 387}
333 388
@@ -335,9 +390,11 @@ static bool is_hypercall(struct lg_cpu *cpu)
335{ 390{
336 u8 insn[3]; 391 u8 insn[3];
337 392
338 /* This must be the Guest kernel trying to do something. 393 /*
394 * This must be the Guest kernel trying to do something.
339 * The bottom two bits of the CS segment register are the privilege 395 * The bottom two bits of the CS segment register are the privilege
340 * level. */ 396 * level.
397 */
341 if ((cpu->regs->cs & 3) != GUEST_PL) 398 if ((cpu->regs->cs & 3) != GUEST_PL)
342 return false; 399 return false;
343 400
@@ -351,86 +408,105 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
351{ 408{
352 switch (cpu->regs->trapnum) { 409 switch (cpu->regs->trapnum) {
353 case 13: /* We've intercepted a General Protection Fault. */ 410 case 13: /* We've intercepted a General Protection Fault. */
354 /* Check if this was one of those annoying IN or OUT 411 /*
412 * Check if this was one of those annoying IN or OUT
355 * instructions which we need to emulate. If so, we just go 413 * instructions which we need to emulate. If so, we just go
356 * back into the Guest after we've done it. */ 414 * back into the Guest after we've done it.
415 */
357 if (cpu->regs->errcode == 0) { 416 if (cpu->regs->errcode == 0) {
358 if (emulate_insn(cpu)) 417 if (emulate_insn(cpu))
359 return; 418 return;
360 } 419 }
361 /* If KVM is active, the vmcall instruction triggers a 420 /*
362 * General Protection Fault. Normally it triggers an 421 * If KVM is active, the vmcall instruction triggers a General
363 * invalid opcode fault (6): */ 422 * Protection Fault. Normally it triggers an invalid opcode
423 * fault (6):
424 */
364 case 6: 425 case 6:
365 /* We need to check if ring == GUEST_PL and 426 /*
366 * faulting instruction == vmcall. */ 427 * We need to check if ring == GUEST_PL and faulting
428 * instruction == vmcall.
429 */
367 if (is_hypercall(cpu)) { 430 if (is_hypercall(cpu)) {
368 rewrite_hypercall(cpu); 431 rewrite_hypercall(cpu);
369 return; 432 return;
370 } 433 }
371 break; 434 break;
372 case 14: /* We've intercepted a Page Fault. */ 435 case 14: /* We've intercepted a Page Fault. */
373 /* The Guest accessed a virtual address that wasn't mapped. 436 /*
437 * The Guest accessed a virtual address that wasn't mapped.
374 * This happens a lot: we don't actually set up most of the page 438 * This happens a lot: we don't actually set up most of the page
375 * tables for the Guest at all when we start: as it runs it asks 439 * tables for the Guest at all when we start: as it runs it asks
376 * for more and more, and we set them up as required. In this 440 * for more and more, and we set them up as required. In this
377 * case, we don't even tell the Guest that the fault happened. 441 * case, we don't even tell the Guest that the fault happened.
378 * 442 *
379 * The errcode tells whether this was a read or a write, and 443 * The errcode tells whether this was a read or a write, and
380 * whether kernel or userspace code. */ 444 * whether kernel or userspace code.
445 */
381 if (demand_page(cpu, cpu->arch.last_pagefault, 446 if (demand_page(cpu, cpu->arch.last_pagefault,
382 cpu->regs->errcode)) 447 cpu->regs->errcode))
383 return; 448 return;
384 449
385 /* OK, it's really not there (or not OK): the Guest needs to 450 /*
451 * OK, it's really not there (or not OK): the Guest needs to
386 * know. We write out the cr2 value so it knows where the 452 * know. We write out the cr2 value so it knows where the
387 * fault occurred. 453 * fault occurred.
388 * 454 *
389 * Note that if the Guest were really messed up, this could 455 * Note that if the Guest were really messed up, this could
390 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so 456 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
391 * lg->lguest_data could be NULL */ 457 * lg->lguest_data could be NULL
458 */
392 if (cpu->lg->lguest_data && 459 if (cpu->lg->lguest_data &&
393 put_user(cpu->arch.last_pagefault, 460 put_user(cpu->arch.last_pagefault,
394 &cpu->lg->lguest_data->cr2)) 461 &cpu->lg->lguest_data->cr2))
395 kill_guest(cpu, "Writing cr2"); 462 kill_guest(cpu, "Writing cr2");
396 break; 463 break;
397 case 7: /* We've intercepted a Device Not Available fault. */ 464 case 7: /* We've intercepted a Device Not Available fault. */
398 /* If the Guest doesn't want to know, we already restored the 465 /*
399 * Floating Point Unit, so we just continue without telling 466 * If the Guest doesn't want to know, we already restored the
400 * it. */ 467 * Floating Point Unit, so we just continue without telling it.
468 */
401 if (!cpu->ts) 469 if (!cpu->ts)
402 return; 470 return;
403 break; 471 break;
404 case 32 ... 255: 472 case 32 ... 255:
405 /* These values mean a real interrupt occurred, in which case 473 /*
474 * These values mean a real interrupt occurred, in which case
406 * the Host handler has already been run. We just do a 475 * the Host handler has already been run. We just do a
407 * friendly check if another process should now be run, then 476 * friendly check if another process should now be run, then
408 * return to run the Guest again */ 477 * return to run the Guest again
478 */
409 cond_resched(); 479 cond_resched();
410 return; 480 return;
411 case LGUEST_TRAP_ENTRY: 481 case LGUEST_TRAP_ENTRY:
412 /* Our 'struct hcall_args' maps directly over our regs: we set 482 /*
413 * up the pointer now to indicate a hypercall is pending. */ 483 * Our 'struct hcall_args' maps directly over our regs: we set
484 * up the pointer now to indicate a hypercall is pending.
485 */
414 cpu->hcall = (struct hcall_args *)cpu->regs; 486 cpu->hcall = (struct hcall_args *)cpu->regs;
415 return; 487 return;
416 } 488 }
417 489
418 /* We didn't handle the trap, so it needs to go to the Guest. */ 490 /* We didn't handle the trap, so it needs to go to the Guest. */
419 if (!deliver_trap(cpu, cpu->regs->trapnum)) 491 if (!deliver_trap(cpu, cpu->regs->trapnum))
420 /* If the Guest doesn't have a handler (either it hasn't 492 /*
493 * If the Guest doesn't have a handler (either it hasn't
421 * registered any yet, or it's one of the faults we don't let 494 * registered any yet, or it's one of the faults we don't let
422 * it handle), it dies with this cryptic error message. */ 495 * it handle), it dies with this cryptic error message.
496 */
423 kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", 497 kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
424 cpu->regs->trapnum, cpu->regs->eip, 498 cpu->regs->trapnum, cpu->regs->eip,
425 cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault 499 cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
426 : cpu->regs->errcode); 500 : cpu->regs->errcode);
427} 501}
428 502
429/* Now we can look at each of the routines this calls, in increasing order of 503/*
504 * Now we can look at each of the routines this calls, in increasing order of
430 * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), 505 * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
431 * deliver_trap() and demand_page(). After all those, we'll be ready to 506 * deliver_trap() and demand_page(). After all those, we'll be ready to
432 * examine the Switcher, and our philosophical understanding of the Host/Guest 507 * examine the Switcher, and our philosophical understanding of the Host/Guest
433 * duality will be complete. :*/ 508 * duality will be complete.
509:*/
434static void adjust_pge(void *on) 510static void adjust_pge(void *on)
435{ 511{
436 if (on) 512 if (on)
@@ -439,13 +515,16 @@ static void adjust_pge(void *on)
439 write_cr4(read_cr4() & ~X86_CR4_PGE); 515 write_cr4(read_cr4() & ~X86_CR4_PGE);
440} 516}
441 517
442/*H:020 Now the Switcher is mapped and every thing else is ready, we need to do 518/*H:020
443 * some more i386-specific initialization. */ 519 * Now the Switcher is mapped and every thing else is ready, we need to do
520 * some more i386-specific initialization.
521 */
444void __init lguest_arch_host_init(void) 522void __init lguest_arch_host_init(void)
445{ 523{
446 int i; 524 int i;
447 525
448 /* Most of the i386/switcher.S doesn't care that it's been moved; on 526 /*
527 * Most of the i386/switcher.S doesn't care that it's been moved; on
449 * Intel, jumps are relative, and it doesn't access any references to 528 * Intel, jumps are relative, and it doesn't access any references to
450 * external code or data. 529 * external code or data.
451 * 530 *
@@ -453,7 +532,8 @@ void __init lguest_arch_host_init(void)
453 * addresses are placed in a table (default_idt_entries), so we need to 532 * addresses are placed in a table (default_idt_entries), so we need to
454 * update the table with the new addresses. switcher_offset() is a 533 * update the table with the new addresses. switcher_offset() is a
455 * convenience function which returns the distance between the 534 * convenience function which returns the distance between the
456 * compiled-in switcher code and the high-mapped copy we just made. */ 535 * compiled-in switcher code and the high-mapped copy we just made.
536 */
457 for (i = 0; i < IDT_ENTRIES; i++) 537 for (i = 0; i < IDT_ENTRIES; i++)
458 default_idt_entries[i] += switcher_offset(); 538 default_idt_entries[i] += switcher_offset();
459 539
@@ -468,63 +548,81 @@ void __init lguest_arch_host_init(void)
468 for_each_possible_cpu(i) { 548 for_each_possible_cpu(i) {
469 /* lguest_pages() returns this CPU's two pages. */ 549 /* lguest_pages() returns this CPU's two pages. */
470 struct lguest_pages *pages = lguest_pages(i); 550 struct lguest_pages *pages = lguest_pages(i);
471 /* This is a convenience pointer to make the code fit one 551 /* This is a convenience pointer to make the code neater. */
472 * statement to a line. */
473 struct lguest_ro_state *state = &pages->state; 552 struct lguest_ro_state *state = &pages->state;
474 553
475 /* The Global Descriptor Table: the Host has a different one 554 /*
555 * The Global Descriptor Table: the Host has a different one
476 * for each CPU. We keep a descriptor for the GDT which says 556 * for each CPU. We keep a descriptor for the GDT which says
477 * where it is and how big it is (the size is actually the last 557 * where it is and how big it is (the size is actually the last
478 * byte, not the size, hence the "-1"). */ 558 * byte, not the size, hence the "-1").
559 */
479 state->host_gdt_desc.size = GDT_SIZE-1; 560 state->host_gdt_desc.size = GDT_SIZE-1;
480 state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); 561 state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
481 562
482 /* All CPUs on the Host use the same Interrupt Descriptor 563 /*
564 * All CPUs on the Host use the same Interrupt Descriptor
483 * Table, so we just use store_idt(), which gets this CPU's IDT 565 * Table, so we just use store_idt(), which gets this CPU's IDT
484 * descriptor. */ 566 * descriptor.
567 */
485 store_idt(&state->host_idt_desc); 568 store_idt(&state->host_idt_desc);
486 569
487 /* The descriptors for the Guest's GDT and IDT can be filled 570 /*
571 * The descriptors for the Guest's GDT and IDT can be filled
488 * out now, too. We copy the GDT & IDT into ->guest_gdt and 572 * out now, too. We copy the GDT & IDT into ->guest_gdt and
489 * ->guest_idt before actually running the Guest. */ 573 * ->guest_idt before actually running the Guest.
574 */
490 state->guest_idt_desc.size = sizeof(state->guest_idt)-1; 575 state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
491 state->guest_idt_desc.address = (long)&state->guest_idt; 576 state->guest_idt_desc.address = (long)&state->guest_idt;
492 state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; 577 state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
493 state->guest_gdt_desc.address = (long)&state->guest_gdt; 578 state->guest_gdt_desc.address = (long)&state->guest_gdt;
494 579
495 /* We know where we want the stack to be when the Guest enters 580 /*
581 * We know where we want the stack to be when the Guest enters
496 * the Switcher: in pages->regs. The stack grows upwards, so 582 * the Switcher: in pages->regs. The stack grows upwards, so
497 * we start it at the end of that structure. */ 583 * we start it at the end of that structure.
584 */
498 state->guest_tss.sp0 = (long)(&pages->regs + 1); 585 state->guest_tss.sp0 = (long)(&pages->regs + 1);
499 /* And this is the GDT entry to use for the stack: we keep a 586 /*
500 * couple of special LGUEST entries. */ 587 * And this is the GDT entry to use for the stack: we keep a
588 * couple of special LGUEST entries.
589 */
501 state->guest_tss.ss0 = LGUEST_DS; 590 state->guest_tss.ss0 = LGUEST_DS;
502 591
503 /* x86 can have a finegrained bitmap which indicates what I/O 592 /*
593 * x86 can have a finegrained bitmap which indicates what I/O
504 * ports the process can use. We set it to the end of our 594 * ports the process can use. We set it to the end of our
505 * structure, meaning "none". */ 595 * structure, meaning "none".
596 */
506 state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); 597 state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
507 598
508 /* Some GDT entries are the same across all Guests, so we can 599 /*
509 * set them up now. */ 600 * Some GDT entries are the same across all Guests, so we can
601 * set them up now.
602 */
510 setup_default_gdt_entries(state); 603 setup_default_gdt_entries(state);
511 /* Most IDT entries are the same for all Guests, too.*/ 604 /* Most IDT entries are the same for all Guests, too.*/
512 setup_default_idt_entries(state, default_idt_entries); 605 setup_default_idt_entries(state, default_idt_entries);
513 606
514 /* The Host needs to be able to use the LGUEST segments on this 607 /*
515 * CPU, too, so put them in the Host GDT. */ 608 * The Host needs to be able to use the LGUEST segments on this
609 * CPU, too, so put them in the Host GDT.
610 */
516 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 611 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
517 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 612 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
518 } 613 }
519 614
520 /* In the Switcher, we want the %cs segment register to use the 615 /*
616 * In the Switcher, we want the %cs segment register to use the
521 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so 617 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
522 * it will be undisturbed when we switch. To change %cs and jump we 618 * it will be undisturbed when we switch. To change %cs and jump we
523 * need this structure to feed to Intel's "lcall" instruction. */ 619 * need this structure to feed to Intel's "lcall" instruction.
620 */
524 lguest_entry.offset = (long)switch_to_guest + switcher_offset(); 621 lguest_entry.offset = (long)switch_to_guest + switcher_offset();
525 lguest_entry.segment = LGUEST_CS; 622 lguest_entry.segment = LGUEST_CS;
526 623
527 /* Finally, we need to turn off "Page Global Enable". PGE is an 624 /*
625 * Finally, we need to turn off "Page Global Enable". PGE is an
528 * optimization where page table entries are specially marked to show 626 * optimization where page table entries are specially marked to show
529 * they never change. The Host kernel marks all the kernel pages this 627 * they never change. The Host kernel marks all the kernel pages this
530 * way because it's always present, even when userspace is running. 628 * way because it's always present, even when userspace is running.
@@ -534,16 +632,21 @@ void __init lguest_arch_host_init(void)
534 * you'll get really weird bugs that you'll chase for two days. 632 * you'll get really weird bugs that you'll chase for two days.
535 * 633 *
536 * I used to turn PGE off every time we switched to the Guest and back 634 * I used to turn PGE off every time we switched to the Guest and back
537 * on when we return, but that slowed the Switcher down noticibly. */ 635 * on when we return, but that slowed the Switcher down noticibly.
636 */
538 637
539 /* We don't need the complexity of CPUs coming and going while we're 638 /*
540 * doing this. */ 639 * We don't need the complexity of CPUs coming and going while we're
640 * doing this.
641 */
541 get_online_cpus(); 642 get_online_cpus();
542 if (cpu_has_pge) { /* We have a broader idea of "global". */ 643 if (cpu_has_pge) { /* We have a broader idea of "global". */
543 /* Remember that this was originally set (for cleanup). */ 644 /* Remember that this was originally set (for cleanup). */
544 cpu_had_pge = 1; 645 cpu_had_pge = 1;
545 /* adjust_pge is a helper function which sets or unsets the PGE 646 /*
546 * bit on its CPU, depending on the argument (0 == unset). */ 647 * adjust_pge is a helper function which sets or unsets the PGE
648 * bit on its CPU, depending on the argument (0 == unset).
649 */
547 on_each_cpu(adjust_pge, (void *)0, 1); 650 on_each_cpu(adjust_pge, (void *)0, 1);
548 /* Turn off the feature in the global feature set. */ 651 /* Turn off the feature in the global feature set. */
549 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); 652 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
@@ -590,26 +693,32 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
590{ 693{
591 u32 tsc_speed; 694 u32 tsc_speed;
592 695
593 /* The pointer to the Guest's "struct lguest_data" is the only argument. 696 /*
594 * We check that address now. */ 697 * The pointer to the Guest's "struct lguest_data" is the only argument.
698 * We check that address now.
699 */
595 if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, 700 if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
596 sizeof(*cpu->lg->lguest_data))) 701 sizeof(*cpu->lg->lguest_data)))
597 return -EFAULT; 702 return -EFAULT;
598 703
599 /* Having checked it, we simply set lg->lguest_data to point straight 704 /*
705 * Having checked it, we simply set lg->lguest_data to point straight
600 * into the Launcher's memory at the right place and then use 706 * into the Launcher's memory at the right place and then use
601 * copy_to_user/from_user from now on, instead of lgread/write. I put 707 * copy_to_user/from_user from now on, instead of lgread/write. I put
602 * this in to show that I'm not immune to writing stupid 708 * this in to show that I'm not immune to writing stupid
603 * optimizations. */ 709 * optimizations.
710 */
604 cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; 711 cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
605 712
606 /* We insist that the Time Stamp Counter exist and doesn't change with 713 /*
714 * We insist that the Time Stamp Counter exist and doesn't change with
607 * cpu frequency. Some devious chip manufacturers decided that TSC 715 * cpu frequency. Some devious chip manufacturers decided that TSC
608 * changes could be handled in software. I decided that time going 716 * changes could be handled in software. I decided that time going
609 * backwards might be good for benchmarks, but it's bad for users. 717 * backwards might be good for benchmarks, but it's bad for users.
610 * 718 *
611 * We also insist that the TSC be stable: the kernel detects unreliable 719 * We also insist that the TSC be stable: the kernel detects unreliable
612 * TSCs for its own purposes, and we use that here. */ 720 * TSCs for its own purposes, and we use that here.
721 */
613 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) 722 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
614 tsc_speed = tsc_khz; 723 tsc_speed = tsc_khz;
615 else 724 else
@@ -625,38 +734,47 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
625} 734}
626/*:*/ 735/*:*/
627 736
628/*L:030 lguest_arch_setup_regs() 737/*L:030
738 * lguest_arch_setup_regs()
629 * 739 *
630 * Most of the Guest's registers are left alone: we used get_zeroed_page() to 740 * Most of the Guest's registers are left alone: we used get_zeroed_page() to
631 * allocate the structure, so they will be 0. */ 741 * allocate the structure, so they will be 0.
742 */
632void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) 743void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
633{ 744{
634 struct lguest_regs *regs = cpu->regs; 745 struct lguest_regs *regs = cpu->regs;
635 746
636 /* There are four "segment" registers which the Guest needs to boot: 747 /*
748 * There are four "segment" registers which the Guest needs to boot:
637 * The "code segment" register (cs) refers to the kernel code segment 749 * The "code segment" register (cs) refers to the kernel code segment
638 * __KERNEL_CS, and the "data", "extra" and "stack" segment registers 750 * __KERNEL_CS, and the "data", "extra" and "stack" segment registers
639 * refer to the kernel data segment __KERNEL_DS. 751 * refer to the kernel data segment __KERNEL_DS.
640 * 752 *
641 * The privilege level is packed into the lower bits. The Guest runs 753 * The privilege level is packed into the lower bits. The Guest runs
642 * at privilege level 1 (GUEST_PL).*/ 754 * at privilege level 1 (GUEST_PL).
755 */
643 regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; 756 regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
644 regs->cs = __KERNEL_CS|GUEST_PL; 757 regs->cs = __KERNEL_CS|GUEST_PL;
645 758
646 /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) 759 /*
760 * The "eflags" register contains miscellaneous flags. Bit 1 (0x002)
647 * is supposed to always be "1". Bit 9 (0x200) controls whether 761 * is supposed to always be "1". Bit 9 (0x200) controls whether
648 * interrupts are enabled. We always leave interrupts enabled while 762 * interrupts are enabled. We always leave interrupts enabled while
649 * running the Guest. */ 763 * running the Guest.
764 */
650 regs->eflags = X86_EFLAGS_IF | 0x2; 765 regs->eflags = X86_EFLAGS_IF | 0x2;
651 766
652 /* The "Extended Instruction Pointer" register says where the Guest is 767 /*
653 * running. */ 768 * The "Extended Instruction Pointer" register says where the Guest is
769 * running.
770 */
654 regs->eip = start; 771 regs->eip = start;
655 772
656 /* %esi points to our boot information, at physical address 0, so don't 773 /*
657 * touch it. */ 774 * %esi points to our boot information, at physical address 0, so don't
775 * touch it.
776 */
658 777
659 /* There are a couple of GDT entries the Guest expects when first 778 /* There are a couple of GDT entries the Guest expects at boot. */
660 * booting. */
661 setup_guest_gdt(cpu); 779 setup_guest_gdt(cpu);
662} 780}