diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2007-07-26 13:41:04 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-07-26 14:35:17 -0400 |
commit | bff672e630a015d5b54c8bfb16160b7edc39a57c (patch) | |
tree | 3af06baacb76809234a3e71033d14b7ed769dbd8 /drivers | |
parent | dde797899ac17ebb812b7566044124d785e98dc7 (diff) |
lguest: documentation V: Host
Documentation: The Host
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/lguest/core.c | 273 | ||||
-rw-r--r-- | drivers/lguest/hypercalls.c | 118 | ||||
-rw-r--r-- | drivers/lguest/interrupts_and_traps.c | 176 | ||||
-rw-r--r-- | drivers/lguest/lg.h | 19 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 314 | ||||
-rw-r--r-- | drivers/lguest/segments.c | 109 |
6 files changed, 924 insertions, 85 deletions
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 1eb05f9a56b6..c0f50b4dd2f1 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c | |||
@@ -64,11 +64,33 @@ static struct lguest_pages *lguest_pages(unsigned int cpu) | |||
64 | (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); | 64 | (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); |
65 | } | 65 | } |
66 | 66 | ||
67 | /*H:010 We need to set up the Switcher at a high virtual address. Remember the | ||
68 | * Switcher is a few hundred bytes of assembler code which actually changes the | ||
69 | * CPU to run the Guest, and then changes back to the Host when a trap or | ||
70 | * interrupt happens. | ||
71 | * | ||
72 | * The Switcher code must be at the same virtual address in the Guest as the | ||
73 | * Host since it will be running as the switchover occurs. | ||
74 | * | ||
75 | * Trying to map memory at a particular address is an unusual thing to do, so | ||
76 | * it's not a simple one-liner. We also set up the per-cpu parts of the | ||
77 | * Switcher here. | ||
78 | */ | ||
67 | static __init int map_switcher(void) | 79 | static __init int map_switcher(void) |
68 | { | 80 | { |
69 | int i, err; | 81 | int i, err; |
70 | struct page **pagep; | 82 | struct page **pagep; |
71 | 83 | ||
84 | /* | ||
85 | * Map the Switcher in to high memory. | ||
86 | * | ||
87 | * It turns out that if we choose the address 0xFFC00000 (4MB under the | ||
88 | * top virtual address), it makes setting up the page tables really | ||
89 | * easy. | ||
90 | */ | ||
91 | |||
92 | /* We allocate an array of "struct page"s. map_vm_area() wants the | ||
93 | * pages in this form, rather than just an array of pointers. */ | ||
72 | switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, | 94 | switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, |
73 | GFP_KERNEL); | 95 | GFP_KERNEL); |
74 | if (!switcher_page) { | 96 | if (!switcher_page) { |
@@ -76,6 +98,8 @@ static __init int map_switcher(void) | |||
76 | goto out; | 98 | goto out; |
77 | } | 99 | } |
78 | 100 | ||
101 | /* Now we actually allocate the pages. The Guest will see these pages, | ||
102 | * so we make sure they're zeroed. */ | ||
79 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { | 103 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { |
80 | unsigned long addr = get_zeroed_page(GFP_KERNEL); | 104 | unsigned long addr = get_zeroed_page(GFP_KERNEL); |
81 | if (!addr) { | 105 | if (!addr) { |
@@ -85,6 +109,9 @@ static __init int map_switcher(void) | |||
85 | switcher_page[i] = virt_to_page(addr); | 109 | switcher_page[i] = virt_to_page(addr); |
86 | } | 110 | } |
87 | 111 | ||
112 | /* Now we reserve the "virtual memory area" we want: 0xFFC00000 | ||
113 | * (SWITCHER_ADDR). We might not get it in theory, but in practice | ||
114 | * it's worked so far. */ | ||
88 | switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, | 115 | switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, |
89 | VM_ALLOC, SWITCHER_ADDR, VMALLOC_END); | 116 | VM_ALLOC, SWITCHER_ADDR, VMALLOC_END); |
90 | if (!switcher_vma) { | 117 | if (!switcher_vma) { |
@@ -93,49 +120,105 @@ static __init int map_switcher(void) | |||
93 | goto free_pages; | 120 | goto free_pages; |
94 | } | 121 | } |
95 | 122 | ||
123 | /* This code actually sets up the pages we've allocated to appear at | ||
124 | * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the | ||
125 | * kind of pages we're mapping (kernel pages), and a pointer to our | ||
126 | * array of struct pages. It increments that pointer, but we don't | ||
127 | * care. */ | ||
96 | pagep = switcher_page; | 128 | pagep = switcher_page; |
97 | err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep); | 129 | err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep); |
98 | if (err) { | 130 | if (err) { |
99 | printk("lguest: map_vm_area failed: %i\n", err); | 131 | printk("lguest: map_vm_area failed: %i\n", err); |
100 | goto free_vma; | 132 | goto free_vma; |
101 | } | 133 | } |
134 | |||
135 | /* Now the switcher is mapped at the right address, we can't fail! | ||
136 | * Copy in the compiled-in Switcher code (from switcher.S). */ | ||
102 | memcpy(switcher_vma->addr, start_switcher_text, | 137 | memcpy(switcher_vma->addr, start_switcher_text, |
103 | end_switcher_text - start_switcher_text); | 138 | end_switcher_text - start_switcher_text); |
104 | 139 | ||
105 | /* Fix up IDT entries to point into copied text. */ | 140 | /* Most of the switcher.S doesn't care that it's been moved; on Intel, |
141 | * jumps are relative, and it doesn't access any references to external | ||
142 | * code or data. | ||
143 | * | ||
144 | * The only exception is the interrupt handlers in switcher.S: their | ||
145 | * addresses are placed in a table (default_idt_entries), so we need to | ||
146 | * update the table with the new addresses. switcher_offset() is a | ||
147 | * convenience function which returns the distance between the builtin | ||
148 | * switcher code and the high-mapped copy we just made. */ | ||
106 | for (i = 0; i < IDT_ENTRIES; i++) | 149 | for (i = 0; i < IDT_ENTRIES; i++) |
107 | default_idt_entries[i] += switcher_offset(); | 150 | default_idt_entries[i] += switcher_offset(); |
108 | 151 | ||
152 | /* | ||
153 | * Set up the Switcher's per-cpu areas. | ||
154 | * | ||
155 | * Each CPU gets two pages of its own within the high-mapped region | ||
156 | * (aka. "struct lguest_pages"). Much of this can be initialized now, | ||
157 | * but some depends on what Guest we are running (which is set up in | ||
158 | * copy_in_guest_info()). | ||
159 | */ | ||
109 | for_each_possible_cpu(i) { | 160 | for_each_possible_cpu(i) { |
161 | /* lguest_pages() returns this CPU's two pages. */ | ||
110 | struct lguest_pages *pages = lguest_pages(i); | 162 | struct lguest_pages *pages = lguest_pages(i); |
163 | /* This is a convenience pointer to make the code fit one | ||
164 | * statement to a line. */ | ||
111 | struct lguest_ro_state *state = &pages->state; | 165 | struct lguest_ro_state *state = &pages->state; |
112 | 166 | ||
113 | /* These fields are static: rest done in copy_in_guest_info */ | 167 | /* The Global Descriptor Table: the Host has a different one |
168 | * for each CPU. We keep a descriptor for the GDT which says | ||
169 | * where it is and how big it is (the size is actually the last | ||
170 | * byte, not the size, hence the "-1"). */ | ||
114 | state->host_gdt_desc.size = GDT_SIZE-1; | 171 | state->host_gdt_desc.size = GDT_SIZE-1; |
115 | state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); | 172 | state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); |
173 | |||
174 | /* All CPUs on the Host use the same Interrupt Descriptor | ||
175 | * Table, so we just use store_idt(), which gets this CPU's IDT | ||
176 | * descriptor. */ | ||
116 | store_idt(&state->host_idt_desc); | 177 | store_idt(&state->host_idt_desc); |
178 | |||
179 | /* The descriptors for the Guest's GDT and IDT can be filled | ||
180 | * out now, too. We copy the GDT & IDT into ->guest_gdt and | ||
181 | * ->guest_idt before actually running the Guest. */ | ||
117 | state->guest_idt_desc.size = sizeof(state->guest_idt)-1; | 182 | state->guest_idt_desc.size = sizeof(state->guest_idt)-1; |
118 | state->guest_idt_desc.address = (long)&state->guest_idt; | 183 | state->guest_idt_desc.address = (long)&state->guest_idt; |
119 | state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; | 184 | state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; |
120 | state->guest_gdt_desc.address = (long)&state->guest_gdt; | 185 | state->guest_gdt_desc.address = (long)&state->guest_gdt; |
186 | |||
187 | /* We know where we want the stack to be when the Guest enters | ||
188 | * the switcher: in pages->regs. The stack grows upwards, so | ||
189 | * we start it at the end of that structure. */ | ||
121 | state->guest_tss.esp0 = (long)(&pages->regs + 1); | 190 | state->guest_tss.esp0 = (long)(&pages->regs + 1); |
191 | /* And this is the GDT entry to use for the stack: we keep a | ||
192 | * couple of special LGUEST entries. */ | ||
122 | state->guest_tss.ss0 = LGUEST_DS; | 193 | state->guest_tss.ss0 = LGUEST_DS; |
123 | /* No I/O for you! */ | 194 | |
195 | /* x86 can have a finegrained bitmap which indicates what I/O | ||
196 | * ports the process can use. We set it to the end of our | ||
197 | * structure, meaning "none". */ | ||
124 | state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); | 198 | state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); |
199 | |||
200 | /* Some GDT entries are the same across all Guests, so we can | ||
201 | * set them up now. */ | ||
125 | setup_default_gdt_entries(state); | 202 | setup_default_gdt_entries(state); |
203 | /* Most IDT entries are the same for all Guests, too.*/ | ||
126 | setup_default_idt_entries(state, default_idt_entries); | 204 | setup_default_idt_entries(state, default_idt_entries); |
127 | 205 | ||
128 | /* Setup LGUEST segments on all cpus */ | 206 | /* The Host needs to be able to use the LGUEST segments on this |
207 | * CPU, too, so put them in the Host GDT. */ | ||
129 | get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; | 208 | get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; |
130 | get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; | 209 | get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; |
131 | } | 210 | } |
132 | 211 | ||
133 | /* Initialize entry point into switcher. */ | 212 | /* In the Switcher, we want the %cs segment register to use the |
213 | * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so | ||
214 | * it will be undisturbed when we switch. To change %cs and jump we | ||
215 | * need this structure to feed to Intel's "lcall" instruction. */ | ||
134 | lguest_entry.offset = (long)switch_to_guest + switcher_offset(); | 216 | lguest_entry.offset = (long)switch_to_guest + switcher_offset(); |
135 | lguest_entry.segment = LGUEST_CS; | 217 | lguest_entry.segment = LGUEST_CS; |
136 | 218 | ||
137 | printk(KERN_INFO "lguest: mapped switcher at %p\n", | 219 | printk(KERN_INFO "lguest: mapped switcher at %p\n", |
138 | switcher_vma->addr); | 220 | switcher_vma->addr); |
221 | /* And we succeeded... */ | ||
139 | return 0; | 222 | return 0; |
140 | 223 | ||
141 | free_vma: | 224 | free_vma: |
@@ -149,35 +232,58 @@ free_some_pages: | |||
149 | out: | 232 | out: |
150 | return err; | 233 | return err; |
151 | } | 234 | } |
235 | /*:*/ | ||
152 | 236 | ||
237 | /* Cleaning up the mapping when the module is unloaded is almost... | ||
238 | * too easy. */ | ||
153 | static void unmap_switcher(void) | 239 | static void unmap_switcher(void) |
154 | { | 240 | { |
155 | unsigned int i; | 241 | unsigned int i; |
156 | 242 | ||
243 | /* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */ | ||
157 | vunmap(switcher_vma->addr); | 244 | vunmap(switcher_vma->addr); |
245 | /* Now we just need to free the pages we copied the switcher into */ | ||
158 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) | 246 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) |
159 | __free_pages(switcher_page[i], 0); | 247 | __free_pages(switcher_page[i], 0); |
160 | } | 248 | } |
161 | 249 | ||
162 | /* IN/OUT insns: enough to get us past boot-time probing. */ | 250 | /*H:130 Our Guest is usually so well behaved; it never tries to do things it |
251 | * isn't allowed to. Unfortunately, "struct paravirt_ops" isn't quite | ||
252 | * complete, because it doesn't contain replacements for the Intel I/O | ||
253 | * instructions. As a result, the Guest sometimes fumbles across one during | ||
254 | * the boot process as it probes for various things which are usually attached | ||
255 | * to a PC. | ||
256 | * | ||
257 | * When the Guest uses one of these instructions, we get trap #13 (General | ||
258 | * Protection Fault) and come here. We see if it's one of those troublesome | ||
259 | * instructions and skip over it. We return true if we did. */ | ||
163 | static int emulate_insn(struct lguest *lg) | 260 | static int emulate_insn(struct lguest *lg) |
164 | { | 261 | { |
165 | u8 insn; | 262 | u8 insn; |
166 | unsigned int insnlen = 0, in = 0, shift = 0; | 263 | unsigned int insnlen = 0, in = 0, shift = 0; |
264 | /* The eip contains the *virtual* address of the Guest's instruction: | ||
265 | * guest_pa just subtracts the Guest's page_offset. */ | ||
167 | unsigned long physaddr = guest_pa(lg, lg->regs->eip); | 266 | unsigned long physaddr = guest_pa(lg, lg->regs->eip); |
168 | 267 | ||
169 | /* This only works for addresses in linear mapping... */ | 268 | /* The guest_pa() function only works for Guest kernel addresses, but |
269 | * that's all we're trying to do anyway. */ | ||
170 | if (lg->regs->eip < lg->page_offset) | 270 | if (lg->regs->eip < lg->page_offset) |
171 | return 0; | 271 | return 0; |
272 | |||
273 | /* Decoding x86 instructions is icky. */ | ||
172 | lgread(lg, &insn, physaddr, 1); | 274 | lgread(lg, &insn, physaddr, 1); |
173 | 275 | ||
174 | /* Operand size prefix means it's actually for ax. */ | 276 | /* 0x66 is an "operand prefix". It means it's using the upper 16 bits |
277 | of the eax register. */ | ||
175 | if (insn == 0x66) { | 278 | if (insn == 0x66) { |
176 | shift = 16; | 279 | shift = 16; |
280 | /* The instruction is 1 byte so far, read the next byte. */ | ||
177 | insnlen = 1; | 281 | insnlen = 1; |
178 | lgread(lg, &insn, physaddr + insnlen, 1); | 282 | lgread(lg, &insn, physaddr + insnlen, 1); |
179 | } | 283 | } |
180 | 284 | ||
285 | /* We can ignore the lower bit for the moment and decode the 4 opcodes | ||
286 | * we need to emulate. */ | ||
181 | switch (insn & 0xFE) { | 287 | switch (insn & 0xFE) { |
182 | case 0xE4: /* in <next byte>,%al */ | 288 | case 0xE4: /* in <next byte>,%al */ |
183 | insnlen += 2; | 289 | insnlen += 2; |
@@ -194,9 +300,13 @@ static int emulate_insn(struct lguest *lg) | |||
194 | insnlen += 1; | 300 | insnlen += 1; |
195 | break; | 301 | break; |
196 | default: | 302 | default: |
303 | /* OK, we don't know what this is, can't emulate. */ | ||
197 | return 0; | 304 | return 0; |
198 | } | 305 | } |
199 | 306 | ||
307 | /* If it was an "IN" instruction, they expect the result to be read | ||
308 | * into %eax, so we change %eax. We always return all-ones, which | ||
309 | * traditionally means "there's nothing there". */ | ||
200 | if (in) { | 310 | if (in) { |
201 | /* Lower bit tells is whether it's a 16 or 32 bit access */ | 311 | /* Lower bit tells is whether it's a 16 or 32 bit access */ |
202 | if (insn & 0x1) | 312 | if (insn & 0x1) |
@@ -204,9 +314,12 @@ static int emulate_insn(struct lguest *lg) | |||
204 | else | 314 | else |
205 | lg->regs->eax |= (0xFFFF << shift); | 315 | lg->regs->eax |= (0xFFFF << shift); |
206 | } | 316 | } |
317 | /* Finally, we've "done" the instruction, so move past it. */ | ||
207 | lg->regs->eip += insnlen; | 318 | lg->regs->eip += insnlen; |
319 | /* Success! */ | ||
208 | return 1; | 320 | return 1; |
209 | } | 321 | } |
322 | /*:*/ | ||
210 | 323 | ||
211 | /*L:305 | 324 | /*L:305 |
212 | * Dealing With Guest Memory. | 325 | * Dealing With Guest Memory. |
@@ -321,13 +434,24 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) | |||
321 | : "memory", "%edx", "%ecx", "%edi", "%esi"); | 434 | : "memory", "%edx", "%ecx", "%edi", "%esi"); |
322 | } | 435 | } |
323 | 436 | ||
437 | /*H:030 Let's jump straight to the the main loop which runs the Guest. | ||
438 | * Remember, this is called by the Launcher reading /dev/lguest, and we keep | ||
439 | * going around and around until something interesting happens. */ | ||
324 | int run_guest(struct lguest *lg, unsigned long __user *user) | 440 | int run_guest(struct lguest *lg, unsigned long __user *user) |
325 | { | 441 | { |
442 | /* We stop running once the Guest is dead. */ | ||
326 | while (!lg->dead) { | 443 | while (!lg->dead) { |
444 | /* We need to initialize this, otherwise gcc complains. It's | ||
445 | * not (yet) clever enough to see that it's initialized when we | ||
446 | * need it. */ | ||
327 | unsigned int cr2 = 0; /* Damn gcc */ | 447 | unsigned int cr2 = 0; /* Damn gcc */ |
328 | 448 | ||
329 | /* Hypercalls first: we might have been out to userspace */ | 449 | /* First we run any hypercalls the Guest wants done: either in |
450 | * the hypercall ring in "struct lguest_data", or directly by | ||
451 | * using int 31 (LGUEST_TRAP_ENTRY). */ | ||
330 | do_hypercalls(lg); | 452 | do_hypercalls(lg); |
453 | /* It's possible the Guest did a SEND_DMA hypercall to the | ||
454 | * Launcher, in which case we return from the read() now. */ | ||
331 | if (lg->dma_is_pending) { | 455 | if (lg->dma_is_pending) { |
332 | if (put_user(lg->pending_dma, user) || | 456 | if (put_user(lg->pending_dma, user) || |
333 | put_user(lg->pending_key, user+1)) | 457 | put_user(lg->pending_key, user+1)) |
@@ -335,6 +459,7 @@ int run_guest(struct lguest *lg, unsigned long __user *user) | |||
335 | return sizeof(unsigned long)*2; | 459 | return sizeof(unsigned long)*2; |
336 | } | 460 | } |
337 | 461 | ||
462 | /* Check for signals */ | ||
338 | if (signal_pending(current)) | 463 | if (signal_pending(current)) |
339 | return -ERESTARTSYS; | 464 | return -ERESTARTSYS; |
340 | 465 | ||
@@ -342,77 +467,154 @@ int run_guest(struct lguest *lg, unsigned long __user *user) | |||
342 | if (lg->break_out) | 467 | if (lg->break_out) |
343 | return -EAGAIN; | 468 | return -EAGAIN; |
344 | 469 | ||
470 | /* Check if there are any interrupts which can be delivered | ||
471 | * now: if so, this sets up the hander to be executed when we | ||
472 | * next run the Guest. */ | ||
345 | maybe_do_interrupt(lg); | 473 | maybe_do_interrupt(lg); |
346 | 474 | ||
475 | /* All long-lived kernel loops need to check with this horrible | ||
476 | * thing called the freezer. If the Host is trying to suspend, | ||
477 | * it stops us. */ | ||
347 | try_to_freeze(); | 478 | try_to_freeze(); |
348 | 479 | ||
480 | /* Just make absolutely sure the Guest is still alive. One of | ||
481 | * those hypercalls could have been fatal, for example. */ | ||
349 | if (lg->dead) | 482 | if (lg->dead) |
350 | break; | 483 | break; |
351 | 484 | ||
485 | /* If the Guest asked to be stopped, we sleep. The Guest's | ||
486 | * clock timer or LHCALL_BREAK from the Waker will wake us. */ | ||
352 | if (lg->halted) { | 487 | if (lg->halted) { |
353 | set_current_state(TASK_INTERRUPTIBLE); | 488 | set_current_state(TASK_INTERRUPTIBLE); |
354 | schedule(); | 489 | schedule(); |
355 | continue; | 490 | continue; |
356 | } | 491 | } |
357 | 492 | ||
493 | /* OK, now we're ready to jump into the Guest. First we put up | ||
494 | * the "Do Not Disturb" sign: */ | ||
358 | local_irq_disable(); | 495 | local_irq_disable(); |
359 | 496 | ||
360 | /* Even if *we* don't want FPU trap, guest might... */ | 497 | /* Remember the awfully-named TS bit? If the Guest has asked |
498 | * to set it we set it now, so we can trap and pass that trap | ||
499 | * to the Guest if it uses the FPU. */ | ||
361 | if (lg->ts) | 500 | if (lg->ts) |
362 | set_ts(); | 501 | set_ts(); |
363 | 502 | ||
364 | /* Don't let Guest do SYSENTER: we can't handle it. */ | 503 | /* SYSENTER is an optimized way of doing system calls. We |
504 | * can't allow it because it always jumps to privilege level 0. | ||
505 | * A normal Guest won't try it because we don't advertise it in | ||
506 | * CPUID, but a malicious Guest (or malicious Guest userspace | ||
507 | * program) could, so we tell the CPU to disable it before | ||
508 | * running the Guest. */ | ||
365 | if (boot_cpu_has(X86_FEATURE_SEP)) | 509 | if (boot_cpu_has(X86_FEATURE_SEP)) |
366 | wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); | 510 | wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); |
367 | 511 | ||
512 | /* Now we actually run the Guest. It will pop back out when | ||
513 | * something interesting happens, and we can examine its | ||
514 | * registers to see what it was doing. */ | ||
368 | run_guest_once(lg, lguest_pages(raw_smp_processor_id())); | 515 | run_guest_once(lg, lguest_pages(raw_smp_processor_id())); |
369 | 516 | ||
370 | /* Save cr2 now if we page-faulted. */ | 517 | /* The "regs" pointer contains two extra entries which are not |
518 | * really registers: a trap number which says what interrupt or | ||
519 | * trap made the switcher code come back, and an error code | ||
520 | * which some traps set. */ | ||
521 | |||
522 | /* If the Guest page faulted, then the cr2 register will tell | ||
523 | * us the bad virtual address. We have to grab this now, | ||
524 | * because once we re-enable interrupts an interrupt could | ||
525 | * fault and thus overwrite cr2, or we could even move off to a | ||
526 | * different CPU. */ | ||
371 | if (lg->regs->trapnum == 14) | 527 | if (lg->regs->trapnum == 14) |
372 | cr2 = read_cr2(); | 528 | cr2 = read_cr2(); |
529 | /* Similarly, if we took a trap because the Guest used the FPU, | ||
530 | * we have to restore the FPU it expects to see. */ | ||
373 | else if (lg->regs->trapnum == 7) | 531 | else if (lg->regs->trapnum == 7) |
374 | math_state_restore(); | 532 | math_state_restore(); |
375 | 533 | ||
534 | /* Restore SYSENTER if it's supposed to be on. */ | ||
376 | if (boot_cpu_has(X86_FEATURE_SEP)) | 535 | if (boot_cpu_has(X86_FEATURE_SEP)) |
377 | wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); | 536 | wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); |
537 | |||
538 | /* Now we're ready to be interrupted or moved to other CPUs */ | ||
378 | local_irq_enable(); | 539 | local_irq_enable(); |
379 | 540 | ||
541 | /* OK, so what happened? */ | ||
380 | switch (lg->regs->trapnum) { | 542 | switch (lg->regs->trapnum) { |
381 | case 13: /* We've intercepted a GPF. */ | 543 | case 13: /* We've intercepted a GPF. */ |
544 | /* Check if this was one of those annoying IN or OUT | ||
545 | * instructions which we need to emulate. If so, we | ||
546 | * just go back into the Guest after we've done it. */ | ||
382 | if (lg->regs->errcode == 0) { | 547 | if (lg->regs->errcode == 0) { |
383 | if (emulate_insn(lg)) | 548 | if (emulate_insn(lg)) |
384 | continue; | 549 | continue; |
385 | } | 550 | } |
386 | break; | 551 | break; |
387 | case 14: /* We've intercepted a page fault. */ | 552 | case 14: /* We've intercepted a page fault. */ |
553 | /* The Guest accessed a virtual address that wasn't | ||
554 | * mapped. This happens a lot: we don't actually set | ||
555 | * up most of the page tables for the Guest at all when | ||
556 | * we start: as it runs it asks for more and more, and | ||
557 | * we set them up as required. In this case, we don't | ||
558 | * even tell the Guest that the fault happened. | ||
559 | * | ||
560 | * The errcode tells whether this was a read or a | ||
561 | * write, and whether kernel or userspace code. */ | ||
388 | if (demand_page(lg, cr2, lg->regs->errcode)) | 562 | if (demand_page(lg, cr2, lg->regs->errcode)) |
389 | continue; | 563 | continue; |
390 | 564 | ||
391 | /* If lguest_data is NULL, this won't hurt. */ | 565 | /* OK, it's really not there (or not OK): the Guest |
566 | * needs to know. We write out the cr2 value so it | ||
567 | * knows where the fault occurred. | ||
568 | * | ||
569 | * Note that if the Guest were really messed up, this | ||
570 | * could happen before it's done the INITIALIZE | ||
571 | * hypercall, so lg->lguest_data will be NULL, so | ||
572 | * &lg->lguest_data->cr2 will be address 8. Writing | ||
573 | * into that address won't hurt the Host at all, | ||
574 | * though. */ | ||
392 | if (put_user(cr2, &lg->lguest_data->cr2)) | 575 | if (put_user(cr2, &lg->lguest_data->cr2)) |
393 | kill_guest(lg, "Writing cr2"); | 576 | kill_guest(lg, "Writing cr2"); |
394 | break; | 577 | break; |
395 | case 7: /* We've intercepted a Device Not Available fault. */ | 578 | case 7: /* We've intercepted a Device Not Available fault. */ |
396 | /* If they don't want to know, just absorb it. */ | 579 | /* If the Guest doesn't want to know, we already |
580 | * restored the Floating Point Unit, so we just | ||
581 | * continue without telling it. */ | ||
397 | if (!lg->ts) | 582 | if (!lg->ts) |
398 | continue; | 583 | continue; |
399 | break; | 584 | break; |
400 | case 32 ... 255: /* Real interrupt, fall thru */ | 585 | case 32 ... 255: |
586 | /* These values mean a real interrupt occurred, in | ||
587 | * which case the Host handler has already been run. | ||
588 | * We just do a friendly check if another process | ||
589 | * should now be run, then fall through to loop | ||
590 | * around: */ | ||
401 | cond_resched(); | 591 | cond_resched(); |
402 | case LGUEST_TRAP_ENTRY: /* Handled at top of loop */ | 592 | case LGUEST_TRAP_ENTRY: /* Handled at top of loop */ |
403 | continue; | 593 | continue; |
404 | } | 594 | } |
405 | 595 | ||
596 | /* If we get here, it's a trap the Guest wants to know | ||
597 | * about. */ | ||
406 | if (deliver_trap(lg, lg->regs->trapnum)) | 598 | if (deliver_trap(lg, lg->regs->trapnum)) |
407 | continue; | 599 | continue; |
408 | 600 | ||
601 | /* If the Guest doesn't have a handler (either it hasn't | ||
602 | * registered any yet, or it's one of the faults we don't let | ||
603 | * it handle), it dies with a cryptic error message. */ | ||
409 | kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", | 604 | kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", |
410 | lg->regs->trapnum, lg->regs->eip, | 605 | lg->regs->trapnum, lg->regs->eip, |
411 | lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode); | 606 | lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode); |
412 | } | 607 | } |
608 | /* The Guest is dead => "No such file or directory" */ | ||
413 | return -ENOENT; | 609 | return -ENOENT; |
414 | } | 610 | } |
415 | 611 | ||
612 | /* Now we can look at each of the routines this calls, in increasing order of | ||
613 | * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), | ||
614 | * deliver_trap() and demand_page(). After all those, we'll be ready to | ||
615 | * examine the Switcher, and our philosophical understanding of the Host/Guest | ||
616 | * duality will be complete. :*/ | ||
617 | |||
416 | int find_free_guest(void) | 618 | int find_free_guest(void) |
417 | { | 619 | { |
418 | unsigned int i; | 620 | unsigned int i; |
@@ -430,55 +632,96 @@ static void adjust_pge(void *on) | |||
430 | write_cr4(read_cr4() & ~X86_CR4_PGE); | 632 | write_cr4(read_cr4() & ~X86_CR4_PGE); |
431 | } | 633 | } |
432 | 634 | ||
635 | /*H:000 | ||
636 | * Welcome to the Host! | ||
637 | * | ||
638 | * By this point your brain has been tickled by the Guest code and numbed by | ||
639 | * the Launcher code; prepare for it to be stretched by the Host code. This is | ||
640 | * the heart. Let's begin at the initialization routine for the Host's lg | ||
641 | * module. | ||
642 | */ | ||
433 | static int __init init(void) | 643 | static int __init init(void) |
434 | { | 644 | { |
435 | int err; | 645 | int err; |
436 | 646 | ||
647 | /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */ | ||
437 | if (paravirt_enabled()) { | 648 | if (paravirt_enabled()) { |
438 | printk("lguest is afraid of %s\n", paravirt_ops.name); | 649 | printk("lguest is afraid of %s\n", paravirt_ops.name); |
439 | return -EPERM; | 650 | return -EPERM; |
440 | } | 651 | } |
441 | 652 | ||
653 | /* First we put the Switcher up in very high virtual memory. */ | ||
442 | err = map_switcher(); | 654 | err = map_switcher(); |
443 | if (err) | 655 | if (err) |
444 | return err; | 656 | return err; |
445 | 657 | ||
658 | /* Now we set up the pagetable implementation for the Guests. */ | ||
446 | err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); | 659 | err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); |
447 | if (err) { | 660 | if (err) { |
448 | unmap_switcher(); | 661 | unmap_switcher(); |
449 | return err; | 662 | return err; |
450 | } | 663 | } |
664 | |||
665 | /* The I/O subsystem needs some things initialized. */ | ||
451 | lguest_io_init(); | 666 | lguest_io_init(); |
452 | 667 | ||
668 | /* /dev/lguest needs to be registered. */ | ||
453 | err = lguest_device_init(); | 669 | err = lguest_device_init(); |
454 | if (err) { | 670 | if (err) { |
455 | free_pagetables(); | 671 | free_pagetables(); |
456 | unmap_switcher(); | 672 | unmap_switcher(); |
457 | return err; | 673 | return err; |
458 | } | 674 | } |
675 | |||
676 | /* Finally, we need to turn off "Page Global Enable". PGE is an | ||
677 | * optimization where page table entries are specially marked to show | ||
678 | * they never change. The Host kernel marks all the kernel pages this | ||
679 | * way because it's always present, even when userspace is running. | ||
680 | * | ||
681 | * Lguest breaks this: unbeknownst to the rest of the Host kernel, we | ||
682 | * switch to the Guest kernel. If you don't disable this on all CPUs, | ||
683 | * you'll get really weird bugs that you'll chase for two days. | ||
684 | * | ||
685 | * I used to turn PGE off every time we switched to the Guest and back | ||
686 | * on when we return, but that slowed the Switcher down noticibly. */ | ||
687 | |||
688 | /* We don't need the complexity of CPUs coming and going while we're | ||
689 | * doing this. */ | ||
459 | lock_cpu_hotplug(); | 690 | lock_cpu_hotplug(); |
460 | if (cpu_has_pge) { /* We have a broader idea of "global". */ | 691 | if (cpu_has_pge) { /* We have a broader idea of "global". */ |
692 | /* Remember that this was originally set (for cleanup). */ | ||
461 | cpu_had_pge = 1; | 693 | cpu_had_pge = 1; |
694 | /* adjust_pge is a helper function which sets or unsets the PGE | ||
695 | * bit on its CPU, depending on the argument (0 == unset). */ | ||
462 | on_each_cpu(adjust_pge, (void *)0, 0, 1); | 696 | on_each_cpu(adjust_pge, (void *)0, 0, 1); |
697 | /* Turn off the feature in the global feature set. */ | ||
463 | clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); | 698 | clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); |
464 | } | 699 | } |
465 | unlock_cpu_hotplug(); | 700 | unlock_cpu_hotplug(); |
701 | |||
702 | /* All good! */ | ||
466 | return 0; | 703 | return 0; |
467 | } | 704 | } |
468 | 705 | ||
706 | /* Cleaning up is just the same code, backwards. With a little French. */ | ||
469 | static void __exit fini(void) | 707 | static void __exit fini(void) |
470 | { | 708 | { |
471 | lguest_device_remove(); | 709 | lguest_device_remove(); |
472 | free_pagetables(); | 710 | free_pagetables(); |
473 | unmap_switcher(); | 711 | unmap_switcher(); |
712 | |||
713 | /* If we had PGE before we started, turn it back on now. */ | ||
474 | lock_cpu_hotplug(); | 714 | lock_cpu_hotplug(); |
475 | if (cpu_had_pge) { | 715 | if (cpu_had_pge) { |
476 | set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); | 716 | set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); |
717 | /* adjust_pge's argument "1" means set PGE. */ | ||
477 | on_each_cpu(adjust_pge, (void *)1, 0, 1); | 718 | on_each_cpu(adjust_pge, (void *)1, 0, 1); |
478 | } | 719 | } |
479 | unlock_cpu_hotplug(); | 720 | unlock_cpu_hotplug(); |
480 | } | 721 | } |
481 | 722 | ||
723 | /* The Host side of lguest can be a module. This is a nice way for people to | ||
724 | * play with it. */ | ||
482 | module_init(init); | 725 | module_init(init); |
483 | module_exit(fini); | 726 | module_exit(fini); |
484 | MODULE_LICENSE("GPL"); | 727 | MODULE_LICENSE("GPL"); |
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index fb546b046445..7a5299f9679d 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c | |||
@@ -28,37 +28,63 @@ | |||
28 | #include <irq_vectors.h> | 28 | #include <irq_vectors.h> |
29 | #include "lg.h" | 29 | #include "lg.h" |
30 | 30 | ||
31 | /*H:120 This is the core hypercall routine: where the Guest gets what it | ||
32 | * wants. Or gets killed. Or, in the case of LHCALL_CRASH, both. | ||
33 | * | ||
34 | * Remember from the Guest: %eax == which call to make, and the arguments are | ||
35 | * packed into %edx, %ebx and %ecx if needed. */ | ||
31 | static void do_hcall(struct lguest *lg, struct lguest_regs *regs) | 36 | static void do_hcall(struct lguest *lg, struct lguest_regs *regs) |
32 | { | 37 | { |
33 | switch (regs->eax) { | 38 | switch (regs->eax) { |
34 | case LHCALL_FLUSH_ASYNC: | 39 | case LHCALL_FLUSH_ASYNC: |
40 | /* This call does nothing, except by breaking out of the Guest | ||
41 | * it makes us process all the asynchronous hypercalls. */ | ||
35 | break; | 42 | break; |
36 | case LHCALL_LGUEST_INIT: | 43 | case LHCALL_LGUEST_INIT: |
44 | /* You can't get here unless you're already initialized. Don't | ||
45 | * do that. */ | ||
37 | kill_guest(lg, "already have lguest_data"); | 46 | kill_guest(lg, "already have lguest_data"); |
38 | break; | 47 | break; |
39 | case LHCALL_CRASH: { | 48 | case LHCALL_CRASH: { |
49 | /* Crash is such a trivial hypercall that we do it in four | ||
50 | * lines right here. */ | ||
40 | char msg[128]; | 51 | char msg[128]; |
52 | /* If the lgread fails, it will call kill_guest() itself; the | ||
53 | * kill_guest() with the message will be ignored. */ | ||
41 | lgread(lg, msg, regs->edx, sizeof(msg)); | 54 | lgread(lg, msg, regs->edx, sizeof(msg)); |
42 | msg[sizeof(msg)-1] = '\0'; | 55 | msg[sizeof(msg)-1] = '\0'; |
43 | kill_guest(lg, "CRASH: %s", msg); | 56 | kill_guest(lg, "CRASH: %s", msg); |
44 | break; | 57 | break; |
45 | } | 58 | } |
46 | case LHCALL_FLUSH_TLB: | 59 | case LHCALL_FLUSH_TLB: |
60 | /* FLUSH_TLB comes in two flavors, depending on the | ||
61 | * argument: */ | ||
47 | if (regs->edx) | 62 | if (regs->edx) |
48 | guest_pagetable_clear_all(lg); | 63 | guest_pagetable_clear_all(lg); |
49 | else | 64 | else |
50 | guest_pagetable_flush_user(lg); | 65 | guest_pagetable_flush_user(lg); |
51 | break; | 66 | break; |
52 | case LHCALL_GET_WALLCLOCK: { | 67 | case LHCALL_GET_WALLCLOCK: { |
68 | /* The Guest wants to know the real time in seconds since 1970, | ||
69 | * in good Unix tradition. */ | ||
53 | struct timespec ts; | 70 | struct timespec ts; |
54 | ktime_get_real_ts(&ts); | 71 | ktime_get_real_ts(&ts); |
55 | regs->eax = ts.tv_sec; | 72 | regs->eax = ts.tv_sec; |
56 | break; | 73 | break; |
57 | } | 74 | } |
58 | case LHCALL_BIND_DMA: | 75 | case LHCALL_BIND_DMA: |
76 | /* BIND_DMA really wants four arguments, but it's the only call | ||
77 | * which does. So the Guest packs the number of buffers and | ||
78 | * the interrupt number into the final argument, and we decode | ||
79 | * it here. This can legitimately fail, since we currently | ||
80 | * place a limit on the number of DMA pools a Guest can have. | ||
81 | * So we return true or false from this call. */ | ||
59 | regs->eax = bind_dma(lg, regs->edx, regs->ebx, | 82 | regs->eax = bind_dma(lg, regs->edx, regs->ebx, |
60 | regs->ecx >> 8, regs->ecx & 0xFF); | 83 | regs->ecx >> 8, regs->ecx & 0xFF); |
61 | break; | 84 | break; |
85 | |||
86 | /* All these calls simply pass the arguments through to the right | ||
87 | * routines. */ | ||
62 | case LHCALL_SEND_DMA: | 88 | case LHCALL_SEND_DMA: |
63 | send_dma(lg, regs->edx, regs->ebx); | 89 | send_dma(lg, regs->edx, regs->ebx); |
64 | break; | 90 | break; |
@@ -86,10 +112,13 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs) | |||
86 | case LHCALL_SET_CLOCKEVENT: | 112 | case LHCALL_SET_CLOCKEVENT: |
87 | guest_set_clockevent(lg, regs->edx); | 113 | guest_set_clockevent(lg, regs->edx); |
88 | break; | 114 | break; |
115 | |||
89 | case LHCALL_TS: | 116 | case LHCALL_TS: |
117 | /* This sets the TS flag, as we saw used in run_guest(). */ | ||
90 | lg->ts = regs->edx; | 118 | lg->ts = regs->edx; |
91 | break; | 119 | break; |
92 | case LHCALL_HALT: | 120 | case LHCALL_HALT: |
121 | /* Similarly, this sets the halted flag for run_guest(). */ | ||
93 | lg->halted = 1; | 122 | lg->halted = 1; |
94 | break; | 123 | break; |
95 | default: | 124 | default: |
@@ -97,25 +126,42 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs) | |||
97 | } | 126 | } |
98 | } | 127 | } |
99 | 128 | ||
100 | /* We always do queued calls before actual hypercall. */ | 129 | /* Asynchronous hypercalls are easy: we just look in the array in the Guest's |
130 | * "struct lguest_data" and see if there are any new ones marked "ready". | ||
131 | * | ||
132 | * We are careful to do these in order: obviously we respect the order the | ||
133 | * Guest put them in the ring, but we also promise the Guest that they will | ||
134 | * happen before any normal hypercall (which is why we check this before | ||
135 | * checking for a normal hcall). */ | ||
101 | static void do_async_hcalls(struct lguest *lg) | 136 | static void do_async_hcalls(struct lguest *lg) |
102 | { | 137 | { |
103 | unsigned int i; | 138 | unsigned int i; |
104 | u8 st[LHCALL_RING_SIZE]; | 139 | u8 st[LHCALL_RING_SIZE]; |
105 | 140 | ||
141 | /* For simplicity, we copy the entire call status array in at once. */ | ||
106 | if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st))) | 142 | if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st))) |
107 | return; | 143 | return; |
108 | 144 | ||
145 | |||
146 | /* We process "struct lguest_data"s hcalls[] ring once. */ | ||
109 | for (i = 0; i < ARRAY_SIZE(st); i++) { | 147 | for (i = 0; i < ARRAY_SIZE(st); i++) { |
110 | struct lguest_regs regs; | 148 | struct lguest_regs regs; |
149 | /* We remember where we were up to from last time. This makes | ||
150 | * sure that the hypercalls are done in the order the Guest | ||
151 | * places them in the ring. */ | ||
111 | unsigned int n = lg->next_hcall; | 152 | unsigned int n = lg->next_hcall; |
112 | 153 | ||
154 | /* 0xFF means there's no call here (yet). */ | ||
113 | if (st[n] == 0xFF) | 155 | if (st[n] == 0xFF) |
114 | break; | 156 | break; |
115 | 157 | ||
158 | /* OK, we have hypercall. Increment the "next_hcall" cursor, | ||
159 | * and wrap back to 0 if we reach the end. */ | ||
116 | if (++lg->next_hcall == LHCALL_RING_SIZE) | 160 | if (++lg->next_hcall == LHCALL_RING_SIZE) |
117 | lg->next_hcall = 0; | 161 | lg->next_hcall = 0; |
118 | 162 | ||
163 | /* We copy the hypercall arguments into a fake register | ||
164 | * structure. This makes life simple for do_hcall(). */ | ||
119 | if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax) | 165 | if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax) |
120 | || get_user(regs.edx, &lg->lguest_data->hcalls[n].edx) | 166 | || get_user(regs.edx, &lg->lguest_data->hcalls[n].edx) |
121 | || get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx) | 167 | || get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx) |
@@ -124,74 +170,126 @@ static void do_async_hcalls(struct lguest *lg) | |||
124 | break; | 170 | break; |
125 | } | 171 | } |
126 | 172 | ||
173 | /* Do the hypercall, same as a normal one. */ | ||
127 | do_hcall(lg, ®s); | 174 | do_hcall(lg, ®s); |
175 | |||
176 | /* Mark the hypercall done. */ | ||
128 | if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) { | 177 | if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) { |
129 | kill_guest(lg, "Writing result for async hypercall"); | 178 | kill_guest(lg, "Writing result for async hypercall"); |
130 | break; | 179 | break; |
131 | } | 180 | } |
132 | 181 | ||
182 | /* Stop doing hypercalls if we've just done a DMA to the | ||
183 | * Launcher: it needs to service this first. */ | ||
133 | if (lg->dma_is_pending) | 184 | if (lg->dma_is_pending) |
134 | break; | 185 | break; |
135 | } | 186 | } |
136 | } | 187 | } |
137 | 188 | ||
189 | /* Last of all, we look at what happens first of all. The very first time the | ||
190 | * Guest makes a hypercall, we end up here to set things up: */ | ||
138 | static void initialize(struct lguest *lg) | 191 | static void initialize(struct lguest *lg) |
139 | { | 192 | { |
140 | u32 tsc_speed; | 193 | u32 tsc_speed; |
141 | 194 | ||
195 | /* You can't do anything until you're initialized. The Guest knows the | ||
196 | * rules, so we're unforgiving here. */ | ||
142 | if (lg->regs->eax != LHCALL_LGUEST_INIT) { | 197 | if (lg->regs->eax != LHCALL_LGUEST_INIT) { |
143 | kill_guest(lg, "hypercall %li before LGUEST_INIT", | 198 | kill_guest(lg, "hypercall %li before LGUEST_INIT", |
144 | lg->regs->eax); | 199 | lg->regs->eax); |
145 | return; | 200 | return; |
146 | } | 201 | } |
147 | 202 | ||
148 | /* We only tell the guest to use the TSC if it's reliable. */ | 203 | /* We insist that the Time Stamp Counter exist and doesn't change with |
204 | * cpu frequency. Some devious chip manufacturers decided that TSC | ||
205 | * changes could be handled in software. I decided that time going | ||
206 | * backwards might be good for benchmarks, but it's bad for users. | ||
207 | * | ||
208 | * We also insist that the TSC be stable: the kernel detects unreliable | ||
209 | * TSCs for its own purposes, and we use that here. */ | ||
149 | if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) | 210 | if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) |
150 | tsc_speed = tsc_khz; | 211 | tsc_speed = tsc_khz; |
151 | else | 212 | else |
152 | tsc_speed = 0; | 213 | tsc_speed = 0; |
153 | 214 | ||
215 | /* The pointer to the Guest's "struct lguest_data" is the only | ||
216 | * argument. */ | ||
154 | lg->lguest_data = (struct lguest_data __user *)lg->regs->edx; | 217 | lg->lguest_data = (struct lguest_data __user *)lg->regs->edx; |
155 | /* We check here so we can simply copy_to_user/from_user */ | 218 | /* If we check the address they gave is OK now, we can simply |
219 | * copy_to_user/from_user from now on rather than using lgread/lgwrite. | ||
220 | * I put this in to show that I'm not immune to writing stupid | ||
221 | * optimizations. */ | ||
156 | if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) { | 222 | if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) { |
157 | kill_guest(lg, "bad guest page %p", lg->lguest_data); | 223 | kill_guest(lg, "bad guest page %p", lg->lguest_data); |
158 | return; | 224 | return; |
159 | } | 225 | } |
226 | /* The Guest tells us where we're not to deliver interrupts by putting | ||
227 | * the range of addresses into "struct lguest_data". */ | ||
160 | if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) | 228 | if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) |
161 | || get_user(lg->noirq_end, &lg->lguest_data->noirq_end) | 229 | || get_user(lg->noirq_end, &lg->lguest_data->noirq_end) |
162 | /* We reserve the top pgd entry. */ | 230 | /* We tell the Guest that it can't use the top 4MB of virtual |
231 | * addresses used by the Switcher. */ | ||
163 | || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) | 232 | || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) |
164 | || put_user(tsc_speed, &lg->lguest_data->tsc_khz) | 233 | || put_user(tsc_speed, &lg->lguest_data->tsc_khz) |
234 | /* We also give the Guest a unique id, as used in lguest_net.c. */ | ||
165 | || put_user(lg->guestid, &lg->lguest_data->guestid)) | 235 | || put_user(lg->guestid, &lg->lguest_data->guestid)) |
166 | kill_guest(lg, "bad guest page %p", lg->lguest_data); | 236 | kill_guest(lg, "bad guest page %p", lg->lguest_data); |
167 | 237 | ||
168 | /* This is the one case where the above accesses might have | 238 | /* This is the one case where the above accesses might have been the |
169 | * been the first write to a Guest page. This may have caused | 239 | * first write to a Guest page. This may have caused a copy-on-write |
170 | * a copy-on-write fault, but the Guest might be referring to | 240 | * fault, but the Guest might be referring to the old (read-only) |
171 | * the old (read-only) page. */ | 241 | * page. */ |
172 | guest_pagetable_clear_all(lg); | 242 | guest_pagetable_clear_all(lg); |
173 | } | 243 | } |
244 | /* Now we've examined the hypercall code; our Guest can make requests. There | ||
245 | * is one other way we can do things for the Guest, as we see in | ||
246 | * emulate_insn(). */ | ||
174 | 247 | ||
175 | /* Even if we go out to userspace and come back, we don't want to do | 248 | /*H:110 Tricky point: we mark the hypercall as "done" once we've done it. |
176 | * the hypercall again. */ | 249 | * Normally we don't need to do this: the Guest will run again and update the |
250 | * trap number before we come back around the run_guest() loop to | ||
251 | * do_hypercalls(). | ||
252 | * | ||
253 | * However, if we are signalled or the Guest sends DMA to the Launcher, that | ||
254 | * loop will exit without running the Guest. When it comes back it would try | ||
255 | * to re-run the hypercall. */ | ||
177 | static void clear_hcall(struct lguest *lg) | 256 | static void clear_hcall(struct lguest *lg) |
178 | { | 257 | { |
179 | lg->regs->trapnum = 255; | 258 | lg->regs->trapnum = 255; |
180 | } | 259 | } |
181 | 260 | ||
261 | /*H:100 | ||
262 | * Hypercalls | ||
263 | * | ||
264 | * Remember from the Guest, hypercalls come in two flavors: normal and | ||
265 | * asynchronous. This file handles both of types. | ||
266 | */ | ||
182 | void do_hypercalls(struct lguest *lg) | 267 | void do_hypercalls(struct lguest *lg) |
183 | { | 268 | { |
269 | /* Not initialized yet? */ | ||
184 | if (unlikely(!lg->lguest_data)) { | 270 | if (unlikely(!lg->lguest_data)) { |
271 | /* Did the Guest make a hypercall? We might have come back for | ||
272 | * some other reason (an interrupt, a different trap). */ | ||
185 | if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) { | 273 | if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) { |
274 | /* Set up the "struct lguest_data" */ | ||
186 | initialize(lg); | 275 | initialize(lg); |
276 | /* The hypercall is done. */ | ||
187 | clear_hcall(lg); | 277 | clear_hcall(lg); |
188 | } | 278 | } |
189 | return; | 279 | return; |
190 | } | 280 | } |
191 | 281 | ||
282 | /* The Guest has initialized. | ||
283 | * | ||
284 | * Look in the hypercall ring for the async hypercalls: */ | ||
192 | do_async_hcalls(lg); | 285 | do_async_hcalls(lg); |
286 | |||
287 | /* If we stopped reading the hypercall ring because the Guest did a | ||
288 | * SEND_DMA to the Launcher, we want to return now. Otherwise if the | ||
289 | * Guest asked us to do a hypercall, we do it. */ | ||
193 | if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) { | 290 | if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) { |
194 | do_hcall(lg, lg->regs); | 291 | do_hcall(lg, lg->regs); |
292 | /* The hypercall is done. */ | ||
195 | clear_hcall(lg); | 293 | clear_hcall(lg); |
196 | } | 294 | } |
197 | } | 295 | } |
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index b2647974e1a7..3d9830322646 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c | |||
@@ -14,100 +14,147 @@ | |||
14 | #include <linux/uaccess.h> | 14 | #include <linux/uaccess.h> |
15 | #include "lg.h" | 15 | #include "lg.h" |
16 | 16 | ||
17 | /* The address of the interrupt handler is split into two bits: */ | ||
17 | static unsigned long idt_address(u32 lo, u32 hi) | 18 | static unsigned long idt_address(u32 lo, u32 hi) |
18 | { | 19 | { |
19 | return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); | 20 | return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); |
20 | } | 21 | } |
21 | 22 | ||
23 | /* The "type" of the interrupt handler is a 4 bit field: we only support a | ||
24 | * couple of types. */ | ||
22 | static int idt_type(u32 lo, u32 hi) | 25 | static int idt_type(u32 lo, u32 hi) |
23 | { | 26 | { |
24 | return (hi >> 8) & 0xF; | 27 | return (hi >> 8) & 0xF; |
25 | } | 28 | } |
26 | 29 | ||
30 | /* An IDT entry can't be used unless the "present" bit is set. */ | ||
27 | static int idt_present(u32 lo, u32 hi) | 31 | static int idt_present(u32 lo, u32 hi) |
28 | { | 32 | { |
29 | return (hi & 0x8000); | 33 | return (hi & 0x8000); |
30 | } | 34 | } |
31 | 35 | ||
36 | /* We need a helper to "push" a value onto the Guest's stack, since that's a | ||
37 | * big part of what delivering an interrupt does. */ | ||
32 | static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) | 38 | static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) |
33 | { | 39 | { |
40 | /* Stack grows upwards: move stack then write value. */ | ||
34 | *gstack -= 4; | 41 | *gstack -= 4; |
35 | lgwrite_u32(lg, *gstack, val); | 42 | lgwrite_u32(lg, *gstack, val); |
36 | } | 43 | } |
37 | 44 | ||
45 | /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or | ||
46 | * trap. The mechanics of delivering traps and interrupts to the Guest are the | ||
47 | * same, except some traps have an "error code" which gets pushed onto the | ||
48 | * stack as well: the caller tells us if this is one. | ||
49 | * | ||
50 | * "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this | ||
51 | * interrupt or trap. It's split into two parts for traditional reasons: gcc | ||
52 | * on i386 used to be frightened by 64 bit numbers. | ||
53 | * | ||
54 | * We set up the stack just like the CPU does for a real interrupt, so it's | ||
55 | * identical for the Guest (and the standard "iret" instruction will undo | ||
56 | * it). */ | ||
38 | static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) | 57 | static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) |
39 | { | 58 | { |
40 | unsigned long gstack; | 59 | unsigned long gstack; |
41 | u32 eflags, ss, irq_enable; | 60 | u32 eflags, ss, irq_enable; |
42 | 61 | ||
43 | /* If they want a ring change, we use new stack and push old ss/esp */ | 62 | /* There are two cases for interrupts: one where the Guest is already |
63 | * in the kernel, and a more complex one where the Guest is in | ||
64 | * userspace. We check the privilege level to find out. */ | ||
44 | if ((lg->regs->ss&0x3) != GUEST_PL) { | 65 | if ((lg->regs->ss&0x3) != GUEST_PL) { |
66 | /* The Guest told us their kernel stack with the SET_STACK | ||
67 | * hypercall: both the virtual address and the segment */ | ||
45 | gstack = guest_pa(lg, lg->esp1); | 68 | gstack = guest_pa(lg, lg->esp1); |
46 | ss = lg->ss1; | 69 | ss = lg->ss1; |
70 | /* We push the old stack segment and pointer onto the new | ||
71 | * stack: when the Guest does an "iret" back from the interrupt | ||
72 | * handler the CPU will notice they're dropping privilege | ||
73 | * levels and expect these here. */ | ||
47 | push_guest_stack(lg, &gstack, lg->regs->ss); | 74 | push_guest_stack(lg, &gstack, lg->regs->ss); |
48 | push_guest_stack(lg, &gstack, lg->regs->esp); | 75 | push_guest_stack(lg, &gstack, lg->regs->esp); |
49 | } else { | 76 | } else { |
77 | /* We're staying on the same Guest (kernel) stack. */ | ||
50 | gstack = guest_pa(lg, lg->regs->esp); | 78 | gstack = guest_pa(lg, lg->regs->esp); |
51 | ss = lg->regs->ss; | 79 | ss = lg->regs->ss; |
52 | } | 80 | } |
53 | 81 | ||
54 | /* We use IF bit in eflags to indicate whether irqs were enabled | 82 | /* Remember that we never let the Guest actually disable interrupts, so |
55 | (it's always 1, since irqs are enabled when guest is running). */ | 83 | * the "Interrupt Flag" bit is always set. We copy that bit from the |
84 | * Guest's "irq_enabled" field into the eflags word: the Guest copies | ||
85 | * it back in "lguest_iret". */ | ||
56 | eflags = lg->regs->eflags; | 86 | eflags = lg->regs->eflags; |
57 | if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0 | 87 | if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0 |
58 | && !(irq_enable & X86_EFLAGS_IF)) | 88 | && !(irq_enable & X86_EFLAGS_IF)) |
59 | eflags &= ~X86_EFLAGS_IF; | 89 | eflags &= ~X86_EFLAGS_IF; |
60 | 90 | ||
91 | /* An interrupt is expected to push three things on the stack: the old | ||
92 | * "eflags" word, the old code segment, and the old instruction | ||
93 | * pointer. */ | ||
61 | push_guest_stack(lg, &gstack, eflags); | 94 | push_guest_stack(lg, &gstack, eflags); |
62 | push_guest_stack(lg, &gstack, lg->regs->cs); | 95 | push_guest_stack(lg, &gstack, lg->regs->cs); |
63 | push_guest_stack(lg, &gstack, lg->regs->eip); | 96 | push_guest_stack(lg, &gstack, lg->regs->eip); |
64 | 97 | ||
98 | /* For the six traps which supply an error code, we push that, too. */ | ||
65 | if (has_err) | 99 | if (has_err) |
66 | push_guest_stack(lg, &gstack, lg->regs->errcode); | 100 | push_guest_stack(lg, &gstack, lg->regs->errcode); |
67 | 101 | ||
68 | /* Change the real stack so switcher returns to trap handler */ | 102 | /* Now we've pushed all the old state, we change the stack, the code |
103 | * segment and the address to execute. */ | ||
69 | lg->regs->ss = ss; | 104 | lg->regs->ss = ss; |
70 | lg->regs->esp = gstack + lg->page_offset; | 105 | lg->regs->esp = gstack + lg->page_offset; |
71 | lg->regs->cs = (__KERNEL_CS|GUEST_PL); | 106 | lg->regs->cs = (__KERNEL_CS|GUEST_PL); |
72 | lg->regs->eip = idt_address(lo, hi); | 107 | lg->regs->eip = idt_address(lo, hi); |
73 | 108 | ||
74 | /* Disable interrupts for an interrupt gate. */ | 109 | /* There are two kinds of interrupt handlers: 0xE is an "interrupt |
110 | * gate" which expects interrupts to be disabled on entry. */ | ||
75 | if (idt_type(lo, hi) == 0xE) | 111 | if (idt_type(lo, hi) == 0xE) |
76 | if (put_user(0, &lg->lguest_data->irq_enabled)) | 112 | if (put_user(0, &lg->lguest_data->irq_enabled)) |
77 | kill_guest(lg, "Disabling interrupts"); | 113 | kill_guest(lg, "Disabling interrupts"); |
78 | } | 114 | } |
79 | 115 | ||
116 | /*H:200 | ||
117 | * Virtual Interrupts. | ||
118 | * | ||
119 | * maybe_do_interrupt() gets called before every entry to the Guest, to see if | ||
120 | * we should divert the Guest to running an interrupt handler. */ | ||
80 | void maybe_do_interrupt(struct lguest *lg) | 121 | void maybe_do_interrupt(struct lguest *lg) |
81 | { | 122 | { |
82 | unsigned int irq; | 123 | unsigned int irq; |
83 | DECLARE_BITMAP(blk, LGUEST_IRQS); | 124 | DECLARE_BITMAP(blk, LGUEST_IRQS); |
84 | struct desc_struct *idt; | 125 | struct desc_struct *idt; |
85 | 126 | ||
127 | /* If the Guest hasn't even initialized yet, we can do nothing. */ | ||
86 | if (!lg->lguest_data) | 128 | if (!lg->lguest_data) |
87 | return; | 129 | return; |
88 | 130 | ||
89 | /* Mask out any interrupts they have blocked. */ | 131 | /* Take our "irqs_pending" array and remove any interrupts the Guest |
132 | * wants blocked: the result ends up in "blk". */ | ||
90 | if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts, | 133 | if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts, |
91 | sizeof(blk))) | 134 | sizeof(blk))) |
92 | return; | 135 | return; |
93 | 136 | ||
94 | bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS); | 137 | bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS); |
95 | 138 | ||
139 | /* Find the first interrupt. */ | ||
96 | irq = find_first_bit(blk, LGUEST_IRQS); | 140 | irq = find_first_bit(blk, LGUEST_IRQS); |
141 | /* None? Nothing to do */ | ||
97 | if (irq >= LGUEST_IRQS) | 142 | if (irq >= LGUEST_IRQS) |
98 | return; | 143 | return; |
99 | 144 | ||
145 | /* They may be in the middle of an iret, where they asked us never to | ||
146 | * deliver interrupts. */ | ||
100 | if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end) | 147 | if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end) |
101 | return; | 148 | return; |
102 | 149 | ||
103 | /* If they're halted, we re-enable interrupts. */ | 150 | /* If they're halted, interrupts restart them. */ |
104 | if (lg->halted) { | 151 | if (lg->halted) { |
105 | /* Re-enable interrupts. */ | 152 | /* Re-enable interrupts. */ |
106 | if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled)) | 153 | if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled)) |
107 | kill_guest(lg, "Re-enabling interrupts"); | 154 | kill_guest(lg, "Re-enabling interrupts"); |
108 | lg->halted = 0; | 155 | lg->halted = 0; |
109 | } else { | 156 | } else { |
110 | /* Maybe they have interrupts disabled? */ | 157 | /* Otherwise we check if they have interrupts disabled. */ |
111 | u32 irq_enabled; | 158 | u32 irq_enabled; |
112 | if (get_user(irq_enabled, &lg->lguest_data->irq_enabled)) | 159 | if (get_user(irq_enabled, &lg->lguest_data->irq_enabled)) |
113 | irq_enabled = 0; | 160 | irq_enabled = 0; |
@@ -115,112 +162,197 @@ void maybe_do_interrupt(struct lguest *lg) | |||
115 | return; | 162 | return; |
116 | } | 163 | } |
117 | 164 | ||
165 | /* Look at the IDT entry the Guest gave us for this interrupt. The | ||
166 | * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip | ||
167 | * over them. */ | ||
118 | idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq]; | 168 | idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq]; |
169 | /* If they don't have a handler (yet?), we just ignore it */ | ||
119 | if (idt_present(idt->a, idt->b)) { | 170 | if (idt_present(idt->a, idt->b)) { |
171 | /* OK, mark it no longer pending and deliver it. */ | ||
120 | clear_bit(irq, lg->irqs_pending); | 172 | clear_bit(irq, lg->irqs_pending); |
173 | /* set_guest_interrupt() takes the interrupt descriptor and a | ||
174 | * flag to say whether this interrupt pushes an error code onto | ||
175 | * the stack as well: virtual interrupts never do. */ | ||
121 | set_guest_interrupt(lg, idt->a, idt->b, 0); | 176 | set_guest_interrupt(lg, idt->a, idt->b, 0); |
122 | } | 177 | } |
123 | } | 178 | } |
124 | 179 | ||
180 | /*H:220 Now we've got the routines to deliver interrupts, delivering traps | ||
181 | * like page fault is easy. The only trick is that Intel decided that some | ||
182 | * traps should have error codes: */ | ||
125 | static int has_err(unsigned int trap) | 183 | static int has_err(unsigned int trap) |
126 | { | 184 | { |
127 | return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); | 185 | return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); |
128 | } | 186 | } |
129 | 187 | ||
188 | /* deliver_trap() returns true if it could deliver the trap. */ | ||
130 | int deliver_trap(struct lguest *lg, unsigned int num) | 189 | int deliver_trap(struct lguest *lg, unsigned int num) |
131 | { | 190 | { |
132 | u32 lo = lg->idt[num].a, hi = lg->idt[num].b; | 191 | u32 lo = lg->idt[num].a, hi = lg->idt[num].b; |
133 | 192 | ||
193 | /* Early on the Guest hasn't set the IDT entries (or maybe it put a | ||
194 | * bogus one in): if we fail here, the Guest will be killed. */ | ||
134 | if (!idt_present(lo, hi)) | 195 | if (!idt_present(lo, hi)) |
135 | return 0; | 196 | return 0; |
136 | set_guest_interrupt(lg, lo, hi, has_err(num)); | 197 | set_guest_interrupt(lg, lo, hi, has_err(num)); |
137 | return 1; | 198 | return 1; |
138 | } | 199 | } |
139 | 200 | ||
201 | /*H:250 Here's the hard part: returning to the Host every time a trap happens | ||
202 | * and then calling deliver_trap() and re-entering the Guest is slow. | ||
203 | * Particularly because Guest userspace system calls are traps (trap 128). | ||
204 | * | ||
205 | * So we'd like to set up the IDT to tell the CPU to deliver traps directly | ||
206 | * into the Guest. This is possible, but the complexities cause the size of | ||
207 | * this file to double! However, 150 lines of code is worth writing for taking | ||
208 | * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all | ||
209 | * the other hypervisors would tease it. | ||
210 | * | ||
211 | * This routine determines if a trap can be delivered directly. */ | ||
140 | static int direct_trap(const struct lguest *lg, | 212 | static int direct_trap(const struct lguest *lg, |
141 | const struct desc_struct *trap, | 213 | const struct desc_struct *trap, |
142 | unsigned int num) | 214 | unsigned int num) |
143 | { | 215 | { |
144 | /* Hardware interrupts don't go to guest (except syscall). */ | 216 | /* Hardware interrupts don't go to the Guest at all (except system |
217 | * call). */ | ||
145 | if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR) | 218 | if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR) |
146 | return 0; | 219 | return 0; |
147 | 220 | ||
148 | /* We intercept page fault (demand shadow paging & cr2 saving) | 221 | /* The Host needs to see page faults (for shadow paging and to save the |
149 | protection fault (in/out emulation) and device not | 222 | * fault address), general protection faults (in/out emulation) and |
150 | available (TS handling), and hypercall */ | 223 | * device not available (TS handling), and of course, the hypercall |
224 | * trap. */ | ||
151 | if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY) | 225 | if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY) |
152 | return 0; | 226 | return 0; |
153 | 227 | ||
154 | /* Interrupt gates (0xE) or not present (0x0) can't go direct. */ | 228 | /* Only trap gates (type 15) can go direct to the Guest. Interrupt |
229 | * gates (type 14) disable interrupts as they are entered, which we | ||
230 | * never let the Guest do. Not present entries (type 0x0) also can't | ||
231 | * go direct, of course 8) */ | ||
155 | return idt_type(trap->a, trap->b) == 0xF; | 232 | return idt_type(trap->a, trap->b) == 0xF; |
156 | } | 233 | } |
157 | 234 | ||
235 | /*H:260 When we make traps go directly into the Guest, we need to make sure | ||
236 | * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the | ||
237 | * CPU trying to deliver the trap will fault while trying to push the interrupt | ||
238 | * words on the stack: this is called a double fault, and it forces us to kill | ||
239 | * the Guest. | ||
240 | * | ||
241 | * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ | ||
158 | void pin_stack_pages(struct lguest *lg) | 242 | void pin_stack_pages(struct lguest *lg) |
159 | { | 243 | { |
160 | unsigned int i; | 244 | unsigned int i; |
161 | 245 | ||
246 | /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or | ||
247 | * two pages of stack space. */ | ||
162 | for (i = 0; i < lg->stack_pages; i++) | 248 | for (i = 0; i < lg->stack_pages; i++) |
249 | /* The stack grows *upwards*, hence the subtraction */ | ||
163 | pin_page(lg, lg->esp1 - i * PAGE_SIZE); | 250 | pin_page(lg, lg->esp1 - i * PAGE_SIZE); |
164 | } | 251 | } |
165 | 252 | ||
253 | /* Direct traps also mean that we need to know whenever the Guest wants to use | ||
254 | * a different kernel stack, so we can change the IDT entries to use that | ||
255 | * stack. The IDT entries expect a virtual address, so unlike most addresses | ||
256 | * the Guest gives us, the "esp" (stack pointer) value here is virtual, not | ||
257 | * physical. | ||
258 | * | ||
259 | * In Linux each process has its own kernel stack, so this happens a lot: we | ||
260 | * change stacks on each context switch. */ | ||
166 | void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) | 261 | void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) |
167 | { | 262 | { |
168 | /* You cannot have a stack segment with priv level 0. */ | 263 | /* You are not allowd have a stack segment with privilege level 0: bad |
264 | * Guest! */ | ||
169 | if ((seg & 0x3) != GUEST_PL) | 265 | if ((seg & 0x3) != GUEST_PL) |
170 | kill_guest(lg, "bad stack segment %i", seg); | 266 | kill_guest(lg, "bad stack segment %i", seg); |
267 | /* We only expect one or two stack pages. */ | ||
171 | if (pages > 2) | 268 | if (pages > 2) |
172 | kill_guest(lg, "bad stack pages %u", pages); | 269 | kill_guest(lg, "bad stack pages %u", pages); |
270 | /* Save where the stack is, and how many pages */ | ||
173 | lg->ss1 = seg; | 271 | lg->ss1 = seg; |
174 | lg->esp1 = esp; | 272 | lg->esp1 = esp; |
175 | lg->stack_pages = pages; | 273 | lg->stack_pages = pages; |
274 | /* Make sure the new stack pages are mapped */ | ||
176 | pin_stack_pages(lg); | 275 | pin_stack_pages(lg); |
177 | } | 276 | } |
178 | 277 | ||
179 | /* Set up trap in IDT. */ | 278 | /* All this reference to mapping stacks leads us neatly into the other complex |
279 | * part of the Host: page table handling. */ | ||
280 | |||
281 | /*H:235 This is the routine which actually checks the Guest's IDT entry and | ||
282 | * transfers it into our entry in "struct lguest": */ | ||
180 | static void set_trap(struct lguest *lg, struct desc_struct *trap, | 283 | static void set_trap(struct lguest *lg, struct desc_struct *trap, |
181 | unsigned int num, u32 lo, u32 hi) | 284 | unsigned int num, u32 lo, u32 hi) |
182 | { | 285 | { |
183 | u8 type = idt_type(lo, hi); | 286 | u8 type = idt_type(lo, hi); |
184 | 287 | ||
288 | /* We zero-out a not-present entry */ | ||
185 | if (!idt_present(lo, hi)) { | 289 | if (!idt_present(lo, hi)) { |
186 | trap->a = trap->b = 0; | 290 | trap->a = trap->b = 0; |
187 | return; | 291 | return; |
188 | } | 292 | } |
189 | 293 | ||
294 | /* We only support interrupt and trap gates. */ | ||
190 | if (type != 0xE && type != 0xF) | 295 | if (type != 0xE && type != 0xF) |
191 | kill_guest(lg, "bad IDT type %i", type); | 296 | kill_guest(lg, "bad IDT type %i", type); |
192 | 297 | ||
298 | /* We only copy the handler address, present bit, privilege level and | ||
299 | * type. The privilege level controls where the trap can be triggered | ||
300 | * manually with an "int" instruction. This is usually GUEST_PL, | ||
301 | * except for system calls which userspace can use. */ | ||
193 | trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); | 302 | trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); |
194 | trap->b = (hi&0xFFFFEF00); | 303 | trap->b = (hi&0xFFFFEF00); |
195 | } | 304 | } |
196 | 305 | ||
306 | /*H:230 While we're here, dealing with delivering traps and interrupts to the | ||
307 | * Guest, we might as well complete the picture: how the Guest tells us where | ||
308 | * it wants them to go. This would be simple, except making traps fast | ||
309 | * requires some tricks. | ||
310 | * | ||
311 | * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the | ||
312 | * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ | ||
197 | void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi) | 313 | void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi) |
198 | { | 314 | { |
199 | /* Guest never handles: NMI, doublefault, hypercall, spurious irq. */ | 315 | /* Guest never handles: NMI, doublefault, spurious interrupt or |
316 | * hypercall. We ignore when it tries to set them. */ | ||
200 | if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) | 317 | if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) |
201 | return; | 318 | return; |
202 | 319 | ||
320 | /* Mark the IDT as changed: next time the Guest runs we'll know we have | ||
321 | * to copy this again. */ | ||
203 | lg->changed |= CHANGED_IDT; | 322 | lg->changed |= CHANGED_IDT; |
323 | |||
324 | /* The IDT which we keep in "struct lguest" only contains 32 entries | ||
325 | * for the traps and LGUEST_IRQS (32) entries for interrupts. We | ||
326 | * ignore attempts to set handlers for higher interrupt numbers, except | ||
327 | * for the system call "interrupt" at 128: we have a special IDT entry | ||
328 | * for that. */ | ||
204 | if (num < ARRAY_SIZE(lg->idt)) | 329 | if (num < ARRAY_SIZE(lg->idt)) |
205 | set_trap(lg, &lg->idt[num], num, lo, hi); | 330 | set_trap(lg, &lg->idt[num], num, lo, hi); |
206 | else if (num == SYSCALL_VECTOR) | 331 | else if (num == SYSCALL_VECTOR) |
207 | set_trap(lg, &lg->syscall_idt, num, lo, hi); | 332 | set_trap(lg, &lg->syscall_idt, num, lo, hi); |
208 | } | 333 | } |
209 | 334 | ||
335 | /* The default entry for each interrupt points into the Switcher routines which | ||
336 | * simply return to the Host. The run_guest() loop will then call | ||
337 | * deliver_trap() to bounce it back into the Guest. */ | ||
210 | static void default_idt_entry(struct desc_struct *idt, | 338 | static void default_idt_entry(struct desc_struct *idt, |
211 | int trap, | 339 | int trap, |
212 | const unsigned long handler) | 340 | const unsigned long handler) |
213 | { | 341 | { |
342 | /* A present interrupt gate. */ | ||
214 | u32 flags = 0x8e00; | 343 | u32 flags = 0x8e00; |
215 | 344 | ||
216 | /* They can't "int" into any of them except hypercall. */ | 345 | /* Set the privilege level on the entry for the hypercall: this allows |
346 | * the Guest to use the "int" instruction to trigger it. */ | ||
217 | if (trap == LGUEST_TRAP_ENTRY) | 347 | if (trap == LGUEST_TRAP_ENTRY) |
218 | flags |= (GUEST_PL << 13); | 348 | flags |= (GUEST_PL << 13); |
219 | 349 | ||
350 | /* Now pack it into the IDT entry in its weird format. */ | ||
220 | idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF); | 351 | idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF); |
221 | idt->b = (handler&0xFFFF0000) | flags; | 352 | idt->b = (handler&0xFFFF0000) | flags; |
222 | } | 353 | } |
223 | 354 | ||
355 | /* When the Guest first starts, we put default entries into the IDT. */ | ||
224 | void setup_default_idt_entries(struct lguest_ro_state *state, | 356 | void setup_default_idt_entries(struct lguest_ro_state *state, |
225 | const unsigned long *def) | 357 | const unsigned long *def) |
226 | { | 358 | { |
@@ -230,19 +362,25 @@ void setup_default_idt_entries(struct lguest_ro_state *state, | |||
230 | default_idt_entry(&state->guest_idt[i], i, def[i]); | 362 | default_idt_entry(&state->guest_idt[i], i, def[i]); |
231 | } | 363 | } |
232 | 364 | ||
365 | /*H:240 We don't use the IDT entries in the "struct lguest" directly, instead | ||
366 | * we copy them into the IDT which we've set up for Guests on this CPU, just | ||
367 | * before we run the Guest. This routine does that copy. */ | ||
233 | void copy_traps(const struct lguest *lg, struct desc_struct *idt, | 368 | void copy_traps(const struct lguest *lg, struct desc_struct *idt, |
234 | const unsigned long *def) | 369 | const unsigned long *def) |
235 | { | 370 | { |
236 | unsigned int i; | 371 | unsigned int i; |
237 | 372 | ||
238 | /* All hardware interrupts are same whatever the guest: only the | 373 | /* We can simply copy the direct traps, otherwise we use the default |
239 | * traps might be different. */ | 374 | * ones in the Switcher: they will return to the Host. */ |
240 | for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) { | 375 | for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) { |
241 | if (direct_trap(lg, &lg->idt[i], i)) | 376 | if (direct_trap(lg, &lg->idt[i], i)) |
242 | idt[i] = lg->idt[i]; | 377 | idt[i] = lg->idt[i]; |
243 | else | 378 | else |
244 | default_idt_entry(&idt[i], i, def[i]); | 379 | default_idt_entry(&idt[i], i, def[i]); |
245 | } | 380 | } |
381 | |||
382 | /* Don't forget the system call trap! The IDT entries for other | ||
383 | * interupts never change, so no need to copy them. */ | ||
246 | i = SYSCALL_VECTOR; | 384 | i = SYSCALL_VECTOR; |
247 | if (direct_trap(lg, &lg->syscall_idt, i)) | 385 | if (direct_trap(lg, &lg->syscall_idt, i)) |
248 | idt[i] = lg->syscall_idt; | 386 | idt[i] = lg->syscall_idt; |
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 3b9dc123a7df..269116eee85f 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h | |||
@@ -58,9 +58,18 @@ struct lguest_dma_info | |||
58 | u8 interrupt; /* 0 when not registered */ | 58 | u8 interrupt; /* 0 when not registered */ |
59 | }; | 59 | }; |
60 | 60 | ||
61 | /* We have separate types for the guest's ptes & pgds and the shadow ptes & | 61 | /*H:310 The page-table code owes a great debt of gratitude to Andi Kleen. He |
62 | * pgds. Since this host might use three-level pagetables and the guest and | 62 | * reviewed the original code which used "u32" for all page table entries, and |
63 | * shadow pagetables don't, we can't use the normal pte_t/pgd_t. */ | 63 | * insisted that it would be far clearer with explicit typing. I thought it |
64 | * was overkill, but he was right: it is much clearer than it was before. | ||
65 | * | ||
66 | * We have separate types for the Guest's ptes & pgds and the shadow ptes & | ||
67 | * pgds. There's already a Linux type for these (pte_t and pgd_t) but they | ||
68 | * change depending on kernel config options (PAE). */ | ||
69 | |||
70 | /* Each entry is identical: lower 12 bits of flags and upper 20 bits for the | ||
71 | * "page frame number" (0 == first physical page, etc). They are different | ||
72 | * types so the compiler will warn us if we mix them improperly. */ | ||
64 | typedef union { | 73 | typedef union { |
65 | struct { unsigned flags:12, pfn:20; }; | 74 | struct { unsigned flags:12, pfn:20; }; |
66 | struct { unsigned long val; } raw; | 75 | struct { unsigned long val; } raw; |
@@ -77,8 +86,12 @@ typedef union { | |||
77 | struct { unsigned flags:12, pfn:20; }; | 86 | struct { unsigned flags:12, pfn:20; }; |
78 | struct { unsigned long val; } raw; | 87 | struct { unsigned long val; } raw; |
79 | } gpte_t; | 88 | } gpte_t; |
89 | |||
90 | /* We have two convenient macros to convert a "raw" value as handed to us by | ||
91 | * the Guest into the correct Guest PGD or PTE type. */ | ||
80 | #define mkgpte(_val) ((gpte_t){.raw.val = _val}) | 92 | #define mkgpte(_val) ((gpte_t){.raw.val = _val}) |
81 | #define mkgpgd(_val) ((gpgd_t){.raw.val = _val}) | 93 | #define mkgpgd(_val) ((gpgd_t){.raw.val = _val}) |
94 | /*:*/ | ||
82 | 95 | ||
83 | struct pgdir | 96 | struct pgdir |
84 | { | 97 | { |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index f9ca50d80466..cd047e81cd63 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
@@ -15,38 +15,91 @@ | |||
15 | #include <asm/tlbflush.h> | 15 | #include <asm/tlbflush.h> |
16 | #include "lg.h" | 16 | #include "lg.h" |
17 | 17 | ||
18 | /*H:300 | ||
19 | * The Page Table Code | ||
20 | * | ||
21 | * We use two-level page tables for the Guest. If you're not entirely | ||
22 | * comfortable with virtual addresses, physical addresses and page tables then | ||
23 | * I recommend you review lguest.c's "Page Table Handling" (with diagrams!). | ||
24 | * | ||
25 | * The Guest keeps page tables, but we maintain the actual ones here: these are | ||
26 | * called "shadow" page tables. Which is a very Guest-centric name: these are | ||
27 | * the real page tables the CPU uses, although we keep them up to date to | ||
28 | * reflect the Guest's. (See what I mean about weird naming? Since when do | ||
29 | * shadows reflect anything?) | ||
30 | * | ||
31 | * Anyway, this is the most complicated part of the Host code. There are seven | ||
32 | * parts to this: | ||
33 | * (i) Setting up a page table entry for the Guest when it faults, | ||
34 | * (ii) Setting up the page table entry for the Guest stack, | ||
35 | * (iii) Setting up a page table entry when the Guest tells us it has changed, | ||
36 | * (iv) Switching page tables, | ||
37 | * (v) Flushing (thowing away) page tables, | ||
38 | * (vi) Mapping the Switcher when the Guest is about to run, | ||
39 | * (vii) Setting up the page tables initially. | ||
40 | :*/ | ||
41 | |||
42 | /* Pages a 4k long, and each page table entry is 4 bytes long, giving us 1024 | ||
43 | * (or 2^10) entries per page. */ | ||
18 | #define PTES_PER_PAGE_SHIFT 10 | 44 | #define PTES_PER_PAGE_SHIFT 10 |
19 | #define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT) | 45 | #define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT) |
46 | |||
47 | /* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is | ||
48 | * conveniently placed at the top 4MB, so it uses a separate, complete PTE | ||
49 | * page. */ | ||
20 | #define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1) | 50 | #define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1) |
21 | 51 | ||
52 | /* We actually need a separate PTE page for each CPU. Remember that after the | ||
53 | * Switcher code itself comes two pages for each CPU, and we don't want this | ||
54 | * CPU's guest to see the pages of any other CPU. */ | ||
22 | static DEFINE_PER_CPU(spte_t *, switcher_pte_pages); | 55 | static DEFINE_PER_CPU(spte_t *, switcher_pte_pages); |
23 | #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) | 56 | #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) |
24 | 57 | ||
58 | /*H:320 With our shadow and Guest types established, we need to deal with | ||
59 | * them: the page table code is curly enough to need helper functions to keep | ||
60 | * it clear and clean. | ||
61 | * | ||
62 | * The first helper takes a virtual address, and says which entry in the top | ||
63 | * level page table deals with that address. Since each top level entry deals | ||
64 | * with 4M, this effectively divides by 4M. */ | ||
25 | static unsigned vaddr_to_pgd_index(unsigned long vaddr) | 65 | static unsigned vaddr_to_pgd_index(unsigned long vaddr) |
26 | { | 66 | { |
27 | return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); | 67 | return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); |
28 | } | 68 | } |
29 | 69 | ||
30 | /* These access the shadow versions (ie. the ones used by the CPU). */ | 70 | /* There are two functions which return pointers to the shadow (aka "real") |
71 | * page tables. | ||
72 | * | ||
73 | * spgd_addr() takes the virtual address and returns a pointer to the top-level | ||
74 | * page directory entry for that address. Since we keep track of several page | ||
75 | * tables, the "i" argument tells us which one we're interested in (it's | ||
76 | * usually the current one). */ | ||
31 | static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) | 77 | static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) |
32 | { | 78 | { |
33 | unsigned int index = vaddr_to_pgd_index(vaddr); | 79 | unsigned int index = vaddr_to_pgd_index(vaddr); |
34 | 80 | ||
81 | /* We kill any Guest trying to touch the Switcher addresses. */ | ||
35 | if (index >= SWITCHER_PGD_INDEX) { | 82 | if (index >= SWITCHER_PGD_INDEX) { |
36 | kill_guest(lg, "attempt to access switcher pages"); | 83 | kill_guest(lg, "attempt to access switcher pages"); |
37 | index = 0; | 84 | index = 0; |
38 | } | 85 | } |
86 | /* Return a pointer index'th pgd entry for the i'th page table. */ | ||
39 | return &lg->pgdirs[i].pgdir[index]; | 87 | return &lg->pgdirs[i].pgdir[index]; |
40 | } | 88 | } |
41 | 89 | ||
90 | /* This routine then takes the PGD entry given above, which contains the | ||
91 | * address of the PTE page. It then returns a pointer to the PTE entry for the | ||
92 | * given address. */ | ||
42 | static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr) | 93 | static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr) |
43 | { | 94 | { |
44 | spte_t *page = __va(spgd.pfn << PAGE_SHIFT); | 95 | spte_t *page = __va(spgd.pfn << PAGE_SHIFT); |
96 | /* You should never call this if the PGD entry wasn't valid */ | ||
45 | BUG_ON(!(spgd.flags & _PAGE_PRESENT)); | 97 | BUG_ON(!(spgd.flags & _PAGE_PRESENT)); |
46 | return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE]; | 98 | return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE]; |
47 | } | 99 | } |
48 | 100 | ||
49 | /* These access the guest versions. */ | 101 | /* These two functions just like the above two, except they access the Guest |
102 | * page tables. Hence they return a Guest address. */ | ||
50 | static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) | 103 | static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) |
51 | { | 104 | { |
52 | unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); | 105 | unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); |
@@ -61,12 +114,24 @@ static unsigned long gpte_addr(struct lguest *lg, | |||
61 | return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t); | 114 | return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t); |
62 | } | 115 | } |
63 | 116 | ||
64 | /* Do a virtual -> physical mapping on a user page. */ | 117 | /*H:350 This routine takes a page number given by the Guest and converts it to |
118 | * an actual, physical page number. It can fail for several reasons: the | ||
119 | * virtual address might not be mapped by the Launcher, the write flag is set | ||
120 | * and the page is read-only, or the write flag was set and the page was | ||
121 | * shared so had to be copied, but we ran out of memory. | ||
122 | * | ||
123 | * This holds a reference to the page, so release_pte() is careful to | ||
124 | * put that back. */ | ||
65 | static unsigned long get_pfn(unsigned long virtpfn, int write) | 125 | static unsigned long get_pfn(unsigned long virtpfn, int write) |
66 | { | 126 | { |
67 | struct page *page; | 127 | struct page *page; |
128 | /* This value indicates failure. */ | ||
68 | unsigned long ret = -1UL; | 129 | unsigned long ret = -1UL; |
69 | 130 | ||
131 | /* get_user_pages() is a complex interface: it gets the "struct | ||
132 | * vm_area_struct" and "struct page" assocated with a range of pages. | ||
133 | * It also needs the task's mmap_sem held, and is not very quick. | ||
134 | * It returns the number of pages it got. */ | ||
70 | down_read(¤t->mm->mmap_sem); | 135 | down_read(¤t->mm->mmap_sem); |
71 | if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT, | 136 | if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT, |
72 | 1, write, 1, &page, NULL) == 1) | 137 | 1, write, 1, &page, NULL) == 1) |
@@ -75,28 +140,47 @@ static unsigned long get_pfn(unsigned long virtpfn, int write) | |||
75 | return ret; | 140 | return ret; |
76 | } | 141 | } |
77 | 142 | ||
143 | /*H:340 Converting a Guest page table entry to a shadow (ie. real) page table | ||
144 | * entry can be a little tricky. The flags are (almost) the same, but the | ||
145 | * Guest PTE contains a virtual page number: the CPU needs the real page | ||
146 | * number. */ | ||
78 | static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write) | 147 | static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write) |
79 | { | 148 | { |
80 | spte_t spte; | 149 | spte_t spte; |
81 | unsigned long pfn; | 150 | unsigned long pfn; |
82 | 151 | ||
83 | /* We ignore the global flag. */ | 152 | /* The Guest sets the global flag, because it thinks that it is using |
153 | * PGE. We only told it to use PGE so it would tell us whether it was | ||
154 | * flushing a kernel mapping or a userspace mapping. We don't actually | ||
155 | * use the global bit, so throw it away. */ | ||
84 | spte.flags = (gpte.flags & ~_PAGE_GLOBAL); | 156 | spte.flags = (gpte.flags & ~_PAGE_GLOBAL); |
157 | |||
158 | /* We need a temporary "unsigned long" variable to hold the answer from | ||
159 | * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't | ||
160 | * fit in spte.pfn. get_pfn() finds the real physical number of the | ||
161 | * page, given the virtual number. */ | ||
85 | pfn = get_pfn(gpte.pfn, write); | 162 | pfn = get_pfn(gpte.pfn, write); |
86 | if (pfn == -1UL) { | 163 | if (pfn == -1UL) { |
87 | kill_guest(lg, "failed to get page %u", gpte.pfn); | 164 | kill_guest(lg, "failed to get page %u", gpte.pfn); |
88 | /* Must not put_page() bogus page on cleanup. */ | 165 | /* When we destroy the Guest, we'll go through the shadow page |
166 | * tables and release_pte() them. Make sure we don't think | ||
167 | * this one is valid! */ | ||
89 | spte.flags = 0; | 168 | spte.flags = 0; |
90 | } | 169 | } |
170 | /* Now we assign the page number, and our shadow PTE is complete. */ | ||
91 | spte.pfn = pfn; | 171 | spte.pfn = pfn; |
92 | return spte; | 172 | return spte; |
93 | } | 173 | } |
94 | 174 | ||
175 | /*H:460 And to complete the chain, release_pte() looks like this: */ | ||
95 | static void release_pte(spte_t pte) | 176 | static void release_pte(spte_t pte) |
96 | { | 177 | { |
178 | /* Remember that get_user_pages() took a reference to the page, in | ||
179 | * get_pfn()? We have to put it back now. */ | ||
97 | if (pte.flags & _PAGE_PRESENT) | 180 | if (pte.flags & _PAGE_PRESENT) |
98 | put_page(pfn_to_page(pte.pfn)); | 181 | put_page(pfn_to_page(pte.pfn)); |
99 | } | 182 | } |
183 | /*:*/ | ||
100 | 184 | ||
101 | static void check_gpte(struct lguest *lg, gpte_t gpte) | 185 | static void check_gpte(struct lguest *lg, gpte_t gpte) |
102 | { | 186 | { |
@@ -110,11 +194,16 @@ static void check_gpgd(struct lguest *lg, gpgd_t gpgd) | |||
110 | kill_guest(lg, "bad page directory entry"); | 194 | kill_guest(lg, "bad page directory entry"); |
111 | } | 195 | } |
112 | 196 | ||
113 | /* FIXME: We hold reference to pages, which prevents them from being | 197 | /*H:330 |
114 | swapped. It'd be nice to have a callback when Linux wants to swap out. */ | 198 | * (i) Setting up a page table entry for the Guest when it faults |
115 | 199 | * | |
116 | /* We fault pages in, which allows us to update accessed/dirty bits. | 200 | * We saw this call in run_guest(): when we see a page fault in the Guest, we |
117 | * Return true if we got page. */ | 201 | * come here. That's because we only set up the shadow page tables lazily as |
202 | * they're needed, so we get page faults all the time and quietly fix them up | ||
203 | * and return to the Guest without it knowing. | ||
204 | * | ||
205 | * If we fixed up the fault (ie. we mapped the address), this routine returns | ||
206 | * true. */ | ||
118 | int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | 207 | int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) |
119 | { | 208 | { |
120 | gpgd_t gpgd; | 209 | gpgd_t gpgd; |
@@ -123,106 +212,161 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | |||
123 | gpte_t gpte; | 212 | gpte_t gpte; |
124 | spte_t *spte; | 213 | spte_t *spte; |
125 | 214 | ||
215 | /* First step: get the top-level Guest page table entry. */ | ||
126 | gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); | 216 | gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); |
217 | /* Toplevel not present? We can't map it in. */ | ||
127 | if (!(gpgd.flags & _PAGE_PRESENT)) | 218 | if (!(gpgd.flags & _PAGE_PRESENT)) |
128 | return 0; | 219 | return 0; |
129 | 220 | ||
221 | /* Now look at the matching shadow entry. */ | ||
130 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); | 222 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); |
131 | if (!(spgd->flags & _PAGE_PRESENT)) { | 223 | if (!(spgd->flags & _PAGE_PRESENT)) { |
132 | /* Get a page of PTEs for them. */ | 224 | /* No shadow entry: allocate a new shadow PTE page. */ |
133 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); | 225 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); |
134 | /* FIXME: Steal from self in this case? */ | 226 | /* This is not really the Guest's fault, but killing it is |
227 | * simple for this corner case. */ | ||
135 | if (!ptepage) { | 228 | if (!ptepage) { |
136 | kill_guest(lg, "out of memory allocating pte page"); | 229 | kill_guest(lg, "out of memory allocating pte page"); |
137 | return 0; | 230 | return 0; |
138 | } | 231 | } |
232 | /* We check that the Guest pgd is OK. */ | ||
139 | check_gpgd(lg, gpgd); | 233 | check_gpgd(lg, gpgd); |
234 | /* And we copy the flags to the shadow PGD entry. The page | ||
235 | * number in the shadow PGD is the page we just allocated. */ | ||
140 | spgd->raw.val = (__pa(ptepage) | gpgd.flags); | 236 | spgd->raw.val = (__pa(ptepage) | gpgd.flags); |
141 | } | 237 | } |
142 | 238 | ||
239 | /* OK, now we look at the lower level in the Guest page table: keep its | ||
240 | * address, because we might update it later. */ | ||
143 | gpte_ptr = gpte_addr(lg, gpgd, vaddr); | 241 | gpte_ptr = gpte_addr(lg, gpgd, vaddr); |
144 | gpte = mkgpte(lgread_u32(lg, gpte_ptr)); | 242 | gpte = mkgpte(lgread_u32(lg, gpte_ptr)); |
145 | 243 | ||
146 | /* No page? */ | 244 | /* If this page isn't in the Guest page tables, we can't page it in. */ |
147 | if (!(gpte.flags & _PAGE_PRESENT)) | 245 | if (!(gpte.flags & _PAGE_PRESENT)) |
148 | return 0; | 246 | return 0; |
149 | 247 | ||
150 | /* Write to read-only page? */ | 248 | /* Check they're not trying to write to a page the Guest wants |
249 | * read-only (bit 2 of errcode == write). */ | ||
151 | if ((errcode & 2) && !(gpte.flags & _PAGE_RW)) | 250 | if ((errcode & 2) && !(gpte.flags & _PAGE_RW)) |
152 | return 0; | 251 | return 0; |
153 | 252 | ||
154 | /* User access to a non-user page? */ | 253 | /* User access to a kernel page? (bit 3 == user access) */ |
155 | if ((errcode & 4) && !(gpte.flags & _PAGE_USER)) | 254 | if ((errcode & 4) && !(gpte.flags & _PAGE_USER)) |
156 | return 0; | 255 | return 0; |
157 | 256 | ||
257 | /* Check that the Guest PTE flags are OK, and the page number is below | ||
258 | * the pfn_limit (ie. not mapping the Launcher binary). */ | ||
158 | check_gpte(lg, gpte); | 259 | check_gpte(lg, gpte); |
260 | /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ | ||
159 | gpte.flags |= _PAGE_ACCESSED; | 261 | gpte.flags |= _PAGE_ACCESSED; |
160 | if (errcode & 2) | 262 | if (errcode & 2) |
161 | gpte.flags |= _PAGE_DIRTY; | 263 | gpte.flags |= _PAGE_DIRTY; |
162 | 264 | ||
163 | /* We're done with the old pte. */ | 265 | /* Get the pointer to the shadow PTE entry we're going to set. */ |
164 | spte = spte_addr(lg, *spgd, vaddr); | 266 | spte = spte_addr(lg, *spgd, vaddr); |
267 | /* If there was a valid shadow PTE entry here before, we release it. | ||
268 | * This can happen with a write to a previously read-only entry. */ | ||
165 | release_pte(*spte); | 269 | release_pte(*spte); |
166 | 270 | ||
167 | /* We don't make it writable if this isn't a write: later | 271 | /* If this is a write, we insist that the Guest page is writable (the |
168 | * write will fault so we can set dirty bit in guest. */ | 272 | * final arg to gpte_to_spte()). */ |
169 | if (gpte.flags & _PAGE_DIRTY) | 273 | if (gpte.flags & _PAGE_DIRTY) |
170 | *spte = gpte_to_spte(lg, gpte, 1); | 274 | *spte = gpte_to_spte(lg, gpte, 1); |
171 | else { | 275 | else { |
276 | /* If this is a read, don't set the "writable" bit in the page | ||
277 | * table entry, even if the Guest says it's writable. That way | ||
278 | * we come back here when a write does actually ocur, so we can | ||
279 | * update the Guest's _PAGE_DIRTY flag. */ | ||
172 | gpte_t ro_gpte = gpte; | 280 | gpte_t ro_gpte = gpte; |
173 | ro_gpte.flags &= ~_PAGE_RW; | 281 | ro_gpte.flags &= ~_PAGE_RW; |
174 | *spte = gpte_to_spte(lg, ro_gpte, 0); | 282 | *spte = gpte_to_spte(lg, ro_gpte, 0); |
175 | } | 283 | } |
176 | 284 | ||
177 | /* Now we update dirty/accessed on guest. */ | 285 | /* Finally, we write the Guest PTE entry back: we've set the |
286 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ | ||
178 | lgwrite_u32(lg, gpte_ptr, gpte.raw.val); | 287 | lgwrite_u32(lg, gpte_ptr, gpte.raw.val); |
288 | |||
289 | /* We succeeded in mapping the page! */ | ||
179 | return 1; | 290 | return 1; |
180 | } | 291 | } |
181 | 292 | ||
182 | /* This is much faster than the full demand_page logic. */ | 293 | /*H:360 (ii) Setting up the page table entry for the Guest stack. |
294 | * | ||
295 | * Remember pin_stack_pages() which makes sure the stack is mapped? It could | ||
296 | * simply call demand_page(), but as we've seen that logic is quite long, and | ||
297 | * usually the stack pages are already mapped anyway, so it's not required. | ||
298 | * | ||
299 | * This is a quick version which answers the question: is this virtual address | ||
300 | * mapped by the shadow page tables, and is it writable? */ | ||
183 | static int page_writable(struct lguest *lg, unsigned long vaddr) | 301 | static int page_writable(struct lguest *lg, unsigned long vaddr) |
184 | { | 302 | { |
185 | spgd_t *spgd; | 303 | spgd_t *spgd; |
186 | unsigned long flags; | 304 | unsigned long flags; |
187 | 305 | ||
306 | /* Look at the top level entry: is it present? */ | ||
188 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); | 307 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); |
189 | if (!(spgd->flags & _PAGE_PRESENT)) | 308 | if (!(spgd->flags & _PAGE_PRESENT)) |
190 | return 0; | 309 | return 0; |
191 | 310 | ||
311 | /* Check the flags on the pte entry itself: it must be present and | ||
312 | * writable. */ | ||
192 | flags = spte_addr(lg, *spgd, vaddr)->flags; | 313 | flags = spte_addr(lg, *spgd, vaddr)->flags; |
193 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); | 314 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); |
194 | } | 315 | } |
195 | 316 | ||
317 | /* So, when pin_stack_pages() asks us to pin a page, we check if it's already | ||
318 | * in the page tables, and if not, we call demand_page() with error code 2 | ||
319 | * (meaning "write"). */ | ||
196 | void pin_page(struct lguest *lg, unsigned long vaddr) | 320 | void pin_page(struct lguest *lg, unsigned long vaddr) |
197 | { | 321 | { |
198 | if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2)) | 322 | if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2)) |
199 | kill_guest(lg, "bad stack page %#lx", vaddr); | 323 | kill_guest(lg, "bad stack page %#lx", vaddr); |
200 | } | 324 | } |
201 | 325 | ||
326 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ | ||
202 | static void release_pgd(struct lguest *lg, spgd_t *spgd) | 327 | static void release_pgd(struct lguest *lg, spgd_t *spgd) |
203 | { | 328 | { |
329 | /* If the entry's not present, there's nothing to release. */ | ||
204 | if (spgd->flags & _PAGE_PRESENT) { | 330 | if (spgd->flags & _PAGE_PRESENT) { |
205 | unsigned int i; | 331 | unsigned int i; |
332 | /* Converting the pfn to find the actual PTE page is easy: turn | ||
333 | * the page number into a physical address, then convert to a | ||
334 | * virtual address (easy for kernel pages like this one). */ | ||
206 | spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT); | 335 | spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT); |
336 | /* For each entry in the page, we might need to release it. */ | ||
207 | for (i = 0; i < PTES_PER_PAGE; i++) | 337 | for (i = 0; i < PTES_PER_PAGE; i++) |
208 | release_pte(ptepage[i]); | 338 | release_pte(ptepage[i]); |
339 | /* Now we can free the page of PTEs */ | ||
209 | free_page((long)ptepage); | 340 | free_page((long)ptepage); |
341 | /* And zero out the PGD entry we we never release it twice. */ | ||
210 | spgd->raw.val = 0; | 342 | spgd->raw.val = 0; |
211 | } | 343 | } |
212 | } | 344 | } |
213 | 345 | ||
346 | /*H:440 (v) Flushing (thowing away) page tables, | ||
347 | * | ||
348 | * We saw flush_user_mappings() called when we re-used a top-level pgdir page. | ||
349 | * It simply releases every PTE page from 0 up to the kernel address. */ | ||
214 | static void flush_user_mappings(struct lguest *lg, int idx) | 350 | static void flush_user_mappings(struct lguest *lg, int idx) |
215 | { | 351 | { |
216 | unsigned int i; | 352 | unsigned int i; |
353 | /* Release every pgd entry up to the kernel's address. */ | ||
217 | for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++) | 354 | for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++) |
218 | release_pgd(lg, lg->pgdirs[idx].pgdir + i); | 355 | release_pgd(lg, lg->pgdirs[idx].pgdir + i); |
219 | } | 356 | } |
220 | 357 | ||
358 | /* The Guest also has a hypercall to do this manually: it's used when a large | ||
359 | * number of mappings have been changed. */ | ||
221 | void guest_pagetable_flush_user(struct lguest *lg) | 360 | void guest_pagetable_flush_user(struct lguest *lg) |
222 | { | 361 | { |
362 | /* Drop the userspace part of the current page table. */ | ||
223 | flush_user_mappings(lg, lg->pgdidx); | 363 | flush_user_mappings(lg, lg->pgdidx); |
224 | } | 364 | } |
365 | /*:*/ | ||
225 | 366 | ||
367 | /* We keep several page tables. This is a simple routine to find the page | ||
368 | * table (if any) corresponding to this top-level address the Guest has given | ||
369 | * us. */ | ||
226 | static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) | 370 | static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) |
227 | { | 371 | { |
228 | unsigned int i; | 372 | unsigned int i; |
@@ -232,21 +376,30 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) | |||
232 | return i; | 376 | return i; |
233 | } | 377 | } |
234 | 378 | ||
379 | /*H:435 And this is us, creating the new page directory. If we really do | ||
380 | * allocate a new one (and so the kernel parts are not there), we set | ||
381 | * blank_pgdir. */ | ||
235 | static unsigned int new_pgdir(struct lguest *lg, | 382 | static unsigned int new_pgdir(struct lguest *lg, |
236 | unsigned long cr3, | 383 | unsigned long cr3, |
237 | int *blank_pgdir) | 384 | int *blank_pgdir) |
238 | { | 385 | { |
239 | unsigned int next; | 386 | unsigned int next; |
240 | 387 | ||
388 | /* We pick one entry at random to throw out. Choosing the Least | ||
389 | * Recently Used might be better, but this is easy. */ | ||
241 | next = random32() % ARRAY_SIZE(lg->pgdirs); | 390 | next = random32() % ARRAY_SIZE(lg->pgdirs); |
391 | /* If it's never been allocated at all before, try now. */ | ||
242 | if (!lg->pgdirs[next].pgdir) { | 392 | if (!lg->pgdirs[next].pgdir) { |
243 | lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL); | 393 | lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL); |
394 | /* If the allocation fails, just keep using the one we have */ | ||
244 | if (!lg->pgdirs[next].pgdir) | 395 | if (!lg->pgdirs[next].pgdir) |
245 | next = lg->pgdidx; | 396 | next = lg->pgdidx; |
246 | else | 397 | else |
247 | /* There are no mappings: you'll need to re-pin */ | 398 | /* This is a blank page, so there are no kernel |
399 | * mappings: caller must map the stack! */ | ||
248 | *blank_pgdir = 1; | 400 | *blank_pgdir = 1; |
249 | } | 401 | } |
402 | /* Record which Guest toplevel this shadows. */ | ||
250 | lg->pgdirs[next].cr3 = cr3; | 403 | lg->pgdirs[next].cr3 = cr3; |
251 | /* Release all the non-kernel mappings. */ | 404 | /* Release all the non-kernel mappings. */ |
252 | flush_user_mappings(lg, next); | 405 | flush_user_mappings(lg, next); |
@@ -254,82 +407,161 @@ static unsigned int new_pgdir(struct lguest *lg, | |||
254 | return next; | 407 | return next; |
255 | } | 408 | } |
256 | 409 | ||
410 | /*H:430 (iv) Switching page tables | ||
411 | * | ||
412 | * This is what happens when the Guest changes page tables (ie. changes the | ||
413 | * top-level pgdir). This happens on almost every context switch. */ | ||
257 | void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) | 414 | void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) |
258 | { | 415 | { |
259 | int newpgdir, repin = 0; | 416 | int newpgdir, repin = 0; |
260 | 417 | ||
418 | /* Look to see if we have this one already. */ | ||
261 | newpgdir = find_pgdir(lg, pgtable); | 419 | newpgdir = find_pgdir(lg, pgtable); |
420 | /* If not, we allocate or mug an existing one: if it's a fresh one, | ||
421 | * repin gets set to 1. */ | ||
262 | if (newpgdir == ARRAY_SIZE(lg->pgdirs)) | 422 | if (newpgdir == ARRAY_SIZE(lg->pgdirs)) |
263 | newpgdir = new_pgdir(lg, pgtable, &repin); | 423 | newpgdir = new_pgdir(lg, pgtable, &repin); |
424 | /* Change the current pgd index to the new one. */ | ||
264 | lg->pgdidx = newpgdir; | 425 | lg->pgdidx = newpgdir; |
426 | /* If it was completely blank, we map in the Guest kernel stack */ | ||
265 | if (repin) | 427 | if (repin) |
266 | pin_stack_pages(lg); | 428 | pin_stack_pages(lg); |
267 | } | 429 | } |
268 | 430 | ||
431 | /*H:470 Finally, a routine which throws away everything: all PGD entries in all | ||
432 | * the shadow page tables. This is used when we destroy the Guest. */ | ||
269 | static void release_all_pagetables(struct lguest *lg) | 433 | static void release_all_pagetables(struct lguest *lg) |
270 | { | 434 | { |
271 | unsigned int i, j; | 435 | unsigned int i, j; |
272 | 436 | ||
437 | /* Every shadow pagetable this Guest has */ | ||
273 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 438 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) |
274 | if (lg->pgdirs[i].pgdir) | 439 | if (lg->pgdirs[i].pgdir) |
440 | /* Every PGD entry except the Switcher at the top */ | ||
275 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) | 441 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) |
276 | release_pgd(lg, lg->pgdirs[i].pgdir + j); | 442 | release_pgd(lg, lg->pgdirs[i].pgdir + j); |
277 | } | 443 | } |
278 | 444 | ||
445 | /* We also throw away everything when a Guest tells us it's changed a kernel | ||
446 | * mapping. Since kernel mappings are in every page table, it's easiest to | ||
447 | * throw them all away. This is amazingly slow, but thankfully rare. */ | ||
279 | void guest_pagetable_clear_all(struct lguest *lg) | 448 | void guest_pagetable_clear_all(struct lguest *lg) |
280 | { | 449 | { |
281 | release_all_pagetables(lg); | 450 | release_all_pagetables(lg); |
451 | /* We need the Guest kernel stack mapped again. */ | ||
282 | pin_stack_pages(lg); | 452 | pin_stack_pages(lg); |
283 | } | 453 | } |
284 | 454 | ||
455 | /*H:420 This is the routine which actually sets the page table entry for then | ||
456 | * "idx"'th shadow page table. | ||
457 | * | ||
458 | * Normally, we can just throw out the old entry and replace it with 0: if they | ||
459 | * use it demand_page() will put the new entry in. We need to do this anyway: | ||
460 | * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page | ||
461 | * is read from, and _PAGE_DIRTY when it's written to. | ||
462 | * | ||
463 | * But Avi Kivity pointed out that most Operating Systems (Linux included) set | ||
464 | * these bits on PTEs immediately anyway. This is done to save the CPU from | ||
465 | * having to update them, but it helps us the same way: if they set | ||
466 | * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if | ||
467 | * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. | ||
468 | */ | ||
285 | static void do_set_pte(struct lguest *lg, int idx, | 469 | static void do_set_pte(struct lguest *lg, int idx, |
286 | unsigned long vaddr, gpte_t gpte) | 470 | unsigned long vaddr, gpte_t gpte) |
287 | { | 471 | { |
472 | /* Look up the matching shadow page directot entry. */ | ||
288 | spgd_t *spgd = spgd_addr(lg, idx, vaddr); | 473 | spgd_t *spgd = spgd_addr(lg, idx, vaddr); |
474 | |||
475 | /* If the top level isn't present, there's no entry to update. */ | ||
289 | if (spgd->flags & _PAGE_PRESENT) { | 476 | if (spgd->flags & _PAGE_PRESENT) { |
477 | /* Otherwise, we start by releasing the existing entry. */ | ||
290 | spte_t *spte = spte_addr(lg, *spgd, vaddr); | 478 | spte_t *spte = spte_addr(lg, *spgd, vaddr); |
291 | release_pte(*spte); | 479 | release_pte(*spte); |
480 | |||
481 | /* If they're setting this entry as dirty or accessed, we might | ||
482 | * as well put that entry they've given us in now. This shaves | ||
483 | * 10% off a copy-on-write micro-benchmark. */ | ||
292 | if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) { | 484 | if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) { |
293 | check_gpte(lg, gpte); | 485 | check_gpte(lg, gpte); |
294 | *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY); | 486 | *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY); |
295 | } else | 487 | } else |
488 | /* Otherwise we can demand_page() it in later. */ | ||
296 | spte->raw.val = 0; | 489 | spte->raw.val = 0; |
297 | } | 490 | } |
298 | } | 491 | } |
299 | 492 | ||
493 | /*H:410 Updating a PTE entry is a little trickier. | ||
494 | * | ||
495 | * We keep track of several different page tables (the Guest uses one for each | ||
496 | * process, so it makes sense to cache at least a few). Each of these have | ||
497 | * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for | ||
498 | * all processes. So when the page table above that address changes, we update | ||
499 | * all the page tables, not just the current one. This is rare. | ||
500 | * | ||
501 | * The benefit is that when we have to track a new page table, we can copy keep | ||
502 | * all the kernel mappings. This speeds up context switch immensely. */ | ||
300 | void guest_set_pte(struct lguest *lg, | 503 | void guest_set_pte(struct lguest *lg, |
301 | unsigned long cr3, unsigned long vaddr, gpte_t gpte) | 504 | unsigned long cr3, unsigned long vaddr, gpte_t gpte) |
302 | { | 505 | { |
303 | /* Kernel mappings must be changed on all top levels. */ | 506 | /* Kernel mappings must be changed on all top levels. Slow, but |
507 | * doesn't happen often. */ | ||
304 | if (vaddr >= lg->page_offset) { | 508 | if (vaddr >= lg->page_offset) { |
305 | unsigned int i; | 509 | unsigned int i; |
306 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 510 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) |
307 | if (lg->pgdirs[i].pgdir) | 511 | if (lg->pgdirs[i].pgdir) |
308 | do_set_pte(lg, i, vaddr, gpte); | 512 | do_set_pte(lg, i, vaddr, gpte); |
309 | } else { | 513 | } else { |
514 | /* Is this page table one we have a shadow for? */ | ||
310 | int pgdir = find_pgdir(lg, cr3); | 515 | int pgdir = find_pgdir(lg, cr3); |
311 | if (pgdir != ARRAY_SIZE(lg->pgdirs)) | 516 | if (pgdir != ARRAY_SIZE(lg->pgdirs)) |
517 | /* If so, do the update. */ | ||
312 | do_set_pte(lg, pgdir, vaddr, gpte); | 518 | do_set_pte(lg, pgdir, vaddr, gpte); |
313 | } | 519 | } |
314 | } | 520 | } |
315 | 521 | ||
522 | /*H:400 | ||
523 | * (iii) Setting up a page table entry when the Guest tells us it has changed. | ||
524 | * | ||
525 | * Just like we did in interrupts_and_traps.c, it makes sense for us to deal | ||
526 | * with the other side of page tables while we're here: what happens when the | ||
527 | * Guest asks for a page table to be updated? | ||
528 | * | ||
529 | * We already saw that demand_page() will fill in the shadow page tables when | ||
530 | * needed, so we can simply remove shadow page table entries whenever the Guest | ||
531 | * tells us they've changed. When the Guest tries to use the new entry it will | ||
532 | * fault and demand_page() will fix it up. | ||
533 | * | ||
534 | * So with that in mind here's our code to to update a (top-level) PGD entry: | ||
535 | */ | ||
316 | void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) | 536 | void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) |
317 | { | 537 | { |
318 | int pgdir; | 538 | int pgdir; |
319 | 539 | ||
540 | /* The kernel seems to try to initialize this early on: we ignore its | ||
541 | * attempts to map over the Switcher. */ | ||
320 | if (idx >= SWITCHER_PGD_INDEX) | 542 | if (idx >= SWITCHER_PGD_INDEX) |
321 | return; | 543 | return; |
322 | 544 | ||
545 | /* If they're talking about a page table we have a shadow for... */ | ||
323 | pgdir = find_pgdir(lg, cr3); | 546 | pgdir = find_pgdir(lg, cr3); |
324 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) | 547 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) |
548 | /* ... throw it away. */ | ||
325 | release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); | 549 | release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); |
326 | } | 550 | } |
327 | 551 | ||
552 | /*H:500 (vii) Setting up the page tables initially. | ||
553 | * | ||
554 | * When a Guest is first created, the Launcher tells us where the toplevel of | ||
555 | * its first page table is. We set some things up here: */ | ||
328 | int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) | 556 | int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) |
329 | { | 557 | { |
330 | /* We assume this in flush_user_mappings, so check now */ | 558 | /* In flush_user_mappings() we loop from 0 to |
559 | * "vaddr_to_pgd_index(lg->page_offset)". This assumes it won't hit | ||
560 | * the Switcher mappings, so check that now. */ | ||
331 | if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX) | 561 | if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX) |
332 | return -EINVAL; | 562 | return -EINVAL; |
563 | /* We start on the first shadow page table, and give it a blank PGD | ||
564 | * page. */ | ||
333 | lg->pgdidx = 0; | 565 | lg->pgdidx = 0; |
334 | lg->pgdirs[lg->pgdidx].cr3 = pgtable; | 566 | lg->pgdirs[lg->pgdidx].cr3 = pgtable; |
335 | lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL); | 567 | lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL); |
@@ -338,33 +570,48 @@ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) | |||
338 | return 0; | 570 | return 0; |
339 | } | 571 | } |
340 | 572 | ||
573 | /* When a Guest dies, our cleanup is fairly simple. */ | ||
341 | void free_guest_pagetable(struct lguest *lg) | 574 | void free_guest_pagetable(struct lguest *lg) |
342 | { | 575 | { |
343 | unsigned int i; | 576 | unsigned int i; |
344 | 577 | ||
578 | /* Throw away all page table pages. */ | ||
345 | release_all_pagetables(lg); | 579 | release_all_pagetables(lg); |
580 | /* Now free the top levels: free_page() can handle 0 just fine. */ | ||
346 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 581 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) |
347 | free_page((long)lg->pgdirs[i].pgdir); | 582 | free_page((long)lg->pgdirs[i].pgdir); |
348 | } | 583 | } |
349 | 584 | ||
350 | /* Caller must be preempt-safe */ | 585 | /*H:480 (vi) Mapping the Switcher when the Guest is about to run. |
586 | * | ||
587 | * The Switcher and the two pages for this CPU need to be available to the | ||
588 | * Guest (and not the pages for other CPUs). We have the appropriate PTE pages | ||
589 | * for each CPU already set up, we just need to hook them in. */ | ||
351 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) | 590 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) |
352 | { | 591 | { |
353 | spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); | 592 | spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); |
354 | spgd_t switcher_pgd; | 593 | spgd_t switcher_pgd; |
355 | spte_t regs_pte; | 594 | spte_t regs_pte; |
356 | 595 | ||
357 | /* Since switcher less that 4MB, we simply mug top pte page. */ | 596 | /* Make the last PGD entry for this Guest point to the Switcher's PTE |
597 | * page for this CPU (with appropriate flags). */ | ||
358 | switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT; | 598 | switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT; |
359 | switcher_pgd.flags = _PAGE_KERNEL; | 599 | switcher_pgd.flags = _PAGE_KERNEL; |
360 | lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; | 600 | lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; |
361 | 601 | ||
362 | /* Map our regs page over stack page. */ | 602 | /* We also change the Switcher PTE page. When we're running the Guest, |
603 | * we want the Guest's "regs" page to appear where the first Switcher | ||
604 | * page for this CPU is. This is an optimization: when the Switcher | ||
605 | * saves the Guest registers, it saves them into the first page of this | ||
606 | * CPU's "struct lguest_pages": if we make sure the Guest's register | ||
607 | * page is already mapped there, we don't have to copy them out | ||
608 | * again. */ | ||
363 | regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT; | 609 | regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT; |
364 | regs_pte.flags = _PAGE_KERNEL; | 610 | regs_pte.flags = _PAGE_KERNEL; |
365 | switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE] | 611 | switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE] |
366 | = regs_pte; | 612 | = regs_pte; |
367 | } | 613 | } |
614 | /*:*/ | ||
368 | 615 | ||
369 | static void free_switcher_pte_pages(void) | 616 | static void free_switcher_pte_pages(void) |
370 | { | 617 | { |
@@ -374,6 +621,10 @@ static void free_switcher_pte_pages(void) | |||
374 | free_page((long)switcher_pte_page(i)); | 621 | free_page((long)switcher_pte_page(i)); |
375 | } | 622 | } |
376 | 623 | ||
624 | /*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given | ||
625 | * the CPU number and the "struct page"s for the Switcher code itself. | ||
626 | * | ||
627 | * Currently the Switcher is less than a page long, so "pages" is always 1. */ | ||
377 | static __init void populate_switcher_pte_page(unsigned int cpu, | 628 | static __init void populate_switcher_pte_page(unsigned int cpu, |
378 | struct page *switcher_page[], | 629 | struct page *switcher_page[], |
379 | unsigned int pages) | 630 | unsigned int pages) |
@@ -381,21 +632,26 @@ static __init void populate_switcher_pte_page(unsigned int cpu, | |||
381 | unsigned int i; | 632 | unsigned int i; |
382 | spte_t *pte = switcher_pte_page(cpu); | 633 | spte_t *pte = switcher_pte_page(cpu); |
383 | 634 | ||
635 | /* The first entries are easy: they map the Switcher code. */ | ||
384 | for (i = 0; i < pages; i++) { | 636 | for (i = 0; i < pages; i++) { |
385 | pte[i].pfn = page_to_pfn(switcher_page[i]); | 637 | pte[i].pfn = page_to_pfn(switcher_page[i]); |
386 | pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED; | 638 | pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED; |
387 | } | 639 | } |
388 | 640 | ||
389 | /* We only map this CPU's pages, so guest can't see others. */ | 641 | /* The only other thing we map is this CPU's pair of pages. */ |
390 | i = pages + cpu*2; | 642 | i = pages + cpu*2; |
391 | 643 | ||
392 | /* First page (regs) is rw, second (state) is ro. */ | 644 | /* First page (Guest registers) is writable from the Guest */ |
393 | pte[i].pfn = page_to_pfn(switcher_page[i]); | 645 | pte[i].pfn = page_to_pfn(switcher_page[i]); |
394 | pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW; | 646 | pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW; |
647 | /* The second page contains the "struct lguest_ro_state", and is | ||
648 | * read-only. */ | ||
395 | pte[i+1].pfn = page_to_pfn(switcher_page[i+1]); | 649 | pte[i+1].pfn = page_to_pfn(switcher_page[i+1]); |
396 | pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED; | 650 | pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED; |
397 | } | 651 | } |
398 | 652 | ||
653 | /*H:510 At boot or module load time, init_pagetables() allocates and populates | ||
654 | * the Switcher PTE page for each CPU. */ | ||
399 | __init int init_pagetables(struct page **switcher_page, unsigned int pages) | 655 | __init int init_pagetables(struct page **switcher_page, unsigned int pages) |
400 | { | 656 | { |
401 | unsigned int i; | 657 | unsigned int i; |
@@ -410,7 +666,9 @@ __init int init_pagetables(struct page **switcher_page, unsigned int pages) | |||
410 | } | 666 | } |
411 | return 0; | 667 | return 0; |
412 | } | 668 | } |
669 | /*:*/ | ||
413 | 670 | ||
671 | /* Cleaning up simply involves freeing the PTE page for each CPU. */ | ||
414 | void free_pagetables(void) | 672 | void free_pagetables(void) |
415 | { | 673 | { |
416 | free_switcher_pte_pages(); | 674 | free_switcher_pte_pages(); |
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index c4fc7293b84b..4d4e5a4586f9 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c | |||
@@ -11,17 +11,58 @@ | |||
11 | * from frolicking through its parklike serenity. :*/ | 11 | * from frolicking through its parklike serenity. :*/ |
12 | #include "lg.h" | 12 | #include "lg.h" |
13 | 13 | ||
14 | /*H:600 | ||
15 | * We've almost completed the Host; there's just one file to go! | ||
16 | * | ||
17 | * Segments & The Global Descriptor Table | ||
18 | * | ||
19 | * (That title sounds like a bad Nerdcore group. Not to suggest that there are | ||
20 | * any good Nerdcore groups, but in high school a friend of mine had a band | ||
21 | * called Joe Fish and the Chips, so there are definitely worse band names). | ||
22 | * | ||
23 | * To refresh: the GDT is a table of 8-byte values describing segments. Once | ||
24 | * set up, these segments can be loaded into one of the 6 "segment registers". | ||
25 | * | ||
26 | * GDT entries are passed around as "struct desc_struct"s, which like IDT | ||
27 | * entries are split into two 32-bit members, "a" and "b". One day, someone | ||
28 | * will clean that up, and be declared a Hero. (No pressure, I'm just saying). | ||
29 | * | ||
30 | * Anyway, the GDT entry contains a base (the start address of the segment), a | ||
31 | * limit (the size of the segment - 1), and some flags. Sounds simple, and it | ||
32 | * would be, except those zany Intel engineers decided that it was too boring | ||
33 | * to put the base at one end, the limit at the other, and the flags in | ||
34 | * between. They decided to shotgun the bits at random throughout the 8 bytes, | ||
35 | * like so: | ||
36 | * | ||
37 | * 0 16 40 48 52 56 63 | ||
38 | * [ limit part 1 ][ base part 1 ][ flags ][li][fl][base ] | ||
39 | * mit ags part 2 | ||
40 | * part 2 | ||
41 | * | ||
42 | * As a result, this file contains a certain amount of magic numeracy. Let's | ||
43 | * begin. | ||
44 | */ | ||
45 | |||
46 | /* Is the descriptor the Guest wants us to put in OK? | ||
47 | * | ||
48 | * The flag which Intel says must be zero: must be zero. The descriptor must | ||
49 | * be present, (this is actually checked earlier but is here for thorougness), | ||
50 | * and the descriptor type must be 1 (a memory segment). */ | ||
14 | static int desc_ok(const struct desc_struct *gdt) | 51 | static int desc_ok(const struct desc_struct *gdt) |
15 | { | 52 | { |
16 | /* MBZ=0, P=1, DT=1 */ | ||
17 | return ((gdt->b & 0x00209000) == 0x00009000); | 53 | return ((gdt->b & 0x00209000) == 0x00009000); |
18 | } | 54 | } |
19 | 55 | ||
56 | /* Is the segment present? (Otherwise it can't be used by the Guest). */ | ||
20 | static int segment_present(const struct desc_struct *gdt) | 57 | static int segment_present(const struct desc_struct *gdt) |
21 | { | 58 | { |
22 | return gdt->b & 0x8000; | 59 | return gdt->b & 0x8000; |
23 | } | 60 | } |
24 | 61 | ||
62 | /* There are several entries we don't let the Guest set. The TSS entry is the | ||
63 | * "Task State Segment" which controls all kinds of delicate things. The | ||
64 | * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the | ||
65 | * the Guest can't be trusted to deal with double faults. */ | ||
25 | static int ignored_gdt(unsigned int num) | 66 | static int ignored_gdt(unsigned int num) |
26 | { | 67 | { |
27 | return (num == GDT_ENTRY_TSS | 68 | return (num == GDT_ENTRY_TSS |
@@ -30,9 +71,18 @@ static int ignored_gdt(unsigned int num) | |||
30 | || num == GDT_ENTRY_DOUBLEFAULT_TSS); | 71 | || num == GDT_ENTRY_DOUBLEFAULT_TSS); |
31 | } | 72 | } |
32 | 73 | ||
33 | /* We don't allow removal of CS, DS or SS; it doesn't make sense. */ | 74 | /* If the Guest asks us to remove an entry from the GDT, we have to be careful. |
75 | * If one of the segment registers is pointing at that entry the Switcher will | ||
76 | * crash when it tries to reload the segment registers for the Guest. | ||
77 | * | ||
78 | * It doesn't make much sense for the Guest to try to remove its own code, data | ||
79 | * or stack segments while they're in use: assume that's a Guest bug. If it's | ||
80 | * one of the lesser segment registers using the removed entry, we simply set | ||
81 | * that register to 0 (unusable). */ | ||
34 | static void check_segment_use(struct lguest *lg, unsigned int desc) | 82 | static void check_segment_use(struct lguest *lg, unsigned int desc) |
35 | { | 83 | { |
84 | /* GDT entries are 8 bytes long, so we divide to get the index and | ||
85 | * ignore the bottom bits. */ | ||
36 | if (lg->regs->gs / 8 == desc) | 86 | if (lg->regs->gs / 8 == desc) |
37 | lg->regs->gs = 0; | 87 | lg->regs->gs = 0; |
38 | if (lg->regs->fs / 8 == desc) | 88 | if (lg->regs->fs / 8 == desc) |
@@ -45,12 +95,16 @@ static void check_segment_use(struct lguest *lg, unsigned int desc) | |||
45 | kill_guest(lg, "Removed live GDT entry %u", desc); | 95 | kill_guest(lg, "Removed live GDT entry %u", desc); |
46 | } | 96 | } |
47 | 97 | ||
98 | /*H:610 Once the GDT has been changed, we look through the changed entries and | ||
99 | * see if they're OK. If not, we'll call kill_guest() and the Guest will never | ||
100 | * get to use the invalid entries. */ | ||
48 | static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) | 101 | static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) |
49 | { | 102 | { |
50 | unsigned int i; | 103 | unsigned int i; |
51 | 104 | ||
52 | for (i = start; i < end; i++) { | 105 | for (i = start; i < end; i++) { |
53 | /* We never copy these ones to real gdt */ | 106 | /* We never copy these ones to real GDT, so we don't care what |
107 | * they say */ | ||
54 | if (ignored_gdt(i)) | 108 | if (ignored_gdt(i)) |
55 | continue; | 109 | continue; |
56 | 110 | ||
@@ -64,41 +118,57 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) | |||
64 | if (!desc_ok(&lg->gdt[i])) | 118 | if (!desc_ok(&lg->gdt[i])) |
65 | kill_guest(lg, "Bad GDT descriptor %i", i); | 119 | kill_guest(lg, "Bad GDT descriptor %i", i); |
66 | 120 | ||
67 | /* DPL 0 presumably means "for use by guest". */ | 121 | /* Segment descriptors contain a privilege level: the Guest is |
122 | * sometimes careless and leaves this as 0, even though it's | ||
123 | * running at privilege level 1. If so, we fix it here. */ | ||
68 | if ((lg->gdt[i].b & 0x00006000) == 0) | 124 | if ((lg->gdt[i].b & 0x00006000) == 0) |
69 | lg->gdt[i].b |= (GUEST_PL << 13); | 125 | lg->gdt[i].b |= (GUEST_PL << 13); |
70 | 126 | ||
71 | /* Set accessed bit, since gdt isn't writable. */ | 127 | /* Each descriptor has an "accessed" bit. If we don't set it |
128 | * now, the CPU will try to set it when the Guest first loads | ||
129 | * that entry into a segment register. But the GDT isn't | ||
130 | * writable by the Guest, so bad things can happen. */ | ||
72 | lg->gdt[i].b |= 0x00000100; | 131 | lg->gdt[i].b |= 0x00000100; |
73 | } | 132 | } |
74 | } | 133 | } |
75 | 134 | ||
135 | /* This routine is called at boot or modprobe time for each CPU to set up the | ||
136 | * "constant" GDT entries for Guests running on that CPU. */ | ||
76 | void setup_default_gdt_entries(struct lguest_ro_state *state) | 137 | void setup_default_gdt_entries(struct lguest_ro_state *state) |
77 | { | 138 | { |
78 | struct desc_struct *gdt = state->guest_gdt; | 139 | struct desc_struct *gdt = state->guest_gdt; |
79 | unsigned long tss = (unsigned long)&state->guest_tss; | 140 | unsigned long tss = (unsigned long)&state->guest_tss; |
80 | 141 | ||
81 | /* Hypervisor segments. */ | 142 | /* The hypervisor segments are full 0-4G segments, privilege level 0 */ |
82 | gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; | 143 | gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; |
83 | gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; | 144 | gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; |
84 | 145 | ||
85 | /* This is the one which we *cannot* copy from guest, since tss | 146 | /* The TSS segment refers to the TSS entry for this CPU, so we cannot |
86 | is depended on this lguest_ro_state, ie. this cpu. */ | 147 | * copy it from the Guest. Forgive the magic flags */ |
87 | gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); | 148 | gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); |
88 | gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) | 149 | gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) |
89 | | ((tss >> 16) & 0x000000FF); | 150 | | ((tss >> 16) & 0x000000FF); |
90 | } | 151 | } |
91 | 152 | ||
153 | /* This routine is called before the Guest is run for the first time. */ | ||
92 | void setup_guest_gdt(struct lguest *lg) | 154 | void setup_guest_gdt(struct lguest *lg) |
93 | { | 155 | { |
156 | /* Start with full 0-4G segments... */ | ||
94 | lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; | 157 | lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; |
95 | lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; | 158 | lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; |
159 | /* ...except the Guest is allowed to use them, so set the privilege | ||
160 | * level appropriately in the flags. */ | ||
96 | lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); | 161 | lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); |
97 | lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); | 162 | lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); |
98 | } | 163 | } |
99 | 164 | ||
100 | /* This is a fast version for the common case where only the three TLS entries | 165 | /* Like the IDT, we never simply use the GDT the Guest gives us. We set up the |
101 | * have changed. */ | 166 | * GDTs for each CPU, then we copy across the entries each time we want to run |
167 | * a different Guest on that CPU. */ | ||
168 | |||
169 | /* A partial GDT load, for the three "thead-local storage" entries. Otherwise | ||
170 | * it's just like load_guest_gdt(). So much, in fact, it would probably be | ||
171 | * neater to have a single hypercall to cover both. */ | ||
102 | void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) | 172 | void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) |
103 | { | 173 | { |
104 | unsigned int i; | 174 | unsigned int i; |
@@ -107,22 +177,31 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) | |||
107 | gdt[i] = lg->gdt[i]; | 177 | gdt[i] = lg->gdt[i]; |
108 | } | 178 | } |
109 | 179 | ||
180 | /* This is the full version */ | ||
110 | void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) | 181 | void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) |
111 | { | 182 | { |
112 | unsigned int i; | 183 | unsigned int i; |
113 | 184 | ||
185 | /* The default entries from setup_default_gdt_entries() are not | ||
186 | * replaced. See ignored_gdt() above. */ | ||
114 | for (i = 0; i < GDT_ENTRIES; i++) | 187 | for (i = 0; i < GDT_ENTRIES; i++) |
115 | if (!ignored_gdt(i)) | 188 | if (!ignored_gdt(i)) |
116 | gdt[i] = lg->gdt[i]; | 189 | gdt[i] = lg->gdt[i]; |
117 | } | 190 | } |
118 | 191 | ||
192 | /* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */ | ||
119 | void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) | 193 | void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) |
120 | { | 194 | { |
195 | /* We assume the Guest has the same number of GDT entries as the | ||
196 | * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ | ||
121 | if (num > ARRAY_SIZE(lg->gdt)) | 197 | if (num > ARRAY_SIZE(lg->gdt)) |
122 | kill_guest(lg, "too many gdt entries %i", num); | 198 | kill_guest(lg, "too many gdt entries %i", num); |
123 | 199 | ||
200 | /* We read the whole thing in, then fix it up. */ | ||
124 | lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0])); | 201 | lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0])); |
125 | fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt)); | 202 | fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt)); |
203 | /* Mark that the GDT changed so the core knows it has to copy it again, | ||
204 | * even if the Guest is run on the same CPU. */ | ||
126 | lg->changed |= CHANGED_GDT; | 205 | lg->changed |= CHANGED_GDT; |
127 | } | 206 | } |
128 | 207 | ||
@@ -134,3 +213,13 @@ void guest_load_tls(struct lguest *lg, unsigned long gtls) | |||
134 | fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); | 213 | fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); |
135 | lg->changed |= CHANGED_GDT_TLS; | 214 | lg->changed |= CHANGED_GDT_TLS; |
136 | } | 215 | } |
216 | |||
217 | /* | ||
218 | * With this, we have finished the Host. | ||
219 | * | ||
220 | * Five of the seven parts of our task are complete. You have made it through | ||
221 | * the Bit of Despair (I think that's somewhere in the page table code, | ||
222 | * myself). | ||
223 | * | ||
224 | * Next, we examine "make Switcher". It's short, but intense. | ||
225 | */ | ||