diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2007-07-19 04:49:23 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-07-19 13:04:52 -0400 |
commit | d7e28ffe6c74416b54345d6004fd0964c115b12c (patch) | |
tree | 844beb4f400d5400098538e0c1e5f12d20a9504a /drivers/lguest/core.c | |
parent | 07ad157f6e5d228be78acd5cea0291e5d0360398 (diff) |
lguest: the host code
This is the code for the "lg.ko" module, which allows lguest guests to
be launched.
[akpm@linux-foundation.org: update for futex-new-private-futexes]
[akpm@linux-foundation.org: build fix]
[jmorris@namei.org: lguest: use hrtimers]
[akpm@linux-foundation.org: x86_64 build fix]
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'drivers/lguest/core.c')
-rw-r--r-- | drivers/lguest/core.c | 462 |
1 files changed, 462 insertions, 0 deletions
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c new file mode 100644 index 000000000000..ce909ec57499 --- /dev/null +++ b/drivers/lguest/core.c | |||
@@ -0,0 +1,462 @@ | |||
1 | /* World's simplest hypervisor, to test paravirt_ops and show | ||
2 | * unbelievers that virtualization is the future. Plus, it's fun! */ | ||
3 | #include <linux/module.h> | ||
4 | #include <linux/stringify.h> | ||
5 | #include <linux/stddef.h> | ||
6 | #include <linux/io.h> | ||
7 | #include <linux/mm.h> | ||
8 | #include <linux/vmalloc.h> | ||
9 | #include <linux/cpu.h> | ||
10 | #include <linux/freezer.h> | ||
11 | #include <asm/paravirt.h> | ||
12 | #include <asm/desc.h> | ||
13 | #include <asm/pgtable.h> | ||
14 | #include <asm/uaccess.h> | ||
15 | #include <asm/poll.h> | ||
16 | #include <asm/highmem.h> | ||
17 | #include <asm/asm-offsets.h> | ||
18 | #include <asm/i387.h> | ||
19 | #include "lg.h" | ||
20 | |||
21 | /* Found in switcher.S */ | ||
22 | extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; | ||
23 | extern unsigned long default_idt_entries[]; | ||
24 | |||
25 | /* Every guest maps the core switcher code. */ | ||
26 | #define SHARED_SWITCHER_PAGES \ | ||
27 | DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) | ||
28 | /* Pages for switcher itself, then two pages per cpu */ | ||
29 | #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS) | ||
30 | |||
31 | /* We map at -4M for ease of mapping into the guest (one PTE page). */ | ||
32 | #define SWITCHER_ADDR 0xFFC00000 | ||
33 | |||
34 | static struct vm_struct *switcher_vma; | ||
35 | static struct page **switcher_page; | ||
36 | |||
37 | static int cpu_had_pge; | ||
38 | static struct { | ||
39 | unsigned long offset; | ||
40 | unsigned short segment; | ||
41 | } lguest_entry; | ||
42 | |||
43 | /* This One Big lock protects all inter-guest data structures. */ | ||
44 | DEFINE_MUTEX(lguest_lock); | ||
45 | static DEFINE_PER_CPU(struct lguest *, last_guest); | ||
46 | |||
47 | /* FIXME: Make dynamic. */ | ||
48 | #define MAX_LGUEST_GUESTS 16 | ||
49 | struct lguest lguests[MAX_LGUEST_GUESTS]; | ||
50 | |||
51 | /* Offset from where switcher.S was compiled to where we've copied it */ | ||
52 | static unsigned long switcher_offset(void) | ||
53 | { | ||
54 | return SWITCHER_ADDR - (unsigned long)start_switcher_text; | ||
55 | } | ||
56 | |||
57 | /* This cpu's struct lguest_pages. */ | ||
58 | static struct lguest_pages *lguest_pages(unsigned int cpu) | ||
59 | { | ||
60 | return &(((struct lguest_pages *) | ||
61 | (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); | ||
62 | } | ||
63 | |||
64 | static __init int map_switcher(void) | ||
65 | { | ||
66 | int i, err; | ||
67 | struct page **pagep; | ||
68 | |||
69 | switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, | ||
70 | GFP_KERNEL); | ||
71 | if (!switcher_page) { | ||
72 | err = -ENOMEM; | ||
73 | goto out; | ||
74 | } | ||
75 | |||
76 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { | ||
77 | unsigned long addr = get_zeroed_page(GFP_KERNEL); | ||
78 | if (!addr) { | ||
79 | err = -ENOMEM; | ||
80 | goto free_some_pages; | ||
81 | } | ||
82 | switcher_page[i] = virt_to_page(addr); | ||
83 | } | ||
84 | |||
85 | switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, | ||
86 | VM_ALLOC, SWITCHER_ADDR, VMALLOC_END); | ||
87 | if (!switcher_vma) { | ||
88 | err = -ENOMEM; | ||
89 | printk("lguest: could not map switcher pages high\n"); | ||
90 | goto free_pages; | ||
91 | } | ||
92 | |||
93 | pagep = switcher_page; | ||
94 | err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep); | ||
95 | if (err) { | ||
96 | printk("lguest: map_vm_area failed: %i\n", err); | ||
97 | goto free_vma; | ||
98 | } | ||
99 | memcpy(switcher_vma->addr, start_switcher_text, | ||
100 | end_switcher_text - start_switcher_text); | ||
101 | |||
102 | /* Fix up IDT entries to point into copied text. */ | ||
103 | for (i = 0; i < IDT_ENTRIES; i++) | ||
104 | default_idt_entries[i] += switcher_offset(); | ||
105 | |||
106 | for_each_possible_cpu(i) { | ||
107 | struct lguest_pages *pages = lguest_pages(i); | ||
108 | struct lguest_ro_state *state = &pages->state; | ||
109 | |||
110 | /* These fields are static: rest done in copy_in_guest_info */ | ||
111 | state->host_gdt_desc.size = GDT_SIZE-1; | ||
112 | state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); | ||
113 | store_idt(&state->host_idt_desc); | ||
114 | state->guest_idt_desc.size = sizeof(state->guest_idt)-1; | ||
115 | state->guest_idt_desc.address = (long)&state->guest_idt; | ||
116 | state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; | ||
117 | state->guest_gdt_desc.address = (long)&state->guest_gdt; | ||
118 | state->guest_tss.esp0 = (long)(&pages->regs + 1); | ||
119 | state->guest_tss.ss0 = LGUEST_DS; | ||
120 | /* No I/O for you! */ | ||
121 | state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); | ||
122 | setup_default_gdt_entries(state); | ||
123 | setup_default_idt_entries(state, default_idt_entries); | ||
124 | |||
125 | /* Setup LGUEST segments on all cpus */ | ||
126 | get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; | ||
127 | get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; | ||
128 | } | ||
129 | |||
130 | /* Initialize entry point into switcher. */ | ||
131 | lguest_entry.offset = (long)switch_to_guest + switcher_offset(); | ||
132 | lguest_entry.segment = LGUEST_CS; | ||
133 | |||
134 | printk(KERN_INFO "lguest: mapped switcher at %p\n", | ||
135 | switcher_vma->addr); | ||
136 | return 0; | ||
137 | |||
138 | free_vma: | ||
139 | vunmap(switcher_vma->addr); | ||
140 | free_pages: | ||
141 | i = TOTAL_SWITCHER_PAGES; | ||
142 | free_some_pages: | ||
143 | for (--i; i >= 0; i--) | ||
144 | __free_pages(switcher_page[i], 0); | ||
145 | kfree(switcher_page); | ||
146 | out: | ||
147 | return err; | ||
148 | } | ||
149 | |||
150 | static void unmap_switcher(void) | ||
151 | { | ||
152 | unsigned int i; | ||
153 | |||
154 | vunmap(switcher_vma->addr); | ||
155 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) | ||
156 | __free_pages(switcher_page[i], 0); | ||
157 | } | ||
158 | |||
159 | /* IN/OUT insns: enough to get us past boot-time probing. */ | ||
160 | static int emulate_insn(struct lguest *lg) | ||
161 | { | ||
162 | u8 insn; | ||
163 | unsigned int insnlen = 0, in = 0, shift = 0; | ||
164 | unsigned long physaddr = guest_pa(lg, lg->regs->eip); | ||
165 | |||
166 | /* This only works for addresses in linear mapping... */ | ||
167 | if (lg->regs->eip < lg->page_offset) | ||
168 | return 0; | ||
169 | lgread(lg, &insn, physaddr, 1); | ||
170 | |||
171 | /* Operand size prefix means it's actually for ax. */ | ||
172 | if (insn == 0x66) { | ||
173 | shift = 16; | ||
174 | insnlen = 1; | ||
175 | lgread(lg, &insn, physaddr + insnlen, 1); | ||
176 | } | ||
177 | |||
178 | switch (insn & 0xFE) { | ||
179 | case 0xE4: /* in <next byte>,%al */ | ||
180 | insnlen += 2; | ||
181 | in = 1; | ||
182 | break; | ||
183 | case 0xEC: /* in (%dx),%al */ | ||
184 | insnlen += 1; | ||
185 | in = 1; | ||
186 | break; | ||
187 | case 0xE6: /* out %al,<next byte> */ | ||
188 | insnlen += 2; | ||
189 | break; | ||
190 | case 0xEE: /* out %al,(%dx) */ | ||
191 | insnlen += 1; | ||
192 | break; | ||
193 | default: | ||
194 | return 0; | ||
195 | } | ||
196 | |||
197 | if (in) { | ||
198 | /* Lower bit tells is whether it's a 16 or 32 bit access */ | ||
199 | if (insn & 0x1) | ||
200 | lg->regs->eax = 0xFFFFFFFF; | ||
201 | else | ||
202 | lg->regs->eax |= (0xFFFF << shift); | ||
203 | } | ||
204 | lg->regs->eip += insnlen; | ||
205 | return 1; | ||
206 | } | ||
207 | |||
208 | int lguest_address_ok(const struct lguest *lg, | ||
209 | unsigned long addr, unsigned long len) | ||
210 | { | ||
211 | return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); | ||
212 | } | ||
213 | |||
214 | /* Just like get_user, but don't let guest access lguest binary. */ | ||
215 | u32 lgread_u32(struct lguest *lg, unsigned long addr) | ||
216 | { | ||
217 | u32 val = 0; | ||
218 | |||
219 | /* Don't let them access lguest binary */ | ||
220 | if (!lguest_address_ok(lg, addr, sizeof(val)) | ||
221 | || get_user(val, (u32 __user *)addr) != 0) | ||
222 | kill_guest(lg, "bad read address %#lx", addr); | ||
223 | return val; | ||
224 | } | ||
225 | |||
226 | void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val) | ||
227 | { | ||
228 | if (!lguest_address_ok(lg, addr, sizeof(val)) | ||
229 | || put_user(val, (u32 __user *)addr) != 0) | ||
230 | kill_guest(lg, "bad write address %#lx", addr); | ||
231 | } | ||
232 | |||
233 | void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) | ||
234 | { | ||
235 | if (!lguest_address_ok(lg, addr, bytes) | ||
236 | || copy_from_user(b, (void __user *)addr, bytes) != 0) { | ||
237 | /* copy_from_user should do this, but as we rely on it... */ | ||
238 | memset(b, 0, bytes); | ||
239 | kill_guest(lg, "bad read address %#lx len %u", addr, bytes); | ||
240 | } | ||
241 | } | ||
242 | |||
243 | void lgwrite(struct lguest *lg, unsigned long addr, const void *b, | ||
244 | unsigned bytes) | ||
245 | { | ||
246 | if (!lguest_address_ok(lg, addr, bytes) | ||
247 | || copy_to_user((void __user *)addr, b, bytes) != 0) | ||
248 | kill_guest(lg, "bad write address %#lx len %u", addr, bytes); | ||
249 | } | ||
250 | |||
251 | static void set_ts(void) | ||
252 | { | ||
253 | u32 cr0; | ||
254 | |||
255 | cr0 = read_cr0(); | ||
256 | if (!(cr0 & 8)) | ||
257 | write_cr0(cr0|8); | ||
258 | } | ||
259 | |||
260 | static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) | ||
261 | { | ||
262 | if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { | ||
263 | __get_cpu_var(last_guest) = lg; | ||
264 | lg->last_pages = pages; | ||
265 | lg->changed = CHANGED_ALL; | ||
266 | } | ||
267 | |||
268 | /* These are pretty cheap, so we do them unconditionally. */ | ||
269 | pages->state.host_cr3 = __pa(current->mm->pgd); | ||
270 | map_switcher_in_guest(lg, pages); | ||
271 | pages->state.guest_tss.esp1 = lg->esp1; | ||
272 | pages->state.guest_tss.ss1 = lg->ss1; | ||
273 | |||
274 | /* Copy direct trap entries. */ | ||
275 | if (lg->changed & CHANGED_IDT) | ||
276 | copy_traps(lg, pages->state.guest_idt, default_idt_entries); | ||
277 | |||
278 | /* Copy all GDT entries but the TSS. */ | ||
279 | if (lg->changed & CHANGED_GDT) | ||
280 | copy_gdt(lg, pages->state.guest_gdt); | ||
281 | /* If only the TLS entries have changed, copy them. */ | ||
282 | else if (lg->changed & CHANGED_GDT_TLS) | ||
283 | copy_gdt_tls(lg, pages->state.guest_gdt); | ||
284 | |||
285 | lg->changed = 0; | ||
286 | } | ||
287 | |||
288 | static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) | ||
289 | { | ||
290 | unsigned int clobber; | ||
291 | |||
292 | copy_in_guest_info(lg, pages); | ||
293 | |||
294 | /* Put eflags on stack, lcall does rest: suitable for iret return. */ | ||
295 | asm volatile("pushf; lcall *lguest_entry" | ||
296 | : "=a"(clobber), "=b"(clobber) | ||
297 | : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) | ||
298 | : "memory", "%edx", "%ecx", "%edi", "%esi"); | ||
299 | } | ||
300 | |||
301 | int run_guest(struct lguest *lg, unsigned long __user *user) | ||
302 | { | ||
303 | while (!lg->dead) { | ||
304 | unsigned int cr2 = 0; /* Damn gcc */ | ||
305 | |||
306 | /* Hypercalls first: we might have been out to userspace */ | ||
307 | do_hypercalls(lg); | ||
308 | if (lg->dma_is_pending) { | ||
309 | if (put_user(lg->pending_dma, user) || | ||
310 | put_user(lg->pending_key, user+1)) | ||
311 | return -EFAULT; | ||
312 | return sizeof(unsigned long)*2; | ||
313 | } | ||
314 | |||
315 | if (signal_pending(current)) | ||
316 | return -ERESTARTSYS; | ||
317 | |||
318 | /* If Waker set break_out, return to Launcher. */ | ||
319 | if (lg->break_out) | ||
320 | return -EAGAIN; | ||
321 | |||
322 | maybe_do_interrupt(lg); | ||
323 | |||
324 | try_to_freeze(); | ||
325 | |||
326 | if (lg->dead) | ||
327 | break; | ||
328 | |||
329 | if (lg->halted) { | ||
330 | set_current_state(TASK_INTERRUPTIBLE); | ||
331 | schedule(); | ||
332 | continue; | ||
333 | } | ||
334 | |||
335 | local_irq_disable(); | ||
336 | |||
337 | /* Even if *we* don't want FPU trap, guest might... */ | ||
338 | if (lg->ts) | ||
339 | set_ts(); | ||
340 | |||
341 | /* Don't let Guest do SYSENTER: we can't handle it. */ | ||
342 | if (boot_cpu_has(X86_FEATURE_SEP)) | ||
343 | wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); | ||
344 | |||
345 | run_guest_once(lg, lguest_pages(raw_smp_processor_id())); | ||
346 | |||
347 | /* Save cr2 now if we page-faulted. */ | ||
348 | if (lg->regs->trapnum == 14) | ||
349 | cr2 = read_cr2(); | ||
350 | else if (lg->regs->trapnum == 7) | ||
351 | math_state_restore(); | ||
352 | |||
353 | if (boot_cpu_has(X86_FEATURE_SEP)) | ||
354 | wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); | ||
355 | local_irq_enable(); | ||
356 | |||
357 | switch (lg->regs->trapnum) { | ||
358 | case 13: /* We've intercepted a GPF. */ | ||
359 | if (lg->regs->errcode == 0) { | ||
360 | if (emulate_insn(lg)) | ||
361 | continue; | ||
362 | } | ||
363 | break; | ||
364 | case 14: /* We've intercepted a page fault. */ | ||
365 | if (demand_page(lg, cr2, lg->regs->errcode)) | ||
366 | continue; | ||
367 | |||
368 | /* If lguest_data is NULL, this won't hurt. */ | ||
369 | if (put_user(cr2, &lg->lguest_data->cr2)) | ||
370 | kill_guest(lg, "Writing cr2"); | ||
371 | break; | ||
372 | case 7: /* We've intercepted a Device Not Available fault. */ | ||
373 | /* If they don't want to know, just absorb it. */ | ||
374 | if (!lg->ts) | ||
375 | continue; | ||
376 | break; | ||
377 | case 32 ... 255: /* Real interrupt, fall thru */ | ||
378 | cond_resched(); | ||
379 | case LGUEST_TRAP_ENTRY: /* Handled at top of loop */ | ||
380 | continue; | ||
381 | } | ||
382 | |||
383 | if (deliver_trap(lg, lg->regs->trapnum)) | ||
384 | continue; | ||
385 | |||
386 | kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", | ||
387 | lg->regs->trapnum, lg->regs->eip, | ||
388 | lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode); | ||
389 | } | ||
390 | return -ENOENT; | ||
391 | } | ||
392 | |||
393 | int find_free_guest(void) | ||
394 | { | ||
395 | unsigned int i; | ||
396 | for (i = 0; i < MAX_LGUEST_GUESTS; i++) | ||
397 | if (!lguests[i].tsk) | ||
398 | return i; | ||
399 | return -1; | ||
400 | } | ||
401 | |||
402 | static void adjust_pge(void *on) | ||
403 | { | ||
404 | if (on) | ||
405 | write_cr4(read_cr4() | X86_CR4_PGE); | ||
406 | else | ||
407 | write_cr4(read_cr4() & ~X86_CR4_PGE); | ||
408 | } | ||
409 | |||
410 | static int __init init(void) | ||
411 | { | ||
412 | int err; | ||
413 | |||
414 | if (paravirt_enabled()) { | ||
415 | printk("lguest is afraid of %s\n", paravirt_ops.name); | ||
416 | return -EPERM; | ||
417 | } | ||
418 | |||
419 | err = map_switcher(); | ||
420 | if (err) | ||
421 | return err; | ||
422 | |||
423 | err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); | ||
424 | if (err) { | ||
425 | unmap_switcher(); | ||
426 | return err; | ||
427 | } | ||
428 | lguest_io_init(); | ||
429 | |||
430 | err = lguest_device_init(); | ||
431 | if (err) { | ||
432 | free_pagetables(); | ||
433 | unmap_switcher(); | ||
434 | return err; | ||
435 | } | ||
436 | lock_cpu_hotplug(); | ||
437 | if (cpu_has_pge) { /* We have a broader idea of "global". */ | ||
438 | cpu_had_pge = 1; | ||
439 | on_each_cpu(adjust_pge, (void *)0, 0, 1); | ||
440 | clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); | ||
441 | } | ||
442 | unlock_cpu_hotplug(); | ||
443 | return 0; | ||
444 | } | ||
445 | |||
446 | static void __exit fini(void) | ||
447 | { | ||
448 | lguest_device_remove(); | ||
449 | free_pagetables(); | ||
450 | unmap_switcher(); | ||
451 | lock_cpu_hotplug(); | ||
452 | if (cpu_had_pge) { | ||
453 | set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); | ||
454 | on_each_cpu(adjust_pge, (void *)1, 0, 1); | ||
455 | } | ||
456 | unlock_cpu_hotplug(); | ||
457 | } | ||
458 | |||
459 | module_init(init); | ||
460 | module_exit(fini); | ||
461 | MODULE_LICENSE("GPL"); | ||
462 | MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); | ||