diff options
Diffstat (limited to 'drivers/lguest')
-rw-r--r-- | drivers/lguest/Kconfig | 20 | ||||
-rw-r--r-- | drivers/lguest/Makefile | 7 | ||||
-rw-r--r-- | drivers/lguest/core.c | 462 | ||||
-rw-r--r-- | drivers/lguest/hypercalls.c | 192 | ||||
-rw-r--r-- | drivers/lguest/interrupts_and_traps.c | 268 | ||||
-rw-r--r-- | drivers/lguest/io.c | 399 | ||||
-rw-r--r-- | drivers/lguest/lg.h | 261 | ||||
-rw-r--r-- | drivers/lguest/lguest.c | 621 | ||||
-rw-r--r-- | drivers/lguest/lguest_asm.S | 56 | ||||
-rw-r--r-- | drivers/lguest/lguest_bus.c | 148 | ||||
-rw-r--r-- | drivers/lguest/lguest_user.c | 236 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 411 | ||||
-rw-r--r-- | drivers/lguest/segments.c | 125 | ||||
-rw-r--r-- | drivers/lguest/switcher.S | 159 |
14 files changed, 3365 insertions, 0 deletions
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig new file mode 100644 index 000000000000..43d901fdc77f --- /dev/null +++ b/drivers/lguest/Kconfig | |||
@@ -0,0 +1,20 @@ | |||
1 | config LGUEST | ||
2 | tristate "Linux hypervisor example code" | ||
3 | depends on X86 && PARAVIRT && NET && EXPERIMENTAL && !X86_PAE | ||
4 | select LGUEST_GUEST | ||
5 | select HVC_DRIVER | ||
6 | ---help--- | ||
7 | This is a very simple module which allows you to run | ||
8 | multiple instances of the same Linux kernel, using the | ||
9 | "lguest" command found in the Documentation/lguest directory. | ||
10 | Note that "lguest" is pronounced to rhyme with "fell quest", | ||
11 | not "rustyvisor". See Documentation/lguest/lguest.txt. | ||
12 | |||
13 | If unsure, say N. If curious, say M. If masochistic, say Y. | ||
14 | |||
15 | config LGUEST_GUEST | ||
16 | bool | ||
17 | help | ||
18 | The guest needs code built-in, even if the host has lguest | ||
19 | support as a module. The drivers are tiny, so we build them | ||
20 | in too. | ||
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile new file mode 100644 index 000000000000..55382c7d799c --- /dev/null +++ b/drivers/lguest/Makefile | |||
@@ -0,0 +1,7 @@ | |||
1 | # Guest requires the paravirt_ops replacement and the bus driver. | ||
2 | obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o | ||
3 | |||
4 | # Host requires the other files, which can be a module. | ||
5 | obj-$(CONFIG_LGUEST) += lg.o | ||
6 | lg-y := core.o hypercalls.o page_tables.o interrupts_and_traps.o \ | ||
7 | segments.o io.o lguest_user.o switcher.o | ||
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c new file mode 100644 index 000000000000..ce909ec57499 --- /dev/null +++ b/drivers/lguest/core.c | |||
@@ -0,0 +1,462 @@ | |||
1 | /* World's simplest hypervisor, to test paravirt_ops and show | ||
2 | * unbelievers that virtualization is the future. Plus, it's fun! */ | ||
3 | #include <linux/module.h> | ||
4 | #include <linux/stringify.h> | ||
5 | #include <linux/stddef.h> | ||
6 | #include <linux/io.h> | ||
7 | #include <linux/mm.h> | ||
8 | #include <linux/vmalloc.h> | ||
9 | #include <linux/cpu.h> | ||
10 | #include <linux/freezer.h> | ||
11 | #include <asm/paravirt.h> | ||
12 | #include <asm/desc.h> | ||
13 | #include <asm/pgtable.h> | ||
14 | #include <asm/uaccess.h> | ||
15 | #include <asm/poll.h> | ||
16 | #include <asm/highmem.h> | ||
17 | #include <asm/asm-offsets.h> | ||
18 | #include <asm/i387.h> | ||
19 | #include "lg.h" | ||
20 | |||
21 | /* Found in switcher.S */ | ||
22 | extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; | ||
23 | extern unsigned long default_idt_entries[]; | ||
24 | |||
25 | /* Every guest maps the core switcher code. */ | ||
26 | #define SHARED_SWITCHER_PAGES \ | ||
27 | DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) | ||
28 | /* Pages for switcher itself, then two pages per cpu */ | ||
29 | #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS) | ||
30 | |||
31 | /* We map at -4M for ease of mapping into the guest (one PTE page). */ | ||
32 | #define SWITCHER_ADDR 0xFFC00000 | ||
33 | |||
34 | static struct vm_struct *switcher_vma; | ||
35 | static struct page **switcher_page; | ||
36 | |||
37 | static int cpu_had_pge; | ||
38 | static struct { | ||
39 | unsigned long offset; | ||
40 | unsigned short segment; | ||
41 | } lguest_entry; | ||
42 | |||
43 | /* This One Big lock protects all inter-guest data structures. */ | ||
44 | DEFINE_MUTEX(lguest_lock); | ||
45 | static DEFINE_PER_CPU(struct lguest *, last_guest); | ||
46 | |||
47 | /* FIXME: Make dynamic. */ | ||
48 | #define MAX_LGUEST_GUESTS 16 | ||
49 | struct lguest lguests[MAX_LGUEST_GUESTS]; | ||
50 | |||
51 | /* Offset from where switcher.S was compiled to where we've copied it */ | ||
52 | static unsigned long switcher_offset(void) | ||
53 | { | ||
54 | return SWITCHER_ADDR - (unsigned long)start_switcher_text; | ||
55 | } | ||
56 | |||
57 | /* This cpu's struct lguest_pages. */ | ||
58 | static struct lguest_pages *lguest_pages(unsigned int cpu) | ||
59 | { | ||
60 | return &(((struct lguest_pages *) | ||
61 | (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); | ||
62 | } | ||
63 | |||
64 | static __init int map_switcher(void) | ||
65 | { | ||
66 | int i, err; | ||
67 | struct page **pagep; | ||
68 | |||
69 | switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, | ||
70 | GFP_KERNEL); | ||
71 | if (!switcher_page) { | ||
72 | err = -ENOMEM; | ||
73 | goto out; | ||
74 | } | ||
75 | |||
76 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { | ||
77 | unsigned long addr = get_zeroed_page(GFP_KERNEL); | ||
78 | if (!addr) { | ||
79 | err = -ENOMEM; | ||
80 | goto free_some_pages; | ||
81 | } | ||
82 | switcher_page[i] = virt_to_page(addr); | ||
83 | } | ||
84 | |||
85 | switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, | ||
86 | VM_ALLOC, SWITCHER_ADDR, VMALLOC_END); | ||
87 | if (!switcher_vma) { | ||
88 | err = -ENOMEM; | ||
89 | printk("lguest: could not map switcher pages high\n"); | ||
90 | goto free_pages; | ||
91 | } | ||
92 | |||
93 | pagep = switcher_page; | ||
94 | err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep); | ||
95 | if (err) { | ||
96 | printk("lguest: map_vm_area failed: %i\n", err); | ||
97 | goto free_vma; | ||
98 | } | ||
99 | memcpy(switcher_vma->addr, start_switcher_text, | ||
100 | end_switcher_text - start_switcher_text); | ||
101 | |||
102 | /* Fix up IDT entries to point into copied text. */ | ||
103 | for (i = 0; i < IDT_ENTRIES; i++) | ||
104 | default_idt_entries[i] += switcher_offset(); | ||
105 | |||
106 | for_each_possible_cpu(i) { | ||
107 | struct lguest_pages *pages = lguest_pages(i); | ||
108 | struct lguest_ro_state *state = &pages->state; | ||
109 | |||
110 | /* These fields are static: rest done in copy_in_guest_info */ | ||
111 | state->host_gdt_desc.size = GDT_SIZE-1; | ||
112 | state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); | ||
113 | store_idt(&state->host_idt_desc); | ||
114 | state->guest_idt_desc.size = sizeof(state->guest_idt)-1; | ||
115 | state->guest_idt_desc.address = (long)&state->guest_idt; | ||
116 | state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; | ||
117 | state->guest_gdt_desc.address = (long)&state->guest_gdt; | ||
118 | state->guest_tss.esp0 = (long)(&pages->regs + 1); | ||
119 | state->guest_tss.ss0 = LGUEST_DS; | ||
120 | /* No I/O for you! */ | ||
121 | state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); | ||
122 | setup_default_gdt_entries(state); | ||
123 | setup_default_idt_entries(state, default_idt_entries); | ||
124 | |||
125 | /* Setup LGUEST segments on all cpus */ | ||
126 | get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; | ||
127 | get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; | ||
128 | } | ||
129 | |||
130 | /* Initialize entry point into switcher. */ | ||
131 | lguest_entry.offset = (long)switch_to_guest + switcher_offset(); | ||
132 | lguest_entry.segment = LGUEST_CS; | ||
133 | |||
134 | printk(KERN_INFO "lguest: mapped switcher at %p\n", | ||
135 | switcher_vma->addr); | ||
136 | return 0; | ||
137 | |||
138 | free_vma: | ||
139 | vunmap(switcher_vma->addr); | ||
140 | free_pages: | ||
141 | i = TOTAL_SWITCHER_PAGES; | ||
142 | free_some_pages: | ||
143 | for (--i; i >= 0; i--) | ||
144 | __free_pages(switcher_page[i], 0); | ||
145 | kfree(switcher_page); | ||
146 | out: | ||
147 | return err; | ||
148 | } | ||
149 | |||
150 | static void unmap_switcher(void) | ||
151 | { | ||
152 | unsigned int i; | ||
153 | |||
154 | vunmap(switcher_vma->addr); | ||
155 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) | ||
156 | __free_pages(switcher_page[i], 0); | ||
157 | } | ||
158 | |||
159 | /* IN/OUT insns: enough to get us past boot-time probing. */ | ||
160 | static int emulate_insn(struct lguest *lg) | ||
161 | { | ||
162 | u8 insn; | ||
163 | unsigned int insnlen = 0, in = 0, shift = 0; | ||
164 | unsigned long physaddr = guest_pa(lg, lg->regs->eip); | ||
165 | |||
166 | /* This only works for addresses in linear mapping... */ | ||
167 | if (lg->regs->eip < lg->page_offset) | ||
168 | return 0; | ||
169 | lgread(lg, &insn, physaddr, 1); | ||
170 | |||
171 | /* Operand size prefix means it's actually for ax. */ | ||
172 | if (insn == 0x66) { | ||
173 | shift = 16; | ||
174 | insnlen = 1; | ||
175 | lgread(lg, &insn, physaddr + insnlen, 1); | ||
176 | } | ||
177 | |||
178 | switch (insn & 0xFE) { | ||
179 | case 0xE4: /* in <next byte>,%al */ | ||
180 | insnlen += 2; | ||
181 | in = 1; | ||
182 | break; | ||
183 | case 0xEC: /* in (%dx),%al */ | ||
184 | insnlen += 1; | ||
185 | in = 1; | ||
186 | break; | ||
187 | case 0xE6: /* out %al,<next byte> */ | ||
188 | insnlen += 2; | ||
189 | break; | ||
190 | case 0xEE: /* out %al,(%dx) */ | ||
191 | insnlen += 1; | ||
192 | break; | ||
193 | default: | ||
194 | return 0; | ||
195 | } | ||
196 | |||
197 | if (in) { | ||
198 | /* Lower bit tells is whether it's a 16 or 32 bit access */ | ||
199 | if (insn & 0x1) | ||
200 | lg->regs->eax = 0xFFFFFFFF; | ||
201 | else | ||
202 | lg->regs->eax |= (0xFFFF << shift); | ||
203 | } | ||
204 | lg->regs->eip += insnlen; | ||
205 | return 1; | ||
206 | } | ||
207 | |||
208 | int lguest_address_ok(const struct lguest *lg, | ||
209 | unsigned long addr, unsigned long len) | ||
210 | { | ||
211 | return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); | ||
212 | } | ||
213 | |||
214 | /* Just like get_user, but don't let guest access lguest binary. */ | ||
215 | u32 lgread_u32(struct lguest *lg, unsigned long addr) | ||
216 | { | ||
217 | u32 val = 0; | ||
218 | |||
219 | /* Don't let them access lguest binary */ | ||
220 | if (!lguest_address_ok(lg, addr, sizeof(val)) | ||
221 | || get_user(val, (u32 __user *)addr) != 0) | ||
222 | kill_guest(lg, "bad read address %#lx", addr); | ||
223 | return val; | ||
224 | } | ||
225 | |||
226 | void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val) | ||
227 | { | ||
228 | if (!lguest_address_ok(lg, addr, sizeof(val)) | ||
229 | || put_user(val, (u32 __user *)addr) != 0) | ||
230 | kill_guest(lg, "bad write address %#lx", addr); | ||
231 | } | ||
232 | |||
233 | void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) | ||
234 | { | ||
235 | if (!lguest_address_ok(lg, addr, bytes) | ||
236 | || copy_from_user(b, (void __user *)addr, bytes) != 0) { | ||
237 | /* copy_from_user should do this, but as we rely on it... */ | ||
238 | memset(b, 0, bytes); | ||
239 | kill_guest(lg, "bad read address %#lx len %u", addr, bytes); | ||
240 | } | ||
241 | } | ||
242 | |||
243 | void lgwrite(struct lguest *lg, unsigned long addr, const void *b, | ||
244 | unsigned bytes) | ||
245 | { | ||
246 | if (!lguest_address_ok(lg, addr, bytes) | ||
247 | || copy_to_user((void __user *)addr, b, bytes) != 0) | ||
248 | kill_guest(lg, "bad write address %#lx len %u", addr, bytes); | ||
249 | } | ||
250 | |||
251 | static void set_ts(void) | ||
252 | { | ||
253 | u32 cr0; | ||
254 | |||
255 | cr0 = read_cr0(); | ||
256 | if (!(cr0 & 8)) | ||
257 | write_cr0(cr0|8); | ||
258 | } | ||
259 | |||
260 | static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) | ||
261 | { | ||
262 | if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { | ||
263 | __get_cpu_var(last_guest) = lg; | ||
264 | lg->last_pages = pages; | ||
265 | lg->changed = CHANGED_ALL; | ||
266 | } | ||
267 | |||
268 | /* These are pretty cheap, so we do them unconditionally. */ | ||
269 | pages->state.host_cr3 = __pa(current->mm->pgd); | ||
270 | map_switcher_in_guest(lg, pages); | ||
271 | pages->state.guest_tss.esp1 = lg->esp1; | ||
272 | pages->state.guest_tss.ss1 = lg->ss1; | ||
273 | |||
274 | /* Copy direct trap entries. */ | ||
275 | if (lg->changed & CHANGED_IDT) | ||
276 | copy_traps(lg, pages->state.guest_idt, default_idt_entries); | ||
277 | |||
278 | /* Copy all GDT entries but the TSS. */ | ||
279 | if (lg->changed & CHANGED_GDT) | ||
280 | copy_gdt(lg, pages->state.guest_gdt); | ||
281 | /* If only the TLS entries have changed, copy them. */ | ||
282 | else if (lg->changed & CHANGED_GDT_TLS) | ||
283 | copy_gdt_tls(lg, pages->state.guest_gdt); | ||
284 | |||
285 | lg->changed = 0; | ||
286 | } | ||
287 | |||
288 | static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) | ||
289 | { | ||
290 | unsigned int clobber; | ||
291 | |||
292 | copy_in_guest_info(lg, pages); | ||
293 | |||
294 | /* Put eflags on stack, lcall does rest: suitable for iret return. */ | ||
295 | asm volatile("pushf; lcall *lguest_entry" | ||
296 | : "=a"(clobber), "=b"(clobber) | ||
297 | : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) | ||
298 | : "memory", "%edx", "%ecx", "%edi", "%esi"); | ||
299 | } | ||
300 | |||
301 | int run_guest(struct lguest *lg, unsigned long __user *user) | ||
302 | { | ||
303 | while (!lg->dead) { | ||
304 | unsigned int cr2 = 0; /* Damn gcc */ | ||
305 | |||
306 | /* Hypercalls first: we might have been out to userspace */ | ||
307 | do_hypercalls(lg); | ||
308 | if (lg->dma_is_pending) { | ||
309 | if (put_user(lg->pending_dma, user) || | ||
310 | put_user(lg->pending_key, user+1)) | ||
311 | return -EFAULT; | ||
312 | return sizeof(unsigned long)*2; | ||
313 | } | ||
314 | |||
315 | if (signal_pending(current)) | ||
316 | return -ERESTARTSYS; | ||
317 | |||
318 | /* If Waker set break_out, return to Launcher. */ | ||
319 | if (lg->break_out) | ||
320 | return -EAGAIN; | ||
321 | |||
322 | maybe_do_interrupt(lg); | ||
323 | |||
324 | try_to_freeze(); | ||
325 | |||
326 | if (lg->dead) | ||
327 | break; | ||
328 | |||
329 | if (lg->halted) { | ||
330 | set_current_state(TASK_INTERRUPTIBLE); | ||
331 | schedule(); | ||
332 | continue; | ||
333 | } | ||
334 | |||
335 | local_irq_disable(); | ||
336 | |||
337 | /* Even if *we* don't want FPU trap, guest might... */ | ||
338 | if (lg->ts) | ||
339 | set_ts(); | ||
340 | |||
341 | /* Don't let Guest do SYSENTER: we can't handle it. */ | ||
342 | if (boot_cpu_has(X86_FEATURE_SEP)) | ||
343 | wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); | ||
344 | |||
345 | run_guest_once(lg, lguest_pages(raw_smp_processor_id())); | ||
346 | |||
347 | /* Save cr2 now if we page-faulted. */ | ||
348 | if (lg->regs->trapnum == 14) | ||
349 | cr2 = read_cr2(); | ||
350 | else if (lg->regs->trapnum == 7) | ||
351 | math_state_restore(); | ||
352 | |||
353 | if (boot_cpu_has(X86_FEATURE_SEP)) | ||
354 | wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); | ||
355 | local_irq_enable(); | ||
356 | |||
357 | switch (lg->regs->trapnum) { | ||
358 | case 13: /* We've intercepted a GPF. */ | ||
359 | if (lg->regs->errcode == 0) { | ||
360 | if (emulate_insn(lg)) | ||
361 | continue; | ||
362 | } | ||
363 | break; | ||
364 | case 14: /* We've intercepted a page fault. */ | ||
365 | if (demand_page(lg, cr2, lg->regs->errcode)) | ||
366 | continue; | ||
367 | |||
368 | /* If lguest_data is NULL, this won't hurt. */ | ||
369 | if (put_user(cr2, &lg->lguest_data->cr2)) | ||
370 | kill_guest(lg, "Writing cr2"); | ||
371 | break; | ||
372 | case 7: /* We've intercepted a Device Not Available fault. */ | ||
373 | /* If they don't want to know, just absorb it. */ | ||
374 | if (!lg->ts) | ||
375 | continue; | ||
376 | break; | ||
377 | case 32 ... 255: /* Real interrupt, fall thru */ | ||
378 | cond_resched(); | ||
379 | case LGUEST_TRAP_ENTRY: /* Handled at top of loop */ | ||
380 | continue; | ||
381 | } | ||
382 | |||
383 | if (deliver_trap(lg, lg->regs->trapnum)) | ||
384 | continue; | ||
385 | |||
386 | kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", | ||
387 | lg->regs->trapnum, lg->regs->eip, | ||
388 | lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode); | ||
389 | } | ||
390 | return -ENOENT; | ||
391 | } | ||
392 | |||
393 | int find_free_guest(void) | ||
394 | { | ||
395 | unsigned int i; | ||
396 | for (i = 0; i < MAX_LGUEST_GUESTS; i++) | ||
397 | if (!lguests[i].tsk) | ||
398 | return i; | ||
399 | return -1; | ||
400 | } | ||
401 | |||
402 | static void adjust_pge(void *on) | ||
403 | { | ||
404 | if (on) | ||
405 | write_cr4(read_cr4() | X86_CR4_PGE); | ||
406 | else | ||
407 | write_cr4(read_cr4() & ~X86_CR4_PGE); | ||
408 | } | ||
409 | |||
410 | static int __init init(void) | ||
411 | { | ||
412 | int err; | ||
413 | |||
414 | if (paravirt_enabled()) { | ||
415 | printk("lguest is afraid of %s\n", paravirt_ops.name); | ||
416 | return -EPERM; | ||
417 | } | ||
418 | |||
419 | err = map_switcher(); | ||
420 | if (err) | ||
421 | return err; | ||
422 | |||
423 | err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); | ||
424 | if (err) { | ||
425 | unmap_switcher(); | ||
426 | return err; | ||
427 | } | ||
428 | lguest_io_init(); | ||
429 | |||
430 | err = lguest_device_init(); | ||
431 | if (err) { | ||
432 | free_pagetables(); | ||
433 | unmap_switcher(); | ||
434 | return err; | ||
435 | } | ||
436 | lock_cpu_hotplug(); | ||
437 | if (cpu_has_pge) { /* We have a broader idea of "global". */ | ||
438 | cpu_had_pge = 1; | ||
439 | on_each_cpu(adjust_pge, (void *)0, 0, 1); | ||
440 | clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); | ||
441 | } | ||
442 | unlock_cpu_hotplug(); | ||
443 | return 0; | ||
444 | } | ||
445 | |||
446 | static void __exit fini(void) | ||
447 | { | ||
448 | lguest_device_remove(); | ||
449 | free_pagetables(); | ||
450 | unmap_switcher(); | ||
451 | lock_cpu_hotplug(); | ||
452 | if (cpu_had_pge) { | ||
453 | set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); | ||
454 | on_each_cpu(adjust_pge, (void *)1, 0, 1); | ||
455 | } | ||
456 | unlock_cpu_hotplug(); | ||
457 | } | ||
458 | |||
459 | module_init(init); | ||
460 | module_exit(fini); | ||
461 | MODULE_LICENSE("GPL"); | ||
462 | MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); | ||
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c new file mode 100644 index 000000000000..ea52ca451f74 --- /dev/null +++ b/drivers/lguest/hypercalls.c | |||
@@ -0,0 +1,192 @@ | |||
1 | /* Actual hypercalls, which allow guests to actually do something. | ||
2 | Copyright (C) 2006 Rusty Russell IBM Corporation | ||
3 | |||
4 | This program is free software; you can redistribute it and/or modify | ||
5 | it under the terms of the GNU General Public License as published by | ||
6 | the Free Software Foundation; either version 2 of the License, or | ||
7 | (at your option) any later version. | ||
8 | |||
9 | This program is distributed in the hope that it will be useful, | ||
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | GNU General Public License for more details. | ||
13 | |||
14 | You should have received a copy of the GNU General Public License | ||
15 | along with this program; if not, write to the Free Software | ||
16 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
17 | */ | ||
18 | #include <linux/uaccess.h> | ||
19 | #include <linux/syscalls.h> | ||
20 | #include <linux/mm.h> | ||
21 | #include <asm/page.h> | ||
22 | #include <asm/pgtable.h> | ||
23 | #include <irq_vectors.h> | ||
24 | #include "lg.h" | ||
25 | |||
26 | static void do_hcall(struct lguest *lg, struct lguest_regs *regs) | ||
27 | { | ||
28 | switch (regs->eax) { | ||
29 | case LHCALL_FLUSH_ASYNC: | ||
30 | break; | ||
31 | case LHCALL_LGUEST_INIT: | ||
32 | kill_guest(lg, "already have lguest_data"); | ||
33 | break; | ||
34 | case LHCALL_CRASH: { | ||
35 | char msg[128]; | ||
36 | lgread(lg, msg, regs->edx, sizeof(msg)); | ||
37 | msg[sizeof(msg)-1] = '\0'; | ||
38 | kill_guest(lg, "CRASH: %s", msg); | ||
39 | break; | ||
40 | } | ||
41 | case LHCALL_FLUSH_TLB: | ||
42 | if (regs->edx) | ||
43 | guest_pagetable_clear_all(lg); | ||
44 | else | ||
45 | guest_pagetable_flush_user(lg); | ||
46 | break; | ||
47 | case LHCALL_GET_WALLCLOCK: { | ||
48 | struct timespec ts; | ||
49 | ktime_get_real_ts(&ts); | ||
50 | regs->eax = ts.tv_sec; | ||
51 | break; | ||
52 | } | ||
53 | case LHCALL_BIND_DMA: | ||
54 | regs->eax = bind_dma(lg, regs->edx, regs->ebx, | ||
55 | regs->ecx >> 8, regs->ecx & 0xFF); | ||
56 | break; | ||
57 | case LHCALL_SEND_DMA: | ||
58 | send_dma(lg, regs->edx, regs->ebx); | ||
59 | break; | ||
60 | case LHCALL_LOAD_GDT: | ||
61 | load_guest_gdt(lg, regs->edx, regs->ebx); | ||
62 | break; | ||
63 | case LHCALL_LOAD_IDT_ENTRY: | ||
64 | load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx); | ||
65 | break; | ||
66 | case LHCALL_NEW_PGTABLE: | ||
67 | guest_new_pagetable(lg, regs->edx); | ||
68 | break; | ||
69 | case LHCALL_SET_STACK: | ||
70 | guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx); | ||
71 | break; | ||
72 | case LHCALL_SET_PTE: | ||
73 | guest_set_pte(lg, regs->edx, regs->ebx, mkgpte(regs->ecx)); | ||
74 | break; | ||
75 | case LHCALL_SET_PMD: | ||
76 | guest_set_pmd(lg, regs->edx, regs->ebx); | ||
77 | break; | ||
78 | case LHCALL_LOAD_TLS: | ||
79 | guest_load_tls(lg, regs->edx); | ||
80 | break; | ||
81 | case LHCALL_SET_CLOCKEVENT: | ||
82 | guest_set_clockevent(lg, regs->edx); | ||
83 | break; | ||
84 | case LHCALL_TS: | ||
85 | lg->ts = regs->edx; | ||
86 | break; | ||
87 | case LHCALL_HALT: | ||
88 | lg->halted = 1; | ||
89 | break; | ||
90 | default: | ||
91 | kill_guest(lg, "Bad hypercall %li\n", regs->eax); | ||
92 | } | ||
93 | } | ||
94 | |||
95 | /* We always do queued calls before actual hypercall. */ | ||
96 | static void do_async_hcalls(struct lguest *lg) | ||
97 | { | ||
98 | unsigned int i; | ||
99 | u8 st[LHCALL_RING_SIZE]; | ||
100 | |||
101 | if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st))) | ||
102 | return; | ||
103 | |||
104 | for (i = 0; i < ARRAY_SIZE(st); i++) { | ||
105 | struct lguest_regs regs; | ||
106 | unsigned int n = lg->next_hcall; | ||
107 | |||
108 | if (st[n] == 0xFF) | ||
109 | break; | ||
110 | |||
111 | if (++lg->next_hcall == LHCALL_RING_SIZE) | ||
112 | lg->next_hcall = 0; | ||
113 | |||
114 | if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax) | ||
115 | || get_user(regs.edx, &lg->lguest_data->hcalls[n].edx) | ||
116 | || get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx) | ||
117 | || get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx)) { | ||
118 | kill_guest(lg, "Fetching async hypercalls"); | ||
119 | break; | ||
120 | } | ||
121 | |||
122 | do_hcall(lg, ®s); | ||
123 | if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) { | ||
124 | kill_guest(lg, "Writing result for async hypercall"); | ||
125 | break; | ||
126 | } | ||
127 | |||
128 | if (lg->dma_is_pending) | ||
129 | break; | ||
130 | } | ||
131 | } | ||
132 | |||
133 | static void initialize(struct lguest *lg) | ||
134 | { | ||
135 | u32 tsc_speed; | ||
136 | |||
137 | if (lg->regs->eax != LHCALL_LGUEST_INIT) { | ||
138 | kill_guest(lg, "hypercall %li before LGUEST_INIT", | ||
139 | lg->regs->eax); | ||
140 | return; | ||
141 | } | ||
142 | |||
143 | /* We only tell the guest to use the TSC if it's reliable. */ | ||
144 | if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) | ||
145 | tsc_speed = tsc_khz; | ||
146 | else | ||
147 | tsc_speed = 0; | ||
148 | |||
149 | lg->lguest_data = (struct lguest_data __user *)lg->regs->edx; | ||
150 | /* We check here so we can simply copy_to_user/from_user */ | ||
151 | if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) { | ||
152 | kill_guest(lg, "bad guest page %p", lg->lguest_data); | ||
153 | return; | ||
154 | } | ||
155 | if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) | ||
156 | || get_user(lg->noirq_end, &lg->lguest_data->noirq_end) | ||
157 | /* We reserve the top pgd entry. */ | ||
158 | || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) | ||
159 | || put_user(tsc_speed, &lg->lguest_data->tsc_khz) | ||
160 | || put_user(lg->guestid, &lg->lguest_data->guestid)) | ||
161 | kill_guest(lg, "bad guest page %p", lg->lguest_data); | ||
162 | |||
163 | /* This is the one case where the above accesses might have | ||
164 | * been the first write to a Guest page. This may have caused | ||
165 | * a copy-on-write fault, but the Guest might be referring to | ||
166 | * the old (read-only) page. */ | ||
167 | guest_pagetable_clear_all(lg); | ||
168 | } | ||
169 | |||
170 | /* Even if we go out to userspace and come back, we don't want to do | ||
171 | * the hypercall again. */ | ||
172 | static void clear_hcall(struct lguest *lg) | ||
173 | { | ||
174 | lg->regs->trapnum = 255; | ||
175 | } | ||
176 | |||
177 | void do_hypercalls(struct lguest *lg) | ||
178 | { | ||
179 | if (unlikely(!lg->lguest_data)) { | ||
180 | if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) { | ||
181 | initialize(lg); | ||
182 | clear_hcall(lg); | ||
183 | } | ||
184 | return; | ||
185 | } | ||
186 | |||
187 | do_async_hcalls(lg); | ||
188 | if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) { | ||
189 | do_hcall(lg, lg->regs); | ||
190 | clear_hcall(lg); | ||
191 | } | ||
192 | } | ||
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c new file mode 100644 index 000000000000..d9de5bbc613f --- /dev/null +++ b/drivers/lguest/interrupts_and_traps.c | |||
@@ -0,0 +1,268 @@ | |||
1 | #include <linux/uaccess.h> | ||
2 | #include "lg.h" | ||
3 | |||
4 | static unsigned long idt_address(u32 lo, u32 hi) | ||
5 | { | ||
6 | return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); | ||
7 | } | ||
8 | |||
9 | static int idt_type(u32 lo, u32 hi) | ||
10 | { | ||
11 | return (hi >> 8) & 0xF; | ||
12 | } | ||
13 | |||
14 | static int idt_present(u32 lo, u32 hi) | ||
15 | { | ||
16 | return (hi & 0x8000); | ||
17 | } | ||
18 | |||
19 | static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) | ||
20 | { | ||
21 | *gstack -= 4; | ||
22 | lgwrite_u32(lg, *gstack, val); | ||
23 | } | ||
24 | |||
25 | static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) | ||
26 | { | ||
27 | unsigned long gstack; | ||
28 | u32 eflags, ss, irq_enable; | ||
29 | |||
30 | /* If they want a ring change, we use new stack and push old ss/esp */ | ||
31 | if ((lg->regs->ss&0x3) != GUEST_PL) { | ||
32 | gstack = guest_pa(lg, lg->esp1); | ||
33 | ss = lg->ss1; | ||
34 | push_guest_stack(lg, &gstack, lg->regs->ss); | ||
35 | push_guest_stack(lg, &gstack, lg->regs->esp); | ||
36 | } else { | ||
37 | gstack = guest_pa(lg, lg->regs->esp); | ||
38 | ss = lg->regs->ss; | ||
39 | } | ||
40 | |||
41 | /* We use IF bit in eflags to indicate whether irqs were disabled | ||
42 | (it's always 0, since irqs are enabled when guest is running). */ | ||
43 | eflags = lg->regs->eflags; | ||
44 | if (get_user(irq_enable, &lg->lguest_data->irq_enabled)) | ||
45 | irq_enable = 0; | ||
46 | eflags |= (irq_enable & X86_EFLAGS_IF); | ||
47 | |||
48 | push_guest_stack(lg, &gstack, eflags); | ||
49 | push_guest_stack(lg, &gstack, lg->regs->cs); | ||
50 | push_guest_stack(lg, &gstack, lg->regs->eip); | ||
51 | |||
52 | if (has_err) | ||
53 | push_guest_stack(lg, &gstack, lg->regs->errcode); | ||
54 | |||
55 | /* Change the real stack so switcher returns to trap handler */ | ||
56 | lg->regs->ss = ss; | ||
57 | lg->regs->esp = gstack + lg->page_offset; | ||
58 | lg->regs->cs = (__KERNEL_CS|GUEST_PL); | ||
59 | lg->regs->eip = idt_address(lo, hi); | ||
60 | |||
61 | /* Disable interrupts for an interrupt gate. */ | ||
62 | if (idt_type(lo, hi) == 0xE) | ||
63 | if (put_user(0, &lg->lguest_data->irq_enabled)) | ||
64 | kill_guest(lg, "Disabling interrupts"); | ||
65 | } | ||
66 | |||
67 | void maybe_do_interrupt(struct lguest *lg) | ||
68 | { | ||
69 | unsigned int irq; | ||
70 | DECLARE_BITMAP(blk, LGUEST_IRQS); | ||
71 | struct desc_struct *idt; | ||
72 | |||
73 | if (!lg->lguest_data) | ||
74 | return; | ||
75 | |||
76 | /* Mask out any interrupts they have blocked. */ | ||
77 | if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts, | ||
78 | sizeof(blk))) | ||
79 | return; | ||
80 | |||
81 | bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS); | ||
82 | |||
83 | irq = find_first_bit(blk, LGUEST_IRQS); | ||
84 | if (irq >= LGUEST_IRQS) | ||
85 | return; | ||
86 | |||
87 | if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end) | ||
88 | return; | ||
89 | |||
90 | /* If they're halted, we re-enable interrupts. */ | ||
91 | if (lg->halted) { | ||
92 | /* Re-enable interrupts. */ | ||
93 | if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled)) | ||
94 | kill_guest(lg, "Re-enabling interrupts"); | ||
95 | lg->halted = 0; | ||
96 | } else { | ||
97 | /* Maybe they have interrupts disabled? */ | ||
98 | u32 irq_enabled; | ||
99 | if (get_user(irq_enabled, &lg->lguest_data->irq_enabled)) | ||
100 | irq_enabled = 0; | ||
101 | if (!irq_enabled) | ||
102 | return; | ||
103 | } | ||
104 | |||
105 | idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq]; | ||
106 | if (idt_present(idt->a, idt->b)) { | ||
107 | clear_bit(irq, lg->irqs_pending); | ||
108 | set_guest_interrupt(lg, idt->a, idt->b, 0); | ||
109 | } | ||
110 | } | ||
111 | |||
112 | static int has_err(unsigned int trap) | ||
113 | { | ||
114 | return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); | ||
115 | } | ||
116 | |||
117 | int deliver_trap(struct lguest *lg, unsigned int num) | ||
118 | { | ||
119 | u32 lo = lg->idt[num].a, hi = lg->idt[num].b; | ||
120 | |||
121 | if (!idt_present(lo, hi)) | ||
122 | return 0; | ||
123 | set_guest_interrupt(lg, lo, hi, has_err(num)); | ||
124 | return 1; | ||
125 | } | ||
126 | |||
127 | static int direct_trap(const struct lguest *lg, | ||
128 | const struct desc_struct *trap, | ||
129 | unsigned int num) | ||
130 | { | ||
131 | /* Hardware interrupts don't go to guest (except syscall). */ | ||
132 | if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR) | ||
133 | return 0; | ||
134 | |||
135 | /* We intercept page fault (demand shadow paging & cr2 saving) | ||
136 | protection fault (in/out emulation) and device not | ||
137 | available (TS handling), and hypercall */ | ||
138 | if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY) | ||
139 | return 0; | ||
140 | |||
141 | /* Interrupt gates (0xE) or not present (0x0) can't go direct. */ | ||
142 | return idt_type(trap->a, trap->b) == 0xF; | ||
143 | } | ||
144 | |||
145 | void pin_stack_pages(struct lguest *lg) | ||
146 | { | ||
147 | unsigned int i; | ||
148 | |||
149 | for (i = 0; i < lg->stack_pages; i++) | ||
150 | pin_page(lg, lg->esp1 - i * PAGE_SIZE); | ||
151 | } | ||
152 | |||
153 | void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) | ||
154 | { | ||
155 | /* You cannot have a stack segment with priv level 0. */ | ||
156 | if ((seg & 0x3) != GUEST_PL) | ||
157 | kill_guest(lg, "bad stack segment %i", seg); | ||
158 | if (pages > 2) | ||
159 | kill_guest(lg, "bad stack pages %u", pages); | ||
160 | lg->ss1 = seg; | ||
161 | lg->esp1 = esp; | ||
162 | lg->stack_pages = pages; | ||
163 | pin_stack_pages(lg); | ||
164 | } | ||
165 | |||
166 | /* Set up trap in IDT. */ | ||
167 | static void set_trap(struct lguest *lg, struct desc_struct *trap, | ||
168 | unsigned int num, u32 lo, u32 hi) | ||
169 | { | ||
170 | u8 type = idt_type(lo, hi); | ||
171 | |||
172 | if (!idt_present(lo, hi)) { | ||
173 | trap->a = trap->b = 0; | ||
174 | return; | ||
175 | } | ||
176 | |||
177 | if (type != 0xE && type != 0xF) | ||
178 | kill_guest(lg, "bad IDT type %i", type); | ||
179 | |||
180 | trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); | ||
181 | trap->b = (hi&0xFFFFEF00); | ||
182 | } | ||
183 | |||
184 | void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi) | ||
185 | { | ||
186 | /* Guest never handles: NMI, doublefault, hypercall, spurious irq. */ | ||
187 | if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) | ||
188 | return; | ||
189 | |||
190 | lg->changed |= CHANGED_IDT; | ||
191 | if (num < ARRAY_SIZE(lg->idt)) | ||
192 | set_trap(lg, &lg->idt[num], num, lo, hi); | ||
193 | else if (num == SYSCALL_VECTOR) | ||
194 | set_trap(lg, &lg->syscall_idt, num, lo, hi); | ||
195 | } | ||
196 | |||
197 | static void default_idt_entry(struct desc_struct *idt, | ||
198 | int trap, | ||
199 | const unsigned long handler) | ||
200 | { | ||
201 | u32 flags = 0x8e00; | ||
202 | |||
203 | /* They can't "int" into any of them except hypercall. */ | ||
204 | if (trap == LGUEST_TRAP_ENTRY) | ||
205 | flags |= (GUEST_PL << 13); | ||
206 | |||
207 | idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF); | ||
208 | idt->b = (handler&0xFFFF0000) | flags; | ||
209 | } | ||
210 | |||
211 | void setup_default_idt_entries(struct lguest_ro_state *state, | ||
212 | const unsigned long *def) | ||
213 | { | ||
214 | unsigned int i; | ||
215 | |||
216 | for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++) | ||
217 | default_idt_entry(&state->guest_idt[i], i, def[i]); | ||
218 | } | ||
219 | |||
220 | void copy_traps(const struct lguest *lg, struct desc_struct *idt, | ||
221 | const unsigned long *def) | ||
222 | { | ||
223 | unsigned int i; | ||
224 | |||
225 | /* All hardware interrupts are same whatever the guest: only the | ||
226 | * traps might be different. */ | ||
227 | for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) { | ||
228 | if (direct_trap(lg, &lg->idt[i], i)) | ||
229 | idt[i] = lg->idt[i]; | ||
230 | else | ||
231 | default_idt_entry(&idt[i], i, def[i]); | ||
232 | } | ||
233 | i = SYSCALL_VECTOR; | ||
234 | if (direct_trap(lg, &lg->syscall_idt, i)) | ||
235 | idt[i] = lg->syscall_idt; | ||
236 | else | ||
237 | default_idt_entry(&idt[i], i, def[i]); | ||
238 | } | ||
239 | |||
240 | void guest_set_clockevent(struct lguest *lg, unsigned long delta) | ||
241 | { | ||
242 | ktime_t expires; | ||
243 | |||
244 | if (unlikely(delta == 0)) { | ||
245 | /* Clock event device is shutting down. */ | ||
246 | hrtimer_cancel(&lg->hrt); | ||
247 | return; | ||
248 | } | ||
249 | |||
250 | expires = ktime_add_ns(ktime_get_real(), delta); | ||
251 | hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS); | ||
252 | } | ||
253 | |||
254 | static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) | ||
255 | { | ||
256 | struct lguest *lg = container_of(timer, struct lguest, hrt); | ||
257 | |||
258 | set_bit(0, lg->irqs_pending); | ||
259 | if (lg->halted) | ||
260 | wake_up_process(lg->tsk); | ||
261 | return HRTIMER_NORESTART; | ||
262 | } | ||
263 | |||
264 | void init_clockdev(struct lguest *lg) | ||
265 | { | ||
266 | hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS); | ||
267 | lg->hrt.function = clockdev_fn; | ||
268 | } | ||
diff --git a/drivers/lguest/io.c b/drivers/lguest/io.c new file mode 100644 index 000000000000..06bdba2337ef --- /dev/null +++ b/drivers/lguest/io.c | |||
@@ -0,0 +1,399 @@ | |||
1 | /* Simple I/O model for guests, based on shared memory. | ||
2 | * Copyright (C) 2006 Rusty Russell IBM Corporation | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
17 | */ | ||
18 | #include <linux/types.h> | ||
19 | #include <linux/futex.h> | ||
20 | #include <linux/jhash.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/highmem.h> | ||
23 | #include <linux/uaccess.h> | ||
24 | #include "lg.h" | ||
25 | |||
26 | static struct list_head dma_hash[61]; | ||
27 | |||
28 | void lguest_io_init(void) | ||
29 | { | ||
30 | unsigned int i; | ||
31 | |||
32 | for (i = 0; i < ARRAY_SIZE(dma_hash); i++) | ||
33 | INIT_LIST_HEAD(&dma_hash[i]); | ||
34 | } | ||
35 | |||
36 | /* FIXME: allow multi-page lengths. */ | ||
37 | static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma) | ||
38 | { | ||
39 | unsigned int i; | ||
40 | |||
41 | for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { | ||
42 | if (!dma->len[i]) | ||
43 | return 1; | ||
44 | if (!lguest_address_ok(lg, dma->addr[i], dma->len[i])) | ||
45 | goto kill; | ||
46 | if (dma->len[i] > PAGE_SIZE) | ||
47 | goto kill; | ||
48 | /* We could do over a page, but is it worth it? */ | ||
49 | if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE) | ||
50 | goto kill; | ||
51 | } | ||
52 | return 1; | ||
53 | |||
54 | kill: | ||
55 | kill_guest(lg, "bad DMA entry: %u@%#lx", dma->len[i], dma->addr[i]); | ||
56 | return 0; | ||
57 | } | ||
58 | |||
59 | static unsigned int hash(const union futex_key *key) | ||
60 | { | ||
61 | return jhash2((u32*)&key->both.word, | ||
62 | (sizeof(key->both.word)+sizeof(key->both.ptr))/4, | ||
63 | key->both.offset) | ||
64 | % ARRAY_SIZE(dma_hash); | ||
65 | } | ||
66 | |||
67 | static inline int key_eq(const union futex_key *a, const union futex_key *b) | ||
68 | { | ||
69 | return (a->both.word == b->both.word | ||
70 | && a->both.ptr == b->both.ptr | ||
71 | && a->both.offset == b->both.offset); | ||
72 | } | ||
73 | |||
74 | /* Must hold read lock on dmainfo owner's current->mm->mmap_sem */ | ||
75 | static void unlink_dma(struct lguest_dma_info *dmainfo) | ||
76 | { | ||
77 | BUG_ON(!mutex_is_locked(&lguest_lock)); | ||
78 | dmainfo->interrupt = 0; | ||
79 | list_del(&dmainfo->list); | ||
80 | drop_futex_key_refs(&dmainfo->key); | ||
81 | } | ||
82 | |||
83 | static int unbind_dma(struct lguest *lg, | ||
84 | const union futex_key *key, | ||
85 | unsigned long dmas) | ||
86 | { | ||
87 | int i, ret = 0; | ||
88 | |||
89 | for (i = 0; i < LGUEST_MAX_DMA; i++) { | ||
90 | if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) { | ||
91 | unlink_dma(&lg->dma[i]); | ||
92 | ret = 1; | ||
93 | break; | ||
94 | } | ||
95 | } | ||
96 | return ret; | ||
97 | } | ||
98 | |||
99 | int bind_dma(struct lguest *lg, | ||
100 | unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt) | ||
101 | { | ||
102 | unsigned int i; | ||
103 | int ret = 0; | ||
104 | union futex_key key; | ||
105 | struct rw_semaphore *fshared = ¤t->mm->mmap_sem; | ||
106 | |||
107 | if (interrupt >= LGUEST_IRQS) | ||
108 | return 0; | ||
109 | |||
110 | mutex_lock(&lguest_lock); | ||
111 | down_read(fshared); | ||
112 | if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { | ||
113 | kill_guest(lg, "bad dma key %#lx", ukey); | ||
114 | goto unlock; | ||
115 | } | ||
116 | get_futex_key_refs(&key); | ||
117 | |||
118 | if (interrupt == 0) | ||
119 | ret = unbind_dma(lg, &key, dmas); | ||
120 | else { | ||
121 | for (i = 0; i < LGUEST_MAX_DMA; i++) { | ||
122 | if (lg->dma[i].interrupt) | ||
123 | continue; | ||
124 | |||
125 | lg->dma[i].dmas = dmas; | ||
126 | lg->dma[i].num_dmas = numdmas; | ||
127 | lg->dma[i].next_dma = 0; | ||
128 | lg->dma[i].key = key; | ||
129 | lg->dma[i].guestid = lg->guestid; | ||
130 | lg->dma[i].interrupt = interrupt; | ||
131 | list_add(&lg->dma[i].list, &dma_hash[hash(&key)]); | ||
132 | ret = 1; | ||
133 | goto unlock; | ||
134 | } | ||
135 | } | ||
136 | drop_futex_key_refs(&key); | ||
137 | unlock: | ||
138 | up_read(fshared); | ||
139 | mutex_unlock(&lguest_lock); | ||
140 | return ret; | ||
141 | } | ||
142 | |||
143 | /* lgread from another guest */ | ||
144 | static int lgread_other(struct lguest *lg, | ||
145 | void *buf, u32 addr, unsigned bytes) | ||
146 | { | ||
147 | if (!lguest_address_ok(lg, addr, bytes) | ||
148 | || access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) { | ||
149 | memset(buf, 0, bytes); | ||
150 | kill_guest(lg, "bad address in registered DMA struct"); | ||
151 | return 0; | ||
152 | } | ||
153 | return 1; | ||
154 | } | ||
155 | |||
156 | /* lgwrite to another guest */ | ||
157 | static int lgwrite_other(struct lguest *lg, u32 addr, | ||
158 | const void *buf, unsigned bytes) | ||
159 | { | ||
160 | if (!lguest_address_ok(lg, addr, bytes) | ||
161 | || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1) | ||
162 | != bytes)) { | ||
163 | kill_guest(lg, "bad address writing to registered DMA"); | ||
164 | return 0; | ||
165 | } | ||
166 | return 1; | ||
167 | } | ||
168 | |||
169 | static u32 copy_data(struct lguest *srclg, | ||
170 | const struct lguest_dma *src, | ||
171 | const struct lguest_dma *dst, | ||
172 | struct page *pages[]) | ||
173 | { | ||
174 | unsigned int totlen, si, di, srcoff, dstoff; | ||
175 | void *maddr = NULL; | ||
176 | |||
177 | totlen = 0; | ||
178 | si = di = 0; | ||
179 | srcoff = dstoff = 0; | ||
180 | while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si] | ||
181 | && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) { | ||
182 | u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff); | ||
183 | |||
184 | if (!maddr) | ||
185 | maddr = kmap(pages[di]); | ||
186 | |||
187 | /* FIXME: This is not completely portable, since | ||
188 | archs do different things for copy_to_user_page. */ | ||
189 | if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE, | ||
190 | (void *__user)src->addr[si], len) != 0) { | ||
191 | kill_guest(srclg, "bad address in sending DMA"); | ||
192 | totlen = 0; | ||
193 | break; | ||
194 | } | ||
195 | |||
196 | totlen += len; | ||
197 | srcoff += len; | ||
198 | dstoff += len; | ||
199 | if (srcoff == src->len[si]) { | ||
200 | si++; | ||
201 | srcoff = 0; | ||
202 | } | ||
203 | if (dstoff == dst->len[di]) { | ||
204 | kunmap(pages[di]); | ||
205 | maddr = NULL; | ||
206 | di++; | ||
207 | dstoff = 0; | ||
208 | } | ||
209 | } | ||
210 | |||
211 | if (maddr) | ||
212 | kunmap(pages[di]); | ||
213 | |||
214 | return totlen; | ||
215 | } | ||
216 | |||
217 | /* Src is us, ie. current. */ | ||
218 | static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src, | ||
219 | struct lguest *dstlg, const struct lguest_dma *dst) | ||
220 | { | ||
221 | int i; | ||
222 | u32 ret; | ||
223 | struct page *pages[LGUEST_MAX_DMA_SECTIONS]; | ||
224 | |||
225 | if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src)) | ||
226 | return 0; | ||
227 | |||
228 | /* First get the destination pages */ | ||
229 | for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { | ||
230 | if (dst->len[i] == 0) | ||
231 | break; | ||
232 | if (get_user_pages(dstlg->tsk, dstlg->mm, | ||
233 | dst->addr[i], 1, 1, 1, pages+i, NULL) | ||
234 | != 1) { | ||
235 | kill_guest(dstlg, "Error mapping DMA pages"); | ||
236 | ret = 0; | ||
237 | goto drop_pages; | ||
238 | } | ||
239 | } | ||
240 | |||
241 | /* Now copy until we run out of src or dst. */ | ||
242 | ret = copy_data(srclg, src, dst, pages); | ||
243 | |||
244 | drop_pages: | ||
245 | while (--i >= 0) | ||
246 | put_page(pages[i]); | ||
247 | return ret; | ||
248 | } | ||
249 | |||
250 | static int dma_transfer(struct lguest *srclg, | ||
251 | unsigned long udma, | ||
252 | struct lguest_dma_info *dst) | ||
253 | { | ||
254 | struct lguest_dma dst_dma, src_dma; | ||
255 | struct lguest *dstlg; | ||
256 | u32 i, dma = 0; | ||
257 | |||
258 | dstlg = &lguests[dst->guestid]; | ||
259 | /* Get our dma list. */ | ||
260 | lgread(srclg, &src_dma, udma, sizeof(src_dma)); | ||
261 | |||
262 | /* We can't deadlock against them dmaing to us, because this | ||
263 | * is all under the lguest_lock. */ | ||
264 | down_read(&dstlg->mm->mmap_sem); | ||
265 | |||
266 | for (i = 0; i < dst->num_dmas; i++) { | ||
267 | dma = (dst->next_dma + i) % dst->num_dmas; | ||
268 | if (!lgread_other(dstlg, &dst_dma, | ||
269 | dst->dmas + dma * sizeof(struct lguest_dma), | ||
270 | sizeof(dst_dma))) { | ||
271 | goto fail; | ||
272 | } | ||
273 | if (!dst_dma.used_len) | ||
274 | break; | ||
275 | } | ||
276 | if (i != dst->num_dmas) { | ||
277 | unsigned long used_lenp; | ||
278 | unsigned int ret; | ||
279 | |||
280 | ret = do_dma(srclg, &src_dma, dstlg, &dst_dma); | ||
281 | /* Put used length in src. */ | ||
282 | lgwrite_u32(srclg, | ||
283 | udma+offsetof(struct lguest_dma, used_len), ret); | ||
284 | if (ret == 0 && src_dma.len[0] != 0) | ||
285 | goto fail; | ||
286 | |||
287 | /* Make sure destination sees contents before length. */ | ||
288 | wmb(); | ||
289 | used_lenp = dst->dmas | ||
290 | + dma * sizeof(struct lguest_dma) | ||
291 | + offsetof(struct lguest_dma, used_len); | ||
292 | lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret)); | ||
293 | dst->next_dma++; | ||
294 | } | ||
295 | up_read(&dstlg->mm->mmap_sem); | ||
296 | |||
297 | /* Do this last so dst doesn't simply sleep on lock. */ | ||
298 | set_bit(dst->interrupt, dstlg->irqs_pending); | ||
299 | wake_up_process(dstlg->tsk); | ||
300 | return i == dst->num_dmas; | ||
301 | |||
302 | fail: | ||
303 | up_read(&dstlg->mm->mmap_sem); | ||
304 | return 0; | ||
305 | } | ||
306 | |||
307 | void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma) | ||
308 | { | ||
309 | union futex_key key; | ||
310 | int empty = 0; | ||
311 | struct rw_semaphore *fshared = ¤t->mm->mmap_sem; | ||
312 | |||
313 | again: | ||
314 | mutex_lock(&lguest_lock); | ||
315 | down_read(fshared); | ||
316 | if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { | ||
317 | kill_guest(lg, "bad sending DMA key"); | ||
318 | goto unlock; | ||
319 | } | ||
320 | /* Shared mapping? Look for other guests... */ | ||
321 | if (key.shared.offset & 1) { | ||
322 | struct lguest_dma_info *i; | ||
323 | list_for_each_entry(i, &dma_hash[hash(&key)], list) { | ||
324 | if (i->guestid == lg->guestid) | ||
325 | continue; | ||
326 | if (!key_eq(&key, &i->key)) | ||
327 | continue; | ||
328 | |||
329 | empty += dma_transfer(lg, udma, i); | ||
330 | break; | ||
331 | } | ||
332 | if (empty == 1) { | ||
333 | /* Give any recipients one chance to restock. */ | ||
334 | up_read(¤t->mm->mmap_sem); | ||
335 | mutex_unlock(&lguest_lock); | ||
336 | empty++; | ||
337 | goto again; | ||
338 | } | ||
339 | } else { | ||
340 | /* Private mapping: tell our userspace. */ | ||
341 | lg->dma_is_pending = 1; | ||
342 | lg->pending_dma = udma; | ||
343 | lg->pending_key = ukey; | ||
344 | } | ||
345 | unlock: | ||
346 | up_read(fshared); | ||
347 | mutex_unlock(&lguest_lock); | ||
348 | } | ||
349 | |||
350 | void release_all_dma(struct lguest *lg) | ||
351 | { | ||
352 | unsigned int i; | ||
353 | |||
354 | BUG_ON(!mutex_is_locked(&lguest_lock)); | ||
355 | |||
356 | down_read(&lg->mm->mmap_sem); | ||
357 | for (i = 0; i < LGUEST_MAX_DMA; i++) { | ||
358 | if (lg->dma[i].interrupt) | ||
359 | unlink_dma(&lg->dma[i]); | ||
360 | } | ||
361 | up_read(&lg->mm->mmap_sem); | ||
362 | } | ||
363 | |||
364 | /* Userspace wants a dma buffer from this guest. */ | ||
365 | unsigned long get_dma_buffer(struct lguest *lg, | ||
366 | unsigned long ukey, unsigned long *interrupt) | ||
367 | { | ||
368 | unsigned long ret = 0; | ||
369 | union futex_key key; | ||
370 | struct lguest_dma_info *i; | ||
371 | struct rw_semaphore *fshared = ¤t->mm->mmap_sem; | ||
372 | |||
373 | mutex_lock(&lguest_lock); | ||
374 | down_read(fshared); | ||
375 | if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { | ||
376 | kill_guest(lg, "bad registered DMA buffer"); | ||
377 | goto unlock; | ||
378 | } | ||
379 | list_for_each_entry(i, &dma_hash[hash(&key)], list) { | ||
380 | if (key_eq(&key, &i->key) && i->guestid == lg->guestid) { | ||
381 | unsigned int j; | ||
382 | for (j = 0; j < i->num_dmas; j++) { | ||
383 | struct lguest_dma dma; | ||
384 | |||
385 | ret = i->dmas + j * sizeof(struct lguest_dma); | ||
386 | lgread(lg, &dma, ret, sizeof(dma)); | ||
387 | if (dma.used_len == 0) | ||
388 | break; | ||
389 | } | ||
390 | *interrupt = i->interrupt; | ||
391 | break; | ||
392 | } | ||
393 | } | ||
394 | unlock: | ||
395 | up_read(fshared); | ||
396 | mutex_unlock(&lguest_lock); | ||
397 | return ret; | ||
398 | } | ||
399 | |||
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h new file mode 100644 index 000000000000..3e2ddfbc816e --- /dev/null +++ b/drivers/lguest/lg.h | |||
@@ -0,0 +1,261 @@ | |||
1 | #ifndef _LGUEST_H | ||
2 | #define _LGUEST_H | ||
3 | |||
4 | #include <asm/desc.h> | ||
5 | |||
6 | #define GDT_ENTRY_LGUEST_CS 10 | ||
7 | #define GDT_ENTRY_LGUEST_DS 11 | ||
8 | #define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) | ||
9 | #define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) | ||
10 | |||
11 | #ifndef __ASSEMBLY__ | ||
12 | #include <linux/types.h> | ||
13 | #include <linux/init.h> | ||
14 | #include <linux/stringify.h> | ||
15 | #include <linux/binfmts.h> | ||
16 | #include <linux/futex.h> | ||
17 | #include <linux/lguest.h> | ||
18 | #include <linux/lguest_launcher.h> | ||
19 | #include <linux/wait.h> | ||
20 | #include <linux/err.h> | ||
21 | #include <asm/semaphore.h> | ||
22 | #include "irq_vectors.h" | ||
23 | |||
24 | #define GUEST_PL 1 | ||
25 | |||
26 | struct lguest_regs | ||
27 | { | ||
28 | /* Manually saved part. */ | ||
29 | unsigned long ebx, ecx, edx; | ||
30 | unsigned long esi, edi, ebp; | ||
31 | unsigned long gs; | ||
32 | unsigned long eax; | ||
33 | unsigned long fs, ds, es; | ||
34 | unsigned long trapnum, errcode; | ||
35 | /* Trap pushed part */ | ||
36 | unsigned long eip; | ||
37 | unsigned long cs; | ||
38 | unsigned long eflags; | ||
39 | unsigned long esp; | ||
40 | unsigned long ss; | ||
41 | }; | ||
42 | |||
43 | void free_pagetables(void); | ||
44 | int init_pagetables(struct page **switcher_page, unsigned int pages); | ||
45 | |||
46 | /* Full 4G segment descriptors, suitable for CS and DS. */ | ||
47 | #define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00}) | ||
48 | #define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300}) | ||
49 | |||
50 | struct lguest_dma_info | ||
51 | { | ||
52 | struct list_head list; | ||
53 | union futex_key key; | ||
54 | unsigned long dmas; | ||
55 | u16 next_dma; | ||
56 | u16 num_dmas; | ||
57 | u16 guestid; | ||
58 | u8 interrupt; /* 0 when not registered */ | ||
59 | }; | ||
60 | |||
61 | /* We have separate types for the guest's ptes & pgds and the shadow ptes & | ||
62 | * pgds. Since this host might use three-level pagetables and the guest and | ||
63 | * shadow pagetables don't, we can't use the normal pte_t/pgd_t. */ | ||
64 | typedef union { | ||
65 | struct { unsigned flags:12, pfn:20; }; | ||
66 | struct { unsigned long val; } raw; | ||
67 | } spgd_t; | ||
68 | typedef union { | ||
69 | struct { unsigned flags:12, pfn:20; }; | ||
70 | struct { unsigned long val; } raw; | ||
71 | } spte_t; | ||
72 | typedef union { | ||
73 | struct { unsigned flags:12, pfn:20; }; | ||
74 | struct { unsigned long val; } raw; | ||
75 | } gpgd_t; | ||
76 | typedef union { | ||
77 | struct { unsigned flags:12, pfn:20; }; | ||
78 | struct { unsigned long val; } raw; | ||
79 | } gpte_t; | ||
80 | #define mkgpte(_val) ((gpte_t){.raw.val = _val}) | ||
81 | #define mkgpgd(_val) ((gpgd_t){.raw.val = _val}) | ||
82 | |||
83 | struct pgdir | ||
84 | { | ||
85 | unsigned long cr3; | ||
86 | spgd_t *pgdir; | ||
87 | }; | ||
88 | |||
89 | /* This is a guest-specific page (mapped ro) into the guest. */ | ||
90 | struct lguest_ro_state | ||
91 | { | ||
92 | /* Host information we need to restore when we switch back. */ | ||
93 | u32 host_cr3; | ||
94 | struct Xgt_desc_struct host_idt_desc; | ||
95 | struct Xgt_desc_struct host_gdt_desc; | ||
96 | u32 host_sp; | ||
97 | |||
98 | /* Fields which are used when guest is running. */ | ||
99 | struct Xgt_desc_struct guest_idt_desc; | ||
100 | struct Xgt_desc_struct guest_gdt_desc; | ||
101 | struct i386_hw_tss guest_tss; | ||
102 | struct desc_struct guest_idt[IDT_ENTRIES]; | ||
103 | struct desc_struct guest_gdt[GDT_ENTRIES]; | ||
104 | }; | ||
105 | |||
106 | /* We have two pages shared with guests, per cpu. */ | ||
107 | struct lguest_pages | ||
108 | { | ||
109 | /* This is the stack page mapped rw in guest */ | ||
110 | char spare[PAGE_SIZE - sizeof(struct lguest_regs)]; | ||
111 | struct lguest_regs regs; | ||
112 | |||
113 | /* This is the host state & guest descriptor page, ro in guest */ | ||
114 | struct lguest_ro_state state; | ||
115 | } __attribute__((aligned(PAGE_SIZE))); | ||
116 | |||
117 | #define CHANGED_IDT 1 | ||
118 | #define CHANGED_GDT 2 | ||
119 | #define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */ | ||
120 | #define CHANGED_ALL 3 | ||
121 | |||
122 | /* The private info the thread maintains about the guest. */ | ||
123 | struct lguest | ||
124 | { | ||
125 | /* At end of a page shared mapped over lguest_pages in guest. */ | ||
126 | unsigned long regs_page; | ||
127 | struct lguest_regs *regs; | ||
128 | struct lguest_data __user *lguest_data; | ||
129 | struct task_struct *tsk; | ||
130 | struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ | ||
131 | u16 guestid; | ||
132 | u32 pfn_limit; | ||
133 | u32 page_offset; | ||
134 | u32 cr2; | ||
135 | int halted; | ||
136 | int ts; | ||
137 | u32 next_hcall; | ||
138 | u32 esp1; | ||
139 | u8 ss1; | ||
140 | |||
141 | /* Do we need to stop what we're doing and return to userspace? */ | ||
142 | int break_out; | ||
143 | wait_queue_head_t break_wq; | ||
144 | |||
145 | /* Bitmap of what has changed: see CHANGED_* above. */ | ||
146 | int changed; | ||
147 | struct lguest_pages *last_pages; | ||
148 | |||
149 | /* We keep a small number of these. */ | ||
150 | u32 pgdidx; | ||
151 | struct pgdir pgdirs[4]; | ||
152 | |||
153 | /* Cached wakeup: we hold a reference to this task. */ | ||
154 | struct task_struct *wake; | ||
155 | |||
156 | unsigned long noirq_start, noirq_end; | ||
157 | int dma_is_pending; | ||
158 | unsigned long pending_dma; /* struct lguest_dma */ | ||
159 | unsigned long pending_key; /* address they're sending to */ | ||
160 | |||
161 | unsigned int stack_pages; | ||
162 | u32 tsc_khz; | ||
163 | |||
164 | struct lguest_dma_info dma[LGUEST_MAX_DMA]; | ||
165 | |||
166 | /* Dead? */ | ||
167 | const char *dead; | ||
168 | |||
169 | /* The GDT entries copied into lguest_ro_state when running. */ | ||
170 | struct desc_struct gdt[GDT_ENTRIES]; | ||
171 | |||
172 | /* The IDT entries: some copied into lguest_ro_state when running. */ | ||
173 | struct desc_struct idt[FIRST_EXTERNAL_VECTOR+LGUEST_IRQS]; | ||
174 | struct desc_struct syscall_idt; | ||
175 | |||
176 | /* Virtual clock device */ | ||
177 | struct hrtimer hrt; | ||
178 | |||
179 | /* Pending virtual interrupts */ | ||
180 | DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); | ||
181 | }; | ||
182 | |||
183 | extern struct lguest lguests[]; | ||
184 | extern struct mutex lguest_lock; | ||
185 | |||
186 | /* core.c: */ | ||
187 | u32 lgread_u32(struct lguest *lg, unsigned long addr); | ||
188 | void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val); | ||
189 | void lgread(struct lguest *lg, void *buf, unsigned long addr, unsigned len); | ||
190 | void lgwrite(struct lguest *lg, unsigned long, const void *buf, unsigned len); | ||
191 | int find_free_guest(void); | ||
192 | int lguest_address_ok(const struct lguest *lg, | ||
193 | unsigned long addr, unsigned long len); | ||
194 | int run_guest(struct lguest *lg, unsigned long __user *user); | ||
195 | |||
196 | |||
197 | /* interrupts_and_traps.c: */ | ||
198 | void maybe_do_interrupt(struct lguest *lg); | ||
199 | int deliver_trap(struct lguest *lg, unsigned int num); | ||
200 | void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi); | ||
201 | void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages); | ||
202 | void pin_stack_pages(struct lguest *lg); | ||
203 | void setup_default_idt_entries(struct lguest_ro_state *state, | ||
204 | const unsigned long *def); | ||
205 | void copy_traps(const struct lguest *lg, struct desc_struct *idt, | ||
206 | const unsigned long *def); | ||
207 | void guest_set_clockevent(struct lguest *lg, unsigned long delta); | ||
208 | void init_clockdev(struct lguest *lg); | ||
209 | |||
210 | /* segments.c: */ | ||
211 | void setup_default_gdt_entries(struct lguest_ro_state *state); | ||
212 | void setup_guest_gdt(struct lguest *lg); | ||
213 | void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num); | ||
214 | void guest_load_tls(struct lguest *lg, unsigned long tls_array); | ||
215 | void copy_gdt(const struct lguest *lg, struct desc_struct *gdt); | ||
216 | void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt); | ||
217 | |||
218 | /* page_tables.c: */ | ||
219 | int init_guest_pagetable(struct lguest *lg, unsigned long pgtable); | ||
220 | void free_guest_pagetable(struct lguest *lg); | ||
221 | void guest_new_pagetable(struct lguest *lg, unsigned long pgtable); | ||
222 | void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 i); | ||
223 | void guest_pagetable_clear_all(struct lguest *lg); | ||
224 | void guest_pagetable_flush_user(struct lguest *lg); | ||
225 | void guest_set_pte(struct lguest *lg, unsigned long cr3, | ||
226 | unsigned long vaddr, gpte_t val); | ||
227 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); | ||
228 | int demand_page(struct lguest *info, unsigned long cr2, int errcode); | ||
229 | void pin_page(struct lguest *lg, unsigned long vaddr); | ||
230 | |||
231 | /* lguest_user.c: */ | ||
232 | int lguest_device_init(void); | ||
233 | void lguest_device_remove(void); | ||
234 | |||
235 | /* io.c: */ | ||
236 | void lguest_io_init(void); | ||
237 | int bind_dma(struct lguest *lg, | ||
238 | unsigned long key, unsigned long udma, u16 numdmas, u8 interrupt); | ||
239 | void send_dma(struct lguest *info, unsigned long key, unsigned long udma); | ||
240 | void release_all_dma(struct lguest *lg); | ||
241 | unsigned long get_dma_buffer(struct lguest *lg, unsigned long key, | ||
242 | unsigned long *interrupt); | ||
243 | |||
244 | /* hypercalls.c: */ | ||
245 | void do_hypercalls(struct lguest *lg); | ||
246 | |||
247 | #define kill_guest(lg, fmt...) \ | ||
248 | do { \ | ||
249 | if (!(lg)->dead) { \ | ||
250 | (lg)->dead = kasprintf(GFP_ATOMIC, fmt); \ | ||
251 | if (!(lg)->dead) \ | ||
252 | (lg)->dead = ERR_PTR(-ENOMEM); \ | ||
253 | } \ | ||
254 | } while(0) | ||
255 | |||
256 | static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) | ||
257 | { | ||
258 | return vaddr - lg->page_offset; | ||
259 | } | ||
260 | #endif /* __ASSEMBLY__ */ | ||
261 | #endif /* _LGUEST_H */ | ||
diff --git a/drivers/lguest/lguest.c b/drivers/lguest/lguest.c new file mode 100644 index 000000000000..b9a58b78c990 --- /dev/null +++ b/drivers/lguest/lguest.c | |||
@@ -0,0 +1,621 @@ | |||
1 | /* | ||
2 | * Lguest specific paravirt-ops implementation | ||
3 | * | ||
4 | * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, but | ||
12 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
14 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
15 | * details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
20 | */ | ||
21 | #include <linux/kernel.h> | ||
22 | #include <linux/start_kernel.h> | ||
23 | #include <linux/string.h> | ||
24 | #include <linux/console.h> | ||
25 | #include <linux/screen_info.h> | ||
26 | #include <linux/irq.h> | ||
27 | #include <linux/interrupt.h> | ||
28 | #include <linux/clocksource.h> | ||
29 | #include <linux/clockchips.h> | ||
30 | #include <linux/lguest.h> | ||
31 | #include <linux/lguest_launcher.h> | ||
32 | #include <linux/lguest_bus.h> | ||
33 | #include <asm/paravirt.h> | ||
34 | #include <asm/param.h> | ||
35 | #include <asm/page.h> | ||
36 | #include <asm/pgtable.h> | ||
37 | #include <asm/desc.h> | ||
38 | #include <asm/setup.h> | ||
39 | #include <asm/e820.h> | ||
40 | #include <asm/mce.h> | ||
41 | #include <asm/io.h> | ||
42 | //#include <asm/sched-clock.h> | ||
43 | |||
44 | /* Declarations for definitions in lguest_guest.S */ | ||
45 | extern char lguest_noirq_start[], lguest_noirq_end[]; | ||
46 | extern const char lgstart_cli[], lgend_cli[]; | ||
47 | extern const char lgstart_sti[], lgend_sti[]; | ||
48 | extern const char lgstart_popf[], lgend_popf[]; | ||
49 | extern const char lgstart_pushf[], lgend_pushf[]; | ||
50 | extern const char lgstart_iret[], lgend_iret[]; | ||
51 | extern void lguest_iret(void); | ||
52 | |||
53 | struct lguest_data lguest_data = { | ||
54 | .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, | ||
55 | .noirq_start = (u32)lguest_noirq_start, | ||
56 | .noirq_end = (u32)lguest_noirq_end, | ||
57 | .blocked_interrupts = { 1 }, /* Block timer interrupts */ | ||
58 | }; | ||
59 | struct lguest_device_desc *lguest_devices; | ||
60 | |||
61 | static enum paravirt_lazy_mode lazy_mode; | ||
62 | static void lguest_lazy_mode(enum paravirt_lazy_mode mode) | ||
63 | { | ||
64 | if (mode == PARAVIRT_LAZY_FLUSH) { | ||
65 | if (unlikely(lazy_mode != PARAVIRT_LAZY_NONE)) | ||
66 | hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); | ||
67 | } else { | ||
68 | lazy_mode = mode; | ||
69 | if (mode == PARAVIRT_LAZY_NONE) | ||
70 | hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); | ||
71 | } | ||
72 | } | ||
73 | |||
74 | static void lazy_hcall(unsigned long call, | ||
75 | unsigned long arg1, | ||
76 | unsigned long arg2, | ||
77 | unsigned long arg3) | ||
78 | { | ||
79 | if (lazy_mode == PARAVIRT_LAZY_NONE) | ||
80 | hcall(call, arg1, arg2, arg3); | ||
81 | else | ||
82 | async_hcall(call, arg1, arg2, arg3); | ||
83 | } | ||
84 | |||
85 | void async_hcall(unsigned long call, | ||
86 | unsigned long arg1, unsigned long arg2, unsigned long arg3) | ||
87 | { | ||
88 | /* Note: This code assumes we're uniprocessor. */ | ||
89 | static unsigned int next_call; | ||
90 | unsigned long flags; | ||
91 | |||
92 | local_irq_save(flags); | ||
93 | if (lguest_data.hcall_status[next_call] != 0xFF) { | ||
94 | /* Table full, so do normal hcall which will flush table. */ | ||
95 | hcall(call, arg1, arg2, arg3); | ||
96 | } else { | ||
97 | lguest_data.hcalls[next_call].eax = call; | ||
98 | lguest_data.hcalls[next_call].edx = arg1; | ||
99 | lguest_data.hcalls[next_call].ebx = arg2; | ||
100 | lguest_data.hcalls[next_call].ecx = arg3; | ||
101 | /* Make sure host sees arguments before "valid" flag. */ | ||
102 | wmb(); | ||
103 | lguest_data.hcall_status[next_call] = 0; | ||
104 | if (++next_call == LHCALL_RING_SIZE) | ||
105 | next_call = 0; | ||
106 | } | ||
107 | local_irq_restore(flags); | ||
108 | } | ||
109 | |||
110 | void lguest_send_dma(unsigned long key, struct lguest_dma *dma) | ||
111 | { | ||
112 | dma->used_len = 0; | ||
113 | hcall(LHCALL_SEND_DMA, key, __pa(dma), 0); | ||
114 | } | ||
115 | |||
116 | int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas, | ||
117 | unsigned int num, u8 irq) | ||
118 | { | ||
119 | if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq)) | ||
120 | return -ENOMEM; | ||
121 | return 0; | ||
122 | } | ||
123 | |||
124 | void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas) | ||
125 | { | ||
126 | hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0); | ||
127 | } | ||
128 | |||
129 | /* For guests, device memory can be used as normal memory, so we cast away the | ||
130 | * __iomem to quieten sparse. */ | ||
131 | void *lguest_map(unsigned long phys_addr, unsigned long pages) | ||
132 | { | ||
133 | return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages); | ||
134 | } | ||
135 | |||
136 | void lguest_unmap(void *addr) | ||
137 | { | ||
138 | iounmap((__force void __iomem *)addr); | ||
139 | } | ||
140 | |||
141 | static unsigned long save_fl(void) | ||
142 | { | ||
143 | return lguest_data.irq_enabled; | ||
144 | } | ||
145 | |||
146 | static void restore_fl(unsigned long flags) | ||
147 | { | ||
148 | /* FIXME: Check if interrupt pending... */ | ||
149 | lguest_data.irq_enabled = flags; | ||
150 | } | ||
151 | |||
152 | static void irq_disable(void) | ||
153 | { | ||
154 | lguest_data.irq_enabled = 0; | ||
155 | } | ||
156 | |||
157 | static void irq_enable(void) | ||
158 | { | ||
159 | /* FIXME: Check if interrupt pending... */ | ||
160 | lguest_data.irq_enabled = X86_EFLAGS_IF; | ||
161 | } | ||
162 | |||
163 | static void lguest_write_idt_entry(struct desc_struct *dt, | ||
164 | int entrynum, u32 low, u32 high) | ||
165 | { | ||
166 | write_dt_entry(dt, entrynum, low, high); | ||
167 | hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high); | ||
168 | } | ||
169 | |||
170 | static void lguest_load_idt(const struct Xgt_desc_struct *desc) | ||
171 | { | ||
172 | unsigned int i; | ||
173 | struct desc_struct *idt = (void *)desc->address; | ||
174 | |||
175 | for (i = 0; i < (desc->size+1)/8; i++) | ||
176 | hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); | ||
177 | } | ||
178 | |||
179 | static void lguest_load_gdt(const struct Xgt_desc_struct *desc) | ||
180 | { | ||
181 | BUG_ON((desc->size+1)/8 != GDT_ENTRIES); | ||
182 | hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0); | ||
183 | } | ||
184 | |||
185 | static void lguest_write_gdt_entry(struct desc_struct *dt, | ||
186 | int entrynum, u32 low, u32 high) | ||
187 | { | ||
188 | write_dt_entry(dt, entrynum, low, high); | ||
189 | hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0); | ||
190 | } | ||
191 | |||
192 | static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) | ||
193 | { | ||
194 | lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0); | ||
195 | } | ||
196 | |||
197 | static void lguest_set_ldt(const void *addr, unsigned entries) | ||
198 | { | ||
199 | } | ||
200 | |||
201 | static void lguest_load_tr_desc(void) | ||
202 | { | ||
203 | } | ||
204 | |||
205 | static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, | ||
206 | unsigned int *ecx, unsigned int *edx) | ||
207 | { | ||
208 | int function = *eax; | ||
209 | |||
210 | native_cpuid(eax, ebx, ecx, edx); | ||
211 | switch (function) { | ||
212 | case 1: /* Basic feature request. */ | ||
213 | /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ | ||
214 | *ecx &= 0x00002201; | ||
215 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ | ||
216 | *edx &= 0x07808101; | ||
217 | /* Host wants to know when we flush kernel pages: set PGE. */ | ||
218 | *edx |= 0x00002000; | ||
219 | break; | ||
220 | case 0x80000000: | ||
221 | /* Futureproof this a little: if they ask how much extended | ||
222 | * processor information, limit it to known fields. */ | ||
223 | if (*eax > 0x80000008) | ||
224 | *eax = 0x80000008; | ||
225 | break; | ||
226 | } | ||
227 | } | ||
228 | |||
229 | static unsigned long current_cr0, current_cr3; | ||
230 | static void lguest_write_cr0(unsigned long val) | ||
231 | { | ||
232 | lazy_hcall(LHCALL_TS, val & 8, 0, 0); | ||
233 | current_cr0 = val; | ||
234 | } | ||
235 | |||
236 | static unsigned long lguest_read_cr0(void) | ||
237 | { | ||
238 | return current_cr0; | ||
239 | } | ||
240 | |||
241 | static void lguest_clts(void) | ||
242 | { | ||
243 | lazy_hcall(LHCALL_TS, 0, 0, 0); | ||
244 | current_cr0 &= ~8U; | ||
245 | } | ||
246 | |||
247 | static unsigned long lguest_read_cr2(void) | ||
248 | { | ||
249 | return lguest_data.cr2; | ||
250 | } | ||
251 | |||
252 | static void lguest_write_cr3(unsigned long cr3) | ||
253 | { | ||
254 | lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0); | ||
255 | current_cr3 = cr3; | ||
256 | } | ||
257 | |||
258 | static unsigned long lguest_read_cr3(void) | ||
259 | { | ||
260 | return current_cr3; | ||
261 | } | ||
262 | |||
263 | /* Used to enable/disable PGE, but we don't care. */ | ||
264 | static unsigned long lguest_read_cr4(void) | ||
265 | { | ||
266 | return 0; | ||
267 | } | ||
268 | |||
269 | static void lguest_write_cr4(unsigned long val) | ||
270 | { | ||
271 | } | ||
272 | |||
273 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | ||
274 | pte_t *ptep, pte_t pteval) | ||
275 | { | ||
276 | *ptep = pteval; | ||
277 | lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low); | ||
278 | } | ||
279 | |||
280 | /* We only support two-level pagetables at the moment. */ | ||
281 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | ||
282 | { | ||
283 | *pmdp = pmdval; | ||
284 | lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK, | ||
285 | (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); | ||
286 | } | ||
287 | |||
288 | /* FIXME: Eliminate all callers of this. */ | ||
289 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) | ||
290 | { | ||
291 | *ptep = pteval; | ||
292 | /* Don't bother with hypercall before initial setup. */ | ||
293 | if (current_cr3) | ||
294 | lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); | ||
295 | } | ||
296 | |||
297 | static void lguest_flush_tlb_single(unsigned long addr) | ||
298 | { | ||
299 | /* Simply set it to zero, and it will fault back in. */ | ||
300 | lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0); | ||
301 | } | ||
302 | |||
303 | static void lguest_flush_tlb_user(void) | ||
304 | { | ||
305 | lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0); | ||
306 | } | ||
307 | |||
308 | static void lguest_flush_tlb_kernel(void) | ||
309 | { | ||
310 | lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); | ||
311 | } | ||
312 | |||
313 | static void disable_lguest_irq(unsigned int irq) | ||
314 | { | ||
315 | set_bit(irq, lguest_data.blocked_interrupts); | ||
316 | } | ||
317 | |||
318 | static void enable_lguest_irq(unsigned int irq) | ||
319 | { | ||
320 | clear_bit(irq, lguest_data.blocked_interrupts); | ||
321 | /* FIXME: If it's pending? */ | ||
322 | } | ||
323 | |||
324 | static struct irq_chip lguest_irq_controller = { | ||
325 | .name = "lguest", | ||
326 | .mask = disable_lguest_irq, | ||
327 | .mask_ack = disable_lguest_irq, | ||
328 | .unmask = enable_lguest_irq, | ||
329 | }; | ||
330 | |||
331 | static void __init lguest_init_IRQ(void) | ||
332 | { | ||
333 | unsigned int i; | ||
334 | |||
335 | for (i = 0; i < LGUEST_IRQS; i++) { | ||
336 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
337 | if (vector != SYSCALL_VECTOR) { | ||
338 | set_intr_gate(vector, interrupt[i]); | ||
339 | set_irq_chip_and_handler(i, &lguest_irq_controller, | ||
340 | handle_level_irq); | ||
341 | } | ||
342 | } | ||
343 | irq_ctx_init(smp_processor_id()); | ||
344 | } | ||
345 | |||
346 | static unsigned long lguest_get_wallclock(void) | ||
347 | { | ||
348 | return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0); | ||
349 | } | ||
350 | |||
351 | static cycle_t lguest_clock_read(void) | ||
352 | { | ||
353 | if (lguest_data.tsc_khz) | ||
354 | return native_read_tsc(); | ||
355 | else | ||
356 | return jiffies; | ||
357 | } | ||
358 | |||
359 | /* This is what we tell the kernel is our clocksource. */ | ||
360 | static struct clocksource lguest_clock = { | ||
361 | .name = "lguest", | ||
362 | .rating = 400, | ||
363 | .read = lguest_clock_read, | ||
364 | }; | ||
365 | |||
366 | /* We also need a "struct clock_event_device": Linux asks us to set it to go | ||
367 | * off some time in the future. Actually, James Morris figured all this out, I | ||
368 | * just applied the patch. */ | ||
369 | static int lguest_clockevent_set_next_event(unsigned long delta, | ||
370 | struct clock_event_device *evt) | ||
371 | { | ||
372 | if (delta < LG_CLOCK_MIN_DELTA) { | ||
373 | if (printk_ratelimit()) | ||
374 | printk(KERN_DEBUG "%s: small delta %lu ns\n", | ||
375 | __FUNCTION__, delta); | ||
376 | return -ETIME; | ||
377 | } | ||
378 | hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0); | ||
379 | return 0; | ||
380 | } | ||
381 | |||
382 | static void lguest_clockevent_set_mode(enum clock_event_mode mode, | ||
383 | struct clock_event_device *evt) | ||
384 | { | ||
385 | switch (mode) { | ||
386 | case CLOCK_EVT_MODE_UNUSED: | ||
387 | case CLOCK_EVT_MODE_SHUTDOWN: | ||
388 | /* A 0 argument shuts the clock down. */ | ||
389 | hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0); | ||
390 | break; | ||
391 | case CLOCK_EVT_MODE_ONESHOT: | ||
392 | /* This is what we expect. */ | ||
393 | break; | ||
394 | case CLOCK_EVT_MODE_PERIODIC: | ||
395 | BUG(); | ||
396 | } | ||
397 | } | ||
398 | |||
399 | /* This describes our primitive timer chip. */ | ||
400 | static struct clock_event_device lguest_clockevent = { | ||
401 | .name = "lguest", | ||
402 | .features = CLOCK_EVT_FEAT_ONESHOT, | ||
403 | .set_next_event = lguest_clockevent_set_next_event, | ||
404 | .set_mode = lguest_clockevent_set_mode, | ||
405 | .rating = INT_MAX, | ||
406 | .mult = 1, | ||
407 | .shift = 0, | ||
408 | .min_delta_ns = LG_CLOCK_MIN_DELTA, | ||
409 | .max_delta_ns = LG_CLOCK_MAX_DELTA, | ||
410 | }; | ||
411 | |||
412 | /* This is the Guest timer interrupt handler (hardware interrupt 0). We just | ||
413 | * call the clockevent infrastructure and it does whatever needs doing. */ | ||
414 | static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) | ||
415 | { | ||
416 | unsigned long flags; | ||
417 | |||
418 | /* Don't interrupt us while this is running. */ | ||
419 | local_irq_save(flags); | ||
420 | lguest_clockevent.event_handler(&lguest_clockevent); | ||
421 | local_irq_restore(flags); | ||
422 | } | ||
423 | |||
424 | static void lguest_time_init(void) | ||
425 | { | ||
426 | set_irq_handler(0, lguest_time_irq); | ||
427 | |||
428 | /* We use the TSC if the Host tells us we can, otherwise a dumb | ||
429 | * jiffies-based clock. */ | ||
430 | if (lguest_data.tsc_khz) { | ||
431 | lguest_clock.shift = 22; | ||
432 | lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, | ||
433 | lguest_clock.shift); | ||
434 | lguest_clock.mask = CLOCKSOURCE_MASK(64); | ||
435 | lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS; | ||
436 | } else { | ||
437 | /* To understand this, start at kernel/time/jiffies.c... */ | ||
438 | lguest_clock.shift = 8; | ||
439 | lguest_clock.mult = (((u64)NSEC_PER_SEC<<8)/ACTHZ) << 8; | ||
440 | lguest_clock.mask = CLOCKSOURCE_MASK(32); | ||
441 | } | ||
442 | clocksource_register(&lguest_clock); | ||
443 | |||
444 | /* We can't set cpumask in the initializer: damn C limitations! */ | ||
445 | lguest_clockevent.cpumask = cpumask_of_cpu(0); | ||
446 | clockevents_register_device(&lguest_clockevent); | ||
447 | |||
448 | enable_lguest_irq(0); | ||
449 | } | ||
450 | |||
451 | static void lguest_load_esp0(struct tss_struct *tss, | ||
452 | struct thread_struct *thread) | ||
453 | { | ||
454 | lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0, | ||
455 | THREAD_SIZE/PAGE_SIZE); | ||
456 | } | ||
457 | |||
458 | static void lguest_set_debugreg(int regno, unsigned long value) | ||
459 | { | ||
460 | /* FIXME: Implement */ | ||
461 | } | ||
462 | |||
463 | static void lguest_wbinvd(void) | ||
464 | { | ||
465 | } | ||
466 | |||
467 | #ifdef CONFIG_X86_LOCAL_APIC | ||
468 | static void lguest_apic_write(unsigned long reg, unsigned long v) | ||
469 | { | ||
470 | } | ||
471 | |||
472 | static unsigned long lguest_apic_read(unsigned long reg) | ||
473 | { | ||
474 | return 0; | ||
475 | } | ||
476 | #endif | ||
477 | |||
478 | static void lguest_safe_halt(void) | ||
479 | { | ||
480 | hcall(LHCALL_HALT, 0, 0, 0); | ||
481 | } | ||
482 | |||
483 | static void lguest_power_off(void) | ||
484 | { | ||
485 | hcall(LHCALL_CRASH, __pa("Power down"), 0, 0); | ||
486 | } | ||
487 | |||
488 | static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) | ||
489 | { | ||
490 | hcall(LHCALL_CRASH, __pa(p), 0, 0); | ||
491 | return NOTIFY_DONE; | ||
492 | } | ||
493 | |||
494 | static struct notifier_block paniced = { | ||
495 | .notifier_call = lguest_panic | ||
496 | }; | ||
497 | |||
498 | static __init char *lguest_memory_setup(void) | ||
499 | { | ||
500 | /* We do this here because lockcheck barfs if before start_kernel */ | ||
501 | atomic_notifier_chain_register(&panic_notifier_list, &paniced); | ||
502 | |||
503 | add_memory_region(E820_MAP->addr, E820_MAP->size, E820_MAP->type); | ||
504 | return "LGUEST"; | ||
505 | } | ||
506 | |||
507 | static const struct lguest_insns | ||
508 | { | ||
509 | const char *start, *end; | ||
510 | } lguest_insns[] = { | ||
511 | [PARAVIRT_PATCH(irq_disable)] = { lgstart_cli, lgend_cli }, | ||
512 | [PARAVIRT_PATCH(irq_enable)] = { lgstart_sti, lgend_sti }, | ||
513 | [PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf }, | ||
514 | [PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf }, | ||
515 | }; | ||
516 | static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len) | ||
517 | { | ||
518 | unsigned int insn_len; | ||
519 | |||
520 | /* Don't touch it if we don't have a replacement */ | ||
521 | if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) | ||
522 | return paravirt_patch_default(type, clobber, insns, len); | ||
523 | |||
524 | insn_len = lguest_insns[type].end - lguest_insns[type].start; | ||
525 | |||
526 | /* Similarly if we can't fit replacement. */ | ||
527 | if (len < insn_len) | ||
528 | return paravirt_patch_default(type, clobber, insns, len); | ||
529 | |||
530 | memcpy(insns, lguest_insns[type].start, insn_len); | ||
531 | return insn_len; | ||
532 | } | ||
533 | |||
534 | __init void lguest_init(void *boot) | ||
535 | { | ||
536 | /* Copy boot parameters first. */ | ||
537 | memcpy(&boot_params, boot, PARAM_SIZE); | ||
538 | memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr), | ||
539 | COMMAND_LINE_SIZE); | ||
540 | |||
541 | paravirt_ops.name = "lguest"; | ||
542 | paravirt_ops.paravirt_enabled = 1; | ||
543 | paravirt_ops.kernel_rpl = 1; | ||
544 | |||
545 | paravirt_ops.save_fl = save_fl; | ||
546 | paravirt_ops.restore_fl = restore_fl; | ||
547 | paravirt_ops.irq_disable = irq_disable; | ||
548 | paravirt_ops.irq_enable = irq_enable; | ||
549 | paravirt_ops.load_gdt = lguest_load_gdt; | ||
550 | paravirt_ops.memory_setup = lguest_memory_setup; | ||
551 | paravirt_ops.cpuid = lguest_cpuid; | ||
552 | paravirt_ops.write_cr3 = lguest_write_cr3; | ||
553 | paravirt_ops.flush_tlb_user = lguest_flush_tlb_user; | ||
554 | paravirt_ops.flush_tlb_single = lguest_flush_tlb_single; | ||
555 | paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; | ||
556 | paravirt_ops.set_pte = lguest_set_pte; | ||
557 | paravirt_ops.set_pte_at = lguest_set_pte_at; | ||
558 | paravirt_ops.set_pmd = lguest_set_pmd; | ||
559 | #ifdef CONFIG_X86_LOCAL_APIC | ||
560 | paravirt_ops.apic_write = lguest_apic_write; | ||
561 | paravirt_ops.apic_write_atomic = lguest_apic_write; | ||
562 | paravirt_ops.apic_read = lguest_apic_read; | ||
563 | #endif | ||
564 | paravirt_ops.load_idt = lguest_load_idt; | ||
565 | paravirt_ops.iret = lguest_iret; | ||
566 | paravirt_ops.load_esp0 = lguest_load_esp0; | ||
567 | paravirt_ops.load_tr_desc = lguest_load_tr_desc; | ||
568 | paravirt_ops.set_ldt = lguest_set_ldt; | ||
569 | paravirt_ops.load_tls = lguest_load_tls; | ||
570 | paravirt_ops.set_debugreg = lguest_set_debugreg; | ||
571 | paravirt_ops.clts = lguest_clts; | ||
572 | paravirt_ops.read_cr0 = lguest_read_cr0; | ||
573 | paravirt_ops.write_cr0 = lguest_write_cr0; | ||
574 | paravirt_ops.init_IRQ = lguest_init_IRQ; | ||
575 | paravirt_ops.read_cr2 = lguest_read_cr2; | ||
576 | paravirt_ops.read_cr3 = lguest_read_cr3; | ||
577 | paravirt_ops.read_cr4 = lguest_read_cr4; | ||
578 | paravirt_ops.write_cr4 = lguest_write_cr4; | ||
579 | paravirt_ops.write_gdt_entry = lguest_write_gdt_entry; | ||
580 | paravirt_ops.write_idt_entry = lguest_write_idt_entry; | ||
581 | paravirt_ops.patch = lguest_patch; | ||
582 | paravirt_ops.safe_halt = lguest_safe_halt; | ||
583 | paravirt_ops.get_wallclock = lguest_get_wallclock; | ||
584 | paravirt_ops.time_init = lguest_time_init; | ||
585 | paravirt_ops.set_lazy_mode = lguest_lazy_mode; | ||
586 | paravirt_ops.wbinvd = lguest_wbinvd; | ||
587 | |||
588 | hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0); | ||
589 | |||
590 | /* We use top of mem for initial pagetables. */ | ||
591 | init_pg_tables_end = __pa(pg0); | ||
592 | |||
593 | asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); | ||
594 | |||
595 | reserve_top_address(lguest_data.reserve_mem); | ||
596 | |||
597 | lockdep_init(); | ||
598 | |||
599 | paravirt_disable_iospace(); | ||
600 | |||
601 | cpu_detect(&new_cpu_data); | ||
602 | /* head.S usually sets up the first capability word, so do it here. */ | ||
603 | new_cpu_data.x86_capability[0] = cpuid_edx(1); | ||
604 | |||
605 | /* Math is always hard! */ | ||
606 | new_cpu_data.hard_math = 1; | ||
607 | |||
608 | #ifdef CONFIG_X86_MCE | ||
609 | mce_disabled = 1; | ||
610 | #endif | ||
611 | |||
612 | #ifdef CONFIG_ACPI | ||
613 | acpi_disabled = 1; | ||
614 | acpi_ht = 0; | ||
615 | #endif | ||
616 | |||
617 | add_preferred_console("hvc", 0, NULL); | ||
618 | |||
619 | pm_power_off = lguest_power_off; | ||
620 | start_kernel(); | ||
621 | } | ||
diff --git a/drivers/lguest/lguest_asm.S b/drivers/lguest/lguest_asm.S new file mode 100644 index 000000000000..00046c57b5ba --- /dev/null +++ b/drivers/lguest/lguest_asm.S | |||
@@ -0,0 +1,56 @@ | |||
1 | #include <linux/linkage.h> | ||
2 | #include <linux/lguest.h> | ||
3 | #include <asm/asm-offsets.h> | ||
4 | #include <asm/thread_info.h> | ||
5 | |||
6 | /* FIXME: Once asm/processor-flags.h goes in, include that */ | ||
7 | #define X86_EFLAGS_IF 0x00000200 | ||
8 | |||
9 | /* | ||
10 | * This is where we begin: we have a magic signature which the launcher looks | ||
11 | * for. The plan is that the Linux boot protocol will be extended with a | ||
12 | * "platform type" field which will guide us here from the normal entry point, | ||
13 | * but for the moment this suffices. We pass the virtual address of the boot | ||
14 | * info to lguest_init(). | ||
15 | * | ||
16 | * We put it in .init.text will be discarded after boot. | ||
17 | */ | ||
18 | .section .init.text, "ax", @progbits | ||
19 | .ascii "GenuineLguest" | ||
20 | /* Set up initial stack. */ | ||
21 | movl $(init_thread_union+THREAD_SIZE),%esp | ||
22 | movl %esi, %eax | ||
23 | addl $__PAGE_OFFSET, %eax | ||
24 | jmp lguest_init | ||
25 | |||
26 | /* The templates for inline patching. */ | ||
27 | #define LGUEST_PATCH(name, insns...) \ | ||
28 | lgstart_##name: insns; lgend_##name:; \ | ||
29 | .globl lgstart_##name; .globl lgend_##name | ||
30 | |||
31 | LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) | ||
32 | LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) | ||
33 | LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) | ||
34 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) | ||
35 | |||
36 | .text | ||
37 | /* These demark the EIP range where host should never deliver interrupts. */ | ||
38 | .global lguest_noirq_start | ||
39 | .global lguest_noirq_end | ||
40 | |||
41 | /* | ||
42 | * We move eflags word to lguest_data.irq_enabled to restore interrupt state. | ||
43 | * For page faults, gpfs and virtual interrupts, the hypervisor has saved | ||
44 | * eflags manually, otherwise it was delivered directly and so eflags reflects | ||
45 | * the real machine IF state, ie. interrupts on. Since the kernel always dies | ||
46 | * if it takes such a trap with interrupts disabled anyway, turning interrupts | ||
47 | * back on unconditionally here is OK. | ||
48 | */ | ||
49 | ENTRY(lguest_iret) | ||
50 | pushl %eax | ||
51 | movl 12(%esp), %eax | ||
52 | lguest_noirq_start: | ||
53 | movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled | ||
54 | popl %eax | ||
55 | iret | ||
56 | lguest_noirq_end: | ||
diff --git a/drivers/lguest/lguest_bus.c b/drivers/lguest/lguest_bus.c new file mode 100644 index 000000000000..18d6ab21a43b --- /dev/null +++ b/drivers/lguest/lguest_bus.c | |||
@@ -0,0 +1,148 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/bootmem.h> | ||
3 | #include <linux/lguest_bus.h> | ||
4 | #include <asm/io.h> | ||
5 | |||
6 | static ssize_t type_show(struct device *_dev, | ||
7 | struct device_attribute *attr, char *buf) | ||
8 | { | ||
9 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); | ||
10 | return sprintf(buf, "%hu", lguest_devices[dev->index].type); | ||
11 | } | ||
12 | static ssize_t features_show(struct device *_dev, | ||
13 | struct device_attribute *attr, char *buf) | ||
14 | { | ||
15 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); | ||
16 | return sprintf(buf, "%hx", lguest_devices[dev->index].features); | ||
17 | } | ||
18 | static ssize_t pfn_show(struct device *_dev, | ||
19 | struct device_attribute *attr, char *buf) | ||
20 | { | ||
21 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); | ||
22 | return sprintf(buf, "%u", lguest_devices[dev->index].pfn); | ||
23 | } | ||
24 | static ssize_t status_show(struct device *_dev, | ||
25 | struct device_attribute *attr, char *buf) | ||
26 | { | ||
27 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); | ||
28 | return sprintf(buf, "%hx", lguest_devices[dev->index].status); | ||
29 | } | ||
30 | static ssize_t status_store(struct device *_dev, struct device_attribute *attr, | ||
31 | const char *buf, size_t count) | ||
32 | { | ||
33 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); | ||
34 | if (sscanf(buf, "%hi", &lguest_devices[dev->index].status) != 1) | ||
35 | return -EINVAL; | ||
36 | return count; | ||
37 | } | ||
38 | static struct device_attribute lguest_dev_attrs[] = { | ||
39 | __ATTR_RO(type), | ||
40 | __ATTR_RO(features), | ||
41 | __ATTR_RO(pfn), | ||
42 | __ATTR(status, 0644, status_show, status_store), | ||
43 | __ATTR_NULL | ||
44 | }; | ||
45 | |||
46 | static int lguest_dev_match(struct device *_dev, struct device_driver *_drv) | ||
47 | { | ||
48 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); | ||
49 | struct lguest_driver *drv = container_of(_drv,struct lguest_driver,drv); | ||
50 | |||
51 | return (drv->device_type == lguest_devices[dev->index].type); | ||
52 | } | ||
53 | |||
54 | struct lguest_bus { | ||
55 | struct bus_type bus; | ||
56 | struct device dev; | ||
57 | }; | ||
58 | |||
59 | static struct lguest_bus lguest_bus = { | ||
60 | .bus = { | ||
61 | .name = "lguest", | ||
62 | .match = lguest_dev_match, | ||
63 | .dev_attrs = lguest_dev_attrs, | ||
64 | }, | ||
65 | .dev = { | ||
66 | .parent = NULL, | ||
67 | .bus_id = "lguest", | ||
68 | } | ||
69 | }; | ||
70 | |||
71 | static int lguest_dev_probe(struct device *_dev) | ||
72 | { | ||
73 | int ret; | ||
74 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); | ||
75 | struct lguest_driver *drv = container_of(dev->dev.driver, | ||
76 | struct lguest_driver, drv); | ||
77 | |||
78 | lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER; | ||
79 | ret = drv->probe(dev); | ||
80 | if (ret == 0) | ||
81 | lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER_OK; | ||
82 | return ret; | ||
83 | } | ||
84 | |||
85 | int register_lguest_driver(struct lguest_driver *drv) | ||
86 | { | ||
87 | if (!lguest_devices) | ||
88 | return 0; | ||
89 | |||
90 | drv->drv.bus = &lguest_bus.bus; | ||
91 | drv->drv.name = drv->name; | ||
92 | drv->drv.owner = drv->owner; | ||
93 | drv->drv.probe = lguest_dev_probe; | ||
94 | |||
95 | return driver_register(&drv->drv); | ||
96 | } | ||
97 | EXPORT_SYMBOL_GPL(register_lguest_driver); | ||
98 | |||
99 | static void add_lguest_device(unsigned int index) | ||
100 | { | ||
101 | struct lguest_device *new; | ||
102 | |||
103 | lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE; | ||
104 | new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL); | ||
105 | if (!new) { | ||
106 | printk(KERN_EMERG "Cannot allocate lguest device %u\n", index); | ||
107 | lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED; | ||
108 | return; | ||
109 | } | ||
110 | |||
111 | new->index = index; | ||
112 | new->private = NULL; | ||
113 | memset(&new->dev, 0, sizeof(new->dev)); | ||
114 | new->dev.parent = &lguest_bus.dev; | ||
115 | new->dev.bus = &lguest_bus.bus; | ||
116 | sprintf(new->dev.bus_id, "%u", index); | ||
117 | if (device_register(&new->dev) != 0) { | ||
118 | printk(KERN_EMERG "Cannot register lguest device %u\n", index); | ||
119 | lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED; | ||
120 | kfree(new); | ||
121 | } | ||
122 | } | ||
123 | |||
124 | static void scan_devices(void) | ||
125 | { | ||
126 | unsigned int i; | ||
127 | |||
128 | for (i = 0; i < LGUEST_MAX_DEVICES; i++) | ||
129 | if (lguest_devices[i].type) | ||
130 | add_lguest_device(i); | ||
131 | } | ||
132 | |||
133 | static int __init lguest_bus_init(void) | ||
134 | { | ||
135 | if (strcmp(paravirt_ops.name, "lguest") != 0) | ||
136 | return 0; | ||
137 | |||
138 | /* Devices are in page above top of "normal" mem. */ | ||
139 | lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1); | ||
140 | |||
141 | if (bus_register(&lguest_bus.bus) != 0 | ||
142 | || device_register(&lguest_bus.dev) != 0) | ||
143 | panic("lguest bus registration failed"); | ||
144 | |||
145 | scan_devices(); | ||
146 | return 0; | ||
147 | } | ||
148 | postcore_initcall(lguest_bus_init); | ||
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c new file mode 100644 index 000000000000..e90d7a783daf --- /dev/null +++ b/drivers/lguest/lguest_user.c | |||
@@ -0,0 +1,236 @@ | |||
1 | /* Userspace control of the guest, via /dev/lguest. */ | ||
2 | #include <linux/uaccess.h> | ||
3 | #include <linux/miscdevice.h> | ||
4 | #include <linux/fs.h> | ||
5 | #include "lg.h" | ||
6 | |||
7 | static void setup_regs(struct lguest_regs *regs, unsigned long start) | ||
8 | { | ||
9 | /* Write out stack in format lguest expects, so we can switch to it. */ | ||
10 | regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; | ||
11 | regs->cs = __KERNEL_CS|GUEST_PL; | ||
12 | regs->eflags = 0x202; /* Interrupts enabled. */ | ||
13 | regs->eip = start; | ||
14 | /* esi points to our boot information (physical address 0) */ | ||
15 | } | ||
16 | |||
17 | /* + addr */ | ||
18 | static long user_get_dma(struct lguest *lg, const u32 __user *input) | ||
19 | { | ||
20 | unsigned long key, udma, irq; | ||
21 | |||
22 | if (get_user(key, input) != 0) | ||
23 | return -EFAULT; | ||
24 | udma = get_dma_buffer(lg, key, &irq); | ||
25 | if (!udma) | ||
26 | return -ENOENT; | ||
27 | |||
28 | /* We put irq number in udma->used_len. */ | ||
29 | lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq); | ||
30 | return udma; | ||
31 | } | ||
32 | |||
33 | /* To force the Guest to stop running and return to the Launcher, the | ||
34 | * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The | ||
35 | * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */ | ||
36 | static int break_guest_out(struct lguest *lg, const u32 __user *input) | ||
37 | { | ||
38 | unsigned long on; | ||
39 | |||
40 | /* Fetch whether they're turning break on or off.. */ | ||
41 | if (get_user(on, input) != 0) | ||
42 | return -EFAULT; | ||
43 | |||
44 | if (on) { | ||
45 | lg->break_out = 1; | ||
46 | /* Pop it out (may be running on different CPU) */ | ||
47 | wake_up_process(lg->tsk); | ||
48 | /* Wait for them to reset it */ | ||
49 | return wait_event_interruptible(lg->break_wq, !lg->break_out); | ||
50 | } else { | ||
51 | lg->break_out = 0; | ||
52 | wake_up(&lg->break_wq); | ||
53 | return 0; | ||
54 | } | ||
55 | } | ||
56 | |||
57 | /* + irq */ | ||
58 | static int user_send_irq(struct lguest *lg, const u32 __user *input) | ||
59 | { | ||
60 | u32 irq; | ||
61 | |||
62 | if (get_user(irq, input) != 0) | ||
63 | return -EFAULT; | ||
64 | if (irq >= LGUEST_IRQS) | ||
65 | return -EINVAL; | ||
66 | set_bit(irq, lg->irqs_pending); | ||
67 | return 0; | ||
68 | } | ||
69 | |||
70 | static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) | ||
71 | { | ||
72 | struct lguest *lg = file->private_data; | ||
73 | |||
74 | if (!lg) | ||
75 | return -EINVAL; | ||
76 | |||
77 | /* If you're not the task which owns the guest, go away. */ | ||
78 | if (current != lg->tsk) | ||
79 | return -EPERM; | ||
80 | |||
81 | if (lg->dead) { | ||
82 | size_t len; | ||
83 | |||
84 | if (IS_ERR(lg->dead)) | ||
85 | return PTR_ERR(lg->dead); | ||
86 | |||
87 | len = min(size, strlen(lg->dead)+1); | ||
88 | if (copy_to_user(user, lg->dead, len) != 0) | ||
89 | return -EFAULT; | ||
90 | return len; | ||
91 | } | ||
92 | |||
93 | if (lg->dma_is_pending) | ||
94 | lg->dma_is_pending = 0; | ||
95 | |||
96 | return run_guest(lg, (unsigned long __user *)user); | ||
97 | } | ||
98 | |||
99 | /* Take: pfnlimit, pgdir, start, pageoffset. */ | ||
100 | static int initialize(struct file *file, const u32 __user *input) | ||
101 | { | ||
102 | struct lguest *lg; | ||
103 | int err, i; | ||
104 | u32 args[4]; | ||
105 | |||
106 | /* We grab the Big Lguest lock, which protects the global array | ||
107 | * "lguests" and multiple simultaneous initializations. */ | ||
108 | mutex_lock(&lguest_lock); | ||
109 | |||
110 | if (file->private_data) { | ||
111 | err = -EBUSY; | ||
112 | goto unlock; | ||
113 | } | ||
114 | |||
115 | if (copy_from_user(args, input, sizeof(args)) != 0) { | ||
116 | err = -EFAULT; | ||
117 | goto unlock; | ||
118 | } | ||
119 | |||
120 | i = find_free_guest(); | ||
121 | if (i < 0) { | ||
122 | err = -ENOSPC; | ||
123 | goto unlock; | ||
124 | } | ||
125 | lg = &lguests[i]; | ||
126 | lg->guestid = i; | ||
127 | lg->pfn_limit = args[0]; | ||
128 | lg->page_offset = args[3]; | ||
129 | lg->regs_page = get_zeroed_page(GFP_KERNEL); | ||
130 | if (!lg->regs_page) { | ||
131 | err = -ENOMEM; | ||
132 | goto release_guest; | ||
133 | } | ||
134 | lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs); | ||
135 | |||
136 | err = init_guest_pagetable(lg, args[1]); | ||
137 | if (err) | ||
138 | goto free_regs; | ||
139 | |||
140 | setup_regs(lg->regs, args[2]); | ||
141 | setup_guest_gdt(lg); | ||
142 | init_clockdev(lg); | ||
143 | lg->tsk = current; | ||
144 | lg->mm = get_task_mm(lg->tsk); | ||
145 | init_waitqueue_head(&lg->break_wq); | ||
146 | lg->last_pages = NULL; | ||
147 | file->private_data = lg; | ||
148 | |||
149 | mutex_unlock(&lguest_lock); | ||
150 | |||
151 | return sizeof(args); | ||
152 | |||
153 | free_regs: | ||
154 | free_page(lg->regs_page); | ||
155 | release_guest: | ||
156 | memset(lg, 0, sizeof(*lg)); | ||
157 | unlock: | ||
158 | mutex_unlock(&lguest_lock); | ||
159 | return err; | ||
160 | } | ||
161 | |||
162 | static ssize_t write(struct file *file, const char __user *input, | ||
163 | size_t size, loff_t *off) | ||
164 | { | ||
165 | struct lguest *lg = file->private_data; | ||
166 | u32 req; | ||
167 | |||
168 | if (get_user(req, input) != 0) | ||
169 | return -EFAULT; | ||
170 | input += sizeof(req); | ||
171 | |||
172 | if (req != LHREQ_INITIALIZE && !lg) | ||
173 | return -EINVAL; | ||
174 | if (lg && lg->dead) | ||
175 | return -ENOENT; | ||
176 | |||
177 | /* If you're not the task which owns the Guest, you can only break */ | ||
178 | if (lg && current != lg->tsk && req != LHREQ_BREAK) | ||
179 | return -EPERM; | ||
180 | |||
181 | switch (req) { | ||
182 | case LHREQ_INITIALIZE: | ||
183 | return initialize(file, (const u32 __user *)input); | ||
184 | case LHREQ_GETDMA: | ||
185 | return user_get_dma(lg, (const u32 __user *)input); | ||
186 | case LHREQ_IRQ: | ||
187 | return user_send_irq(lg, (const u32 __user *)input); | ||
188 | case LHREQ_BREAK: | ||
189 | return break_guest_out(lg, (const u32 __user *)input); | ||
190 | default: | ||
191 | return -EINVAL; | ||
192 | } | ||
193 | } | ||
194 | |||
195 | static int close(struct inode *inode, struct file *file) | ||
196 | { | ||
197 | struct lguest *lg = file->private_data; | ||
198 | |||
199 | if (!lg) | ||
200 | return 0; | ||
201 | |||
202 | mutex_lock(&lguest_lock); | ||
203 | /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ | ||
204 | hrtimer_cancel(&lg->hrt); | ||
205 | release_all_dma(lg); | ||
206 | free_guest_pagetable(lg); | ||
207 | mmput(lg->mm); | ||
208 | if (!IS_ERR(lg->dead)) | ||
209 | kfree(lg->dead); | ||
210 | free_page(lg->regs_page); | ||
211 | memset(lg, 0, sizeof(*lg)); | ||
212 | mutex_unlock(&lguest_lock); | ||
213 | return 0; | ||
214 | } | ||
215 | |||
216 | static struct file_operations lguest_fops = { | ||
217 | .owner = THIS_MODULE, | ||
218 | .release = close, | ||
219 | .write = write, | ||
220 | .read = read, | ||
221 | }; | ||
222 | static struct miscdevice lguest_dev = { | ||
223 | .minor = MISC_DYNAMIC_MINOR, | ||
224 | .name = "lguest", | ||
225 | .fops = &lguest_fops, | ||
226 | }; | ||
227 | |||
228 | int __init lguest_device_init(void) | ||
229 | { | ||
230 | return misc_register(&lguest_dev); | ||
231 | } | ||
232 | |||
233 | void __exit lguest_device_remove(void) | ||
234 | { | ||
235 | misc_deregister(&lguest_dev); | ||
236 | } | ||
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c new file mode 100644 index 000000000000..1b0ba09b1269 --- /dev/null +++ b/drivers/lguest/page_tables.c | |||
@@ -0,0 +1,411 @@ | |||
1 | /* Shadow page table operations. | ||
2 | * Copyright (C) Rusty Russell IBM Corporation 2006. | ||
3 | * GPL v2 and any later version */ | ||
4 | #include <linux/mm.h> | ||
5 | #include <linux/types.h> | ||
6 | #include <linux/spinlock.h> | ||
7 | #include <linux/random.h> | ||
8 | #include <linux/percpu.h> | ||
9 | #include <asm/tlbflush.h> | ||
10 | #include "lg.h" | ||
11 | |||
12 | #define PTES_PER_PAGE_SHIFT 10 | ||
13 | #define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT) | ||
14 | #define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1) | ||
15 | |||
16 | static DEFINE_PER_CPU(spte_t *, switcher_pte_pages); | ||
17 | #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) | ||
18 | |||
19 | static unsigned vaddr_to_pgd_index(unsigned long vaddr) | ||
20 | { | ||
21 | return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); | ||
22 | } | ||
23 | |||
24 | /* These access the shadow versions (ie. the ones used by the CPU). */ | ||
25 | static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) | ||
26 | { | ||
27 | unsigned int index = vaddr_to_pgd_index(vaddr); | ||
28 | |||
29 | if (index >= SWITCHER_PGD_INDEX) { | ||
30 | kill_guest(lg, "attempt to access switcher pages"); | ||
31 | index = 0; | ||
32 | } | ||
33 | return &lg->pgdirs[i].pgdir[index]; | ||
34 | } | ||
35 | |||
36 | static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr) | ||
37 | { | ||
38 | spte_t *page = __va(spgd.pfn << PAGE_SHIFT); | ||
39 | BUG_ON(!(spgd.flags & _PAGE_PRESENT)); | ||
40 | return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE]; | ||
41 | } | ||
42 | |||
43 | /* These access the guest versions. */ | ||
44 | static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) | ||
45 | { | ||
46 | unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); | ||
47 | return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t); | ||
48 | } | ||
49 | |||
50 | static unsigned long gpte_addr(struct lguest *lg, | ||
51 | gpgd_t gpgd, unsigned long vaddr) | ||
52 | { | ||
53 | unsigned long gpage = gpgd.pfn << PAGE_SHIFT; | ||
54 | BUG_ON(!(gpgd.flags & _PAGE_PRESENT)); | ||
55 | return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t); | ||
56 | } | ||
57 | |||
58 | /* Do a virtual -> physical mapping on a user page. */ | ||
59 | static unsigned long get_pfn(unsigned long virtpfn, int write) | ||
60 | { | ||
61 | struct page *page; | ||
62 | unsigned long ret = -1UL; | ||
63 | |||
64 | down_read(¤t->mm->mmap_sem); | ||
65 | if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT, | ||
66 | 1, write, 1, &page, NULL) == 1) | ||
67 | ret = page_to_pfn(page); | ||
68 | up_read(¤t->mm->mmap_sem); | ||
69 | return ret; | ||
70 | } | ||
71 | |||
72 | static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write) | ||
73 | { | ||
74 | spte_t spte; | ||
75 | unsigned long pfn; | ||
76 | |||
77 | /* We ignore the global flag. */ | ||
78 | spte.flags = (gpte.flags & ~_PAGE_GLOBAL); | ||
79 | pfn = get_pfn(gpte.pfn, write); | ||
80 | if (pfn == -1UL) { | ||
81 | kill_guest(lg, "failed to get page %u", gpte.pfn); | ||
82 | /* Must not put_page() bogus page on cleanup. */ | ||
83 | spte.flags = 0; | ||
84 | } | ||
85 | spte.pfn = pfn; | ||
86 | return spte; | ||
87 | } | ||
88 | |||
89 | static void release_pte(spte_t pte) | ||
90 | { | ||
91 | if (pte.flags & _PAGE_PRESENT) | ||
92 | put_page(pfn_to_page(pte.pfn)); | ||
93 | } | ||
94 | |||
95 | static void check_gpte(struct lguest *lg, gpte_t gpte) | ||
96 | { | ||
97 | if ((gpte.flags & (_PAGE_PWT|_PAGE_PSE)) || gpte.pfn >= lg->pfn_limit) | ||
98 | kill_guest(lg, "bad page table entry"); | ||
99 | } | ||
100 | |||
101 | static void check_gpgd(struct lguest *lg, gpgd_t gpgd) | ||
102 | { | ||
103 | if ((gpgd.flags & ~_PAGE_TABLE) || gpgd.pfn >= lg->pfn_limit) | ||
104 | kill_guest(lg, "bad page directory entry"); | ||
105 | } | ||
106 | |||
107 | /* FIXME: We hold reference to pages, which prevents them from being | ||
108 | swapped. It'd be nice to have a callback when Linux wants to swap out. */ | ||
109 | |||
110 | /* We fault pages in, which allows us to update accessed/dirty bits. | ||
111 | * Return true if we got page. */ | ||
112 | int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | ||
113 | { | ||
114 | gpgd_t gpgd; | ||
115 | spgd_t *spgd; | ||
116 | unsigned long gpte_ptr; | ||
117 | gpte_t gpte; | ||
118 | spte_t *spte; | ||
119 | |||
120 | gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); | ||
121 | if (!(gpgd.flags & _PAGE_PRESENT)) | ||
122 | return 0; | ||
123 | |||
124 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); | ||
125 | if (!(spgd->flags & _PAGE_PRESENT)) { | ||
126 | /* Get a page of PTEs for them. */ | ||
127 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); | ||
128 | /* FIXME: Steal from self in this case? */ | ||
129 | if (!ptepage) { | ||
130 | kill_guest(lg, "out of memory allocating pte page"); | ||
131 | return 0; | ||
132 | } | ||
133 | check_gpgd(lg, gpgd); | ||
134 | spgd->raw.val = (__pa(ptepage) | gpgd.flags); | ||
135 | } | ||
136 | |||
137 | gpte_ptr = gpte_addr(lg, gpgd, vaddr); | ||
138 | gpte = mkgpte(lgread_u32(lg, gpte_ptr)); | ||
139 | |||
140 | /* No page? */ | ||
141 | if (!(gpte.flags & _PAGE_PRESENT)) | ||
142 | return 0; | ||
143 | |||
144 | /* Write to read-only page? */ | ||
145 | if ((errcode & 2) && !(gpte.flags & _PAGE_RW)) | ||
146 | return 0; | ||
147 | |||
148 | /* User access to a non-user page? */ | ||
149 | if ((errcode & 4) && !(gpte.flags & _PAGE_USER)) | ||
150 | return 0; | ||
151 | |||
152 | check_gpte(lg, gpte); | ||
153 | gpte.flags |= _PAGE_ACCESSED; | ||
154 | if (errcode & 2) | ||
155 | gpte.flags |= _PAGE_DIRTY; | ||
156 | |||
157 | /* We're done with the old pte. */ | ||
158 | spte = spte_addr(lg, *spgd, vaddr); | ||
159 | release_pte(*spte); | ||
160 | |||
161 | /* We don't make it writable if this isn't a write: later | ||
162 | * write will fault so we can set dirty bit in guest. */ | ||
163 | if (gpte.flags & _PAGE_DIRTY) | ||
164 | *spte = gpte_to_spte(lg, gpte, 1); | ||
165 | else { | ||
166 | gpte_t ro_gpte = gpte; | ||
167 | ro_gpte.flags &= ~_PAGE_RW; | ||
168 | *spte = gpte_to_spte(lg, ro_gpte, 0); | ||
169 | } | ||
170 | |||
171 | /* Now we update dirty/accessed on guest. */ | ||
172 | lgwrite_u32(lg, gpte_ptr, gpte.raw.val); | ||
173 | return 1; | ||
174 | } | ||
175 | |||
176 | /* This is much faster than the full demand_page logic. */ | ||
177 | static int page_writable(struct lguest *lg, unsigned long vaddr) | ||
178 | { | ||
179 | spgd_t *spgd; | ||
180 | unsigned long flags; | ||
181 | |||
182 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); | ||
183 | if (!(spgd->flags & _PAGE_PRESENT)) | ||
184 | return 0; | ||
185 | |||
186 | flags = spte_addr(lg, *spgd, vaddr)->flags; | ||
187 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); | ||
188 | } | ||
189 | |||
190 | void pin_page(struct lguest *lg, unsigned long vaddr) | ||
191 | { | ||
192 | if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2)) | ||
193 | kill_guest(lg, "bad stack page %#lx", vaddr); | ||
194 | } | ||
195 | |||
196 | static void release_pgd(struct lguest *lg, spgd_t *spgd) | ||
197 | { | ||
198 | if (spgd->flags & _PAGE_PRESENT) { | ||
199 | unsigned int i; | ||
200 | spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT); | ||
201 | for (i = 0; i < PTES_PER_PAGE; i++) | ||
202 | release_pte(ptepage[i]); | ||
203 | free_page((long)ptepage); | ||
204 | spgd->raw.val = 0; | ||
205 | } | ||
206 | } | ||
207 | |||
208 | static void flush_user_mappings(struct lguest *lg, int idx) | ||
209 | { | ||
210 | unsigned int i; | ||
211 | for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++) | ||
212 | release_pgd(lg, lg->pgdirs[idx].pgdir + i); | ||
213 | } | ||
214 | |||
215 | void guest_pagetable_flush_user(struct lguest *lg) | ||
216 | { | ||
217 | flush_user_mappings(lg, lg->pgdidx); | ||
218 | } | ||
219 | |||
220 | static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) | ||
221 | { | ||
222 | unsigned int i; | ||
223 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | ||
224 | if (lg->pgdirs[i].cr3 == pgtable) | ||
225 | break; | ||
226 | return i; | ||
227 | } | ||
228 | |||
229 | static unsigned int new_pgdir(struct lguest *lg, | ||
230 | unsigned long cr3, | ||
231 | int *blank_pgdir) | ||
232 | { | ||
233 | unsigned int next; | ||
234 | |||
235 | next = random32() % ARRAY_SIZE(lg->pgdirs); | ||
236 | if (!lg->pgdirs[next].pgdir) { | ||
237 | lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL); | ||
238 | if (!lg->pgdirs[next].pgdir) | ||
239 | next = lg->pgdidx; | ||
240 | else | ||
241 | /* There are no mappings: you'll need to re-pin */ | ||
242 | *blank_pgdir = 1; | ||
243 | } | ||
244 | lg->pgdirs[next].cr3 = cr3; | ||
245 | /* Release all the non-kernel mappings. */ | ||
246 | flush_user_mappings(lg, next); | ||
247 | |||
248 | return next; | ||
249 | } | ||
250 | |||
251 | void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) | ||
252 | { | ||
253 | int newpgdir, repin = 0; | ||
254 | |||
255 | newpgdir = find_pgdir(lg, pgtable); | ||
256 | if (newpgdir == ARRAY_SIZE(lg->pgdirs)) | ||
257 | newpgdir = new_pgdir(lg, pgtable, &repin); | ||
258 | lg->pgdidx = newpgdir; | ||
259 | if (repin) | ||
260 | pin_stack_pages(lg); | ||
261 | } | ||
262 | |||
263 | static void release_all_pagetables(struct lguest *lg) | ||
264 | { | ||
265 | unsigned int i, j; | ||
266 | |||
267 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | ||
268 | if (lg->pgdirs[i].pgdir) | ||
269 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) | ||
270 | release_pgd(lg, lg->pgdirs[i].pgdir + j); | ||
271 | } | ||
272 | |||
273 | void guest_pagetable_clear_all(struct lguest *lg) | ||
274 | { | ||
275 | release_all_pagetables(lg); | ||
276 | pin_stack_pages(lg); | ||
277 | } | ||
278 | |||
279 | static void do_set_pte(struct lguest *lg, int idx, | ||
280 | unsigned long vaddr, gpte_t gpte) | ||
281 | { | ||
282 | spgd_t *spgd = spgd_addr(lg, idx, vaddr); | ||
283 | if (spgd->flags & _PAGE_PRESENT) { | ||
284 | spte_t *spte = spte_addr(lg, *spgd, vaddr); | ||
285 | release_pte(*spte); | ||
286 | if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) { | ||
287 | check_gpte(lg, gpte); | ||
288 | *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY); | ||
289 | } else | ||
290 | spte->raw.val = 0; | ||
291 | } | ||
292 | } | ||
293 | |||
294 | void guest_set_pte(struct lguest *lg, | ||
295 | unsigned long cr3, unsigned long vaddr, gpte_t gpte) | ||
296 | { | ||
297 | /* Kernel mappings must be changed on all top levels. */ | ||
298 | if (vaddr >= lg->page_offset) { | ||
299 | unsigned int i; | ||
300 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | ||
301 | if (lg->pgdirs[i].pgdir) | ||
302 | do_set_pte(lg, i, vaddr, gpte); | ||
303 | } else { | ||
304 | int pgdir = find_pgdir(lg, cr3); | ||
305 | if (pgdir != ARRAY_SIZE(lg->pgdirs)) | ||
306 | do_set_pte(lg, pgdir, vaddr, gpte); | ||
307 | } | ||
308 | } | ||
309 | |||
310 | void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) | ||
311 | { | ||
312 | int pgdir; | ||
313 | |||
314 | if (idx >= SWITCHER_PGD_INDEX) | ||
315 | return; | ||
316 | |||
317 | pgdir = find_pgdir(lg, cr3); | ||
318 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) | ||
319 | release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); | ||
320 | } | ||
321 | |||
322 | int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) | ||
323 | { | ||
324 | /* We assume this in flush_user_mappings, so check now */ | ||
325 | if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX) | ||
326 | return -EINVAL; | ||
327 | lg->pgdidx = 0; | ||
328 | lg->pgdirs[lg->pgdidx].cr3 = pgtable; | ||
329 | lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL); | ||
330 | if (!lg->pgdirs[lg->pgdidx].pgdir) | ||
331 | return -ENOMEM; | ||
332 | return 0; | ||
333 | } | ||
334 | |||
335 | void free_guest_pagetable(struct lguest *lg) | ||
336 | { | ||
337 | unsigned int i; | ||
338 | |||
339 | release_all_pagetables(lg); | ||
340 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | ||
341 | free_page((long)lg->pgdirs[i].pgdir); | ||
342 | } | ||
343 | |||
344 | /* Caller must be preempt-safe */ | ||
345 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) | ||
346 | { | ||
347 | spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); | ||
348 | spgd_t switcher_pgd; | ||
349 | spte_t regs_pte; | ||
350 | |||
351 | /* Since switcher less that 4MB, we simply mug top pte page. */ | ||
352 | switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT; | ||
353 | switcher_pgd.flags = _PAGE_KERNEL; | ||
354 | lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; | ||
355 | |||
356 | /* Map our regs page over stack page. */ | ||
357 | regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT; | ||
358 | regs_pte.flags = _PAGE_KERNEL; | ||
359 | switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE] | ||
360 | = regs_pte; | ||
361 | } | ||
362 | |||
363 | static void free_switcher_pte_pages(void) | ||
364 | { | ||
365 | unsigned int i; | ||
366 | |||
367 | for_each_possible_cpu(i) | ||
368 | free_page((long)switcher_pte_page(i)); | ||
369 | } | ||
370 | |||
371 | static __init void populate_switcher_pte_page(unsigned int cpu, | ||
372 | struct page *switcher_page[], | ||
373 | unsigned int pages) | ||
374 | { | ||
375 | unsigned int i; | ||
376 | spte_t *pte = switcher_pte_page(cpu); | ||
377 | |||
378 | for (i = 0; i < pages; i++) { | ||
379 | pte[i].pfn = page_to_pfn(switcher_page[i]); | ||
380 | pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED; | ||
381 | } | ||
382 | |||
383 | /* We only map this CPU's pages, so guest can't see others. */ | ||
384 | i = pages + cpu*2; | ||
385 | |||
386 | /* First page (regs) is rw, second (state) is ro. */ | ||
387 | pte[i].pfn = page_to_pfn(switcher_page[i]); | ||
388 | pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW; | ||
389 | pte[i+1].pfn = page_to_pfn(switcher_page[i+1]); | ||
390 | pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED; | ||
391 | } | ||
392 | |||
393 | __init int init_pagetables(struct page **switcher_page, unsigned int pages) | ||
394 | { | ||
395 | unsigned int i; | ||
396 | |||
397 | for_each_possible_cpu(i) { | ||
398 | switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL); | ||
399 | if (!switcher_pte_page(i)) { | ||
400 | free_switcher_pte_pages(); | ||
401 | return -ENOMEM; | ||
402 | } | ||
403 | populate_switcher_pte_page(i, switcher_page, pages); | ||
404 | } | ||
405 | return 0; | ||
406 | } | ||
407 | |||
408 | void free_pagetables(void) | ||
409 | { | ||
410 | free_switcher_pte_pages(); | ||
411 | } | ||
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c new file mode 100644 index 000000000000..1b2cfe89dcd5 --- /dev/null +++ b/drivers/lguest/segments.c | |||
@@ -0,0 +1,125 @@ | |||
1 | #include "lg.h" | ||
2 | |||
3 | static int desc_ok(const struct desc_struct *gdt) | ||
4 | { | ||
5 | /* MBZ=0, P=1, DT=1 */ | ||
6 | return ((gdt->b & 0x00209000) == 0x00009000); | ||
7 | } | ||
8 | |||
9 | static int segment_present(const struct desc_struct *gdt) | ||
10 | { | ||
11 | return gdt->b & 0x8000; | ||
12 | } | ||
13 | |||
14 | static int ignored_gdt(unsigned int num) | ||
15 | { | ||
16 | return (num == GDT_ENTRY_TSS | ||
17 | || num == GDT_ENTRY_LGUEST_CS | ||
18 | || num == GDT_ENTRY_LGUEST_DS | ||
19 | || num == GDT_ENTRY_DOUBLEFAULT_TSS); | ||
20 | } | ||
21 | |||
22 | /* We don't allow removal of CS, DS or SS; it doesn't make sense. */ | ||
23 | static void check_segment_use(struct lguest *lg, unsigned int desc) | ||
24 | { | ||
25 | if (lg->regs->gs / 8 == desc) | ||
26 | lg->regs->gs = 0; | ||
27 | if (lg->regs->fs / 8 == desc) | ||
28 | lg->regs->fs = 0; | ||
29 | if (lg->regs->es / 8 == desc) | ||
30 | lg->regs->es = 0; | ||
31 | if (lg->regs->ds / 8 == desc | ||
32 | || lg->regs->cs / 8 == desc | ||
33 | || lg->regs->ss / 8 == desc) | ||
34 | kill_guest(lg, "Removed live GDT entry %u", desc); | ||
35 | } | ||
36 | |||
37 | static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) | ||
38 | { | ||
39 | unsigned int i; | ||
40 | |||
41 | for (i = start; i < end; i++) { | ||
42 | /* We never copy these ones to real gdt */ | ||
43 | if (ignored_gdt(i)) | ||
44 | continue; | ||
45 | |||
46 | /* We could fault in switch_to_guest if they are using | ||
47 | * a removed segment. */ | ||
48 | if (!segment_present(&lg->gdt[i])) { | ||
49 | check_segment_use(lg, i); | ||
50 | continue; | ||
51 | } | ||
52 | |||
53 | if (!desc_ok(&lg->gdt[i])) | ||
54 | kill_guest(lg, "Bad GDT descriptor %i", i); | ||
55 | |||
56 | /* DPL 0 presumably means "for use by guest". */ | ||
57 | if ((lg->gdt[i].b & 0x00006000) == 0) | ||
58 | lg->gdt[i].b |= (GUEST_PL << 13); | ||
59 | |||
60 | /* Set accessed bit, since gdt isn't writable. */ | ||
61 | lg->gdt[i].b |= 0x00000100; | ||
62 | } | ||
63 | } | ||
64 | |||
65 | void setup_default_gdt_entries(struct lguest_ro_state *state) | ||
66 | { | ||
67 | struct desc_struct *gdt = state->guest_gdt; | ||
68 | unsigned long tss = (unsigned long)&state->guest_tss; | ||
69 | |||
70 | /* Hypervisor segments. */ | ||
71 | gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; | ||
72 | gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; | ||
73 | |||
74 | /* This is the one which we *cannot* copy from guest, since tss | ||
75 | is depended on this lguest_ro_state, ie. this cpu. */ | ||
76 | gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); | ||
77 | gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) | ||
78 | | ((tss >> 16) & 0x000000FF); | ||
79 | } | ||
80 | |||
81 | void setup_guest_gdt(struct lguest *lg) | ||
82 | { | ||
83 | lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; | ||
84 | lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; | ||
85 | lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); | ||
86 | lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); | ||
87 | } | ||
88 | |||
89 | /* This is a fast version for the common case where only the three TLS entries | ||
90 | * have changed. */ | ||
91 | void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) | ||
92 | { | ||
93 | unsigned int i; | ||
94 | |||
95 | for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++) | ||
96 | gdt[i] = lg->gdt[i]; | ||
97 | } | ||
98 | |||
99 | void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) | ||
100 | { | ||
101 | unsigned int i; | ||
102 | |||
103 | for (i = 0; i < GDT_ENTRIES; i++) | ||
104 | if (!ignored_gdt(i)) | ||
105 | gdt[i] = lg->gdt[i]; | ||
106 | } | ||
107 | |||
108 | void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) | ||
109 | { | ||
110 | if (num > ARRAY_SIZE(lg->gdt)) | ||
111 | kill_guest(lg, "too many gdt entries %i", num); | ||
112 | |||
113 | lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0])); | ||
114 | fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt)); | ||
115 | lg->changed |= CHANGED_GDT; | ||
116 | } | ||
117 | |||
118 | void guest_load_tls(struct lguest *lg, unsigned long gtls) | ||
119 | { | ||
120 | struct desc_struct *tls = &lg->gdt[GDT_ENTRY_TLS_MIN]; | ||
121 | |||
122 | lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); | ||
123 | fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); | ||
124 | lg->changed |= CHANGED_GDT_TLS; | ||
125 | } | ||
diff --git a/drivers/lguest/switcher.S b/drivers/lguest/switcher.S new file mode 100644 index 000000000000..eadd4cc299d2 --- /dev/null +++ b/drivers/lguest/switcher.S | |||
@@ -0,0 +1,159 @@ | |||
1 | /* This code sits at 0xFFC00000 to do the low-level guest<->host switch. | ||
2 | |||
3 | There is are two pages above us for this CPU (struct lguest_pages). | ||
4 | The second page (struct lguest_ro_state) becomes read-only after the | ||
5 | context switch. The first page (the stack for traps) remains writable, | ||
6 | but while we're in here, the guest cannot be running. | ||
7 | */ | ||
8 | #include <linux/linkage.h> | ||
9 | #include <asm/asm-offsets.h> | ||
10 | #include "lg.h" | ||
11 | |||
12 | .text | ||
13 | ENTRY(start_switcher_text) | ||
14 | |||
15 | /* %eax points to lguest pages for this CPU. %ebx contains cr3 value. | ||
16 | All normal registers can be clobbered! */ | ||
17 | ENTRY(switch_to_guest) | ||
18 | /* Save host segments on host stack. */ | ||
19 | pushl %es | ||
20 | pushl %ds | ||
21 | pushl %gs | ||
22 | pushl %fs | ||
23 | /* With CONFIG_FRAME_POINTER, gcc doesn't let us clobber this! */ | ||
24 | pushl %ebp | ||
25 | /* Save host stack. */ | ||
26 | movl %esp, LGUEST_PAGES_host_sp(%eax) | ||
27 | /* Switch to guest stack: if we get NMI we expect to be there. */ | ||
28 | movl %eax, %edx | ||
29 | addl $LGUEST_PAGES_regs, %edx | ||
30 | movl %edx, %esp | ||
31 | /* Switch to guest's GDT, IDT. */ | ||
32 | lgdt LGUEST_PAGES_guest_gdt_desc(%eax) | ||
33 | lidt LGUEST_PAGES_guest_idt_desc(%eax) | ||
34 | /* Switch to guest's TSS while GDT still writable. */ | ||
35 | movl $(GDT_ENTRY_TSS*8), %edx | ||
36 | ltr %dx | ||
37 | /* Set host's TSS GDT entry to available (clear byte 5 bit 2). */ | ||
38 | movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx | ||
39 | andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx) | ||
40 | /* Switch to guest page tables: lguest_pages->state now read-only. */ | ||
41 | movl %ebx, %cr3 | ||
42 | /* Restore guest regs */ | ||
43 | popl %ebx | ||
44 | popl %ecx | ||
45 | popl %edx | ||
46 | popl %esi | ||
47 | popl %edi | ||
48 | popl %ebp | ||
49 | popl %gs | ||
50 | popl %eax | ||
51 | popl %fs | ||
52 | popl %ds | ||
53 | popl %es | ||
54 | /* Skip error code and trap number */ | ||
55 | addl $8, %esp | ||
56 | iret | ||
57 | |||
58 | #define SWITCH_TO_HOST \ | ||
59 | /* Save guest state */ \ | ||
60 | pushl %es; \ | ||
61 | pushl %ds; \ | ||
62 | pushl %fs; \ | ||
63 | pushl %eax; \ | ||
64 | pushl %gs; \ | ||
65 | pushl %ebp; \ | ||
66 | pushl %edi; \ | ||
67 | pushl %esi; \ | ||
68 | pushl %edx; \ | ||
69 | pushl %ecx; \ | ||
70 | pushl %ebx; \ | ||
71 | /* Load lguest ds segment for convenience. */ \ | ||
72 | movl $(LGUEST_DS), %eax; \ | ||
73 | movl %eax, %ds; \ | ||
74 | /* Figure out where we are, based on stack (at top of regs). */ \ | ||
75 | movl %esp, %eax; \ | ||
76 | subl $LGUEST_PAGES_regs, %eax; \ | ||
77 | /* Put trap number in %ebx before we switch cr3 and lose it. */ \ | ||
78 | movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \ | ||
79 | /* Switch to host page tables (host GDT, IDT and stack are in host \ | ||
80 | mem, so need this first) */ \ | ||
81 | movl LGUEST_PAGES_host_cr3(%eax), %edx; \ | ||
82 | movl %edx, %cr3; \ | ||
83 | /* Set guest's TSS to available (clear byte 5 bit 2). */ \ | ||
84 | andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \ | ||
85 | /* Switch to host's GDT & IDT. */ \ | ||
86 | lgdt LGUEST_PAGES_host_gdt_desc(%eax); \ | ||
87 | lidt LGUEST_PAGES_host_idt_desc(%eax); \ | ||
88 | /* Switch to host's stack. */ \ | ||
89 | movl LGUEST_PAGES_host_sp(%eax), %esp; \ | ||
90 | /* Switch to host's TSS */ \ | ||
91 | movl $(GDT_ENTRY_TSS*8), %edx; \ | ||
92 | ltr %dx; \ | ||
93 | popl %ebp; \ | ||
94 | popl %fs; \ | ||
95 | popl %gs; \ | ||
96 | popl %ds; \ | ||
97 | popl %es | ||
98 | |||
99 | /* Return to run_guest_once. */ | ||
100 | return_to_host: | ||
101 | SWITCH_TO_HOST | ||
102 | iret | ||
103 | |||
104 | deliver_to_host: | ||
105 | SWITCH_TO_HOST | ||
106 | /* Decode IDT and jump to hosts' irq handler. When that does iret, it | ||
107 | * will return to run_guest_once. This is a feature. */ | ||
108 | movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx | ||
109 | leal (%edx,%ebx,8), %eax | ||
110 | movzwl (%eax),%edx | ||
111 | movl 4(%eax), %eax | ||
112 | xorw %ax, %ax | ||
113 | orl %eax, %edx | ||
114 | jmp *%edx | ||
115 | |||
116 | /* Real hardware interrupts are delivered straight to the host. Others | ||
117 | cause us to return to run_guest_once so it can decide what to do. Note | ||
118 | that some of these are overridden by the guest to deliver directly, and | ||
119 | never enter here (see load_guest_idt_entry). */ | ||
120 | .macro IRQ_STUB N TARGET | ||
121 | .data; .long 1f; .text; 1: | ||
122 | /* Make an error number for most traps, which don't have one. */ | ||
123 | .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17) | ||
124 | pushl $0 | ||
125 | .endif | ||
126 | pushl $\N | ||
127 | jmp \TARGET | ||
128 | ALIGN | ||
129 | .endm | ||
130 | |||
131 | .macro IRQ_STUBS FIRST LAST TARGET | ||
132 | irq=\FIRST | ||
133 | .rept \LAST-\FIRST+1 | ||
134 | IRQ_STUB irq \TARGET | ||
135 | irq=irq+1 | ||
136 | .endr | ||
137 | .endm | ||
138 | |||
139 | /* We intercept every interrupt, because we may need to switch back to | ||
140 | * host. Unfortunately we can't tell them apart except by entry | ||
141 | * point, so we need 256 entry points. | ||
142 | */ | ||
143 | .data | ||
144 | .global default_idt_entries | ||
145 | default_idt_entries: | ||
146 | .text | ||
147 | IRQ_STUBS 0 1 return_to_host /* First two traps */ | ||
148 | IRQ_STUB 2 handle_nmi /* NMI */ | ||
149 | IRQ_STUBS 3 31 return_to_host /* Rest of traps */ | ||
150 | IRQ_STUBS 32 127 deliver_to_host /* Real interrupts */ | ||
151 | IRQ_STUB 128 return_to_host /* System call (overridden) */ | ||
152 | IRQ_STUBS 129 255 deliver_to_host /* Other real interrupts */ | ||
153 | |||
154 | /* We ignore NMI and return. */ | ||
155 | handle_nmi: | ||
156 | addl $8, %esp | ||
157 | iret | ||
158 | |||
159 | ENTRY(end_switcher_text) | ||