diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2007-07-19 04:49:22 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-07-19 13:04:52 -0400 |
commit | 07ad157f6e5d228be78acd5cea0291e5d0360398 (patch) | |
tree | 87180c2d1aa53857f46d1dc293e08c0fbea0608a | |
parent | 5992b6dac0d23a2b51a1ccbaf8f1a2e62097b12b (diff) |
lguest: the guest code
lguest is a simple hypervisor for Linux on Linux. Unlike kvm it doesn't need
VT/SVM hardware. Unlike Xen it's simply "modprobe and go". Unlike both, it's
5000 lines and self-contained.
Performance is ok, but not great (-30% on kernel compile). But given its
hackability, I expect this to improve, along with the paravirt_ops code which
it supplies a complete example for. There's also a 64-bit version being
worked on and other craziness.
But most of all, lguest is awesome fun! Too much of the kernel is a big ball
of hair. lguest is simple enough to dive into and hack, plus has some warts
which scream "fork me!".
This patch:
This is the code and headers required to make an i386 kernel an lguest guest.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | drivers/lguest/lguest.c | 544 | ||||
-rw-r--r-- | drivers/lguest/lguest_asm.S | 53 | ||||
-rw-r--r-- | drivers/lguest/lguest_bus.c | 148 | ||||
-rw-r--r-- | include/linux/lguest.h | 85 | ||||
-rw-r--r-- | include/linux/lguest_bus.h | 48 |
5 files changed, 878 insertions, 0 deletions
diff --git a/drivers/lguest/lguest.c b/drivers/lguest/lguest.c new file mode 100644 index 000000000000..b3a72bd8d6f5 --- /dev/null +++ b/drivers/lguest/lguest.c | |||
@@ -0,0 +1,544 @@ | |||
1 | /* | ||
2 | * Lguest specific paravirt-ops implementation | ||
3 | * | ||
4 | * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, but | ||
12 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
14 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
15 | * details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
20 | */ | ||
21 | #include <linux/kernel.h> | ||
22 | #include <linux/start_kernel.h> | ||
23 | #include <linux/string.h> | ||
24 | #include <linux/console.h> | ||
25 | #include <linux/screen_info.h> | ||
26 | #include <linux/irq.h> | ||
27 | #include <linux/interrupt.h> | ||
28 | #include <linux/lguest.h> | ||
29 | #include <linux/lguest_launcher.h> | ||
30 | #include <linux/lguest_bus.h> | ||
31 | #include <asm/paravirt.h> | ||
32 | #include <asm/param.h> | ||
33 | #include <asm/page.h> | ||
34 | #include <asm/pgtable.h> | ||
35 | #include <asm/desc.h> | ||
36 | #include <asm/setup.h> | ||
37 | #include <asm/e820.h> | ||
38 | #include <asm/mce.h> | ||
39 | #include <asm/io.h> | ||
40 | |||
41 | /* Declarations for definitions in lguest_guest.S */ | ||
42 | extern char lguest_noirq_start[], lguest_noirq_end[]; | ||
43 | extern const char lgstart_cli[], lgend_cli[]; | ||
44 | extern const char lgstart_sti[], lgend_sti[]; | ||
45 | extern const char lgstart_popf[], lgend_popf[]; | ||
46 | extern const char lgstart_pushf[], lgend_pushf[]; | ||
47 | extern const char lgstart_iret[], lgend_iret[]; | ||
48 | extern void lguest_iret(void); | ||
49 | |||
50 | struct lguest_data lguest_data = { | ||
51 | .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, | ||
52 | .noirq_start = (u32)lguest_noirq_start, | ||
53 | .noirq_end = (u32)lguest_noirq_end, | ||
54 | .blocked_interrupts = { 1 }, /* Block timer interrupts */ | ||
55 | }; | ||
56 | struct lguest_device_desc *lguest_devices; | ||
57 | static __initdata const struct lguest_boot_info *boot = __va(0); | ||
58 | |||
59 | static enum paravirt_lazy_mode lazy_mode; | ||
60 | static void lguest_lazy_mode(enum paravirt_lazy_mode mode) | ||
61 | { | ||
62 | if (mode == PARAVIRT_LAZY_FLUSH) { | ||
63 | if (unlikely(lazy_mode != PARAVIRT_LAZY_NONE)) | ||
64 | hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); | ||
65 | } else { | ||
66 | lazy_mode = mode; | ||
67 | if (mode == PARAVIRT_LAZY_NONE) | ||
68 | hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); | ||
69 | } | ||
70 | } | ||
71 | |||
72 | static void lazy_hcall(unsigned long call, | ||
73 | unsigned long arg1, | ||
74 | unsigned long arg2, | ||
75 | unsigned long arg3) | ||
76 | { | ||
77 | if (lazy_mode == PARAVIRT_LAZY_NONE) | ||
78 | hcall(call, arg1, arg2, arg3); | ||
79 | else | ||
80 | async_hcall(call, arg1, arg2, arg3); | ||
81 | } | ||
82 | |||
83 | void async_hcall(unsigned long call, | ||
84 | unsigned long arg1, unsigned long arg2, unsigned long arg3) | ||
85 | { | ||
86 | /* Note: This code assumes we're uniprocessor. */ | ||
87 | static unsigned int next_call; | ||
88 | unsigned long flags; | ||
89 | |||
90 | local_irq_save(flags); | ||
91 | if (lguest_data.hcall_status[next_call] != 0xFF) { | ||
92 | /* Table full, so do normal hcall which will flush table. */ | ||
93 | hcall(call, arg1, arg2, arg3); | ||
94 | } else { | ||
95 | lguest_data.hcalls[next_call].eax = call; | ||
96 | lguest_data.hcalls[next_call].edx = arg1; | ||
97 | lguest_data.hcalls[next_call].ebx = arg2; | ||
98 | lguest_data.hcalls[next_call].ecx = arg3; | ||
99 | /* Make sure host sees arguments before "valid" flag. */ | ||
100 | wmb(); | ||
101 | lguest_data.hcall_status[next_call] = 0; | ||
102 | if (++next_call == LHCALL_RING_SIZE) | ||
103 | next_call = 0; | ||
104 | } | ||
105 | local_irq_restore(flags); | ||
106 | } | ||
107 | |||
108 | void lguest_send_dma(unsigned long key, struct lguest_dma *dma) | ||
109 | { | ||
110 | dma->used_len = 0; | ||
111 | hcall(LHCALL_SEND_DMA, key, __pa(dma), 0); | ||
112 | } | ||
113 | |||
114 | int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas, | ||
115 | unsigned int num, u8 irq) | ||
116 | { | ||
117 | if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq)) | ||
118 | return -ENOMEM; | ||
119 | return 0; | ||
120 | } | ||
121 | |||
122 | void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas) | ||
123 | { | ||
124 | hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0); | ||
125 | } | ||
126 | |||
127 | /* For guests, device memory can be used as normal memory, so we cast away the | ||
128 | * __iomem to quieten sparse. */ | ||
129 | void *lguest_map(unsigned long phys_addr, unsigned long pages) | ||
130 | { | ||
131 | return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages); | ||
132 | } | ||
133 | |||
134 | void lguest_unmap(void *addr) | ||
135 | { | ||
136 | iounmap((__force void __iomem *)addr); | ||
137 | } | ||
138 | |||
139 | static unsigned long save_fl(void) | ||
140 | { | ||
141 | return lguest_data.irq_enabled; | ||
142 | } | ||
143 | |||
144 | static void restore_fl(unsigned long flags) | ||
145 | { | ||
146 | /* FIXME: Check if interrupt pending... */ | ||
147 | lguest_data.irq_enabled = flags; | ||
148 | } | ||
149 | |||
150 | static void irq_disable(void) | ||
151 | { | ||
152 | lguest_data.irq_enabled = 0; | ||
153 | } | ||
154 | |||
155 | static void irq_enable(void) | ||
156 | { | ||
157 | /* FIXME: Check if interrupt pending... */ | ||
158 | lguest_data.irq_enabled = X86_EFLAGS_IF; | ||
159 | } | ||
160 | |||
161 | static void lguest_write_idt_entry(struct desc_struct *dt, | ||
162 | int entrynum, u32 low, u32 high) | ||
163 | { | ||
164 | write_dt_entry(dt, entrynum, low, high); | ||
165 | hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high); | ||
166 | } | ||
167 | |||
168 | static void lguest_load_idt(const struct Xgt_desc_struct *desc) | ||
169 | { | ||
170 | unsigned int i; | ||
171 | struct desc_struct *idt = (void *)desc->address; | ||
172 | |||
173 | for (i = 0; i < (desc->size+1)/8; i++) | ||
174 | hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); | ||
175 | } | ||
176 | |||
177 | static void lguest_load_gdt(const struct Xgt_desc_struct *desc) | ||
178 | { | ||
179 | BUG_ON((desc->size+1)/8 != GDT_ENTRIES); | ||
180 | hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0); | ||
181 | } | ||
182 | |||
183 | static void lguest_write_gdt_entry(struct desc_struct *dt, | ||
184 | int entrynum, u32 low, u32 high) | ||
185 | { | ||
186 | write_dt_entry(dt, entrynum, low, high); | ||
187 | hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0); | ||
188 | } | ||
189 | |||
190 | static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) | ||
191 | { | ||
192 | lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0); | ||
193 | } | ||
194 | |||
195 | static void lguest_set_ldt(const void *addr, unsigned entries) | ||
196 | { | ||
197 | } | ||
198 | |||
199 | static void lguest_load_tr_desc(void) | ||
200 | { | ||
201 | } | ||
202 | |||
203 | static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, | ||
204 | unsigned int *ecx, unsigned int *edx) | ||
205 | { | ||
206 | int function = *eax; | ||
207 | |||
208 | native_cpuid(eax, ebx, ecx, edx); | ||
209 | switch (function) { | ||
210 | case 1: /* Basic feature request. */ | ||
211 | /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ | ||
212 | *ecx &= 0x00002201; | ||
213 | /* Similarly: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ | ||
214 | *edx &= 0x07808101; | ||
215 | /* Host wants to know when we flush kernel pages: set PGE. */ | ||
216 | *edx |= 0x00002000; | ||
217 | break; | ||
218 | case 0x80000000: | ||
219 | /* Futureproof this a little: if they ask how much extended | ||
220 | * processor information, limit it to known fields. */ | ||
221 | if (*eax > 0x80000008) | ||
222 | *eax = 0x80000008; | ||
223 | break; | ||
224 | } | ||
225 | } | ||
226 | |||
227 | static unsigned long current_cr0, current_cr3; | ||
228 | static void lguest_write_cr0(unsigned long val) | ||
229 | { | ||
230 | lazy_hcall(LHCALL_TS, val & 8, 0, 0); | ||
231 | current_cr0 = val; | ||
232 | } | ||
233 | |||
234 | static unsigned long lguest_read_cr0(void) | ||
235 | { | ||
236 | return current_cr0; | ||
237 | } | ||
238 | |||
239 | static void lguest_clts(void) | ||
240 | { | ||
241 | lazy_hcall(LHCALL_TS, 0, 0, 0); | ||
242 | current_cr0 &= ~8U; | ||
243 | } | ||
244 | |||
245 | static unsigned long lguest_read_cr2(void) | ||
246 | { | ||
247 | return lguest_data.cr2; | ||
248 | } | ||
249 | |||
250 | static void lguest_write_cr3(unsigned long cr3) | ||
251 | { | ||
252 | lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0); | ||
253 | current_cr3 = cr3; | ||
254 | } | ||
255 | |||
256 | static unsigned long lguest_read_cr3(void) | ||
257 | { | ||
258 | return current_cr3; | ||
259 | } | ||
260 | |||
261 | /* Used to enable/disable PGE, but we don't care. */ | ||
262 | static unsigned long lguest_read_cr4(void) | ||
263 | { | ||
264 | return 0; | ||
265 | } | ||
266 | |||
267 | static void lguest_write_cr4(unsigned long val) | ||
268 | { | ||
269 | } | ||
270 | |||
271 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | ||
272 | pte_t *ptep, pte_t pteval) | ||
273 | { | ||
274 | *ptep = pteval; | ||
275 | lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low); | ||
276 | } | ||
277 | |||
278 | /* We only support two-level pagetables at the moment. */ | ||
279 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | ||
280 | { | ||
281 | *pmdp = pmdval; | ||
282 | lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK, | ||
283 | (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); | ||
284 | } | ||
285 | |||
286 | /* FIXME: Eliminate all callers of this. */ | ||
287 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) | ||
288 | { | ||
289 | *ptep = pteval; | ||
290 | /* Don't bother with hypercall before initial setup. */ | ||
291 | if (current_cr3) | ||
292 | lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); | ||
293 | } | ||
294 | |||
295 | static void lguest_flush_tlb_single(unsigned long addr) | ||
296 | { | ||
297 | /* Simply set it to zero, and it will fault back in. */ | ||
298 | lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0); | ||
299 | } | ||
300 | |||
301 | static void lguest_flush_tlb_user(void) | ||
302 | { | ||
303 | lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0); | ||
304 | } | ||
305 | |||
306 | static void lguest_flush_tlb_kernel(void) | ||
307 | { | ||
308 | lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); | ||
309 | } | ||
310 | |||
311 | static void disable_lguest_irq(unsigned int irq) | ||
312 | { | ||
313 | set_bit(irq, lguest_data.blocked_interrupts); | ||
314 | } | ||
315 | |||
316 | static void enable_lguest_irq(unsigned int irq) | ||
317 | { | ||
318 | clear_bit(irq, lguest_data.blocked_interrupts); | ||
319 | /* FIXME: If it's pending? */ | ||
320 | } | ||
321 | |||
322 | static struct irq_chip lguest_irq_controller = { | ||
323 | .name = "lguest", | ||
324 | .mask = disable_lguest_irq, | ||
325 | .mask_ack = disable_lguest_irq, | ||
326 | .unmask = enable_lguest_irq, | ||
327 | }; | ||
328 | |||
329 | static void __init lguest_init_IRQ(void) | ||
330 | { | ||
331 | unsigned int i; | ||
332 | |||
333 | for (i = 0; i < LGUEST_IRQS; i++) { | ||
334 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
335 | if (vector != SYSCALL_VECTOR) { | ||
336 | set_intr_gate(vector, interrupt[i]); | ||
337 | set_irq_chip_and_handler(i, &lguest_irq_controller, | ||
338 | handle_level_irq); | ||
339 | } | ||
340 | } | ||
341 | irq_ctx_init(smp_processor_id()); | ||
342 | } | ||
343 | |||
344 | static unsigned long lguest_get_wallclock(void) | ||
345 | { | ||
346 | return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0); | ||
347 | } | ||
348 | |||
349 | static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) | ||
350 | { | ||
351 | do_timer(hcall(LHCALL_TIMER_READ, 0, 0, 0)); | ||
352 | update_process_times(user_mode_vm(get_irq_regs())); | ||
353 | } | ||
354 | |||
355 | static u64 sched_clock_base; | ||
356 | static void lguest_time_init(void) | ||
357 | { | ||
358 | set_irq_handler(0, lguest_time_irq); | ||
359 | hcall(LHCALL_TIMER_READ, 0, 0, 0); | ||
360 | sched_clock_base = jiffies_64; | ||
361 | enable_lguest_irq(0); | ||
362 | } | ||
363 | |||
364 | static unsigned long long lguest_sched_clock(void) | ||
365 | { | ||
366 | return (jiffies_64 - sched_clock_base) * (1000000000 / HZ); | ||
367 | } | ||
368 | |||
369 | static void lguest_load_esp0(struct tss_struct *tss, | ||
370 | struct thread_struct *thread) | ||
371 | { | ||
372 | lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0, | ||
373 | THREAD_SIZE/PAGE_SIZE); | ||
374 | } | ||
375 | |||
376 | static void lguest_set_debugreg(int regno, unsigned long value) | ||
377 | { | ||
378 | /* FIXME: Implement */ | ||
379 | } | ||
380 | |||
381 | static void lguest_wbinvd(void) | ||
382 | { | ||
383 | } | ||
384 | |||
385 | #ifdef CONFIG_X86_LOCAL_APIC | ||
386 | static void lguest_apic_write(unsigned long reg, unsigned long v) | ||
387 | { | ||
388 | } | ||
389 | |||
390 | static unsigned long lguest_apic_read(unsigned long reg) | ||
391 | { | ||
392 | return 0; | ||
393 | } | ||
394 | #endif | ||
395 | |||
396 | static void lguest_safe_halt(void) | ||
397 | { | ||
398 | hcall(LHCALL_HALT, 0, 0, 0); | ||
399 | } | ||
400 | |||
401 | static void lguest_power_off(void) | ||
402 | { | ||
403 | hcall(LHCALL_CRASH, __pa("Power down"), 0, 0); | ||
404 | } | ||
405 | |||
406 | static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) | ||
407 | { | ||
408 | hcall(LHCALL_CRASH, __pa(p), 0, 0); | ||
409 | return NOTIFY_DONE; | ||
410 | } | ||
411 | |||
412 | static struct notifier_block paniced = { | ||
413 | .notifier_call = lguest_panic | ||
414 | }; | ||
415 | |||
416 | static __init char *lguest_memory_setup(void) | ||
417 | { | ||
418 | /* We do this here because lockcheck barfs if before start_kernel */ | ||
419 | atomic_notifier_chain_register(&panic_notifier_list, &paniced); | ||
420 | |||
421 | e820.nr_map = 0; | ||
422 | add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM); | ||
423 | return "LGUEST"; | ||
424 | } | ||
425 | |||
426 | static const struct lguest_insns | ||
427 | { | ||
428 | const char *start, *end; | ||
429 | } lguest_insns[] = { | ||
430 | [PARAVIRT_PATCH(irq_disable)] = { lgstart_cli, lgend_cli }, | ||
431 | [PARAVIRT_PATCH(irq_enable)] = { lgstart_sti, lgend_sti }, | ||
432 | [PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf }, | ||
433 | [PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf }, | ||
434 | }; | ||
435 | static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len) | ||
436 | { | ||
437 | unsigned int insn_len; | ||
438 | |||
439 | /* Don't touch it if we don't have a replacement */ | ||
440 | if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) | ||
441 | return paravirt_patch_default(type, clobber, insns, len); | ||
442 | |||
443 | insn_len = lguest_insns[type].end - lguest_insns[type].start; | ||
444 | |||
445 | /* Similarly if we can't fit replacement. */ | ||
446 | if (len < insn_len) | ||
447 | return paravirt_patch_default(type, clobber, insns, len); | ||
448 | |||
449 | memcpy(insns, lguest_insns[type].start, insn_len); | ||
450 | return insn_len; | ||
451 | } | ||
452 | |||
453 | __init void lguest_init(void) | ||
454 | { | ||
455 | paravirt_ops.name = "lguest"; | ||
456 | paravirt_ops.paravirt_enabled = 1; | ||
457 | paravirt_ops.kernel_rpl = 1; | ||
458 | |||
459 | paravirt_ops.save_fl = save_fl; | ||
460 | paravirt_ops.restore_fl = restore_fl; | ||
461 | paravirt_ops.irq_disable = irq_disable; | ||
462 | paravirt_ops.irq_enable = irq_enable; | ||
463 | paravirt_ops.load_gdt = lguest_load_gdt; | ||
464 | paravirt_ops.memory_setup = lguest_memory_setup; | ||
465 | paravirt_ops.cpuid = lguest_cpuid; | ||
466 | paravirt_ops.write_cr3 = lguest_write_cr3; | ||
467 | paravirt_ops.flush_tlb_user = lguest_flush_tlb_user; | ||
468 | paravirt_ops.flush_tlb_single = lguest_flush_tlb_single; | ||
469 | paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; | ||
470 | paravirt_ops.set_pte = lguest_set_pte; | ||
471 | paravirt_ops.set_pte_at = lguest_set_pte_at; | ||
472 | paravirt_ops.set_pmd = lguest_set_pmd; | ||
473 | #ifdef CONFIG_X86_LOCAL_APIC | ||
474 | paravirt_ops.apic_write = lguest_apic_write; | ||
475 | paravirt_ops.apic_write_atomic = lguest_apic_write; | ||
476 | paravirt_ops.apic_read = lguest_apic_read; | ||
477 | #endif | ||
478 | paravirt_ops.load_idt = lguest_load_idt; | ||
479 | paravirt_ops.iret = lguest_iret; | ||
480 | paravirt_ops.load_esp0 = lguest_load_esp0; | ||
481 | paravirt_ops.load_tr_desc = lguest_load_tr_desc; | ||
482 | paravirt_ops.set_ldt = lguest_set_ldt; | ||
483 | paravirt_ops.load_tls = lguest_load_tls; | ||
484 | paravirt_ops.set_debugreg = lguest_set_debugreg; | ||
485 | paravirt_ops.clts = lguest_clts; | ||
486 | paravirt_ops.read_cr0 = lguest_read_cr0; | ||
487 | paravirt_ops.write_cr0 = lguest_write_cr0; | ||
488 | paravirt_ops.init_IRQ = lguest_init_IRQ; | ||
489 | paravirt_ops.read_cr2 = lguest_read_cr2; | ||
490 | paravirt_ops.read_cr3 = lguest_read_cr3; | ||
491 | paravirt_ops.read_cr4 = lguest_read_cr4; | ||
492 | paravirt_ops.write_cr4 = lguest_write_cr4; | ||
493 | paravirt_ops.write_gdt_entry = lguest_write_gdt_entry; | ||
494 | paravirt_ops.write_idt_entry = lguest_write_idt_entry; | ||
495 | paravirt_ops.patch = lguest_patch; | ||
496 | paravirt_ops.safe_halt = lguest_safe_halt; | ||
497 | paravirt_ops.get_wallclock = lguest_get_wallclock; | ||
498 | paravirt_ops.time_init = lguest_time_init; | ||
499 | paravirt_ops.set_lazy_mode = lguest_lazy_mode; | ||
500 | paravirt_ops.wbinvd = lguest_wbinvd; | ||
501 | paravirt_ops.sched_clock = lguest_sched_clock; | ||
502 | |||
503 | hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0); | ||
504 | strncpy(boot_command_line, boot->cmdline, COMMAND_LINE_SIZE); | ||
505 | |||
506 | /* We use top of mem for initial pagetables. */ | ||
507 | init_pg_tables_end = __pa(pg0); | ||
508 | |||
509 | asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); | ||
510 | |||
511 | reserve_top_address(lguest_data.reserve_mem); | ||
512 | |||
513 | lockdep_init(); | ||
514 | |||
515 | paravirt_disable_iospace(); | ||
516 | |||
517 | cpu_detect(&new_cpu_data); | ||
518 | /* head.S usually sets up the first capability word, so do it here. */ | ||
519 | new_cpu_data.x86_capability[0] = cpuid_edx(1); | ||
520 | |||
521 | /* Math is always hard! */ | ||
522 | new_cpu_data.hard_math = 1; | ||
523 | |||
524 | #ifdef CONFIG_X86_MCE | ||
525 | mce_disabled = 1; | ||
526 | #endif | ||
527 | |||
528 | #ifdef CONFIG_ACPI | ||
529 | acpi_disabled = 1; | ||
530 | acpi_ht = 0; | ||
531 | #endif | ||
532 | |||
533 | add_preferred_console("hvc", 0, NULL); | ||
534 | |||
535 | if (boot->initrd_size) { | ||
536 | /* We stash this at top of memory. */ | ||
537 | INITRD_START = boot->max_pfn*PAGE_SIZE - boot->initrd_size; | ||
538 | INITRD_SIZE = boot->initrd_size; | ||
539 | LOADER_TYPE = 0xFF; | ||
540 | } | ||
541 | |||
542 | pm_power_off = lguest_power_off; | ||
543 | start_kernel(); | ||
544 | } | ||
diff --git a/drivers/lguest/lguest_asm.S b/drivers/lguest/lguest_asm.S new file mode 100644 index 000000000000..5ac3d20bb184 --- /dev/null +++ b/drivers/lguest/lguest_asm.S | |||
@@ -0,0 +1,53 @@ | |||
1 | #include <linux/linkage.h> | ||
2 | #include <linux/lguest.h> | ||
3 | #include <asm/asm-offsets.h> | ||
4 | #include <asm/thread_info.h> | ||
5 | |||
6 | /* FIXME: Once asm/processor-flags.h goes in, include that */ | ||
7 | #define X86_EFLAGS_IF 0x00000200 | ||
8 | |||
9 | /* | ||
10 | * This is where we begin: we have a magic signature which the launcher looks | ||
11 | * for. The plan is that the Linux boot protocol will be extended with a | ||
12 | * "platform type" field which will guide us here from the normal entry point, | ||
13 | * but for the moment this suffices. | ||
14 | * | ||
15 | * We put it in .init.text will be discarded after boot. | ||
16 | */ | ||
17 | .section .init.text, "ax", @progbits | ||
18 | .ascii "GenuineLguest" | ||
19 | /* Set up initial stack. */ | ||
20 | movl $(init_thread_union+THREAD_SIZE),%esp | ||
21 | jmp lguest_init | ||
22 | |||
23 | /* The templates for inline patching. */ | ||
24 | #define LGUEST_PATCH(name, insns...) \ | ||
25 | lgstart_##name: insns; lgend_##name:; \ | ||
26 | .globl lgstart_##name; .globl lgend_##name | ||
27 | |||
28 | LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) | ||
29 | LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) | ||
30 | LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) | ||
31 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) | ||
32 | |||
33 | .text | ||
34 | /* These demark the EIP range where host should never deliver interrupts. */ | ||
35 | .global lguest_noirq_start | ||
36 | .global lguest_noirq_end | ||
37 | |||
38 | /* | ||
39 | * We move eflags word to lguest_data.irq_enabled to restore interrupt state. | ||
40 | * For page faults, gpfs and virtual interrupts, the hypervisor has saved | ||
41 | * eflags manually, otherwise it was delivered directly and so eflags reflects | ||
42 | * the real machine IF state, ie. interrupts on. Since the kernel always dies | ||
43 | * if it takes such a trap with interrupts disabled anyway, turning interrupts | ||
44 | * back on unconditionally here is OK. | ||
45 | */ | ||
46 | ENTRY(lguest_iret) | ||
47 | pushl %eax | ||
48 | movl 12(%esp), %eax | ||
49 | lguest_noirq_start: | ||
50 | movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled | ||
51 | popl %eax | ||
52 | iret | ||
53 | lguest_noirq_end: | ||
diff --git a/drivers/lguest/lguest_bus.c b/drivers/lguest/lguest_bus.c new file mode 100644 index 000000000000..18d6ab21a43b --- /dev/null +++ b/drivers/lguest/lguest_bus.c | |||
@@ -0,0 +1,148 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/bootmem.h> | ||
3 | #include <linux/lguest_bus.h> | ||
4 | #include <asm/io.h> | ||
5 | |||
6 | static ssize_t type_show(struct device *_dev, | ||
7 | struct device_attribute *attr, char *buf) | ||
8 | { | ||
9 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); | ||
10 | return sprintf(buf, "%hu", lguest_devices[dev->index].type); | ||
11 | } | ||
12 | static ssize_t features_show(struct device *_dev, | ||
13 | struct device_attribute *attr, char *buf) | ||
14 | { | ||
15 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); | ||
16 | return sprintf(buf, "%hx", lguest_devices[dev->index].features); | ||
17 | } | ||
18 | static ssize_t pfn_show(struct device *_dev, | ||
19 | struct device_attribute *attr, char *buf) | ||
20 | { | ||
21 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); | ||
22 | return sprintf(buf, "%u", lguest_devices[dev->index].pfn); | ||
23 | } | ||
24 | static ssize_t status_show(struct device *_dev, | ||
25 | struct device_attribute *attr, char *buf) | ||
26 | { | ||
27 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); | ||
28 | return sprintf(buf, "%hx", lguest_devices[dev->index].status); | ||
29 | } | ||
30 | static ssize_t status_store(struct device *_dev, struct device_attribute *attr, | ||
31 | const char *buf, size_t count) | ||
32 | { | ||
33 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); | ||
34 | if (sscanf(buf, "%hi", &lguest_devices[dev->index].status) != 1) | ||
35 | return -EINVAL; | ||
36 | return count; | ||
37 | } | ||
38 | static struct device_attribute lguest_dev_attrs[] = { | ||
39 | __ATTR_RO(type), | ||
40 | __ATTR_RO(features), | ||
41 | __ATTR_RO(pfn), | ||
42 | __ATTR(status, 0644, status_show, status_store), | ||
43 | __ATTR_NULL | ||
44 | }; | ||
45 | |||
46 | static int lguest_dev_match(struct device *_dev, struct device_driver *_drv) | ||
47 | { | ||
48 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); | ||
49 | struct lguest_driver *drv = container_of(_drv,struct lguest_driver,drv); | ||
50 | |||
51 | return (drv->device_type == lguest_devices[dev->index].type); | ||
52 | } | ||
53 | |||
54 | struct lguest_bus { | ||
55 | struct bus_type bus; | ||
56 | struct device dev; | ||
57 | }; | ||
58 | |||
59 | static struct lguest_bus lguest_bus = { | ||
60 | .bus = { | ||
61 | .name = "lguest", | ||
62 | .match = lguest_dev_match, | ||
63 | .dev_attrs = lguest_dev_attrs, | ||
64 | }, | ||
65 | .dev = { | ||
66 | .parent = NULL, | ||
67 | .bus_id = "lguest", | ||
68 | } | ||
69 | }; | ||
70 | |||
71 | static int lguest_dev_probe(struct device *_dev) | ||
72 | { | ||
73 | int ret; | ||
74 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); | ||
75 | struct lguest_driver *drv = container_of(dev->dev.driver, | ||
76 | struct lguest_driver, drv); | ||
77 | |||
78 | lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER; | ||
79 | ret = drv->probe(dev); | ||
80 | if (ret == 0) | ||
81 | lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER_OK; | ||
82 | return ret; | ||
83 | } | ||
84 | |||
85 | int register_lguest_driver(struct lguest_driver *drv) | ||
86 | { | ||
87 | if (!lguest_devices) | ||
88 | return 0; | ||
89 | |||
90 | drv->drv.bus = &lguest_bus.bus; | ||
91 | drv->drv.name = drv->name; | ||
92 | drv->drv.owner = drv->owner; | ||
93 | drv->drv.probe = lguest_dev_probe; | ||
94 | |||
95 | return driver_register(&drv->drv); | ||
96 | } | ||
97 | EXPORT_SYMBOL_GPL(register_lguest_driver); | ||
98 | |||
99 | static void add_lguest_device(unsigned int index) | ||
100 | { | ||
101 | struct lguest_device *new; | ||
102 | |||
103 | lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE; | ||
104 | new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL); | ||
105 | if (!new) { | ||
106 | printk(KERN_EMERG "Cannot allocate lguest device %u\n", index); | ||
107 | lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED; | ||
108 | return; | ||
109 | } | ||
110 | |||
111 | new->index = index; | ||
112 | new->private = NULL; | ||
113 | memset(&new->dev, 0, sizeof(new->dev)); | ||
114 | new->dev.parent = &lguest_bus.dev; | ||
115 | new->dev.bus = &lguest_bus.bus; | ||
116 | sprintf(new->dev.bus_id, "%u", index); | ||
117 | if (device_register(&new->dev) != 0) { | ||
118 | printk(KERN_EMERG "Cannot register lguest device %u\n", index); | ||
119 | lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED; | ||
120 | kfree(new); | ||
121 | } | ||
122 | } | ||
123 | |||
124 | static void scan_devices(void) | ||
125 | { | ||
126 | unsigned int i; | ||
127 | |||
128 | for (i = 0; i < LGUEST_MAX_DEVICES; i++) | ||
129 | if (lguest_devices[i].type) | ||
130 | add_lguest_device(i); | ||
131 | } | ||
132 | |||
133 | static int __init lguest_bus_init(void) | ||
134 | { | ||
135 | if (strcmp(paravirt_ops.name, "lguest") != 0) | ||
136 | return 0; | ||
137 | |||
138 | /* Devices are in page above top of "normal" mem. */ | ||
139 | lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1); | ||
140 | |||
141 | if (bus_register(&lguest_bus.bus) != 0 | ||
142 | || device_register(&lguest_bus.dev) != 0) | ||
143 | panic("lguest bus registration failed"); | ||
144 | |||
145 | scan_devices(); | ||
146 | return 0; | ||
147 | } | ||
148 | postcore_initcall(lguest_bus_init); | ||
diff --git a/include/linux/lguest.h b/include/linux/lguest.h new file mode 100644 index 000000000000..f30c04fc22b7 --- /dev/null +++ b/include/linux/lguest.h | |||
@@ -0,0 +1,85 @@ | |||
1 | /* Things the lguest guest needs to know. Note: like all lguest interfaces, | ||
2 | * this is subject to wild and random change between versions. */ | ||
3 | #ifndef _ASM_LGUEST_H | ||
4 | #define _ASM_LGUEST_H | ||
5 | |||
6 | /* These are randomly chosen numbers which indicate we're an lguest at boot */ | ||
7 | #define LGUEST_MAGIC_EBP 0x4C687970 | ||
8 | #define LGUEST_MAGIC_EDI 0x652D4D65 | ||
9 | #define LGUEST_MAGIC_ESI 0xFFFFFFFF | ||
10 | |||
11 | #ifndef __ASSEMBLY__ | ||
12 | #include <asm/irq.h> | ||
13 | |||
14 | #define LHCALL_FLUSH_ASYNC 0 | ||
15 | #define LHCALL_LGUEST_INIT 1 | ||
16 | #define LHCALL_CRASH 2 | ||
17 | #define LHCALL_LOAD_GDT 3 | ||
18 | #define LHCALL_NEW_PGTABLE 4 | ||
19 | #define LHCALL_FLUSH_TLB 5 | ||
20 | #define LHCALL_LOAD_IDT_ENTRY 6 | ||
21 | #define LHCALL_SET_STACK 7 | ||
22 | #define LHCALL_TS 8 | ||
23 | #define LHCALL_TIMER_READ 9 | ||
24 | #define LHCALL_HALT 10 | ||
25 | #define LHCALL_GET_WALLCLOCK 11 | ||
26 | #define LHCALL_BIND_DMA 12 | ||
27 | #define LHCALL_SEND_DMA 13 | ||
28 | #define LHCALL_SET_PTE 14 | ||
29 | #define LHCALL_SET_PMD 15 | ||
30 | #define LHCALL_LOAD_TLS 16 | ||
31 | |||
32 | #define LGUEST_TRAP_ENTRY 0x1F | ||
33 | |||
34 | static inline unsigned long | ||
35 | hcall(unsigned long call, | ||
36 | unsigned long arg1, unsigned long arg2, unsigned long arg3) | ||
37 | { | ||
38 | asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) | ||
39 | : "=a"(call) | ||
40 | : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) | ||
41 | : "memory"); | ||
42 | return call; | ||
43 | } | ||
44 | |||
45 | void async_hcall(unsigned long call, | ||
46 | unsigned long arg1, unsigned long arg2, unsigned long arg3); | ||
47 | |||
48 | /* Can't use our min() macro here: needs to be a constant */ | ||
49 | #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) | ||
50 | |||
51 | #define LHCALL_RING_SIZE 64 | ||
52 | struct hcall_ring | ||
53 | { | ||
54 | u32 eax, edx, ebx, ecx; | ||
55 | }; | ||
56 | |||
57 | /* All the good stuff happens here: guest registers it with LGUEST_INIT */ | ||
58 | struct lguest_data | ||
59 | { | ||
60 | /* Fields which change during running: */ | ||
61 | /* 512 == enabled (same as eflags) */ | ||
62 | unsigned int irq_enabled; | ||
63 | /* Interrupts blocked by guest. */ | ||
64 | DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS); | ||
65 | |||
66 | /* Virtual address of page fault. */ | ||
67 | unsigned long cr2; | ||
68 | |||
69 | /* Async hypercall ring. 0xFF == done, 0 == pending. */ | ||
70 | u8 hcall_status[LHCALL_RING_SIZE]; | ||
71 | struct hcall_ring hcalls[LHCALL_RING_SIZE]; | ||
72 | |||
73 | /* Fields initialized by the hypervisor at boot: */ | ||
74 | /* Memory not to try to access */ | ||
75 | unsigned long reserve_mem; | ||
76 | /* ID of this guest (used by network driver to set ethernet address) */ | ||
77 | u16 guestid; | ||
78 | |||
79 | /* Fields initialized by the guest at boot: */ | ||
80 | /* Instruction range to suppress interrupts even if enabled */ | ||
81 | unsigned long noirq_start, noirq_end; | ||
82 | }; | ||
83 | extern struct lguest_data lguest_data; | ||
84 | #endif /* __ASSEMBLY__ */ | ||
85 | #endif /* _ASM_LGUEST_H */ | ||
diff --git a/include/linux/lguest_bus.h b/include/linux/lguest_bus.h new file mode 100644 index 000000000000..c9b4e05fee49 --- /dev/null +++ b/include/linux/lguest_bus.h | |||
@@ -0,0 +1,48 @@ | |||
1 | #ifndef _ASM_LGUEST_DEVICE_H | ||
2 | #define _ASM_LGUEST_DEVICE_H | ||
3 | /* Everything you need to know about lguest devices. */ | ||
4 | #include <linux/device.h> | ||
5 | #include <linux/lguest.h> | ||
6 | #include <linux/lguest_launcher.h> | ||
7 | |||
8 | struct lguest_device { | ||
9 | /* Unique busid, and index into lguest_page->devices[] */ | ||
10 | unsigned int index; | ||
11 | |||
12 | struct device dev; | ||
13 | |||
14 | /* Driver can hang data off here. */ | ||
15 | void *private; | ||
16 | }; | ||
17 | |||
18 | /* By convention, each device can use irq index+1 if it wants to. */ | ||
19 | static inline int lgdev_irq(const struct lguest_device *dev) | ||
20 | { | ||
21 | return dev->index + 1; | ||
22 | } | ||
23 | |||
24 | /* dma args must not be vmalloced! */ | ||
25 | void lguest_send_dma(unsigned long key, struct lguest_dma *dma); | ||
26 | int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas, | ||
27 | unsigned int num, u8 irq); | ||
28 | void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas); | ||
29 | |||
30 | /* Map the virtual device space */ | ||
31 | void *lguest_map(unsigned long phys_addr, unsigned long pages); | ||
32 | void lguest_unmap(void *); | ||
33 | |||
34 | struct lguest_driver { | ||
35 | const char *name; | ||
36 | struct module *owner; | ||
37 | u16 device_type; | ||
38 | int (*probe)(struct lguest_device *dev); | ||
39 | void (*remove)(struct lguest_device *dev); | ||
40 | |||
41 | struct device_driver drv; | ||
42 | }; | ||
43 | |||
44 | extern int register_lguest_driver(struct lguest_driver *drv); | ||
45 | extern void unregister_lguest_driver(struct lguest_driver *drv); | ||
46 | |||
47 | extern struct lguest_device_desc *lguest_devices; /* Just past max_pfn */ | ||
48 | #endif /* _ASM_LGUEST_DEVICE_H */ | ||