diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2007-10-21 21:03:36 -0400 |
---|---|---|
committer | Rusty Russell <rusty@rustcorp.com.au> | 2007-10-23 01:49:54 -0400 |
commit | 47436aa4ad054c1c7c8231618e86ebd9305308dc (patch) | |
tree | a9ba6e0521f9116442144a86e781a3164ec86094 /drivers | |
parent | c18acd73ffc209def08003a1927473096f66c5ad (diff) |
Boot with virtual == physical to get closer to native Linux.
1) This allows us to get alot closer to booting bzImages.
2) It means we don't have to know page_offset.
3) The Guest needs to modify the boot pagetables to create the
PAGE_OFFSET mapping before jumping to C code.
4) guest_pa() walks the page tables rather than using page_offset.
5) We don't use page_offset to figure out whether to emulate: it was
always kinda quesationable, and won't work for instructions done
before remapping (bzImage unpacking in particular).
6) We still want the kernel address for tlb flushing: have the initial
hypercall give us that, too.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/lguest/hypercalls.c | 8 | ||||
-rw-r--r-- | drivers/lguest/interrupts_and_traps.c | 13 | ||||
-rw-r--r-- | drivers/lguest/lg.h | 8 | ||||
-rw-r--r-- | drivers/lguest/lguest_user.c | 11 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 47 | ||||
-rw-r--r-- | drivers/lguest/x86/core.c | 7 |
6 files changed, 62 insertions, 32 deletions
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 02d0ae268267..13b5f2f813de 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c | |||
@@ -181,15 +181,15 @@ static void initialize(struct lguest *lg) | |||
181 | /* The Guest tells us where we're not to deliver interrupts by putting | 181 | /* The Guest tells us where we're not to deliver interrupts by putting |
182 | * the range of addresses into "struct lguest_data". */ | 182 | * the range of addresses into "struct lguest_data". */ |
183 | if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) | 183 | if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) |
184 | || get_user(lg->noirq_end, &lg->lguest_data->noirq_end) | 184 | || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)) |
185 | /* We tell the Guest that it can't use the top 4MB of virtual | ||
186 | * addresses used by the Switcher. */ | ||
187 | || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)) | ||
188 | kill_guest(lg, "bad guest page %p", lg->lguest_data); | 185 | kill_guest(lg, "bad guest page %p", lg->lguest_data); |
189 | 186 | ||
190 | /* We write the current time into the Guest's data page once now. */ | 187 | /* We write the current time into the Guest's data page once now. */ |
191 | write_timestamp(lg); | 188 | write_timestamp(lg); |
192 | 189 | ||
190 | /* page_tables.c will also do some setup. */ | ||
191 | page_table_guest_data_init(lg); | ||
192 | |||
193 | /* This is the one case where the above accesses might have been the | 193 | /* This is the one case where the above accesses might have been the |
194 | * first write to a Guest page. This may have caused a copy-on-write | 194 | * first write to a Guest page. This may have caused a copy-on-write |
195 | * fault, but the Guest might be referring to the old (read-only) | 195 | * fault, but the Guest might be referring to the old (read-only) |
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index a57d757eab6e..3271c0031a1b 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c | |||
@@ -62,8 +62,9 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) | |||
62 | * it). */ | 62 | * it). */ |
63 | static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) | 63 | static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) |
64 | { | 64 | { |
65 | unsigned long gstack; | 65 | unsigned long gstack, origstack; |
66 | u32 eflags, ss, irq_enable; | 66 | u32 eflags, ss, irq_enable; |
67 | unsigned long virtstack; | ||
67 | 68 | ||
68 | /* There are two cases for interrupts: one where the Guest is already | 69 | /* There are two cases for interrupts: one where the Guest is already |
69 | * in the kernel, and a more complex one where the Guest is in | 70 | * in the kernel, and a more complex one where the Guest is in |
@@ -71,8 +72,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) | |||
71 | if ((lg->regs->ss&0x3) != GUEST_PL) { | 72 | if ((lg->regs->ss&0x3) != GUEST_PL) { |
72 | /* The Guest told us their kernel stack with the SET_STACK | 73 | /* The Guest told us their kernel stack with the SET_STACK |
73 | * hypercall: both the virtual address and the segment */ | 74 | * hypercall: both the virtual address and the segment */ |
74 | gstack = guest_pa(lg, lg->esp1); | 75 | virtstack = lg->esp1; |
75 | ss = lg->ss1; | 76 | ss = lg->ss1; |
77 | |||
78 | origstack = gstack = guest_pa(lg, virtstack); | ||
76 | /* We push the old stack segment and pointer onto the new | 79 | /* We push the old stack segment and pointer onto the new |
77 | * stack: when the Guest does an "iret" back from the interrupt | 80 | * stack: when the Guest does an "iret" back from the interrupt |
78 | * handler the CPU will notice they're dropping privilege | 81 | * handler the CPU will notice they're dropping privilege |
@@ -81,8 +84,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) | |||
81 | push_guest_stack(lg, &gstack, lg->regs->esp); | 84 | push_guest_stack(lg, &gstack, lg->regs->esp); |
82 | } else { | 85 | } else { |
83 | /* We're staying on the same Guest (kernel) stack. */ | 86 | /* We're staying on the same Guest (kernel) stack. */ |
84 | gstack = guest_pa(lg, lg->regs->esp); | 87 | virtstack = lg->regs->esp; |
85 | ss = lg->regs->ss; | 88 | ss = lg->regs->ss; |
89 | |||
90 | origstack = gstack = guest_pa(lg, virtstack); | ||
86 | } | 91 | } |
87 | 92 | ||
88 | /* Remember that we never let the Guest actually disable interrupts, so | 93 | /* Remember that we never let the Guest actually disable interrupts, so |
@@ -108,7 +113,7 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) | |||
108 | /* Now we've pushed all the old state, we change the stack, the code | 113 | /* Now we've pushed all the old state, we change the stack, the code |
109 | * segment and the address to execute. */ | 114 | * segment and the address to execute. */ |
110 | lg->regs->ss = ss; | 115 | lg->regs->ss = ss; |
111 | lg->regs->esp = gstack + lg->page_offset; | 116 | lg->regs->esp = virtstack + (gstack - origstack); |
112 | lg->regs->cs = (__KERNEL_CS|GUEST_PL); | 117 | lg->regs->cs = (__KERNEL_CS|GUEST_PL); |
113 | lg->regs->eip = idt_address(lo, hi); | 118 | lg->regs->eip = idt_address(lo, hi); |
114 | 119 | ||
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 7408cebe995e..e4845d7f0688 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h | |||
@@ -63,7 +63,7 @@ struct lguest | |||
63 | /* This provides the offset to the base of guest-physical | 63 | /* This provides the offset to the base of guest-physical |
64 | * memory in the Launcher. */ | 64 | * memory in the Launcher. */ |
65 | void __user *mem_base; | 65 | void __user *mem_base; |
66 | u32 page_offset; | 66 | unsigned long kernel_address; |
67 | u32 cr2; | 67 | u32 cr2; |
68 | int halted; | 68 | int halted; |
69 | int ts; | 69 | int ts; |
@@ -165,6 +165,8 @@ void guest_set_pte(struct lguest *lg, unsigned long gpgdir, | |||
165 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); | 165 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); |
166 | int demand_page(struct lguest *info, unsigned long cr2, int errcode); | 166 | int demand_page(struct lguest *info, unsigned long cr2, int errcode); |
167 | void pin_page(struct lguest *lg, unsigned long vaddr); | 167 | void pin_page(struct lguest *lg, unsigned long vaddr); |
168 | unsigned long guest_pa(struct lguest *lg, unsigned long vaddr); | ||
169 | void page_table_guest_data_init(struct lguest *lg); | ||
168 | 170 | ||
169 | /* <arch>/core.c: */ | 171 | /* <arch>/core.c: */ |
170 | void lguest_arch_host_init(void); | 172 | void lguest_arch_host_init(void); |
@@ -229,9 +231,5 @@ do { \ | |||
229 | } while(0) | 231 | } while(0) |
230 | /* (End of aside) :*/ | 232 | /* (End of aside) :*/ |
231 | 233 | ||
232 | static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) | ||
233 | { | ||
234 | return vaddr - lg->page_offset; | ||
235 | } | ||
236 | #endif /* __ASSEMBLY__ */ | 234 | #endif /* __ASSEMBLY__ */ |
237 | #endif /* _LGUEST_H */ | 235 | #endif /* _LGUEST_H */ |
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index b184652e45d7..61b177e1e649 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c | |||
@@ -111,7 +111,7 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) | |||
111 | return run_guest(lg, (unsigned long __user *)user); | 111 | return run_guest(lg, (unsigned long __user *)user); |
112 | } | 112 | } |
113 | 113 | ||
114 | /*L:020 The initialization write supplies 5 pointer sized (32 or 64 bit) | 114 | /*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit) |
115 | * values (in addition to the LHREQ_INITIALIZE value). These are: | 115 | * values (in addition to the LHREQ_INITIALIZE value). These are: |
116 | * | 116 | * |
117 | * base: The start of the Guest-physical memory inside the Launcher memory. | 117 | * base: The start of the Guest-physical memory inside the Launcher memory. |
@@ -124,12 +124,6 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) | |||
124 | * pagetables (which are set up by the Launcher). | 124 | * pagetables (which are set up by the Launcher). |
125 | * | 125 | * |
126 | * start: The first instruction to execute ("eip" in x86-speak). | 126 | * start: The first instruction to execute ("eip" in x86-speak). |
127 | * | ||
128 | * page_offset: The PAGE_OFFSET constant in the Guest kernel. We should | ||
129 | * probably wean the code off this, but it's a very useful constant! Any | ||
130 | * address above this is within the Guest kernel, and any kernel address can | ||
131 | * quickly converted from physical to virtual by adding PAGE_OFFSET. It's | ||
132 | * 0xC0000000 (3G) by default, but it's configurable at kernel build time. | ||
133 | */ | 127 | */ |
134 | static int initialize(struct file *file, const unsigned long __user *input) | 128 | static int initialize(struct file *file, const unsigned long __user *input) |
135 | { | 129 | { |
@@ -137,7 +131,7 @@ static int initialize(struct file *file, const unsigned long __user *input) | |||
137 | * Guest. */ | 131 | * Guest. */ |
138 | struct lguest *lg; | 132 | struct lguest *lg; |
139 | int err; | 133 | int err; |
140 | unsigned long args[5]; | 134 | unsigned long args[4]; |
141 | 135 | ||
142 | /* We grab the Big Lguest lock, which protects against multiple | 136 | /* We grab the Big Lguest lock, which protects against multiple |
143 | * simultaneous initializations. */ | 137 | * simultaneous initializations. */ |
@@ -162,7 +156,6 @@ static int initialize(struct file *file, const unsigned long __user *input) | |||
162 | /* Populate the easy fields of our "struct lguest" */ | 156 | /* Populate the easy fields of our "struct lguest" */ |
163 | lg->mem_base = (void __user *)(long)args[0]; | 157 | lg->mem_base = (void __user *)(long)args[0]; |
164 | lg->pfn_limit = args[1]; | 158 | lg->pfn_limit = args[1]; |
165 | lg->page_offset = args[4]; | ||
166 | 159 | ||
167 | /* We need a complete page for the Guest registers: they are accessible | 160 | /* We need a complete page for the Guest registers: they are accessible |
168 | * to the Guest and we can only grant it access to whole pages. */ | 161 | * to the Guest and we can only grant it access to whole pages. */ |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index bfe3650b28d6..fe3c7575647b 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/random.h> | 13 | #include <linux/random.h> |
14 | #include <linux/percpu.h> | 14 | #include <linux/percpu.h> |
15 | #include <asm/tlbflush.h> | 15 | #include <asm/tlbflush.h> |
16 | #include <asm/uaccess.h> | ||
16 | #include "lg.h" | 17 | #include "lg.h" |
17 | 18 | ||
18 | /*M:008 We hold reference to pages, which prevents them from being swapped. | 19 | /*M:008 We hold reference to pages, which prevents them from being swapped. |
@@ -345,7 +346,7 @@ static void flush_user_mappings(struct lguest *lg, int idx) | |||
345 | { | 346 | { |
346 | unsigned int i; | 347 | unsigned int i; |
347 | /* Release every pgd entry up to the kernel's address. */ | 348 | /* Release every pgd entry up to the kernel's address. */ |
348 | for (i = 0; i < pgd_index(lg->page_offset); i++) | 349 | for (i = 0; i < pgd_index(lg->kernel_address); i++) |
349 | release_pgd(lg, lg->pgdirs[idx].pgdir + i); | 350 | release_pgd(lg, lg->pgdirs[idx].pgdir + i); |
350 | } | 351 | } |
351 | 352 | ||
@@ -358,6 +359,25 @@ void guest_pagetable_flush_user(struct lguest *lg) | |||
358 | } | 359 | } |
359 | /*:*/ | 360 | /*:*/ |
360 | 361 | ||
362 | /* We walk down the guest page tables to get a guest-physical address */ | ||
363 | unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) | ||
364 | { | ||
365 | pgd_t gpgd; | ||
366 | pte_t gpte; | ||
367 | |||
368 | /* First step: get the top-level Guest page table entry. */ | ||
369 | gpgd = __pgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); | ||
370 | /* Toplevel not present? We can't map it in. */ | ||
371 | if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) | ||
372 | kill_guest(lg, "Bad address %#lx", vaddr); | ||
373 | |||
374 | gpte = __pte(lgread_u32(lg, gpte_addr(lg, gpgd, vaddr))); | ||
375 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) | ||
376 | kill_guest(lg, "Bad address %#lx", vaddr); | ||
377 | |||
378 | return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); | ||
379 | } | ||
380 | |||
361 | /* We keep several page tables. This is a simple routine to find the page | 381 | /* We keep several page tables. This is a simple routine to find the page |
362 | * table (if any) corresponding to this top-level address the Guest has given | 382 | * table (if any) corresponding to this top-level address the Guest has given |
363 | * us. */ | 383 | * us. */ |
@@ -500,7 +520,7 @@ void guest_set_pte(struct lguest *lg, | |||
500 | { | 520 | { |
501 | /* Kernel mappings must be changed on all top levels. Slow, but | 521 | /* Kernel mappings must be changed on all top levels. Slow, but |
502 | * doesn't happen often. */ | 522 | * doesn't happen often. */ |
503 | if (vaddr >= lg->page_offset) { | 523 | if (vaddr >= lg->kernel_address) { |
504 | unsigned int i; | 524 | unsigned int i; |
505 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 525 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) |
506 | if (lg->pgdirs[i].pgdir) | 526 | if (lg->pgdirs[i].pgdir) |
@@ -550,11 +570,6 @@ void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) | |||
550 | * its first page table is. We set some things up here: */ | 570 | * its first page table is. We set some things up here: */ |
551 | int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) | 571 | int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) |
552 | { | 572 | { |
553 | /* In flush_user_mappings() we loop from 0 to | ||
554 | * "pgd_index(lg->page_offset)". This assumes it won't hit | ||
555 | * the Switcher mappings, so check that now. */ | ||
556 | if (pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX) | ||
557 | return -EINVAL; | ||
558 | /* We start on the first shadow page table, and give it a blank PGD | 573 | /* We start on the first shadow page table, and give it a blank PGD |
559 | * page. */ | 574 | * page. */ |
560 | lg->pgdidx = 0; | 575 | lg->pgdidx = 0; |
@@ -565,6 +580,24 @@ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) | |||
565 | return 0; | 580 | return 0; |
566 | } | 581 | } |
567 | 582 | ||
583 | /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ | ||
584 | void page_table_guest_data_init(struct lguest *lg) | ||
585 | { | ||
586 | /* We get the kernel address: above this is all kernel memory. */ | ||
587 | if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address) | ||
588 | /* We tell the Guest that it can't use the top 4MB of virtual | ||
589 | * addresses used by the Switcher. */ | ||
590 | || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) | ||
591 | || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir)) | ||
592 | kill_guest(lg, "bad guest page %p", lg->lguest_data); | ||
593 | |||
594 | /* In flush_user_mappings() we loop from 0 to | ||
595 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the | ||
596 | * Switcher mappings, so check that now. */ | ||
597 | if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX) | ||
598 | kill_guest(lg, "bad kernel address %#lx", lg->kernel_address); | ||
599 | } | ||
600 | |||
568 | /* When a Guest dies, our cleanup is fairly simple. */ | 601 | /* When a Guest dies, our cleanup is fairly simple. */ |
569 | void free_guest_pagetable(struct lguest *lg) | 602 | void free_guest_pagetable(struct lguest *lg) |
570 | { | 603 | { |
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index a125109446dc..39f64c95de18 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c | |||
@@ -216,9 +216,10 @@ static int emulate_insn(struct lguest *lg) | |||
216 | * guest_pa just subtracts the Guest's page_offset. */ | 216 | * guest_pa just subtracts the Guest's page_offset. */ |
217 | unsigned long physaddr = guest_pa(lg, lg->regs->eip); | 217 | unsigned long physaddr = guest_pa(lg, lg->regs->eip); |
218 | 218 | ||
219 | /* The guest_pa() function only works for Guest kernel addresses, but | 219 | /* This must be the Guest kernel trying to do something, not userspace! |
220 | * that's all we're trying to do anyway. */ | 220 | * The bottom two bits of the CS segment register are the privilege |
221 | if (lg->regs->eip < lg->page_offset) | 221 | * level. */ |
222 | if ((lg->regs->cs & 3) != GUEST_PL) | ||
222 | return 0; | 223 | return 0; |
223 | 224 | ||
224 | /* Decoding x86 instructions is icky. */ | 225 | /* Decoding x86 instructions is icky. */ |