diff options
author | Matias Zabaljauregui <zabaljauregui@gmail.com> | 2009-06-13 00:27:07 -0400 |
---|---|---|
committer | Rusty Russell <rusty@rustcorp.com.au> | 2009-06-12 08:57:08 -0400 |
commit | acdd0b6292b282c4511897ac2691a47befbf1c6a (patch) | |
tree | 1bfcfc32b11d35e99fec5bbf52b19d6ee038f25e /drivers/lguest | |
parent | cefcad1773197523e11e18b669f245e6a8d32058 (diff) |
lguest: PAE support
This version requires that host and guest have the same PAE status.
NX cap is not offered to the guest, yet.
Signed-off-by: Matias Zabaljauregui <zabaljauregui@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Diffstat (limited to 'drivers/lguest')
-rw-r--r-- | drivers/lguest/Kconfig | 2 | ||||
-rw-r--r-- | drivers/lguest/hypercalls.c | 10 | ||||
-rw-r--r-- | drivers/lguest/lg.h | 5 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 351 |
4 files changed, 329 insertions, 39 deletions
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index a3d3cbab359a..8f63845db830 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig | |||
@@ -1,6 +1,6 @@ | |||
1 | config LGUEST | 1 | config LGUEST |
2 | tristate "Linux hypervisor example code" | 2 | tristate "Linux hypervisor example code" |
3 | depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX | 3 | depends on X86_32 && EXPERIMENTAL && FUTEX |
4 | select HVC_DRIVER | 4 | select HVC_DRIVER |
5 | ---help--- | 5 | ---help--- |
6 | This is a very simple module which allows you to run | 6 | This is a very simple module which allows you to run |
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 51149ca14617..c29ffa19cb74 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c | |||
@@ -77,11 +77,21 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) | |||
77 | guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); | 77 | guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); |
78 | break; | 78 | break; |
79 | case LHCALL_SET_PTE: | 79 | case LHCALL_SET_PTE: |
80 | #ifdef CONFIG_X86_PAE | ||
81 | guest_set_pte(cpu, args->arg1, args->arg2, | ||
82 | __pte(args->arg3 | (u64)args->arg4 << 32)); | ||
83 | #else | ||
80 | guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); | 84 | guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); |
85 | #endif | ||
81 | break; | 86 | break; |
82 | case LHCALL_SET_PGD: | 87 | case LHCALL_SET_PGD: |
83 | guest_set_pgd(cpu->lg, args->arg1, args->arg2); | 88 | guest_set_pgd(cpu->lg, args->arg1, args->arg2); |
84 | break; | 89 | break; |
90 | #ifdef CONFIG_X86_PAE | ||
91 | case LHCALL_SET_PMD: | ||
92 | guest_set_pmd(cpu->lg, args->arg1, args->arg2); | ||
93 | break; | ||
94 | #endif | ||
85 | case LHCALL_SET_CLOCKEVENT: | 95 | case LHCALL_SET_CLOCKEVENT: |
86 | guest_set_clockevent(cpu, args->arg1); | 96 | guest_set_clockevent(cpu, args->arg1); |
87 | break; | 97 | break; |
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index cacc2da2058d..6201ce59e886 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h | |||
@@ -137,6 +137,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); | |||
137 | * in the kernel. */ | 137 | * in the kernel. */ |
138 | #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) | 138 | #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) |
139 | #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) | 139 | #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) |
140 | #define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) | ||
141 | #define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT) | ||
140 | 142 | ||
141 | /* interrupts_and_traps.c: */ | 143 | /* interrupts_and_traps.c: */ |
142 | unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); | 144 | unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); |
@@ -170,6 +172,9 @@ int init_guest_pagetable(struct lguest *lg); | |||
170 | void free_guest_pagetable(struct lguest *lg); | 172 | void free_guest_pagetable(struct lguest *lg); |
171 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); | 173 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); |
172 | void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); | 174 | void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); |
175 | #ifdef CONFIG_X86_PAE | ||
176 | void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); | ||
177 | #endif | ||
173 | void guest_pagetable_clear_all(struct lg_cpu *cpu); | 178 | void guest_pagetable_clear_all(struct lg_cpu *cpu); |
174 | void guest_pagetable_flush_user(struct lg_cpu *cpu); | 179 | void guest_pagetable_flush_user(struct lg_cpu *cpu); |
175 | void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, | 180 | void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 6a54d76b6236..5e2c26adcf06 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
@@ -53,6 +53,17 @@ | |||
53 | * page. */ | 53 | * page. */ |
54 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) | 54 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) |
55 | 55 | ||
56 | /* For PAE we need the PMD index as well. We use the last 2MB, so we | ||
57 | * will need the last pmd entry of the last pmd page. */ | ||
58 | #ifdef CONFIG_X86_PAE | ||
59 | #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) | ||
60 | #define RESERVE_MEM 2U | ||
61 | #define CHECK_GPGD_MASK _PAGE_PRESENT | ||
62 | #else | ||
63 | #define RESERVE_MEM 4U | ||
64 | #define CHECK_GPGD_MASK _PAGE_TABLE | ||
65 | #endif | ||
66 | |||
56 | /* We actually need a separate PTE page for each CPU. Remember that after the | 67 | /* We actually need a separate PTE page for each CPU. Remember that after the |
57 | * Switcher code itself comes two pages for each CPU, and we don't want this | 68 | * Switcher code itself comes two pages for each CPU, and we don't want this |
58 | * CPU's guest to see the pages of any other CPU. */ | 69 | * CPU's guest to see the pages of any other CPU. */ |
@@ -73,23 +84,58 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) | |||
73 | { | 84 | { |
74 | unsigned int index = pgd_index(vaddr); | 85 | unsigned int index = pgd_index(vaddr); |
75 | 86 | ||
87 | #ifndef CONFIG_X86_PAE | ||
76 | /* We kill any Guest trying to touch the Switcher addresses. */ | 88 | /* We kill any Guest trying to touch the Switcher addresses. */ |
77 | if (index >= SWITCHER_PGD_INDEX) { | 89 | if (index >= SWITCHER_PGD_INDEX) { |
78 | kill_guest(cpu, "attempt to access switcher pages"); | 90 | kill_guest(cpu, "attempt to access switcher pages"); |
79 | index = 0; | 91 | index = 0; |
80 | } | 92 | } |
93 | #endif | ||
81 | /* Return a pointer index'th pgd entry for the i'th page table. */ | 94 | /* Return a pointer index'th pgd entry for the i'th page table. */ |
82 | return &cpu->lg->pgdirs[i].pgdir[index]; | 95 | return &cpu->lg->pgdirs[i].pgdir[index]; |
83 | } | 96 | } |
84 | 97 | ||
98 | #ifdef CONFIG_X86_PAE | ||
99 | /* This routine then takes the PGD entry given above, which contains the | ||
100 | * address of the PMD page. It then returns a pointer to the PMD entry for the | ||
101 | * given address. */ | ||
102 | static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) | ||
103 | { | ||
104 | unsigned int index = pmd_index(vaddr); | ||
105 | pmd_t *page; | ||
106 | |||
107 | /* We kill any Guest trying to touch the Switcher addresses. */ | ||
108 | if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && | ||
109 | index >= SWITCHER_PMD_INDEX) { | ||
110 | kill_guest(cpu, "attempt to access switcher pages"); | ||
111 | index = 0; | ||
112 | } | ||
113 | |||
114 | /* You should never call this if the PGD entry wasn't valid */ | ||
115 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); | ||
116 | page = __va(pgd_pfn(spgd) << PAGE_SHIFT); | ||
117 | |||
118 | return &page[index]; | ||
119 | } | ||
120 | #endif | ||
121 | |||
85 | /* This routine then takes the page directory entry returned above, which | 122 | /* This routine then takes the page directory entry returned above, which |
86 | * contains the address of the page table entry (PTE) page. It then returns a | 123 | * contains the address of the page table entry (PTE) page. It then returns a |
87 | * pointer to the PTE entry for the given address. */ | 124 | * pointer to the PTE entry for the given address. */ |
88 | static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr) | 125 | static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) |
89 | { | 126 | { |
127 | #ifdef CONFIG_X86_PAE | ||
128 | pmd_t *pmd = spmd_addr(cpu, spgd, vaddr); | ||
129 | pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT); | ||
130 | |||
131 | /* You should never call this if the PMD entry wasn't valid */ | ||
132 | BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)); | ||
133 | #else | ||
90 | pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); | 134 | pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); |
91 | /* You should never call this if the PGD entry wasn't valid */ | 135 | /* You should never call this if the PGD entry wasn't valid */ |
92 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); | 136 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); |
137 | #endif | ||
138 | |||
93 | return &page[pte_index(vaddr)]; | 139 | return &page[pte_index(vaddr)]; |
94 | } | 140 | } |
95 | 141 | ||
@@ -101,10 +147,31 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) | |||
101 | return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); | 147 | return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); |
102 | } | 148 | } |
103 | 149 | ||
104 | static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) | 150 | #ifdef CONFIG_X86_PAE |
151 | static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) | ||
105 | { | 152 | { |
106 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; | 153 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; |
107 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); | 154 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); |
155 | return gpage + pmd_index(vaddr) * sizeof(pmd_t); | ||
156 | } | ||
157 | #endif | ||
158 | |||
159 | static unsigned long gpte_addr(struct lg_cpu *cpu, | ||
160 | pgd_t gpgd, unsigned long vaddr) | ||
161 | { | ||
162 | #ifdef CONFIG_X86_PAE | ||
163 | pmd_t gpmd; | ||
164 | #endif | ||
165 | unsigned long gpage; | ||
166 | |||
167 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); | ||
168 | #ifdef CONFIG_X86_PAE | ||
169 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
170 | gpage = pmd_pfn(gpmd) << PAGE_SHIFT; | ||
171 | BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT)); | ||
172 | #else | ||
173 | gpage = pgd_pfn(gpgd) << PAGE_SHIFT; | ||
174 | #endif | ||
108 | return gpage + pte_index(vaddr) * sizeof(pte_t); | 175 | return gpage + pte_index(vaddr) * sizeof(pte_t); |
109 | } | 176 | } |
110 | /*:*/ | 177 | /*:*/ |
@@ -184,11 +251,20 @@ static void check_gpte(struct lg_cpu *cpu, pte_t gpte) | |||
184 | 251 | ||
185 | static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) | 252 | static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) |
186 | { | 253 | { |
187 | if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || | 254 | if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || |
188 | (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) | 255 | (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) |
189 | kill_guest(cpu, "bad page directory entry"); | 256 | kill_guest(cpu, "bad page directory entry"); |
190 | } | 257 | } |
191 | 258 | ||
259 | #ifdef CONFIG_X86_PAE | ||
260 | static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) | ||
261 | { | ||
262 | if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || | ||
263 | (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) | ||
264 | kill_guest(cpu, "bad page middle directory entry"); | ||
265 | } | ||
266 | #endif | ||
267 | |||
192 | /*H:330 | 268 | /*H:330 |
193 | * (i) Looking up a page table entry when the Guest faults. | 269 | * (i) Looking up a page table entry when the Guest faults. |
194 | * | 270 | * |
@@ -207,6 +283,11 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
207 | pte_t gpte; | 283 | pte_t gpte; |
208 | pte_t *spte; | 284 | pte_t *spte; |
209 | 285 | ||
286 | #ifdef CONFIG_X86_PAE | ||
287 | pmd_t *spmd; | ||
288 | pmd_t gpmd; | ||
289 | #endif | ||
290 | |||
210 | /* First step: get the top-level Guest page table entry. */ | 291 | /* First step: get the top-level Guest page table entry. */ |
211 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | 292 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); |
212 | /* Toplevel not present? We can't map it in. */ | 293 | /* Toplevel not present? We can't map it in. */ |
@@ -228,12 +309,40 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
228 | check_gpgd(cpu, gpgd); | 309 | check_gpgd(cpu, gpgd); |
229 | /* And we copy the flags to the shadow PGD entry. The page | 310 | /* And we copy the flags to the shadow PGD entry. The page |
230 | * number in the shadow PGD is the page we just allocated. */ | 311 | * number in the shadow PGD is the page we just allocated. */ |
231 | *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); | 312 | set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); |
232 | } | 313 | } |
233 | 314 | ||
315 | #ifdef CONFIG_X86_PAE | ||
316 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
317 | /* middle level not present? We can't map it in. */ | ||
318 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | ||
319 | return false; | ||
320 | |||
321 | /* Now look at the matching shadow entry. */ | ||
322 | spmd = spmd_addr(cpu, *spgd, vaddr); | ||
323 | |||
324 | if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { | ||
325 | /* No shadow entry: allocate a new shadow PTE page. */ | ||
326 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); | ||
327 | |||
328 | /* This is not really the Guest's fault, but killing it is | ||
329 | * simple for this corner case. */ | ||
330 | if (!ptepage) { | ||
331 | kill_guest(cpu, "out of memory allocating pte page"); | ||
332 | return false; | ||
333 | } | ||
334 | |||
335 | /* We check that the Guest pmd is OK. */ | ||
336 | check_gpmd(cpu, gpmd); | ||
337 | |||
338 | /* And we copy the flags to the shadow PMD entry. The page | ||
339 | * number in the shadow PMD is the page we just allocated. */ | ||
340 | native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); | ||
341 | } | ||
342 | #endif | ||
234 | /* OK, now we look at the lower level in the Guest page table: keep its | 343 | /* OK, now we look at the lower level in the Guest page table: keep its |
235 | * address, because we might update it later. */ | 344 | * address, because we might update it later. */ |
236 | gpte_ptr = gpte_addr(gpgd, vaddr); | 345 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); |
237 | gpte = lgread(cpu, gpte_ptr, pte_t); | 346 | gpte = lgread(cpu, gpte_ptr, pte_t); |
238 | 347 | ||
239 | /* If this page isn't in the Guest page tables, we can't page it in. */ | 348 | /* If this page isn't in the Guest page tables, we can't page it in. */ |
@@ -259,7 +368,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
259 | gpte = pte_mkdirty(gpte); | 368 | gpte = pte_mkdirty(gpte); |
260 | 369 | ||
261 | /* Get the pointer to the shadow PTE entry we're going to set. */ | 370 | /* Get the pointer to the shadow PTE entry we're going to set. */ |
262 | spte = spte_addr(*spgd, vaddr); | 371 | spte = spte_addr(cpu, *spgd, vaddr); |
263 | /* If there was a valid shadow PTE entry here before, we release it. | 372 | /* If there was a valid shadow PTE entry here before, we release it. |
264 | * This can happen with a write to a previously read-only entry. */ | 373 | * This can happen with a write to a previously read-only entry. */ |
265 | release_pte(*spte); | 374 | release_pte(*spte); |
@@ -301,14 +410,23 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) | |||
301 | pgd_t *spgd; | 410 | pgd_t *spgd; |
302 | unsigned long flags; | 411 | unsigned long flags; |
303 | 412 | ||
413 | #ifdef CONFIG_X86_PAE | ||
414 | pmd_t *spmd; | ||
415 | #endif | ||
304 | /* Look at the current top level entry: is it present? */ | 416 | /* Look at the current top level entry: is it present? */ |
305 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); | 417 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); |
306 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) | 418 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) |
307 | return false; | 419 | return false; |
308 | 420 | ||
421 | #ifdef CONFIG_X86_PAE | ||
422 | spmd = spmd_addr(cpu, *spgd, vaddr); | ||
423 | if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) | ||
424 | return false; | ||
425 | #endif | ||
426 | |||
309 | /* Check the flags on the pte entry itself: it must be present and | 427 | /* Check the flags on the pte entry itself: it must be present and |
310 | * writable. */ | 428 | * writable. */ |
311 | flags = pte_flags(*(spte_addr(*spgd, vaddr))); | 429 | flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); |
312 | 430 | ||
313 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); | 431 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); |
314 | } | 432 | } |
@@ -322,6 +440,41 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) | |||
322 | kill_guest(cpu, "bad stack page %#lx", vaddr); | 440 | kill_guest(cpu, "bad stack page %#lx", vaddr); |
323 | } | 441 | } |
324 | 442 | ||
443 | #ifdef CONFIG_X86_PAE | ||
444 | static void release_pmd(pmd_t *spmd) | ||
445 | { | ||
446 | /* If the entry's not present, there's nothing to release. */ | ||
447 | if (pmd_flags(*spmd) & _PAGE_PRESENT) { | ||
448 | unsigned int i; | ||
449 | pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT); | ||
450 | /* For each entry in the page, we might need to release it. */ | ||
451 | for (i = 0; i < PTRS_PER_PTE; i++) | ||
452 | release_pte(ptepage[i]); | ||
453 | /* Now we can free the page of PTEs */ | ||
454 | free_page((long)ptepage); | ||
455 | /* And zero out the PMD entry so we never release it twice. */ | ||
456 | native_set_pmd(spmd, __pmd(0)); | ||
457 | } | ||
458 | } | ||
459 | |||
460 | static void release_pgd(pgd_t *spgd) | ||
461 | { | ||
462 | /* If the entry's not present, there's nothing to release. */ | ||
463 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { | ||
464 | unsigned int i; | ||
465 | pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); | ||
466 | |||
467 | for (i = 0; i < PTRS_PER_PMD; i++) | ||
468 | release_pmd(&pmdpage[i]); | ||
469 | |||
470 | /* Now we can free the page of PMDs */ | ||
471 | free_page((long)pmdpage); | ||
472 | /* And zero out the PGD entry so we never release it twice. */ | ||
473 | set_pgd(spgd, __pgd(0)); | ||
474 | } | ||
475 | } | ||
476 | |||
477 | #else /* !CONFIG_X86_PAE */ | ||
325 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ | 478 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ |
326 | static void release_pgd(pgd_t *spgd) | 479 | static void release_pgd(pgd_t *spgd) |
327 | { | 480 | { |
@@ -341,7 +494,7 @@ static void release_pgd(pgd_t *spgd) | |||
341 | *spgd = __pgd(0); | 494 | *spgd = __pgd(0); |
342 | } | 495 | } |
343 | } | 496 | } |
344 | 497 | #endif | |
345 | /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() | 498 | /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() |
346 | * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. | 499 | * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. |
347 | * It simply releases every PTE page from 0 up to the Guest's kernel address. */ | 500 | * It simply releases every PTE page from 0 up to the Guest's kernel address. */ |
@@ -370,6 +523,9 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) | |||
370 | pgd_t gpgd; | 523 | pgd_t gpgd; |
371 | pte_t gpte; | 524 | pte_t gpte; |
372 | 525 | ||
526 | #ifdef CONFIG_X86_PAE | ||
527 | pmd_t gpmd; | ||
528 | #endif | ||
373 | /* First step: get the top-level Guest page table entry. */ | 529 | /* First step: get the top-level Guest page table entry. */ |
374 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | 530 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); |
375 | /* Toplevel not present? We can't map it in. */ | 531 | /* Toplevel not present? We can't map it in. */ |
@@ -378,7 +534,13 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) | |||
378 | return -1UL; | 534 | return -1UL; |
379 | } | 535 | } |
380 | 536 | ||
381 | gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t); | 537 | gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); |
538 | #ifdef CONFIG_X86_PAE | ||
539 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
540 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | ||
541 | kill_guest(cpu, "Bad address %#lx", vaddr); | ||
542 | #endif | ||
543 | gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); | ||
382 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) | 544 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) |
383 | kill_guest(cpu, "Bad address %#lx", vaddr); | 545 | kill_guest(cpu, "Bad address %#lx", vaddr); |
384 | 546 | ||
@@ -405,6 +567,9 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
405 | int *blank_pgdir) | 567 | int *blank_pgdir) |
406 | { | 568 | { |
407 | unsigned int next; | 569 | unsigned int next; |
570 | #ifdef CONFIG_X86_PAE | ||
571 | pmd_t *pmd_table; | ||
572 | #endif | ||
408 | 573 | ||
409 | /* We pick one entry at random to throw out. Choosing the Least | 574 | /* We pick one entry at random to throw out. Choosing the Least |
410 | * Recently Used might be better, but this is easy. */ | 575 | * Recently Used might be better, but this is easy. */ |
@@ -416,10 +581,27 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
416 | /* If the allocation fails, just keep using the one we have */ | 581 | /* If the allocation fails, just keep using the one we have */ |
417 | if (!cpu->lg->pgdirs[next].pgdir) | 582 | if (!cpu->lg->pgdirs[next].pgdir) |
418 | next = cpu->cpu_pgd; | 583 | next = cpu->cpu_pgd; |
419 | else | 584 | else { |
420 | /* This is a blank page, so there are no kernel | 585 | #ifdef CONFIG_X86_PAE |
421 | * mappings: caller must map the stack! */ | 586 | /* In PAE mode, allocate a pmd page and populate the |
587 | * last pgd entry. */ | ||
588 | pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); | ||
589 | if (!pmd_table) { | ||
590 | free_page((long)cpu->lg->pgdirs[next].pgdir); | ||
591 | set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); | ||
592 | next = cpu->cpu_pgd; | ||
593 | } else { | ||
594 | set_pgd(cpu->lg->pgdirs[next].pgdir + | ||
595 | SWITCHER_PGD_INDEX, | ||
596 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | ||
597 | /* This is a blank page, so there are no kernel | ||
598 | * mappings: caller must map the stack! */ | ||
599 | *blank_pgdir = 1; | ||
600 | } | ||
601 | #else | ||
422 | *blank_pgdir = 1; | 602 | *blank_pgdir = 1; |
603 | #endif | ||
604 | } | ||
423 | } | 605 | } |
424 | /* Record which Guest toplevel this shadows. */ | 606 | /* Record which Guest toplevel this shadows. */ |
425 | cpu->lg->pgdirs[next].gpgdir = gpgdir; | 607 | cpu->lg->pgdirs[next].gpgdir = gpgdir; |
@@ -460,10 +642,25 @@ static void release_all_pagetables(struct lguest *lg) | |||
460 | 642 | ||
461 | /* Every shadow pagetable this Guest has */ | 643 | /* Every shadow pagetable this Guest has */ |
462 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 644 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) |
463 | if (lg->pgdirs[i].pgdir) | 645 | if (lg->pgdirs[i].pgdir) { |
646 | #ifdef CONFIG_X86_PAE | ||
647 | pgd_t *spgd; | ||
648 | pmd_t *pmdpage; | ||
649 | unsigned int k; | ||
650 | |||
651 | /* Get the last pmd page. */ | ||
652 | spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; | ||
653 | pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); | ||
654 | |||
655 | /* And release the pmd entries of that pmd page, | ||
656 | * except for the switcher pmd. */ | ||
657 | for (k = 0; k < SWITCHER_PMD_INDEX; k++) | ||
658 | release_pmd(&pmdpage[k]); | ||
659 | #endif | ||
464 | /* Every PGD entry except the Switcher at the top */ | 660 | /* Every PGD entry except the Switcher at the top */ |
465 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) | 661 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) |
466 | release_pgd(lg->pgdirs[i].pgdir + j); | 662 | release_pgd(lg->pgdirs[i].pgdir + j); |
663 | } | ||
467 | } | 664 | } |
468 | 665 | ||
469 | /* We also throw away everything when a Guest tells us it's changed a kernel | 666 | /* We also throw away everything when a Guest tells us it's changed a kernel |
@@ -504,24 +701,37 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, | |||
504 | { | 701 | { |
505 | /* Look up the matching shadow page directory entry. */ | 702 | /* Look up the matching shadow page directory entry. */ |
506 | pgd_t *spgd = spgd_addr(cpu, idx, vaddr); | 703 | pgd_t *spgd = spgd_addr(cpu, idx, vaddr); |
704 | #ifdef CONFIG_X86_PAE | ||
705 | pmd_t *spmd; | ||
706 | #endif | ||
507 | 707 | ||
508 | /* If the top level isn't present, there's no entry to update. */ | 708 | /* If the top level isn't present, there's no entry to update. */ |
509 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { | 709 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { |
510 | /* Otherwise, we start by releasing the existing entry. */ | 710 | #ifdef CONFIG_X86_PAE |
511 | pte_t *spte = spte_addr(*spgd, vaddr); | 711 | spmd = spmd_addr(cpu, *spgd, vaddr); |
512 | release_pte(*spte); | 712 | if (pmd_flags(*spmd) & _PAGE_PRESENT) { |
513 | 713 | #endif | |
514 | /* If they're setting this entry as dirty or accessed, we might | 714 | /* Otherwise, we start by releasing |
515 | * as well put that entry they've given us in now. This shaves | 715 | * the existing entry. */ |
516 | * 10% off a copy-on-write micro-benchmark. */ | 716 | pte_t *spte = spte_addr(cpu, *spgd, vaddr); |
517 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { | 717 | release_pte(*spte); |
518 | check_gpte(cpu, gpte); | 718 | |
519 | *spte = gpte_to_spte(cpu, gpte, | 719 | /* If they're setting this entry as dirty or accessed, |
520 | pte_flags(gpte) & _PAGE_DIRTY); | 720 | * we might as well put that entry they've given us |
521 | } else | 721 | * in now. This shaves 10% off a |
522 | /* Otherwise kill it and we can demand_page() it in | 722 | * copy-on-write micro-benchmark. */ |
523 | * later. */ | 723 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { |
524 | *spte = __pte(0); | 724 | check_gpte(cpu, gpte); |
725 | native_set_pte(spte, | ||
726 | gpte_to_spte(cpu, gpte, | ||
727 | pte_flags(gpte) & _PAGE_DIRTY)); | ||
728 | } else | ||
729 | /* Otherwise kill it and we can demand_page() | ||
730 | * it in later. */ | ||
731 | native_set_pte(spte, __pte(0)); | ||
732 | #ifdef CONFIG_X86_PAE | ||
733 | } | ||
734 | #endif | ||
525 | } | 735 | } |
526 | } | 736 | } |
527 | 737 | ||
@@ -572,8 +782,6 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) | |||
572 | { | 782 | { |
573 | int pgdir; | 783 | int pgdir; |
574 | 784 | ||
575 | /* The kernel seems to try to initialize this early on: we ignore its | ||
576 | * attempts to map over the Switcher. */ | ||
577 | if (idx >= SWITCHER_PGD_INDEX) | 785 | if (idx >= SWITCHER_PGD_INDEX) |
578 | return; | 786 | return; |
579 | 787 | ||
@@ -583,6 +791,12 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) | |||
583 | /* ... throw it away. */ | 791 | /* ... throw it away. */ |
584 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); | 792 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); |
585 | } | 793 | } |
794 | #ifdef CONFIG_X86_PAE | ||
795 | void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) | ||
796 | { | ||
797 | guest_pagetable_clear_all(&lg->cpus[0]); | ||
798 | } | ||
799 | #endif | ||
586 | 800 | ||
587 | /* Once we know how much memory we have we can construct simple identity | 801 | /* Once we know how much memory we have we can construct simple identity |
588 | * (which set virtual == physical) and linear mappings | 802 | * (which set virtual == physical) and linear mappings |
@@ -596,8 +810,16 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
596 | { | 810 | { |
597 | pgd_t __user *pgdir; | 811 | pgd_t __user *pgdir; |
598 | pte_t __user *linear; | 812 | pte_t __user *linear; |
599 | unsigned int mapped_pages, i, linear_pages, phys_linear; | ||
600 | unsigned long mem_base = (unsigned long)lg->mem_base; | 813 | unsigned long mem_base = (unsigned long)lg->mem_base; |
814 | unsigned int mapped_pages, i, linear_pages; | ||
815 | #ifdef CONFIG_X86_PAE | ||
816 | pmd_t __user *pmds; | ||
817 | unsigned int j; | ||
818 | pgd_t pgd; | ||
819 | pmd_t pmd; | ||
820 | #else | ||
821 | unsigned int phys_linear; | ||
822 | #endif | ||
601 | 823 | ||
602 | /* We have mapped_pages frames to map, so we need | 824 | /* We have mapped_pages frames to map, so we need |
603 | * linear_pages page tables to map them. */ | 825 | * linear_pages page tables to map them. */ |
@@ -610,6 +832,9 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
610 | /* Now we use the next linear_pages pages as pte pages */ | 832 | /* Now we use the next linear_pages pages as pte pages */ |
611 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; | 833 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; |
612 | 834 | ||
835 | #ifdef CONFIG_X86_PAE | ||
836 | pmds = (void *)linear - PAGE_SIZE; | ||
837 | #endif | ||
613 | /* Linear mapping is easy: put every page's address into the | 838 | /* Linear mapping is easy: put every page's address into the |
614 | * mapping in order. */ | 839 | * mapping in order. */ |
615 | for (i = 0; i < mapped_pages; i++) { | 840 | for (i = 0; i < mapped_pages; i++) { |
@@ -621,6 +846,22 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
621 | 846 | ||
622 | /* The top level points to the linear page table pages above. | 847 | /* The top level points to the linear page table pages above. |
623 | * We setup the identity and linear mappings here. */ | 848 | * We setup the identity and linear mappings here. */ |
849 | #ifdef CONFIG_X86_PAE | ||
850 | for (i = 0, j; i < mapped_pages && j < PTRS_PER_PMD; | ||
851 | i += PTRS_PER_PTE, j++) { | ||
852 | native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) | ||
853 | - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | ||
854 | |||
855 | if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) | ||
856 | return -EFAULT; | ||
857 | } | ||
858 | |||
859 | set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); | ||
860 | if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) | ||
861 | return -EFAULT; | ||
862 | if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) | ||
863 | return -EFAULT; | ||
864 | #else | ||
624 | phys_linear = (unsigned long)linear - mem_base; | 865 | phys_linear = (unsigned long)linear - mem_base; |
625 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { | 866 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { |
626 | pgd_t pgd; | 867 | pgd_t pgd; |
@@ -633,6 +874,7 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
633 | &pgd, sizeof(pgd))) | 874 | &pgd, sizeof(pgd))) |
634 | return -EFAULT; | 875 | return -EFAULT; |
635 | } | 876 | } |
877 | #endif | ||
636 | 878 | ||
637 | /* We return the top level (guest-physical) address: remember where | 879 | /* We return the top level (guest-physical) address: remember where |
638 | * this is. */ | 880 | * this is. */ |
@@ -648,7 +890,10 @@ int init_guest_pagetable(struct lguest *lg) | |||
648 | u64 mem; | 890 | u64 mem; |
649 | u32 initrd_size; | 891 | u32 initrd_size; |
650 | struct boot_params __user *boot = (struct boot_params *)lg->mem_base; | 892 | struct boot_params __user *boot = (struct boot_params *)lg->mem_base; |
651 | 893 | #ifdef CONFIG_X86_PAE | |
894 | pgd_t *pgd; | ||
895 | pmd_t *pmd_table; | ||
896 | #endif | ||
652 | /* Get the Guest memory size and the ramdisk size from the boot header | 897 | /* Get the Guest memory size and the ramdisk size from the boot header |
653 | * located at lg->mem_base (Guest address 0). */ | 898 | * located at lg->mem_base (Guest address 0). */ |
654 | if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) | 899 | if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) |
@@ -663,6 +908,15 @@ int init_guest_pagetable(struct lguest *lg) | |||
663 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); | 908 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); |
664 | if (!lg->pgdirs[0].pgdir) | 909 | if (!lg->pgdirs[0].pgdir) |
665 | return -ENOMEM; | 910 | return -ENOMEM; |
911 | #ifdef CONFIG_X86_PAE | ||
912 | pgd = lg->pgdirs[0].pgdir; | ||
913 | pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); | ||
914 | if (!pmd_table) | ||
915 | return -ENOMEM; | ||
916 | |||
917 | set_pgd(pgd + SWITCHER_PGD_INDEX, | ||
918 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | ||
919 | #endif | ||
666 | lg->cpus[0].cpu_pgd = 0; | 920 | lg->cpus[0].cpu_pgd = 0; |
667 | return 0; | 921 | return 0; |
668 | } | 922 | } |
@@ -672,17 +926,24 @@ void page_table_guest_data_init(struct lg_cpu *cpu) | |||
672 | { | 926 | { |
673 | /* We get the kernel address: above this is all kernel memory. */ | 927 | /* We get the kernel address: above this is all kernel memory. */ |
674 | if (get_user(cpu->lg->kernel_address, | 928 | if (get_user(cpu->lg->kernel_address, |
675 | &cpu->lg->lguest_data->kernel_address) | 929 | &cpu->lg->lguest_data->kernel_address) |
676 | /* We tell the Guest that it can't use the top 4MB of virtual | 930 | /* We tell the Guest that it can't use the top 2 or 4 MB |
677 | * addresses used by the Switcher. */ | 931 | * of virtual addresses used by the Switcher. */ |
678 | || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem) | 932 | || put_user(RESERVE_MEM * 1024 * 1024, |
679 | || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) | 933 | &cpu->lg->lguest_data->reserve_mem) |
934 | || put_user(cpu->lg->pgdirs[0].gpgdir, | ||
935 | &cpu->lg->lguest_data->pgdir)) | ||
680 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); | 936 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); |
681 | 937 | ||
682 | /* In flush_user_mappings() we loop from 0 to | 938 | /* In flush_user_mappings() we loop from 0 to |
683 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the | 939 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the |
684 | * Switcher mappings, so check that now. */ | 940 | * Switcher mappings, so check that now. */ |
941 | #ifdef CONFIG_X86_PAE | ||
942 | if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && | ||
943 | pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) | ||
944 | #else | ||
685 | if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) | 945 | if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) |
946 | #endif | ||
686 | kill_guest(cpu, "bad kernel address %#lx", | 947 | kill_guest(cpu, "bad kernel address %#lx", |
687 | cpu->lg->kernel_address); | 948 | cpu->lg->kernel_address); |
688 | } | 949 | } |
@@ -708,16 +969,30 @@ void free_guest_pagetable(struct lguest *lg) | |||
708 | void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) | 969 | void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) |
709 | { | 970 | { |
710 | pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); | 971 | pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); |
711 | pgd_t switcher_pgd; | ||
712 | pte_t regs_pte; | 972 | pte_t regs_pte; |
713 | unsigned long pfn; | 973 | unsigned long pfn; |
714 | 974 | ||
975 | #ifdef CONFIG_X86_PAE | ||
976 | pmd_t switcher_pmd; | ||
977 | pmd_t *pmd_table; | ||
978 | |||
979 | native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> | ||
980 | PAGE_SHIFT, PAGE_KERNEL_EXEC)); | ||
981 | |||
982 | pmd_table = __va(pgd_pfn(cpu->lg-> | ||
983 | pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) | ||
984 | << PAGE_SHIFT); | ||
985 | native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); | ||
986 | #else | ||
987 | pgd_t switcher_pgd; | ||
988 | |||
715 | /* Make the last PGD entry for this Guest point to the Switcher's PTE | 989 | /* Make the last PGD entry for this Guest point to the Switcher's PTE |
716 | * page for this CPU (with appropriate flags). */ | 990 | * page for this CPU (with appropriate flags). */ |
717 | switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); | 991 | switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); |
718 | 992 | ||
719 | cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; | 993 | cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; |
720 | 994 | ||
995 | #endif | ||
721 | /* We also change the Switcher PTE page. When we're running the Guest, | 996 | /* We also change the Switcher PTE page. When we're running the Guest, |
722 | * we want the Guest's "regs" page to appear where the first Switcher | 997 | * we want the Guest's "regs" page to appear where the first Switcher |
723 | * page for this CPU is. This is an optimization: when the Switcher | 998 | * page for this CPU is. This is an optimization: when the Switcher |