diff options
| author | Matias Zabaljauregui <zabaljauregui@gmail.com> | 2009-06-13 00:27:07 -0400 |
|---|---|---|
| committer | Rusty Russell <rusty@rustcorp.com.au> | 2009-06-12 08:57:08 -0400 |
| commit | acdd0b6292b282c4511897ac2691a47befbf1c6a (patch) | |
| tree | 1bfcfc32b11d35e99fec5bbf52b19d6ee038f25e | |
| parent | cefcad1773197523e11e18b669f245e6a8d32058 (diff) | |
lguest: PAE support
This version requires that host and guest have the same PAE status.
NX cap is not offered to the guest, yet.
Signed-off-by: Matias Zabaljauregui <zabaljauregui@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
| -rw-r--r-- | Documentation/lguest/lguest.txt | 1 | ||||
| -rw-r--r-- | arch/x86/include/asm/lguest.h | 7 | ||||
| -rw-r--r-- | arch/x86/include/asm/lguest_hcall.h | 3 | ||||
| -rw-r--r-- | arch/x86/lguest/Kconfig | 1 | ||||
| -rw-r--r-- | arch/x86/lguest/boot.c | 71 | ||||
| -rw-r--r-- | drivers/lguest/Kconfig | 2 | ||||
| -rw-r--r-- | drivers/lguest/hypercalls.c | 10 | ||||
| -rw-r--r-- | drivers/lguest/lg.h | 5 | ||||
| -rw-r--r-- | drivers/lguest/page_tables.c | 351 |
9 files changed, 403 insertions, 48 deletions
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt index 28c747362f9..efb3a6a045a 100644 --- a/Documentation/lguest/lguest.txt +++ b/Documentation/lguest/lguest.txt | |||
| @@ -37,7 +37,6 @@ Running Lguest: | |||
| 37 | "Paravirtualized guest support" = Y | 37 | "Paravirtualized guest support" = Y |
| 38 | "Lguest guest support" = Y | 38 | "Lguest guest support" = Y |
| 39 | "High Memory Support" = off/4GB | 39 | "High Memory Support" = off/4GB |
| 40 | "PAE (Physical Address Extension) Support" = N | ||
| 41 | "Alignment value to which kernel should be aligned" = 0x100000 | 40 | "Alignment value to which kernel should be aligned" = 0x100000 |
| 42 | (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and | 41 | (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and |
| 43 | CONFIG_PHYSICAL_ALIGN=0x100000) | 42 | CONFIG_PHYSICAL_ALIGN=0x100000) |
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 1caf57628b9..313389cd50d 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h | |||
| @@ -17,8 +17,13 @@ | |||
| 17 | /* Pages for switcher itself, then two pages per cpu */ | 17 | /* Pages for switcher itself, then two pages per cpu */ |
| 18 | #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) | 18 | #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) |
| 19 | 19 | ||
| 20 | /* We map at -4M for ease of mapping into the guest (one PTE page). */ | 20 | /* We map at -4M (-2M when PAE is activated) for ease of mapping |
| 21 | * into the guest (one PTE page). */ | ||
| 22 | #ifdef CONFIG_X86_PAE | ||
| 23 | #define SWITCHER_ADDR 0xFFE00000 | ||
| 24 | #else | ||
| 21 | #define SWITCHER_ADDR 0xFFC00000 | 25 | #define SWITCHER_ADDR 0xFFC00000 |
| 26 | #endif | ||
| 22 | 27 | ||
| 23 | /* Found in switcher.S */ | 28 | /* Found in switcher.S */ |
| 24 | extern unsigned long default_idt_entries[]; | 29 | extern unsigned long default_idt_entries[]; |
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index b14b3552a4d..d31c4a68407 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h | |||
| @@ -12,6 +12,7 @@ | |||
| 12 | #define LHCALL_TS 8 | 12 | #define LHCALL_TS 8 |
| 13 | #define LHCALL_SET_CLOCKEVENT 9 | 13 | #define LHCALL_SET_CLOCKEVENT 9 |
| 14 | #define LHCALL_HALT 10 | 14 | #define LHCALL_HALT 10 |
| 15 | #define LHCALL_SET_PMD 13 | ||
| 15 | #define LHCALL_SET_PTE 14 | 16 | #define LHCALL_SET_PTE 14 |
| 16 | #define LHCALL_SET_PGD 15 | 17 | #define LHCALL_SET_PGD 15 |
| 17 | #define LHCALL_LOAD_TLS 16 | 18 | #define LHCALL_LOAD_TLS 16 |
| @@ -33,7 +34,7 @@ | |||
| 33 | * operations? There are two ways: the direct way is to make a "hypercall", | 34 | * operations? There are two ways: the direct way is to make a "hypercall", |
| 34 | * to make requests of the Host Itself. | 35 | * to make requests of the Host Itself. |
| 35 | * | 36 | * |
| 36 | * We use the KVM hypercall mechanism. Eighteen hypercalls are | 37 | * We use the KVM hypercall mechanism. Seventeen hypercalls are |
| 37 | * available: the hypercall number is put in the %eax register, and the | 38 | * available: the hypercall number is put in the %eax register, and the |
| 38 | * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. | 39 | * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. |
| 39 | * If a return value makes sense, it's returned in %eax. | 40 | * If a return value makes sense, it's returned in %eax. |
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 8dab8f7844d..38718041efc 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig | |||
| @@ -2,7 +2,6 @@ config LGUEST_GUEST | |||
| 2 | bool "Lguest guest support" | 2 | bool "Lguest guest support" |
| 3 | select PARAVIRT | 3 | select PARAVIRT |
| 4 | depends on X86_32 | 4 | depends on X86_32 |
| 5 | depends on !X86_PAE | ||
| 6 | select VIRTIO | 5 | select VIRTIO |
| 7 | select VIRTIO_RING | 6 | select VIRTIO_RING |
| 8 | select VIRTIO_CONSOLE | 7 | select VIRTIO_CONSOLE |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index d12f554e5f6..7bc65f0f62c 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
| @@ -167,6 +167,7 @@ static void lazy_hcall3(unsigned long call, | |||
| 167 | async_hcall(call, arg1, arg2, arg3, 0); | 167 | async_hcall(call, arg1, arg2, arg3, 0); |
| 168 | } | 168 | } |
| 169 | 169 | ||
| 170 | #ifdef CONFIG_X86_PAE | ||
| 170 | static void lazy_hcall4(unsigned long call, | 171 | static void lazy_hcall4(unsigned long call, |
| 171 | unsigned long arg1, | 172 | unsigned long arg1, |
| 172 | unsigned long arg2, | 173 | unsigned long arg2, |
| @@ -178,6 +179,7 @@ static void lazy_hcall4(unsigned long call, | |||
| 178 | else | 179 | else |
| 179 | async_hcall(call, arg1, arg2, arg3, arg4); | 180 | async_hcall(call, arg1, arg2, arg3, arg4); |
| 180 | } | 181 | } |
| 182 | #endif | ||
| 181 | 183 | ||
| 182 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then | 184 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then |
| 183 | * issue the do-nothing hypercall to flush any stored calls. */ | 185 | * issue the do-nothing hypercall to flush any stored calls. */ |
| @@ -380,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
| 380 | case 1: /* Basic feature request. */ | 382 | case 1: /* Basic feature request. */ |
| 381 | /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ | 383 | /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ |
| 382 | *cx &= 0x00002201; | 384 | *cx &= 0x00002201; |
| 383 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ | 385 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ |
| 384 | *dx &= 0x07808111; | 386 | *dx &= 0x07808151; |
| 385 | /* The Host can do a nice optimization if it knows that the | 387 | /* The Host can do a nice optimization if it knows that the |
| 386 | * kernel mappings (addresses above 0xC0000000 or whatever | 388 | * kernel mappings (addresses above 0xC0000000 or whatever |
| 387 | * PAGE_OFFSET is set to) haven't changed. But Linux calls | 389 | * PAGE_OFFSET is set to) haven't changed. But Linux calls |
| @@ -400,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
| 400 | if (*ax > 0x80000008) | 402 | if (*ax > 0x80000008) |
| 401 | *ax = 0x80000008; | 403 | *ax = 0x80000008; |
| 402 | break; | 404 | break; |
| 405 | case 0x80000001: | ||
| 406 | /* Here we should fix nx cap depending on host. */ | ||
| 407 | /* For this version of PAE, we just clear NX bit. */ | ||
| 408 | *dx &= ~(1 << 20); | ||
| 409 | break; | ||
| 403 | } | 410 | } |
| 404 | } | 411 | } |
| 405 | 412 | ||
| @@ -533,7 +540,12 @@ static void lguest_write_cr4(unsigned long val) | |||
| 533 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | 540 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, |
| 534 | pte_t *ptep) | 541 | pte_t *ptep) |
| 535 | { | 542 | { |
| 543 | #ifdef CONFIG_X86_PAE | ||
| 544 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, | ||
| 545 | ptep->pte_low, ptep->pte_high); | ||
| 546 | #else | ||
| 536 | lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); | 547 | lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); |
| 548 | #endif | ||
| 537 | } | 549 | } |
| 538 | 550 | ||
| 539 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | 551 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, |
| @@ -543,15 +555,37 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
| 543 | lguest_pte_update(mm, addr, ptep); | 555 | lguest_pte_update(mm, addr, ptep); |
| 544 | } | 556 | } |
| 545 | 557 | ||
| 546 | /* The Guest calls this to set a top-level entry. Again, we set the entry then | 558 | /* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd |
| 547 | * tell the Host which top-level page we changed, and the index of the entry we | 559 | * to set a middle-level entry when PAE is activated. |
| 548 | * changed. */ | 560 | * Again, we set the entry then tell the Host which page we changed, |
| 561 | * and the index of the entry we changed. */ | ||
| 562 | #ifdef CONFIG_X86_PAE | ||
| 563 | static void lguest_set_pud(pud_t *pudp, pud_t pudval) | ||
| 564 | { | ||
| 565 | native_set_pud(pudp, pudval); | ||
| 566 | |||
| 567 | /* 32 bytes aligned pdpt address and the index. */ | ||
| 568 | lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, | ||
| 569 | (__pa(pudp) & 0x1F) / sizeof(pud_t)); | ||
| 570 | } | ||
| 571 | |||
| 572 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | ||
| 573 | { | ||
| 574 | native_set_pmd(pmdp, pmdval); | ||
| 575 | lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, | ||
| 576 | (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); | ||
| 577 | } | ||
| 578 | #else | ||
| 579 | |||
| 580 | /* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not | ||
| 581 | * activated. */ | ||
| 549 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | 582 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) |
| 550 | { | 583 | { |
| 551 | native_set_pmd(pmdp, pmdval); | 584 | native_set_pmd(pmdp, pmdval); |
| 552 | lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, | 585 | lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, |
| 553 | (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); | 586 | (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); |
| 554 | } | 587 | } |
| 588 | #endif | ||
| 555 | 589 | ||
| 556 | /* There are a couple of legacy places where the kernel sets a PTE, but we | 590 | /* There are a couple of legacy places where the kernel sets a PTE, but we |
| 557 | * don't know the top level any more. This is useless for us, since we don't | 591 | * don't know the top level any more. This is useless for us, since we don't |
| @@ -569,6 +603,26 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) | |||
| 569 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | 603 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); |
| 570 | } | 604 | } |
| 571 | 605 | ||
| 606 | #ifdef CONFIG_X86_PAE | ||
| 607 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | ||
| 608 | { | ||
| 609 | native_set_pte_atomic(ptep, pte); | ||
| 610 | if (cr3_changed) | ||
| 611 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | ||
| 612 | } | ||
| 613 | |||
| 614 | void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | ||
| 615 | { | ||
| 616 | native_pte_clear(mm, addr, ptep); | ||
| 617 | lguest_pte_update(mm, addr, ptep); | ||
| 618 | } | ||
| 619 | |||
| 620 | void lguest_pmd_clear(pmd_t *pmdp) | ||
| 621 | { | ||
| 622 | lguest_set_pmd(pmdp, __pmd(0)); | ||
| 623 | } | ||
| 624 | #endif | ||
| 625 | |||
| 572 | /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on | 626 | /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on |
| 573 | * native page table operations. On native hardware you can set a new page | 627 | * native page table operations. On native hardware you can set a new page |
| 574 | * table entry whenever you want, but if you want to remove one you have to do | 628 | * table entry whenever you want, but if you want to remove one you have to do |
| @@ -1035,6 +1089,7 @@ __init void lguest_init(void) | |||
| 1035 | pv_info.name = "lguest"; | 1089 | pv_info.name = "lguest"; |
| 1036 | pv_info.paravirt_enabled = 1; | 1090 | pv_info.paravirt_enabled = 1; |
| 1037 | pv_info.kernel_rpl = 1; | 1091 | pv_info.kernel_rpl = 1; |
| 1092 | pv_info.shared_kernel_pmd = 1; | ||
| 1038 | 1093 | ||
| 1039 | /* We set up all the lguest overrides for sensitive operations. These | 1094 | /* We set up all the lguest overrides for sensitive operations. These |
| 1040 | * are detailed with the operations themselves. */ | 1095 | * are detailed with the operations themselves. */ |
| @@ -1080,6 +1135,12 @@ __init void lguest_init(void) | |||
| 1080 | pv_mmu_ops.set_pte = lguest_set_pte; | 1135 | pv_mmu_ops.set_pte = lguest_set_pte; |
| 1081 | pv_mmu_ops.set_pte_at = lguest_set_pte_at; | 1136 | pv_mmu_ops.set_pte_at = lguest_set_pte_at; |
| 1082 | pv_mmu_ops.set_pmd = lguest_set_pmd; | 1137 | pv_mmu_ops.set_pmd = lguest_set_pmd; |
| 1138 | #ifdef CONFIG_X86_PAE | ||
| 1139 | pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; | ||
| 1140 | pv_mmu_ops.pte_clear = lguest_pte_clear; | ||
| 1141 | pv_mmu_ops.pmd_clear = lguest_pmd_clear; | ||
| 1142 | pv_mmu_ops.set_pud = lguest_set_pud; | ||
| 1143 | #endif | ||
| 1083 | pv_mmu_ops.read_cr2 = lguest_read_cr2; | 1144 | pv_mmu_ops.read_cr2 = lguest_read_cr2; |
| 1084 | pv_mmu_ops.read_cr3 = lguest_read_cr3; | 1145 | pv_mmu_ops.read_cr3 = lguest_read_cr3; |
| 1085 | pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; | 1146 | pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; |
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index a3d3cbab359..8f63845db83 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | config LGUEST | 1 | config LGUEST |
| 2 | tristate "Linux hypervisor example code" | 2 | tristate "Linux hypervisor example code" |
| 3 | depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX | 3 | depends on X86_32 && EXPERIMENTAL && FUTEX |
| 4 | select HVC_DRIVER | 4 | select HVC_DRIVER |
| 5 | ---help--- | 5 | ---help--- |
| 6 | This is a very simple module which allows you to run | 6 | This is a very simple module which allows you to run |
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 51149ca1461..c29ffa19cb7 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c | |||
| @@ -77,11 +77,21 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) | |||
| 77 | guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); | 77 | guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); |
| 78 | break; | 78 | break; |
| 79 | case LHCALL_SET_PTE: | 79 | case LHCALL_SET_PTE: |
| 80 | #ifdef CONFIG_X86_PAE | ||
| 81 | guest_set_pte(cpu, args->arg1, args->arg2, | ||
| 82 | __pte(args->arg3 | (u64)args->arg4 << 32)); | ||
| 83 | #else | ||
| 80 | guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); | 84 | guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); |
| 85 | #endif | ||
| 81 | break; | 86 | break; |
| 82 | case LHCALL_SET_PGD: | 87 | case LHCALL_SET_PGD: |
| 83 | guest_set_pgd(cpu->lg, args->arg1, args->arg2); | 88 | guest_set_pgd(cpu->lg, args->arg1, args->arg2); |
| 84 | break; | 89 | break; |
| 90 | #ifdef CONFIG_X86_PAE | ||
| 91 | case LHCALL_SET_PMD: | ||
| 92 | guest_set_pmd(cpu->lg, args->arg1, args->arg2); | ||
| 93 | break; | ||
| 94 | #endif | ||
| 85 | case LHCALL_SET_CLOCKEVENT: | 95 | case LHCALL_SET_CLOCKEVENT: |
| 86 | guest_set_clockevent(cpu, args->arg1); | 96 | guest_set_clockevent(cpu, args->arg1); |
| 87 | break; | 97 | break; |
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index cacc2da2058..6201ce59e88 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h | |||
| @@ -137,6 +137,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); | |||
| 137 | * in the kernel. */ | 137 | * in the kernel. */ |
| 138 | #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) | 138 | #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) |
| 139 | #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) | 139 | #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) |
| 140 | #define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) | ||
| 141 | #define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT) | ||
| 140 | 142 | ||
| 141 | /* interrupts_and_traps.c: */ | 143 | /* interrupts_and_traps.c: */ |
| 142 | unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); | 144 | unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); |
| @@ -170,6 +172,9 @@ int init_guest_pagetable(struct lguest *lg); | |||
| 170 | void free_guest_pagetable(struct lguest *lg); | 172 | void free_guest_pagetable(struct lguest *lg); |
| 171 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); | 173 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); |
| 172 | void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); | 174 | void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); |
| 175 | #ifdef CONFIG_X86_PAE | ||
| 176 | void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); | ||
| 177 | #endif | ||
| 173 | void guest_pagetable_clear_all(struct lg_cpu *cpu); | 178 | void guest_pagetable_clear_all(struct lg_cpu *cpu); |
| 174 | void guest_pagetable_flush_user(struct lg_cpu *cpu); | 179 | void guest_pagetable_flush_user(struct lg_cpu *cpu); |
| 175 | void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, | 180 | void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 6a54d76b623..5e2c26adcf0 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
| @@ -53,6 +53,17 @@ | |||
| 53 | * page. */ | 53 | * page. */ |
| 54 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) | 54 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) |
| 55 | 55 | ||
| 56 | /* For PAE we need the PMD index as well. We use the last 2MB, so we | ||
| 57 | * will need the last pmd entry of the last pmd page. */ | ||
| 58 | #ifdef CONFIG_X86_PAE | ||
| 59 | #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) | ||
| 60 | #define RESERVE_MEM 2U | ||
| 61 | #define CHECK_GPGD_MASK _PAGE_PRESENT | ||
| 62 | #else | ||
| 63 | #define RESERVE_MEM 4U | ||
| 64 | #define CHECK_GPGD_MASK _PAGE_TABLE | ||
| 65 | #endif | ||
| 66 | |||
| 56 | /* We actually need a separate PTE page for each CPU. Remember that after the | 67 | /* We actually need a separate PTE page for each CPU. Remember that after the |
| 57 | * Switcher code itself comes two pages for each CPU, and we don't want this | 68 | * Switcher code itself comes two pages for each CPU, and we don't want this |
| 58 | * CPU's guest to see the pages of any other CPU. */ | 69 | * CPU's guest to see the pages of any other CPU. */ |
| @@ -73,23 +84,58 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) | |||
| 73 | { | 84 | { |
| 74 | unsigned int index = pgd_index(vaddr); | 85 | unsigned int index = pgd_index(vaddr); |
| 75 | 86 | ||
| 87 | #ifndef CONFIG_X86_PAE | ||
| 76 | /* We kill any Guest trying to touch the Switcher addresses. */ | 88 | /* We kill any Guest trying to touch the Switcher addresses. */ |
| 77 | if (index >= SWITCHER_PGD_INDEX) { | 89 | if (index >= SWITCHER_PGD_INDEX) { |
| 78 | kill_guest(cpu, "attempt to access switcher pages"); | 90 | kill_guest(cpu, "attempt to access switcher pages"); |
| 79 | index = 0; | 91 | index = 0; |
| 80 | } | 92 | } |
| 93 | #endif | ||
| 81 | /* Return a pointer index'th pgd entry for the i'th page table. */ | 94 | /* Return a pointer index'th pgd entry for the i'th page table. */ |
| 82 | return &cpu->lg->pgdirs[i].pgdir[index]; | 95 | return &cpu->lg->pgdirs[i].pgdir[index]; |
| 83 | } | 96 | } |
| 84 | 97 | ||
| 98 | #ifdef CONFIG_X86_PAE | ||
| 99 | /* This routine then takes the PGD entry given above, which contains the | ||
| 100 | * address of the PMD page. It then returns a pointer to the PMD entry for the | ||
| 101 | * given address. */ | ||
| 102 | static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) | ||
| 103 | { | ||
| 104 | unsigned int index = pmd_index(vaddr); | ||
| 105 | pmd_t *page; | ||
| 106 | |||
| 107 | /* We kill any Guest trying to touch the Switcher addresses. */ | ||
| 108 | if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && | ||
| 109 | index >= SWITCHER_PMD_INDEX) { | ||
| 110 | kill_guest(cpu, "attempt to access switcher pages"); | ||
| 111 | index = 0; | ||
| 112 | } | ||
| 113 | |||
| 114 | /* You should never call this if the PGD entry wasn't valid */ | ||
| 115 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); | ||
| 116 | page = __va(pgd_pfn(spgd) << PAGE_SHIFT); | ||
| 117 | |||
| 118 | return &page[index]; | ||
| 119 | } | ||
| 120 | #endif | ||
| 121 | |||
| 85 | /* This routine then takes the page directory entry returned above, which | 122 | /* This routine then takes the page directory entry returned above, which |
| 86 | * contains the address of the page table entry (PTE) page. It then returns a | 123 | * contains the address of the page table entry (PTE) page. It then returns a |
| 87 | * pointer to the PTE entry for the given address. */ | 124 | * pointer to the PTE entry for the given address. */ |
| 88 | static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr) | 125 | static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) |
| 89 | { | 126 | { |
| 127 | #ifdef CONFIG_X86_PAE | ||
| 128 | pmd_t *pmd = spmd_addr(cpu, spgd, vaddr); | ||
| 129 | pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT); | ||
| 130 | |||
| 131 | /* You should never call this if the PMD entry wasn't valid */ | ||
| 132 | BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)); | ||
| 133 | #else | ||
| 90 | pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); | 134 | pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); |
| 91 | /* You should never call this if the PGD entry wasn't valid */ | 135 | /* You should never call this if the PGD entry wasn't valid */ |
| 92 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); | 136 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); |
| 137 | #endif | ||
| 138 | |||
| 93 | return &page[pte_index(vaddr)]; | 139 | return &page[pte_index(vaddr)]; |
| 94 | } | 140 | } |
| 95 | 141 | ||
| @@ -101,10 +147,31 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 101 | return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); | 147 | return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); |
| 102 | } | 148 | } |
| 103 | 149 | ||
| 104 | static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) | 150 | #ifdef CONFIG_X86_PAE |
| 151 | static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) | ||
| 105 | { | 152 | { |
| 106 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; | 153 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; |
| 107 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); | 154 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); |
| 155 | return gpage + pmd_index(vaddr) * sizeof(pmd_t); | ||
| 156 | } | ||
| 157 | #endif | ||
| 158 | |||
| 159 | static unsigned long gpte_addr(struct lg_cpu *cpu, | ||
| 160 | pgd_t gpgd, unsigned long vaddr) | ||
| 161 | { | ||
| 162 | #ifdef CONFIG_X86_PAE | ||
| 163 | pmd_t gpmd; | ||
| 164 | #endif | ||
| 165 | unsigned long gpage; | ||
| 166 | |||
| 167 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); | ||
| 168 | #ifdef CONFIG_X86_PAE | ||
| 169 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
| 170 | gpage = pmd_pfn(gpmd) << PAGE_SHIFT; | ||
| 171 | BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT)); | ||
| 172 | #else | ||
| 173 | gpage = pgd_pfn(gpgd) << PAGE_SHIFT; | ||
| 174 | #endif | ||
| 108 | return gpage + pte_index(vaddr) * sizeof(pte_t); | 175 | return gpage + pte_index(vaddr) * sizeof(pte_t); |
| 109 | } | 176 | } |
| 110 | /*:*/ | 177 | /*:*/ |
| @@ -184,11 +251,20 @@ static void check_gpte(struct lg_cpu *cpu, pte_t gpte) | |||
| 184 | 251 | ||
| 185 | static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) | 252 | static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) |
| 186 | { | 253 | { |
| 187 | if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || | 254 | if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || |
| 188 | (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) | 255 | (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) |
| 189 | kill_guest(cpu, "bad page directory entry"); | 256 | kill_guest(cpu, "bad page directory entry"); |
| 190 | } | 257 | } |
| 191 | 258 | ||
| 259 | #ifdef CONFIG_X86_PAE | ||
| 260 | static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) | ||
| 261 | { | ||
| 262 | if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || | ||
| 263 | (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) | ||
| 264 | kill_guest(cpu, "bad page middle directory entry"); | ||
| 265 | } | ||
| 266 | #endif | ||
| 267 | |||
| 192 | /*H:330 | 268 | /*H:330 |
| 193 | * (i) Looking up a page table entry when the Guest faults. | 269 | * (i) Looking up a page table entry when the Guest faults. |
| 194 | * | 270 | * |
| @@ -207,6 +283,11 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
| 207 | pte_t gpte; | 283 | pte_t gpte; |
| 208 | pte_t *spte; | 284 | pte_t *spte; |
| 209 | 285 | ||
| 286 | #ifdef CONFIG_X86_PAE | ||
| 287 | pmd_t *spmd; | ||
| 288 | pmd_t gpmd; | ||
| 289 | #endif | ||
| 290 | |||
| 210 | /* First step: get the top-level Guest page table entry. */ | 291 | /* First step: get the top-level Guest page table entry. */ |
| 211 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | 292 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); |
| 212 | /* Toplevel not present? We can't map it in. */ | 293 | /* Toplevel not present? We can't map it in. */ |
| @@ -228,12 +309,40 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
| 228 | check_gpgd(cpu, gpgd); | 309 | check_gpgd(cpu, gpgd); |
| 229 | /* And we copy the flags to the shadow PGD entry. The page | 310 | /* And we copy the flags to the shadow PGD entry. The page |
| 230 | * number in the shadow PGD is the page we just allocated. */ | 311 | * number in the shadow PGD is the page we just allocated. */ |
| 231 | *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); | 312 | set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); |
| 232 | } | 313 | } |
| 233 | 314 | ||
| 315 | #ifdef CONFIG_X86_PAE | ||
| 316 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
| 317 | /* middle level not present? We can't map it in. */ | ||
| 318 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | ||
| 319 | return false; | ||
| 320 | |||
| 321 | /* Now look at the matching shadow entry. */ | ||
| 322 | spmd = spmd_addr(cpu, *spgd, vaddr); | ||
| 323 | |||
| 324 | if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { | ||
| 325 | /* No shadow entry: allocate a new shadow PTE page. */ | ||
| 326 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); | ||
| 327 | |||
| 328 | /* This is not really the Guest's fault, but killing it is | ||
| 329 | * simple for this corner case. */ | ||
| 330 | if (!ptepage) { | ||
| 331 | kill_guest(cpu, "out of memory allocating pte page"); | ||
| 332 | return false; | ||
| 333 | } | ||
| 334 | |||
| 335 | /* We check that the Guest pmd is OK. */ | ||
| 336 | check_gpmd(cpu, gpmd); | ||
| 337 | |||
| 338 | /* And we copy the flags to the shadow PMD entry. The page | ||
| 339 | * number in the shadow PMD is the page we just allocated. */ | ||
| 340 | native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); | ||
| 341 | } | ||
| 342 | #endif | ||
| 234 | /* OK, now we look at the lower level in the Guest page table: keep its | 343 | /* OK, now we look at the lower level in the Guest page table: keep its |
| 235 | * address, because we might update it later. */ | 344 | * address, because we might update it later. */ |
| 236 | gpte_ptr = gpte_addr(gpgd, vaddr); | 345 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); |
| 237 | gpte = lgread(cpu, gpte_ptr, pte_t); | 346 | gpte = lgread(cpu, gpte_ptr, pte_t); |
| 238 | 347 | ||
| 239 | /* If this page isn't in the Guest page tables, we can't page it in. */ | 348 | /* If this page isn't in the Guest page tables, we can't page it in. */ |
| @@ -259,7 +368,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
| 259 | gpte = pte_mkdirty(gpte); | 368 | gpte = pte_mkdirty(gpte); |
| 260 | 369 | ||
| 261 | /* Get the pointer to the shadow PTE entry we're going to set. */ | 370 | /* Get the pointer to the shadow PTE entry we're going to set. */ |
| 262 | spte = spte_addr(*spgd, vaddr); | 371 | spte = spte_addr(cpu, *spgd, vaddr); |
| 263 | /* If there was a valid shadow PTE entry here before, we release it. | 372 | /* If there was a valid shadow PTE entry here before, we release it. |
| 264 | * This can happen with a write to a previously read-only entry. */ | 373 | * This can happen with a write to a previously read-only entry. */ |
| 265 | release_pte(*spte); | 374 | release_pte(*spte); |
| @@ -301,14 +410,23 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 301 | pgd_t *spgd; | 410 | pgd_t *spgd; |
| 302 | unsigned long flags; | 411 | unsigned long flags; |
| 303 | 412 | ||
| 413 | #ifdef CONFIG_X86_PAE | ||
| 414 | pmd_t *spmd; | ||
| 415 | #endif | ||
| 304 | /* Look at the current top level entry: is it present? */ | 416 | /* Look at the current top level entry: is it present? */ |
| 305 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); | 417 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); |
| 306 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) | 418 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) |
| 307 | return false; | 419 | return false; |
| 308 | 420 | ||
| 421 | #ifdef CONFIG_X86_PAE | ||
| 422 | spmd = spmd_addr(cpu, *spgd, vaddr); | ||
| 423 | if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) | ||
| 424 | return false; | ||
| 425 | #endif | ||
| 426 | |||
| 309 | /* Check the flags on the pte entry itself: it must be present and | 427 | /* Check the flags on the pte entry itself: it must be present and |
| 310 | * writable. */ | 428 | * writable. */ |
| 311 | flags = pte_flags(*(spte_addr(*spgd, vaddr))); | 429 | flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); |
| 312 | 430 | ||
| 313 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); | 431 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); |
| 314 | } | 432 | } |
| @@ -322,6 +440,41 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 322 | kill_guest(cpu, "bad stack page %#lx", vaddr); | 440 | kill_guest(cpu, "bad stack page %#lx", vaddr); |
| 323 | } | 441 | } |
| 324 | 442 | ||
| 443 | #ifdef CONFIG_X86_PAE | ||
| 444 | static void release_pmd(pmd_t *spmd) | ||
| 445 | { | ||
| 446 | /* If the entry's not present, there's nothing to release. */ | ||
| 447 | if (pmd_flags(*spmd) & _PAGE_PRESENT) { | ||
| 448 | unsigned int i; | ||
| 449 | pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT); | ||
| 450 | /* For each entry in the page, we might need to release it. */ | ||
| 451 | for (i = 0; i < PTRS_PER_PTE; i++) | ||
| 452 | release_pte(ptepage[i]); | ||
| 453 | /* Now we can free the page of PTEs */ | ||
| 454 | free_page((long)ptepage); | ||
| 455 | /* And zero out the PMD entry so we never release it twice. */ | ||
| 456 | native_set_pmd(spmd, __pmd(0)); | ||
| 457 | } | ||
| 458 | } | ||
| 459 | |||
| 460 | static void release_pgd(pgd_t *spgd) | ||
| 461 | { | ||
| 462 | /* If the entry's not present, there's nothing to release. */ | ||
| 463 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { | ||
| 464 | unsigned int i; | ||
| 465 | pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); | ||
| 466 | |||
| 467 | for (i = 0; i < PTRS_PER_PMD; i++) | ||
| 468 | release_pmd(&pmdpage[i]); | ||
| 469 | |||
| 470 | /* Now we can free the page of PMDs */ | ||
| 471 | free_page((long)pmdpage); | ||
| 472 | /* And zero out the PGD entry so we never release it twice. */ | ||
| 473 | set_pgd(spgd, __pgd(0)); | ||
| 474 | } | ||
| 475 | } | ||
| 476 | |||
| 477 | #else /* !CONFIG_X86_PAE */ | ||
| 325 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ | 478 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ |
| 326 | static void release_pgd(pgd_t *spgd) | 479 | static void release_pgd(pgd_t *spgd) |
| 327 | { | 480 | { |
| @@ -341,7 +494,7 @@ static void release_pgd(pgd_t *spgd) | |||
| 341 | *spgd = __pgd(0); | 494 | *spgd = __pgd(0); |
| 342 | } | 495 | } |
| 343 | } | 496 | } |
| 344 | 497 | #endif | |
| 345 | /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() | 498 | /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() |
| 346 | * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. | 499 | * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. |
| 347 | * It simply releases every PTE page from 0 up to the Guest's kernel address. */ | 500 | * It simply releases every PTE page from 0 up to the Guest's kernel address. */ |
| @@ -370,6 +523,9 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 370 | pgd_t gpgd; | 523 | pgd_t gpgd; |
| 371 | pte_t gpte; | 524 | pte_t gpte; |
| 372 | 525 | ||
| 526 | #ifdef CONFIG_X86_PAE | ||
| 527 | pmd_t gpmd; | ||
| 528 | #endif | ||
| 373 | /* First step: get the top-level Guest page table entry. */ | 529 | /* First step: get the top-level Guest page table entry. */ |
| 374 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | 530 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); |
| 375 | /* Toplevel not present? We can't map it in. */ | 531 | /* Toplevel not present? We can't map it in. */ |
| @@ -378,7 +534,13 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 378 | return -1UL; | 534 | return -1UL; |
| 379 | } | 535 | } |
| 380 | 536 | ||
| 381 | gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t); | 537 | gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); |
| 538 | #ifdef CONFIG_X86_PAE | ||
| 539 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
| 540 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | ||
| 541 | kill_guest(cpu, "Bad address %#lx", vaddr); | ||
| 542 | #endif | ||
| 543 | gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); | ||
| 382 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) | 544 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) |
| 383 | kill_guest(cpu, "Bad address %#lx", vaddr); | 545 | kill_guest(cpu, "Bad address %#lx", vaddr); |
| 384 | 546 | ||
| @@ -405,6 +567,9 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
| 405 | int *blank_pgdir) | 567 | int *blank_pgdir) |
| 406 | { | 568 | { |
| 407 | unsigned int next; | 569 | unsigned int next; |
| 570 | #ifdef CONFIG_X86_PAE | ||
| 571 | pmd_t *pmd_table; | ||
| 572 | #endif | ||
| 408 | 573 | ||
| 409 | /* We pick one entry at random to throw out. Choosing the Least | 574 | /* We pick one entry at random to throw out. Choosing the Least |
| 410 | * Recently Used might be better, but this is easy. */ | 575 | * Recently Used might be better, but this is easy. */ |
| @@ -416,10 +581,27 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
| 416 | /* If the allocation fails, just keep using the one we have */ | 581 | /* If the allocation fails, just keep using the one we have */ |
| 417 | if (!cpu->lg->pgdirs[next].pgdir) | 582 | if (!cpu->lg->pgdirs[next].pgdir) |
| 418 | next = cpu->cpu_pgd; | 583 | next = cpu->cpu_pgd; |
| 419 | else | 584 | else { |
| 420 | /* This is a blank page, so there are no kernel | 585 | #ifdef CONFIG_X86_PAE |
| 421 | * mappings: caller must map the stack! */ | 586 | /* In PAE mode, allocate a pmd page and populate the |
| 587 | * last pgd entry. */ | ||
| 588 | pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); | ||
| 589 | if (!pmd_table) { | ||
| 590 | free_page((long)cpu->lg->pgdirs[next].pgdir); | ||
| 591 | set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); | ||
| 592 | next = cpu->cpu_pgd; | ||
| 593 | } else { | ||
| 594 | set_pgd(cpu->lg->pgdirs[next].pgdir + | ||
| 595 | SWITCHER_PGD_INDEX, | ||
| 596 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | ||
| 597 | /* This is a blank page, so there are no kernel | ||
| 598 | * mappings: caller must map the stack! */ | ||
| 599 | *blank_pgdir = 1; | ||
| 600 | } | ||
| 601 | #else | ||
| 422 | *blank_pgdir = 1; | 602 | *blank_pgdir = 1; |
| 603 | #endif | ||
| 604 | } | ||
| 423 | } | 605 | } |
| 424 | /* Record which Guest toplevel this shadows. */ | 606 | /* Record which Guest toplevel this shadows. */ |
| 425 | cpu->lg->pgdirs[next].gpgdir = gpgdir; | 607 | cpu->lg->pgdirs[next].gpgdir = gpgdir; |
| @@ -460,10 +642,25 @@ static void release_all_pagetables(struct lguest *lg) | |||
| 460 | 642 | ||
| 461 | /* Every shadow pagetable this Guest has */ | 643 | /* Every shadow pagetable this Guest has */ |
| 462 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 644 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) |
| 463 | if (lg->pgdirs[i].pgdir) | 645 | if (lg->pgdirs[i].pgdir) { |
| 646 | #ifdef CONFIG_X86_PAE | ||
| 647 | pgd_t *spgd; | ||
| 648 | pmd_t *pmdpage; | ||
| 649 | unsigned int k; | ||
| 650 | |||
| 651 | /* Get the last pmd page. */ | ||
| 652 | spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; | ||
| 653 | pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); | ||
| 654 | |||
| 655 | /* And release the pmd entries of that pmd page, | ||
| 656 | * except for the switcher pmd. */ | ||
| 657 | for (k = 0; k < SWITCHER_PMD_INDEX; k++) | ||
| 658 | release_pmd(&pmdpage[k]); | ||
| 659 | #endif | ||
| 464 | /* Every PGD entry except the Switcher at the top */ | 660 | /* Every PGD entry except the Switcher at the top */ |
| 465 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) | 661 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) |
| 466 | release_pgd(lg->pgdirs[i].pgdir + j); | 662 | release_pgd(lg->pgdirs[i].pgdir + j); |
| 663 | } | ||
| 467 | } | 664 | } |
| 468 | 665 | ||
| 469 | /* We also throw away everything when a Guest tells us it's changed a kernel | 666 | /* We also throw away everything when a Guest tells us it's changed a kernel |
| @@ -504,24 +701,37 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, | |||
| 504 | { | 701 | { |
| 505 | /* Look up the matching shadow page directory entry. */ | 702 | /* Look up the matching shadow page directory entry. */ |
| 506 | pgd_t *spgd = spgd_addr(cpu, idx, vaddr); | 703 | pgd_t *spgd = spgd_addr(cpu, idx, vaddr); |
| 704 | #ifdef CONFIG_X86_PAE | ||
| 705 | pmd_t *spmd; | ||
| 706 | #endif | ||
| 507 | 707 | ||
| 508 | /* If the top level isn't present, there's no entry to update. */ | 708 | /* If the top level isn't present, there's no entry to update. */ |
| 509 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { | 709 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { |
| 510 | /* Otherwise, we start by releasing the existing entry. */ | 710 | #ifdef CONFIG_X86_PAE |
| 511 | pte_t *spte = spte_addr(*spgd, vaddr); | 711 | spmd = spmd_addr(cpu, *spgd, vaddr); |
| 512 | release_pte(*spte); | 712 | if (pmd_flags(*spmd) & _PAGE_PRESENT) { |
| 513 | 713 | #endif | |
| 514 | /* If they're setting this entry as dirty or accessed, we might | 714 | /* Otherwise, we start by releasing |
| 515 | * as well put that entry they've given us in now. This shaves | 715 | * the existing entry. */ |
| 516 | * 10% off a copy-on-write micro-benchmark. */ | 716 | pte_t *spte = spte_addr(cpu, *spgd, vaddr); |
| 517 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { | 717 | release_pte(*spte); |
| 518 | check_gpte(cpu, gpte); | 718 | |
| 519 | *spte = gpte_to_spte(cpu, gpte, | 719 | /* If they're setting this entry as dirty or accessed, |
| 520 | pte_flags(gpte) & _PAGE_DIRTY); | 720 | * we might as well put that entry they've given us |
| 521 | } else | 721 | * in now. This shaves 10% off a |
| 522 | /* Otherwise kill it and we can demand_page() it in | 722 | * copy-on-write micro-benchmark. */ |
| 523 | * later. */ | 723 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { |
| 524 | *spte = __pte(0); | 724 | check_gpte(cpu, gpte); |
| 725 | native_set_pte(spte, | ||
| 726 | gpte_to_spte(cpu, gpte, | ||
| 727 | pte_flags(gpte) & _PAGE_DIRTY)); | ||
| 728 | } else | ||
| 729 | /* Otherwise kill it and we can demand_page() | ||
| 730 | * it in later. */ | ||
| 731 | native_set_pte(spte, __pte(0)); | ||
| 732 | #ifdef CONFIG_X86_PAE | ||
| 733 | } | ||
| 734 | #endif | ||
| 525 | } | 735 | } |
| 526 | } | 736 | } |
| 527 | 737 | ||
| @@ -572,8 +782,6 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) | |||
| 572 | { | 782 | { |
| 573 | int pgdir; | 783 | int pgdir; |
| 574 | 784 | ||
| 575 | /* The kernel seems to try to initialize this early on: we ignore its | ||
| 576 | * attempts to map over the Switcher. */ | ||
| 577 | if (idx >= SWITCHER_PGD_INDEX) | 785 | if (idx >= SWITCHER_PGD_INDEX) |
| 578 | return; | 786 | return; |
| 579 | 787 | ||
| @@ -583,6 +791,12 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) | |||
| 583 | /* ... throw it away. */ | 791 | /* ... throw it away. */ |
| 584 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); | 792 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); |
| 585 | } | 793 | } |
| 794 | #ifdef CONFIG_X86_PAE | ||
| 795 | void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) | ||
| 796 | { | ||
| 797 | guest_pagetable_clear_all(&lg->cpus[0]); | ||
| 798 | } | ||
| 799 | #endif | ||
| 586 | 800 | ||
| 587 | /* Once we know how much memory we have we can construct simple identity | 801 | /* Once we know how much memory we have we can construct simple identity |
| 588 | * (which set virtual == physical) and linear mappings | 802 | * (which set virtual == physical) and linear mappings |
| @@ -596,8 +810,16 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
| 596 | { | 810 | { |
| 597 | pgd_t __user *pgdir; | 811 | pgd_t __user *pgdir; |
| 598 | pte_t __user *linear; | 812 | pte_t __user *linear; |
| 599 | unsigned int mapped_pages, i, linear_pages, phys_linear; | ||
| 600 | unsigned long mem_base = (unsigned long)lg->mem_base; | 813 | unsigned long mem_base = (unsigned long)lg->mem_base; |
| 814 | unsigned int mapped_pages, i, linear_pages; | ||
| 815 | #ifdef CONFIG_X86_PAE | ||
| 816 | pmd_t __user *pmds; | ||
| 817 | unsigned int j; | ||
| 818 | pgd_t pgd; | ||
| 819 | pmd_t pmd; | ||
| 820 | #else | ||
| 821 | unsigned int phys_linear; | ||
| 822 | #endif | ||
| 601 | 823 | ||
| 602 | /* We have mapped_pages frames to map, so we need | 824 | /* We have mapped_pages frames to map, so we need |
| 603 | * linear_pages page tables to map them. */ | 825 | * linear_pages page tables to map them. */ |
| @@ -610,6 +832,9 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
| 610 | /* Now we use the next linear_pages pages as pte pages */ | 832 | /* Now we use the next linear_pages pages as pte pages */ |
| 611 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; | 833 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; |
| 612 | 834 | ||
| 835 | #ifdef CONFIG_X86_PAE | ||
| 836 | pmds = (void *)linear - PAGE_SIZE; | ||
| 837 | #endif | ||
| 613 | /* Linear mapping is easy: put every page's address into the | 838 | /* Linear mapping is easy: put every page's address into the |
| 614 | * mapping in order. */ | 839 | * mapping in order. */ |
| 615 | for (i = 0; i < mapped_pages; i++) { | 840 | for (i = 0; i < mapped_pages; i++) { |
| @@ -621,6 +846,22 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
| 621 | 846 | ||
| 622 | /* The top level points to the linear page table pages above. | 847 | /* The top level points to the linear page table pages above. |
| 623 | * We setup the identity and linear mappings here. */ | 848 | * We setup the identity and linear mappings here. */ |
| 849 | #ifdef CONFIG_X86_PAE | ||
| 850 | for (i = 0, j; i < mapped_pages && j < PTRS_PER_PMD; | ||
| 851 | i += PTRS_PER_PTE, j++) { | ||
| 852 | native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) | ||
| 853 | - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | ||
| 854 | |||
| 855 | if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) | ||
| 856 | return -EFAULT; | ||
| 857 | } | ||
| 858 | |||
| 859 | set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); | ||
| 860 | if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) | ||
| 861 | return -EFAULT; | ||
| 862 | if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) | ||
| 863 | return -EFAULT; | ||
| 864 | #else | ||
| 624 | phys_linear = (unsigned long)linear - mem_base; | 865 | phys_linear = (unsigned long)linear - mem_base; |
| 625 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { | 866 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { |
| 626 | pgd_t pgd; | 867 | pgd_t pgd; |
| @@ -633,6 +874,7 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
| 633 | &pgd, sizeof(pgd))) | 874 | &pgd, sizeof(pgd))) |
| 634 | return -EFAULT; | 875 | return -EFAULT; |
| 635 | } | 876 | } |
| 877 | #endif | ||
| 636 | 878 | ||
| 637 | /* We return the top level (guest-physical) address: remember where | 879 | /* We return the top level (guest-physical) address: remember where |
| 638 | * this is. */ | 880 | * this is. */ |
| @@ -648,7 +890,10 @@ int init_guest_pagetable(struct lguest *lg) | |||
| 648 | u64 mem; | 890 | u64 mem; |
| 649 | u32 initrd_size; | 891 | u32 initrd_size; |
| 650 | struct boot_params __user *boot = (struct boot_params *)lg->mem_base; | 892 | struct boot_params __user *boot = (struct boot_params *)lg->mem_base; |
| 651 | 893 | #ifdef CONFIG_X86_PAE | |
| 894 | pgd_t *pgd; | ||
| 895 | pmd_t *pmd_table; | ||
| 896 | #endif | ||
| 652 | /* Get the Guest memory size and the ramdisk size from the boot header | 897 | /* Get the Guest memory size and the ramdisk size from the boot header |
| 653 | * located at lg->mem_base (Guest address 0). */ | 898 | * located at lg->mem_base (Guest address 0). */ |
| 654 | if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) | 899 | if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) |
| @@ -663,6 +908,15 @@ int init_guest_pagetable(struct lguest *lg) | |||
| 663 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); | 908 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); |
| 664 | if (!lg->pgdirs[0].pgdir) | 909 | if (!lg->pgdirs[0].pgdir) |
| 665 | return -ENOMEM; | 910 | return -ENOMEM; |
| 911 | #ifdef CONFIG_X86_PAE | ||
| 912 | pgd = lg->pgdirs[0].pgdir; | ||
| 913 | pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); | ||
| 914 | if (!pmd_table) | ||
| 915 | return -ENOMEM; | ||
| 916 | |||
| 917 | set_pgd(pgd + SWITCHER_PGD_INDEX, | ||
| 918 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | ||
| 919 | #endif | ||
| 666 | lg->cpus[0].cpu_pgd = 0; | 920 | lg->cpus[0].cpu_pgd = 0; |
| 667 | return 0; | 921 | return 0; |
| 668 | } | 922 | } |
| @@ -672,17 +926,24 @@ void page_table_guest_data_init(struct lg_cpu *cpu) | |||
| 672 | { | 926 | { |
| 673 | /* We get the kernel address: above this is all kernel memory. */ | 927 | /* We get the kernel address: above this is all kernel memory. */ |
| 674 | if (get_user(cpu->lg->kernel_address, | 928 | if (get_user(cpu->lg->kernel_address, |
| 675 | &cpu->lg->lguest_data->kernel_address) | 929 | &cpu->lg->lguest_data->kernel_address) |
| 676 | /* We tell the Guest that it can't use the top 4MB of virtual | 930 | /* We tell the Guest that it can't use the top 2 or 4 MB |
| 677 | * addresses used by the Switcher. */ | 931 | * of virtual addresses used by the Switcher. */ |
| 678 | || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem) | 932 | || put_user(RESERVE_MEM * 1024 * 1024, |
| 679 | || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) | 933 | &cpu->lg->lguest_data->reserve_mem) |
| 934 | || put_user(cpu->lg->pgdirs[0].gpgdir, | ||
| 935 | &cpu->lg->lguest_data->pgdir)) | ||
| 680 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); | 936 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); |
| 681 | 937 | ||
| 682 | /* In flush_user_mappings() we loop from 0 to | 938 | /* In flush_user_mappings() we loop from 0 to |
| 683 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the | 939 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the |
| 684 | * Switcher mappings, so check that now. */ | 940 | * Switcher mappings, so check that now. */ |
| 941 | #ifdef CONFIG_X86_PAE | ||
| 942 | if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && | ||
| 943 | pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) | ||
| 944 | #else | ||
| 685 | if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) | 945 | if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) |
| 946 | #endif | ||
| 686 | kill_guest(cpu, "bad kernel address %#lx", | 947 | kill_guest(cpu, "bad kernel address %#lx", |
| 687 | cpu->lg->kernel_address); | 948 | cpu->lg->kernel_address); |
| 688 | } | 949 | } |
| @@ -708,16 +969,30 @@ void free_guest_pagetable(struct lguest *lg) | |||
| 708 | void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) | 969 | void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) |
| 709 | { | 970 | { |
| 710 | pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); | 971 | pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); |
| 711 | pgd_t switcher_pgd; | ||
| 712 | pte_t regs_pte; | 972 | pte_t regs_pte; |
| 713 | unsigned long pfn; | 973 | unsigned long pfn; |
| 714 | 974 | ||
| 975 | #ifdef CONFIG_X86_PAE | ||
| 976 | pmd_t switcher_pmd; | ||
| 977 | pmd_t *pmd_table; | ||
| 978 | |||
| 979 | native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> | ||
| 980 | PAGE_SHIFT, PAGE_KERNEL_EXEC)); | ||
| 981 | |||
| 982 | pmd_table = __va(pgd_pfn(cpu->lg-> | ||
| 983 | pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) | ||
| 984 | << PAGE_SHIFT); | ||
| 985 | native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); | ||
| 986 | #else | ||
| 987 | pgd_t switcher_pgd; | ||
| 988 | |||
| 715 | /* Make the last PGD entry for this Guest point to the Switcher's PTE | 989 | /* Make the last PGD entry for this Guest point to the Switcher's PTE |
| 716 | * page for this CPU (with appropriate flags). */ | 990 | * page for this CPU (with appropriate flags). */ |
| 717 | switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); | 991 | switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); |
| 718 | 992 | ||
| 719 | cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; | 993 | cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; |
| 720 | 994 | ||
| 995 | #endif | ||
| 721 | /* We also change the Switcher PTE page. When we're running the Guest, | 996 | /* We also change the Switcher PTE page. When we're running the Guest, |
| 722 | * we want the Guest's "regs" page to appear where the first Switcher | 997 | * we want the Guest's "regs" page to appear where the first Switcher |
| 723 | * page for this CPU is. This is an optimization: when the Switcher | 998 | * page for this CPU is. This is an optimization: when the Switcher |
