diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-02 17:14:04 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-02 17:14:04 -0400 |
commit | 736a2dd2571ac56b11ed95a7814d838d5311be04 (patch) | |
tree | de10d107025970c6e51d5b6faeba799ed4b9caae /drivers/lguest | |
parent | 0b2e3b6bb4a415379f16e38fc92db42379be47a1 (diff) | |
parent | 01d779a14ef800b74684d9692add4944df052461 (diff) |
Merge tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux
Pull virtio & lguest updates from Rusty Russell:
"Lots of virtio work which wasn't quite ready for last merge window.
Plus I dived into lguest again, reworking the pagetable code so we can
move the switcher page: our fixmaps sometimes take more than 2MB now..."
Ugh. Annoying conflicts with the tcm_vhost -> vhost_scsi rename.
Hopefully correctly resolved.
* tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (57 commits)
caif_virtio: Remove bouncing email addresses
lguest: improve code readability in lg_cpu_start.
virtio-net: fill only rx queues which are being used
lguest: map Switcher below fixmap.
lguest: cache last cpu we ran on.
lguest: map Switcher text whenever we allocate a new pagetable.
lguest: don't share Switcher PTE pages between guests.
lguest: expost switcher_pages array (as lg_switcher_pages).
lguest: extract shadow PTE walking / allocating.
lguest: make check_gpte et. al return bool.
lguest: assume Switcher text is a single page.
lguest: rename switcher_page to switcher_pages.
lguest: remove RESERVE_MEM constant.
lguest: check vaddr not pgd for Switcher protection.
lguest: prepare to make SWITCHER_ADDR a variable.
virtio: console: replace EMFILE with EBUSY for already-open port
virtio-scsi: reset virtqueue affinity when doing cpu hotplug
virtio-scsi: introduce multiqueue support
virtio-scsi: push vq lock/unlock into virtscsi_vq_done
virtio-scsi: pass struct virtio_scsi to virtqueue completion function
...
Diffstat (limited to 'drivers/lguest')
-rw-r--r-- | drivers/lguest/Kconfig | 5 | ||||
-rw-r--r-- | drivers/lguest/core.c | 67 | ||||
-rw-r--r-- | drivers/lguest/lg.h | 6 | ||||
-rw-r--r-- | drivers/lguest/lguest_user.c | 6 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 567 | ||||
-rw-r--r-- | drivers/lguest/x86/core.c | 7 |
6 files changed, 347 insertions, 311 deletions
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index 89875ea19ade..ee035ec4526b 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig | |||
@@ -5,10 +5,9 @@ config LGUEST | |||
5 | ---help--- | 5 | ---help--- |
6 | This is a very simple module which allows you to run | 6 | This is a very simple module which allows you to run |
7 | multiple instances of the same Linux kernel, using the | 7 | multiple instances of the same Linux kernel, using the |
8 | "lguest" command found in the Documentation/virtual/lguest | 8 | "lguest" command found in the tools/lguest directory. |
9 | directory. | ||
10 | 9 | ||
11 | Note that "lguest" is pronounced to rhyme with "fell quest", | 10 | Note that "lguest" is pronounced to rhyme with "fell quest", |
12 | not "rustyvisor". See Documentation/virtual/lguest/lguest.txt. | 11 | not "rustyvisor". See tools/lguest/lguest.txt. |
13 | 12 | ||
14 | If unsure, say N. If curious, say M. If masochistic, say Y. | 13 | If unsure, say N. If curious, say M. If masochistic, say Y. |
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index a5ebc0083d87..0bf1e4edf04d 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c | |||
@@ -20,9 +20,9 @@ | |||
20 | #include <asm/asm-offsets.h> | 20 | #include <asm/asm-offsets.h> |
21 | #include "lg.h" | 21 | #include "lg.h" |
22 | 22 | ||
23 | 23 | unsigned long switcher_addr; | |
24 | struct page **lg_switcher_pages; | ||
24 | static struct vm_struct *switcher_vma; | 25 | static struct vm_struct *switcher_vma; |
25 | static struct page **switcher_page; | ||
26 | 26 | ||
27 | /* This One Big lock protects all inter-guest data structures. */ | 27 | /* This One Big lock protects all inter-guest data structures. */ |
28 | DEFINE_MUTEX(lguest_lock); | 28 | DEFINE_MUTEX(lguest_lock); |
@@ -52,13 +52,21 @@ static __init int map_switcher(void) | |||
52 | * easy. | 52 | * easy. |
53 | */ | 53 | */ |
54 | 54 | ||
55 | /* We assume Switcher text fits into a single page. */ | ||
56 | if (end_switcher_text - start_switcher_text > PAGE_SIZE) { | ||
57 | printk(KERN_ERR "lguest: switcher text too large (%zu)\n", | ||
58 | end_switcher_text - start_switcher_text); | ||
59 | return -EINVAL; | ||
60 | } | ||
61 | |||
55 | /* | 62 | /* |
56 | * We allocate an array of struct page pointers. map_vm_area() wants | 63 | * We allocate an array of struct page pointers. map_vm_area() wants |
57 | * this, rather than just an array of pages. | 64 | * this, rather than just an array of pages. |
58 | */ | 65 | */ |
59 | switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, | 66 | lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0]) |
60 | GFP_KERNEL); | 67 | * TOTAL_SWITCHER_PAGES, |
61 | if (!switcher_page) { | 68 | GFP_KERNEL); |
69 | if (!lg_switcher_pages) { | ||
62 | err = -ENOMEM; | 70 | err = -ENOMEM; |
63 | goto out; | 71 | goto out; |
64 | } | 72 | } |
@@ -68,32 +76,29 @@ static __init int map_switcher(void) | |||
68 | * so we make sure they're zeroed. | 76 | * so we make sure they're zeroed. |
69 | */ | 77 | */ |
70 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { | 78 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { |
71 | switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); | 79 | lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); |
72 | if (!switcher_page[i]) { | 80 | if (!lg_switcher_pages[i]) { |
73 | err = -ENOMEM; | 81 | err = -ENOMEM; |
74 | goto free_some_pages; | 82 | goto free_some_pages; |
75 | } | 83 | } |
76 | } | 84 | } |
77 | 85 | ||
78 | /* | 86 | /* |
79 | * First we check that the Switcher won't overlap the fixmap area at | 87 | * We place the Switcher underneath the fixmap area, which is the |
80 | * the top of memory. It's currently nowhere near, but it could have | 88 | * highest virtual address we can get. This is important, since we |
81 | * very strange effects if it ever happened. | 89 | * tell the Guest it can't access this memory, so we want its ceiling |
90 | * as high as possible. | ||
82 | */ | 91 | */ |
83 | if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ | 92 | switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE; |
84 | err = -ENOMEM; | ||
85 | printk("lguest: mapping switcher would thwack fixmap\n"); | ||
86 | goto free_pages; | ||
87 | } | ||
88 | 93 | ||
89 | /* | 94 | /* |
90 | * Now we reserve the "virtual memory area" we want: 0xFFC00000 | 95 | * Now we reserve the "virtual memory area" we want. We might |
91 | * (SWITCHER_ADDR). We might not get it in theory, but in practice | 96 | * not get it in theory, but in practice it's worked so far. |
92 | * it's worked so far. The end address needs +1 because __get_vm_area | 97 | * The end address needs +1 because __get_vm_area allocates an |
93 | * allocates an extra guard page, so we need space for that. | 98 | * extra guard page, so we need space for that. |
94 | */ | 99 | */ |
95 | switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, | 100 | switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, |
96 | VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR | 101 | VM_ALLOC, switcher_addr, switcher_addr |
97 | + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); | 102 | + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); |
98 | if (!switcher_vma) { | 103 | if (!switcher_vma) { |
99 | err = -ENOMEM; | 104 | err = -ENOMEM; |
@@ -103,12 +108,12 @@ static __init int map_switcher(void) | |||
103 | 108 | ||
104 | /* | 109 | /* |
105 | * This code actually sets up the pages we've allocated to appear at | 110 | * This code actually sets up the pages we've allocated to appear at |
106 | * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the | 111 | * switcher_addr. map_vm_area() takes the vma we allocated above, the |
107 | * kind of pages we're mapping (kernel pages), and a pointer to our | 112 | * kind of pages we're mapping (kernel pages), and a pointer to our |
108 | * array of struct pages. It increments that pointer, but we don't | 113 | * array of struct pages. It increments that pointer, but we don't |
109 | * care. | 114 | * care. |
110 | */ | 115 | */ |
111 | pagep = switcher_page; | 116 | pagep = lg_switcher_pages; |
112 | err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); | 117 | err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); |
113 | if (err) { | 118 | if (err) { |
114 | printk("lguest: map_vm_area failed: %i\n", err); | 119 | printk("lguest: map_vm_area failed: %i\n", err); |
@@ -133,8 +138,8 @@ free_pages: | |||
133 | i = TOTAL_SWITCHER_PAGES; | 138 | i = TOTAL_SWITCHER_PAGES; |
134 | free_some_pages: | 139 | free_some_pages: |
135 | for (--i; i >= 0; i--) | 140 | for (--i; i >= 0; i--) |
136 | __free_pages(switcher_page[i], 0); | 141 | __free_pages(lg_switcher_pages[i], 0); |
137 | kfree(switcher_page); | 142 | kfree(lg_switcher_pages); |
138 | out: | 143 | out: |
139 | return err; | 144 | return err; |
140 | } | 145 | } |
@@ -149,8 +154,8 @@ static void unmap_switcher(void) | |||
149 | vunmap(switcher_vma->addr); | 154 | vunmap(switcher_vma->addr); |
150 | /* Now we just need to free the pages we copied the switcher into */ | 155 | /* Now we just need to free the pages we copied the switcher into */ |
151 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) | 156 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) |
152 | __free_pages(switcher_page[i], 0); | 157 | __free_pages(lg_switcher_pages[i], 0); |
153 | kfree(switcher_page); | 158 | kfree(lg_switcher_pages); |
154 | } | 159 | } |
155 | 160 | ||
156 | /*H:032 | 161 | /*H:032 |
@@ -323,15 +328,10 @@ static int __init init(void) | |||
323 | if (err) | 328 | if (err) |
324 | goto out; | 329 | goto out; |
325 | 330 | ||
326 | /* Now we set up the pagetable implementation for the Guests. */ | ||
327 | err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); | ||
328 | if (err) | ||
329 | goto unmap; | ||
330 | |||
331 | /* We might need to reserve an interrupt vector. */ | 331 | /* We might need to reserve an interrupt vector. */ |
332 | err = init_interrupts(); | 332 | err = init_interrupts(); |
333 | if (err) | 333 | if (err) |
334 | goto free_pgtables; | 334 | goto unmap; |
335 | 335 | ||
336 | /* /dev/lguest needs to be registered. */ | 336 | /* /dev/lguest needs to be registered. */ |
337 | err = lguest_device_init(); | 337 | err = lguest_device_init(); |
@@ -346,8 +346,6 @@ static int __init init(void) | |||
346 | 346 | ||
347 | free_interrupts: | 347 | free_interrupts: |
348 | free_interrupts(); | 348 | free_interrupts(); |
349 | free_pgtables: | ||
350 | free_pagetables(); | ||
351 | unmap: | 349 | unmap: |
352 | unmap_switcher(); | 350 | unmap_switcher(); |
353 | out: | 351 | out: |
@@ -359,7 +357,6 @@ static void __exit fini(void) | |||
359 | { | 357 | { |
360 | lguest_device_remove(); | 358 | lguest_device_remove(); |
361 | free_interrupts(); | 359 | free_interrupts(); |
362 | free_pagetables(); | ||
363 | unmap_switcher(); | 360 | unmap_switcher(); |
364 | 361 | ||
365 | lguest_arch_host_fini(); | 362 | lguest_arch_host_fini(); |
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 295df06e6590..2eef40be4c04 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h | |||
@@ -14,11 +14,10 @@ | |||
14 | 14 | ||
15 | #include <asm/lguest.h> | 15 | #include <asm/lguest.h> |
16 | 16 | ||
17 | void free_pagetables(void); | ||
18 | int init_pagetables(struct page **switcher_page, unsigned int pages); | ||
19 | |||
20 | struct pgdir { | 17 | struct pgdir { |
21 | unsigned long gpgdir; | 18 | unsigned long gpgdir; |
19 | bool switcher_mapped; | ||
20 | int last_host_cpu; | ||
22 | pgd_t *pgdir; | 21 | pgd_t *pgdir; |
23 | }; | 22 | }; |
24 | 23 | ||
@@ -124,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg, | |||
124 | unsigned long addr, unsigned long len); | 123 | unsigned long addr, unsigned long len); |
125 | void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); | 124 | void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); |
126 | void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); | 125 | void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); |
126 | extern struct page **lg_switcher_pages; | ||
127 | 127 | ||
128 | /*H:035 | 128 | /*H:035 |
129 | * Using memory-copy operations like that is usually inconvient, so we | 129 | * Using memory-copy operations like that is usually inconvient, so we |
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index ff4a0bc9904d..4263f4cc8c55 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c | |||
@@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) | |||
250 | */ | 250 | */ |
251 | static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) | 251 | static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) |
252 | { | 252 | { |
253 | /* We have a limited number the number of CPUs in the lguest struct. */ | 253 | /* We have a limited number of CPUs in the lguest struct. */ |
254 | if (id >= ARRAY_SIZE(cpu->lg->cpus)) | 254 | if (id >= ARRAY_SIZE(cpu->lg->cpus)) |
255 | return -EINVAL; | 255 | return -EINVAL; |
256 | 256 | ||
257 | /* Set up this CPU's id, and pointer back to the lguest struct. */ | 257 | /* Set up this CPU's id, and pointer back to the lguest struct. */ |
258 | cpu->id = id; | 258 | cpu->id = id; |
259 | cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); | 259 | cpu->lg = container_of(cpu, struct lguest, cpus[id]); |
260 | cpu->lg->nr_cpus++; | 260 | cpu->lg->nr_cpus++; |
261 | 261 | ||
262 | /* Each CPU has a timer it can set. */ | 262 | /* Each CPU has a timer it can set. */ |
@@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) | |||
270 | if (!cpu->regs_page) | 270 | if (!cpu->regs_page) |
271 | return -ENOMEM; | 271 | return -ENOMEM; |
272 | 272 | ||
273 | /* We actually put the registers at the bottom of the page. */ | 273 | /* We actually put the registers at the end of the page. */ |
274 | cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); | 274 | cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); |
275 | 275 | ||
276 | /* | 276 | /* |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 864baabaee25..699187ab3800 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * converted Guest pages when running the Guest. | 7 | * converted Guest pages when running the Guest. |
8 | :*/ | 8 | :*/ |
9 | 9 | ||
10 | /* Copyright (C) Rusty Russell IBM Corporation 2006. | 10 | /* Copyright (C) Rusty Russell IBM Corporation 2013. |
11 | * GPL v2 and any later version */ | 11 | * GPL v2 and any later version */ |
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/gfp.h> | 13 | #include <linux/gfp.h> |
@@ -62,22 +62,11 @@ | |||
62 | * will need the last pmd entry of the last pmd page. | 62 | * will need the last pmd entry of the last pmd page. |
63 | */ | 63 | */ |
64 | #ifdef CONFIG_X86_PAE | 64 | #ifdef CONFIG_X86_PAE |
65 | #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) | ||
66 | #define RESERVE_MEM 2U | ||
67 | #define CHECK_GPGD_MASK _PAGE_PRESENT | 65 | #define CHECK_GPGD_MASK _PAGE_PRESENT |
68 | #else | 66 | #else |
69 | #define RESERVE_MEM 4U | ||
70 | #define CHECK_GPGD_MASK _PAGE_TABLE | 67 | #define CHECK_GPGD_MASK _PAGE_TABLE |
71 | #endif | 68 | #endif |
72 | 69 | ||
73 | /* | ||
74 | * We actually need a separate PTE page for each CPU. Remember that after the | ||
75 | * Switcher code itself comes two pages for each CPU, and we don't want this | ||
76 | * CPU's guest to see the pages of any other CPU. | ||
77 | */ | ||
78 | static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); | ||
79 | #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) | ||
80 | |||
81 | /*H:320 | 70 | /*H:320 |
82 | * The page table code is curly enough to need helper functions to keep it | 71 | * The page table code is curly enough to need helper functions to keep it |
83 | * clear and clean. The kernel itself provides many of them; one advantage | 72 | * clear and clean. The kernel itself provides many of them; one advantage |
@@ -95,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) | |||
95 | { | 84 | { |
96 | unsigned int index = pgd_index(vaddr); | 85 | unsigned int index = pgd_index(vaddr); |
97 | 86 | ||
98 | #ifndef CONFIG_X86_PAE | ||
99 | /* We kill any Guest trying to touch the Switcher addresses. */ | ||
100 | if (index >= SWITCHER_PGD_INDEX) { | ||
101 | kill_guest(cpu, "attempt to access switcher pages"); | ||
102 | index = 0; | ||
103 | } | ||
104 | #endif | ||
105 | /* Return a pointer index'th pgd entry for the i'th page table. */ | 87 | /* Return a pointer index'th pgd entry for the i'th page table. */ |
106 | return &cpu->lg->pgdirs[i].pgdir[index]; | 88 | return &cpu->lg->pgdirs[i].pgdir[index]; |
107 | } | 89 | } |
@@ -117,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) | |||
117 | unsigned int index = pmd_index(vaddr); | 99 | unsigned int index = pmd_index(vaddr); |
118 | pmd_t *page; | 100 | pmd_t *page; |
119 | 101 | ||
120 | /* We kill any Guest trying to touch the Switcher addresses. */ | ||
121 | if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && | ||
122 | index >= SWITCHER_PMD_INDEX) { | ||
123 | kill_guest(cpu, "attempt to access switcher pages"); | ||
124 | index = 0; | ||
125 | } | ||
126 | |||
127 | /* You should never call this if the PGD entry wasn't valid */ | 102 | /* You should never call this if the PGD entry wasn't valid */ |
128 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); | 103 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); |
129 | page = __va(pgd_pfn(spgd) << PAGE_SHIFT); | 104 | page = __va(pgd_pfn(spgd) << PAGE_SHIFT); |
@@ -275,122 +250,177 @@ static void release_pte(pte_t pte) | |||
275 | } | 250 | } |
276 | /*:*/ | 251 | /*:*/ |
277 | 252 | ||
278 | static void check_gpte(struct lg_cpu *cpu, pte_t gpte) | 253 | static bool check_gpte(struct lg_cpu *cpu, pte_t gpte) |
279 | { | 254 | { |
280 | if ((pte_flags(gpte) & _PAGE_PSE) || | 255 | if ((pte_flags(gpte) & _PAGE_PSE) || |
281 | pte_pfn(gpte) >= cpu->lg->pfn_limit) | 256 | pte_pfn(gpte) >= cpu->lg->pfn_limit) { |
282 | kill_guest(cpu, "bad page table entry"); | 257 | kill_guest(cpu, "bad page table entry"); |
258 | return false; | ||
259 | } | ||
260 | return true; | ||
283 | } | 261 | } |
284 | 262 | ||
285 | static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) | 263 | static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) |
286 | { | 264 | { |
287 | if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || | 265 | if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || |
288 | (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) | 266 | (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) { |
289 | kill_guest(cpu, "bad page directory entry"); | 267 | kill_guest(cpu, "bad page directory entry"); |
268 | return false; | ||
269 | } | ||
270 | return true; | ||
290 | } | 271 | } |
291 | 272 | ||
292 | #ifdef CONFIG_X86_PAE | 273 | #ifdef CONFIG_X86_PAE |
293 | static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) | 274 | static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) |
294 | { | 275 | { |
295 | if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || | 276 | if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || |
296 | (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) | 277 | (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) { |
297 | kill_guest(cpu, "bad page middle directory entry"); | 278 | kill_guest(cpu, "bad page middle directory entry"); |
279 | return false; | ||
280 | } | ||
281 | return true; | ||
298 | } | 282 | } |
299 | #endif | 283 | #endif |
300 | 284 | ||
301 | /*H:330 | 285 | /*H:331 |
302 | * (i) Looking up a page table entry when the Guest faults. | 286 | * This is the core routine to walk the shadow page tables and find the page |
303 | * | 287 | * table entry for a specific address. |
304 | * We saw this call in run_guest(): when we see a page fault in the Guest, we | ||
305 | * come here. That's because we only set up the shadow page tables lazily as | ||
306 | * they're needed, so we get page faults all the time and quietly fix them up | ||
307 | * and return to the Guest without it knowing. | ||
308 | * | 288 | * |
309 | * If we fixed up the fault (ie. we mapped the address), this routine returns | 289 | * If allocate is set, then we allocate any missing levels, setting the flags |
310 | * true. Otherwise, it was a real fault and we need to tell the Guest. | 290 | * on the new page directory and mid-level directories using the arguments |
291 | * (which are copied from the Guest's page table entries). | ||
311 | */ | 292 | */ |
312 | bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | 293 | static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate, |
294 | int pgd_flags, int pmd_flags) | ||
313 | { | 295 | { |
314 | pgd_t gpgd; | ||
315 | pgd_t *spgd; | 296 | pgd_t *spgd; |
316 | unsigned long gpte_ptr; | ||
317 | pte_t gpte; | ||
318 | pte_t *spte; | ||
319 | |||
320 | /* Mid level for PAE. */ | 297 | /* Mid level for PAE. */ |
321 | #ifdef CONFIG_X86_PAE | 298 | #ifdef CONFIG_X86_PAE |
322 | pmd_t *spmd; | 299 | pmd_t *spmd; |
323 | pmd_t gpmd; | ||
324 | #endif | 300 | #endif |
325 | 301 | ||
326 | /* First step: get the top-level Guest page table entry. */ | 302 | /* Get top level entry. */ |
327 | if (unlikely(cpu->linear_pages)) { | ||
328 | /* Faking up a linear mapping. */ | ||
329 | gpgd = __pgd(CHECK_GPGD_MASK); | ||
330 | } else { | ||
331 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | ||
332 | /* Toplevel not present? We can't map it in. */ | ||
333 | if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) | ||
334 | return false; | ||
335 | } | ||
336 | |||
337 | /* Now look at the matching shadow entry. */ | ||
338 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); | 303 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); |
339 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { | 304 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { |
340 | /* No shadow entry: allocate a new shadow PTE page. */ | 305 | /* No shadow entry: allocate a new shadow PTE page. */ |
341 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); | 306 | unsigned long ptepage; |
307 | |||
308 | /* If they didn't want us to allocate anything, stop. */ | ||
309 | if (!allocate) | ||
310 | return NULL; | ||
311 | |||
312 | ptepage = get_zeroed_page(GFP_KERNEL); | ||
342 | /* | 313 | /* |
343 | * This is not really the Guest's fault, but killing it is | 314 | * This is not really the Guest's fault, but killing it is |
344 | * simple for this corner case. | 315 | * simple for this corner case. |
345 | */ | 316 | */ |
346 | if (!ptepage) { | 317 | if (!ptepage) { |
347 | kill_guest(cpu, "out of memory allocating pte page"); | 318 | kill_guest(cpu, "out of memory allocating pte page"); |
348 | return false; | 319 | return NULL; |
349 | } | 320 | } |
350 | /* We check that the Guest pgd is OK. */ | ||
351 | check_gpgd(cpu, gpgd); | ||
352 | /* | 321 | /* |
353 | * And we copy the flags to the shadow PGD entry. The page | 322 | * And we copy the flags to the shadow PGD entry. The page |
354 | * number in the shadow PGD is the page we just allocated. | 323 | * number in the shadow PGD is the page we just allocated. |
355 | */ | 324 | */ |
356 | set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); | 325 | set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags)); |
357 | } | 326 | } |
358 | 327 | ||
328 | /* | ||
329 | * Intel's Physical Address Extension actually uses three levels of | ||
330 | * page tables, so we need to look in the mid-level. | ||
331 | */ | ||
359 | #ifdef CONFIG_X86_PAE | 332 | #ifdef CONFIG_X86_PAE |
360 | if (unlikely(cpu->linear_pages)) { | 333 | /* Now look at the mid-level shadow entry. */ |
361 | /* Faking up a linear mapping. */ | ||
362 | gpmd = __pmd(_PAGE_TABLE); | ||
363 | } else { | ||
364 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
365 | /* Middle level not present? We can't map it in. */ | ||
366 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | ||
367 | return false; | ||
368 | } | ||
369 | |||
370 | /* Now look at the matching shadow entry. */ | ||
371 | spmd = spmd_addr(cpu, *spgd, vaddr); | 334 | spmd = spmd_addr(cpu, *spgd, vaddr); |
372 | 335 | ||
373 | if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { | 336 | if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { |
374 | /* No shadow entry: allocate a new shadow PTE page. */ | 337 | /* No shadow entry: allocate a new shadow PTE page. */ |
375 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); | 338 | unsigned long ptepage; |
339 | |||
340 | /* If they didn't want us to allocate anything, stop. */ | ||
341 | if (!allocate) | ||
342 | return NULL; | ||
343 | |||
344 | ptepage = get_zeroed_page(GFP_KERNEL); | ||
376 | 345 | ||
377 | /* | 346 | /* |
378 | * This is not really the Guest's fault, but killing it is | 347 | * This is not really the Guest's fault, but killing it is |
379 | * simple for this corner case. | 348 | * simple for this corner case. |
380 | */ | 349 | */ |
381 | if (!ptepage) { | 350 | if (!ptepage) { |
382 | kill_guest(cpu, "out of memory allocating pte page"); | 351 | kill_guest(cpu, "out of memory allocating pmd page"); |
383 | return false; | 352 | return NULL; |
384 | } | 353 | } |
385 | 354 | ||
386 | /* We check that the Guest pmd is OK. */ | ||
387 | check_gpmd(cpu, gpmd); | ||
388 | |||
389 | /* | 355 | /* |
390 | * And we copy the flags to the shadow PMD entry. The page | 356 | * And we copy the flags to the shadow PMD entry. The page |
391 | * number in the shadow PMD is the page we just allocated. | 357 | * number in the shadow PMD is the page we just allocated. |
392 | */ | 358 | */ |
393 | set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); | 359 | set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags)); |
360 | } | ||
361 | #endif | ||
362 | |||
363 | /* Get the pointer to the shadow PTE entry we're going to set. */ | ||
364 | return spte_addr(cpu, *spgd, vaddr); | ||
365 | } | ||
366 | |||
367 | /*H:330 | ||
368 | * (i) Looking up a page table entry when the Guest faults. | ||
369 | * | ||
370 | * We saw this call in run_guest(): when we see a page fault in the Guest, we | ||
371 | * come here. That's because we only set up the shadow page tables lazily as | ||
372 | * they're needed, so we get page faults all the time and quietly fix them up | ||
373 | * and return to the Guest without it knowing. | ||
374 | * | ||
375 | * If we fixed up the fault (ie. we mapped the address), this routine returns | ||
376 | * true. Otherwise, it was a real fault and we need to tell the Guest. | ||
377 | */ | ||
378 | bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | ||
379 | { | ||
380 | unsigned long gpte_ptr; | ||
381 | pte_t gpte; | ||
382 | pte_t *spte; | ||
383 | pmd_t gpmd; | ||
384 | pgd_t gpgd; | ||
385 | |||
386 | /* We never demand page the Switcher, so trying is a mistake. */ | ||
387 | if (vaddr >= switcher_addr) | ||
388 | return false; | ||
389 | |||
390 | /* First step: get the top-level Guest page table entry. */ | ||
391 | if (unlikely(cpu->linear_pages)) { | ||
392 | /* Faking up a linear mapping. */ | ||
393 | gpgd = __pgd(CHECK_GPGD_MASK); | ||
394 | } else { | ||
395 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | ||
396 | /* Toplevel not present? We can't map it in. */ | ||
397 | if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) | ||
398 | return false; | ||
399 | |||
400 | /* | ||
401 | * This kills the Guest if it has weird flags or tries to | ||
402 | * refer to a "physical" address outside the bounds. | ||
403 | */ | ||
404 | if (!check_gpgd(cpu, gpgd)) | ||
405 | return false; | ||
406 | } | ||
407 | |||
408 | /* This "mid-level" entry is only used for non-linear, PAE mode. */ | ||
409 | gpmd = __pmd(_PAGE_TABLE); | ||
410 | |||
411 | #ifdef CONFIG_X86_PAE | ||
412 | if (likely(!cpu->linear_pages)) { | ||
413 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
414 | /* Middle level not present? We can't map it in. */ | ||
415 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | ||
416 | return false; | ||
417 | |||
418 | /* | ||
419 | * This kills the Guest if it has weird flags or tries to | ||
420 | * refer to a "physical" address outside the bounds. | ||
421 | */ | ||
422 | if (!check_gpmd(cpu, gpmd)) | ||
423 | return false; | ||
394 | } | 424 | } |
395 | 425 | ||
396 | /* | 426 | /* |
@@ -433,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
433 | * Check that the Guest PTE flags are OK, and the page number is below | 463 | * Check that the Guest PTE flags are OK, and the page number is below |
434 | * the pfn_limit (ie. not mapping the Launcher binary). | 464 | * the pfn_limit (ie. not mapping the Launcher binary). |
435 | */ | 465 | */ |
436 | check_gpte(cpu, gpte); | 466 | if (!check_gpte(cpu, gpte)) |
467 | return false; | ||
437 | 468 | ||
438 | /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ | 469 | /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ |
439 | gpte = pte_mkyoung(gpte); | 470 | gpte = pte_mkyoung(gpte); |
@@ -441,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
441 | gpte = pte_mkdirty(gpte); | 472 | gpte = pte_mkdirty(gpte); |
442 | 473 | ||
443 | /* Get the pointer to the shadow PTE entry we're going to set. */ | 474 | /* Get the pointer to the shadow PTE entry we're going to set. */ |
444 | spte = spte_addr(cpu, *spgd, vaddr); | 475 | spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd)); |
476 | if (!spte) | ||
477 | return false; | ||
445 | 478 | ||
446 | /* | 479 | /* |
447 | * If there was a valid shadow PTE entry here before, we release it. | 480 | * If there was a valid shadow PTE entry here before, we release it. |
@@ -493,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
493 | */ | 526 | */ |
494 | static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) | 527 | static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) |
495 | { | 528 | { |
496 | pgd_t *spgd; | 529 | pte_t *spte; |
497 | unsigned long flags; | 530 | unsigned long flags; |
498 | 531 | ||
499 | #ifdef CONFIG_X86_PAE | 532 | /* You can't put your stack in the Switcher! */ |
500 | pmd_t *spmd; | 533 | if (vaddr >= switcher_addr) |
501 | #endif | ||
502 | /* Look at the current top level entry: is it present? */ | ||
503 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); | ||
504 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) | ||
505 | return false; | 534 | return false; |
506 | 535 | ||
507 | #ifdef CONFIG_X86_PAE | 536 | /* If there's no shadow PTE, it's not writable. */ |
508 | spmd = spmd_addr(cpu, *spgd, vaddr); | 537 | spte = find_spte(cpu, vaddr, false, 0, 0); |
509 | if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) | 538 | if (!spte) |
510 | return false; | 539 | return false; |
511 | #endif | ||
512 | 540 | ||
513 | /* | 541 | /* |
514 | * Check the flags on the pte entry itself: it must be present and | 542 | * Check the flags on the pte entry itself: it must be present and |
515 | * writable. | 543 | * writable. |
516 | */ | 544 | */ |
517 | flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); | 545 | flags = pte_flags(*spte); |
518 | |||
519 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); | 546 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); |
520 | } | 547 | } |
521 | 548 | ||
@@ -678,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
678 | int *blank_pgdir) | 705 | int *blank_pgdir) |
679 | { | 706 | { |
680 | unsigned int next; | 707 | unsigned int next; |
681 | #ifdef CONFIG_X86_PAE | ||
682 | pmd_t *pmd_table; | ||
683 | #endif | ||
684 | 708 | ||
685 | /* | 709 | /* |
686 | * We pick one entry at random to throw out. Choosing the Least | 710 | * We pick one entry at random to throw out. Choosing the Least |
@@ -695,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
695 | if (!cpu->lg->pgdirs[next].pgdir) | 719 | if (!cpu->lg->pgdirs[next].pgdir) |
696 | next = cpu->cpu_pgd; | 720 | next = cpu->cpu_pgd; |
697 | else { | 721 | else { |
698 | #ifdef CONFIG_X86_PAE | ||
699 | /* | 722 | /* |
700 | * In PAE mode, allocate a pmd page and populate the | 723 | * This is a blank page, so there are no kernel |
701 | * last pgd entry. | 724 | * mappings: caller must map the stack! |
702 | */ | 725 | */ |
703 | pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); | ||
704 | if (!pmd_table) { | ||
705 | free_page((long)cpu->lg->pgdirs[next].pgdir); | ||
706 | set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); | ||
707 | next = cpu->cpu_pgd; | ||
708 | } else { | ||
709 | set_pgd(cpu->lg->pgdirs[next].pgdir + | ||
710 | SWITCHER_PGD_INDEX, | ||
711 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | ||
712 | /* | ||
713 | * This is a blank page, so there are no kernel | ||
714 | * mappings: caller must map the stack! | ||
715 | */ | ||
716 | *blank_pgdir = 1; | ||
717 | } | ||
718 | #else | ||
719 | *blank_pgdir = 1; | 726 | *blank_pgdir = 1; |
720 | #endif | ||
721 | } | 727 | } |
722 | } | 728 | } |
723 | /* Record which Guest toplevel this shadows. */ | 729 | /* Record which Guest toplevel this shadows. */ |
@@ -725,9 +731,50 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
725 | /* Release all the non-kernel mappings. */ | 731 | /* Release all the non-kernel mappings. */ |
726 | flush_user_mappings(cpu->lg, next); | 732 | flush_user_mappings(cpu->lg, next); |
727 | 733 | ||
734 | /* This hasn't run on any CPU at all. */ | ||
735 | cpu->lg->pgdirs[next].last_host_cpu = -1; | ||
736 | |||
728 | return next; | 737 | return next; |
729 | } | 738 | } |
730 | 739 | ||
740 | /*H:501 | ||
741 | * We do need the Switcher code mapped at all times, so we allocate that | ||
742 | * part of the Guest page table here. We map the Switcher code immediately, | ||
743 | * but defer mapping of the guest register page and IDT/LDT etc page until | ||
744 | * just before we run the guest in map_switcher_in_guest(). | ||
745 | * | ||
746 | * We *could* do this setup in map_switcher_in_guest(), but at that point | ||
747 | * we've interrupts disabled, and allocating pages like that is fraught: we | ||
748 | * can't sleep if we need to free up some memory. | ||
749 | */ | ||
750 | static bool allocate_switcher_mapping(struct lg_cpu *cpu) | ||
751 | { | ||
752 | int i; | ||
753 | |||
754 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { | ||
755 | pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true, | ||
756 | CHECK_GPGD_MASK, _PAGE_TABLE); | ||
757 | if (!pte) | ||
758 | return false; | ||
759 | |||
760 | /* | ||
761 | * Map the switcher page if not already there. It might | ||
762 | * already be there because we call allocate_switcher_mapping() | ||
763 | * in guest_set_pgd() just in case it did discard our Switcher | ||
764 | * mapping, but it probably didn't. | ||
765 | */ | ||
766 | if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) { | ||
767 | /* Get a reference to the Switcher page. */ | ||
768 | get_page(lg_switcher_pages[0]); | ||
769 | /* Create a read-only, exectuable, kernel-style PTE */ | ||
770 | set_pte(pte, | ||
771 | mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX)); | ||
772 | } | ||
773 | } | ||
774 | cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true; | ||
775 | return true; | ||
776 | } | ||
777 | |||
731 | /*H:470 | 778 | /*H:470 |
732 | * Finally, a routine which throws away everything: all PGD entries in all | 779 | * Finally, a routine which throws away everything: all PGD entries in all |
733 | * the shadow page tables, including the Guest's kernel mappings. This is used | 780 | * the shadow page tables, including the Guest's kernel mappings. This is used |
@@ -738,28 +785,16 @@ static void release_all_pagetables(struct lguest *lg) | |||
738 | unsigned int i, j; | 785 | unsigned int i, j; |
739 | 786 | ||
740 | /* Every shadow pagetable this Guest has */ | 787 | /* Every shadow pagetable this Guest has */ |
741 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 788 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) { |
742 | if (lg->pgdirs[i].pgdir) { | 789 | if (!lg->pgdirs[i].pgdir) |
743 | #ifdef CONFIG_X86_PAE | 790 | continue; |
744 | pgd_t *spgd; | 791 | |
745 | pmd_t *pmdpage; | 792 | /* Every PGD entry. */ |
746 | unsigned int k; | 793 | for (j = 0; j < PTRS_PER_PGD; j++) |
747 | 794 | release_pgd(lg->pgdirs[i].pgdir + j); | |
748 | /* Get the last pmd page. */ | 795 | lg->pgdirs[i].switcher_mapped = false; |
749 | spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; | 796 | lg->pgdirs[i].last_host_cpu = -1; |
750 | pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); | 797 | } |
751 | |||
752 | /* | ||
753 | * And release the pmd entries of that pmd page, | ||
754 | * except for the switcher pmd. | ||
755 | */ | ||
756 | for (k = 0; k < SWITCHER_PMD_INDEX; k++) | ||
757 | release_pmd(&pmdpage[k]); | ||
758 | #endif | ||
759 | /* Every PGD entry except the Switcher at the top */ | ||
760 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) | ||
761 | release_pgd(lg->pgdirs[i].pgdir + j); | ||
762 | } | ||
763 | } | 798 | } |
764 | 799 | ||
765 | /* | 800 | /* |
@@ -773,6 +808,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) | |||
773 | release_all_pagetables(cpu->lg); | 808 | release_all_pagetables(cpu->lg); |
774 | /* We need the Guest kernel stack mapped again. */ | 809 | /* We need the Guest kernel stack mapped again. */ |
775 | pin_stack_pages(cpu); | 810 | pin_stack_pages(cpu); |
811 | /* And we need Switcher allocated. */ | ||
812 | if (!allocate_switcher_mapping(cpu)) | ||
813 | kill_guest(cpu, "Cannot populate switcher mapping"); | ||
776 | } | 814 | } |
777 | 815 | ||
778 | /*H:430 | 816 | /*H:430 |
@@ -808,9 +846,17 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) | |||
808 | newpgdir = new_pgdir(cpu, pgtable, &repin); | 846 | newpgdir = new_pgdir(cpu, pgtable, &repin); |
809 | /* Change the current pgd index to the new one. */ | 847 | /* Change the current pgd index to the new one. */ |
810 | cpu->cpu_pgd = newpgdir; | 848 | cpu->cpu_pgd = newpgdir; |
811 | /* If it was completely blank, we map in the Guest kernel stack */ | 849 | /* |
850 | * If it was completely blank, we map in the Guest kernel stack and | ||
851 | * the Switcher. | ||
852 | */ | ||
812 | if (repin) | 853 | if (repin) |
813 | pin_stack_pages(cpu); | 854 | pin_stack_pages(cpu); |
855 | |||
856 | if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) { | ||
857 | if (!allocate_switcher_mapping(cpu)) | ||
858 | kill_guest(cpu, "Cannot populate switcher mapping"); | ||
859 | } | ||
814 | } | 860 | } |
815 | /*:*/ | 861 | /*:*/ |
816 | 862 | ||
@@ -865,7 +911,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, | |||
865 | * micro-benchmark. | 911 | * micro-benchmark. |
866 | */ | 912 | */ |
867 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { | 913 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { |
868 | check_gpte(cpu, gpte); | 914 | if (!check_gpte(cpu, gpte)) |
915 | return; | ||
869 | set_pte(spte, | 916 | set_pte(spte, |
870 | gpte_to_spte(cpu, gpte, | 917 | gpte_to_spte(cpu, gpte, |
871 | pte_flags(gpte) & _PAGE_DIRTY)); | 918 | pte_flags(gpte) & _PAGE_DIRTY)); |
@@ -897,6 +944,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, | |||
897 | void guest_set_pte(struct lg_cpu *cpu, | 944 | void guest_set_pte(struct lg_cpu *cpu, |
898 | unsigned long gpgdir, unsigned long vaddr, pte_t gpte) | 945 | unsigned long gpgdir, unsigned long vaddr, pte_t gpte) |
899 | { | 946 | { |
947 | /* We don't let you remap the Switcher; we need it to get back! */ | ||
948 | if (vaddr >= switcher_addr) { | ||
949 | kill_guest(cpu, "attempt to set pte into Switcher pages"); | ||
950 | return; | ||
951 | } | ||
952 | |||
900 | /* | 953 | /* |
901 | * Kernel mappings must be changed on all top levels. Slow, but doesn't | 954 | * Kernel mappings must be changed on all top levels. Slow, but doesn't |
902 | * happen often. | 955 | * happen often. |
@@ -933,14 +986,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) | |||
933 | { | 986 | { |
934 | int pgdir; | 987 | int pgdir; |
935 | 988 | ||
936 | if (idx >= SWITCHER_PGD_INDEX) | 989 | if (idx > PTRS_PER_PGD) { |
990 | kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u", | ||
991 | idx, PTRS_PER_PGD); | ||
937 | return; | 992 | return; |
993 | } | ||
938 | 994 | ||
939 | /* If they're talking about a page table we have a shadow for... */ | 995 | /* If they're talking about a page table we have a shadow for... */ |
940 | pgdir = find_pgdir(lg, gpgdir); | 996 | pgdir = find_pgdir(lg, gpgdir); |
941 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) | 997 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) { |
942 | /* ... throw it away. */ | 998 | /* ... throw it away. */ |
943 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); | 999 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); |
1000 | /* That might have been the Switcher mapping, remap it. */ | ||
1001 | if (!allocate_switcher_mapping(&lg->cpus[0])) { | ||
1002 | kill_guest(&lg->cpus[0], | ||
1003 | "Cannot populate switcher mapping"); | ||
1004 | } | ||
1005 | } | ||
944 | } | 1006 | } |
945 | 1007 | ||
946 | #ifdef CONFIG_X86_PAE | 1008 | #ifdef CONFIG_X86_PAE |
@@ -958,6 +1020,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) | |||
958 | * we will populate on future faults. The Guest doesn't have any actual | 1020 | * we will populate on future faults. The Guest doesn't have any actual |
959 | * pagetables yet, so we set linear_pages to tell demand_page() to fake it | 1021 | * pagetables yet, so we set linear_pages to tell demand_page() to fake it |
960 | * for the moment. | 1022 | * for the moment. |
1023 | * | ||
1024 | * We do need the Switcher to be mapped at all times, so we allocate that | ||
1025 | * part of the Guest page table here. | ||
961 | */ | 1026 | */ |
962 | int init_guest_pagetable(struct lguest *lg) | 1027 | int init_guest_pagetable(struct lguest *lg) |
963 | { | 1028 | { |
@@ -971,21 +1036,34 @@ int init_guest_pagetable(struct lguest *lg) | |||
971 | 1036 | ||
972 | /* We start with a linear mapping until the initialize. */ | 1037 | /* We start with a linear mapping until the initialize. */ |
973 | cpu->linear_pages = true; | 1038 | cpu->linear_pages = true; |
1039 | |||
1040 | /* Allocate the page tables for the Switcher. */ | ||
1041 | if (!allocate_switcher_mapping(cpu)) { | ||
1042 | release_all_pagetables(lg); | ||
1043 | return -ENOMEM; | ||
1044 | } | ||
1045 | |||
974 | return 0; | 1046 | return 0; |
975 | } | 1047 | } |
976 | 1048 | ||
977 | /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ | 1049 | /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ |
978 | void page_table_guest_data_init(struct lg_cpu *cpu) | 1050 | void page_table_guest_data_init(struct lg_cpu *cpu) |
979 | { | 1051 | { |
1052 | /* | ||
1053 | * We tell the Guest that it can't use the virtual addresses | ||
1054 | * used by the Switcher. This trick is equivalent to 4GB - | ||
1055 | * switcher_addr. | ||
1056 | */ | ||
1057 | u32 top = ~switcher_addr + 1; | ||
1058 | |||
980 | /* We get the kernel address: above this is all kernel memory. */ | 1059 | /* We get the kernel address: above this is all kernel memory. */ |
981 | if (get_user(cpu->lg->kernel_address, | 1060 | if (get_user(cpu->lg->kernel_address, |
982 | &cpu->lg->lguest_data->kernel_address) | 1061 | &cpu->lg->lguest_data->kernel_address) |
983 | /* | 1062 | /* |
984 | * We tell the Guest that it can't use the top 2 or 4 MB | 1063 | * We tell the Guest that it can't use the top virtual |
985 | * of virtual addresses used by the Switcher. | 1064 | * addresses (used by the Switcher). |
986 | */ | 1065 | */ |
987 | || put_user(RESERVE_MEM * 1024 * 1024, | 1066 | || put_user(top, &cpu->lg->lguest_data->reserve_mem)) { |
988 | &cpu->lg->lguest_data->reserve_mem)) { | ||
989 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); | 1067 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); |
990 | return; | 1068 | return; |
991 | } | 1069 | } |
@@ -995,12 +1073,7 @@ void page_table_guest_data_init(struct lg_cpu *cpu) | |||
995 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the | 1073 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the |
996 | * Switcher mappings, so check that now. | 1074 | * Switcher mappings, so check that now. |
997 | */ | 1075 | */ |
998 | #ifdef CONFIG_X86_PAE | 1076 | if (cpu->lg->kernel_address >= switcher_addr) |
999 | if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && | ||
1000 | pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) | ||
1001 | #else | ||
1002 | if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) | ||
1003 | #endif | ||
1004 | kill_guest(cpu, "bad kernel address %#lx", | 1077 | kill_guest(cpu, "bad kernel address %#lx", |
1005 | cpu->lg->kernel_address); | 1078 | cpu->lg->kernel_address); |
1006 | } | 1079 | } |
@@ -1017,102 +1090,96 @@ void free_guest_pagetable(struct lguest *lg) | |||
1017 | free_page((long)lg->pgdirs[i].pgdir); | 1090 | free_page((long)lg->pgdirs[i].pgdir); |
1018 | } | 1091 | } |
1019 | 1092 | ||
1020 | /*H:480 | 1093 | /*H:481 |
1021 | * (vi) Mapping the Switcher when the Guest is about to run. | 1094 | * This clears the Switcher mappings for cpu #i. |
1022 | * | ||
1023 | * The Switcher and the two pages for this CPU need to be visible in the | ||
1024 | * Guest (and not the pages for other CPUs). We have the appropriate PTE pages | ||
1025 | * for each CPU already set up, we just need to hook them in now we know which | ||
1026 | * Guest is about to run on this CPU. | ||
1027 | */ | 1095 | */ |
1028 | void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) | 1096 | static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i) |
1029 | { | 1097 | { |
1030 | pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages); | 1098 | unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2; |
1031 | pte_t regs_pte; | 1099 | pte_t *pte; |
1032 | 1100 | ||
1033 | #ifdef CONFIG_X86_PAE | 1101 | /* Clear the mappings for both pages. */ |
1034 | pmd_t switcher_pmd; | 1102 | pte = find_spte(cpu, base, false, 0, 0); |
1035 | pmd_t *pmd_table; | 1103 | release_pte(*pte); |
1036 | 1104 | set_pte(pte, __pte(0)); | |
1037 | switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, | ||
1038 | PAGE_KERNEL_EXEC); | ||
1039 | |||
1040 | /* Figure out where the pmd page is, by reading the PGD, and converting | ||
1041 | * it to a virtual address. */ | ||
1042 | pmd_table = __va(pgd_pfn(cpu->lg-> | ||
1043 | pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) | ||
1044 | << PAGE_SHIFT); | ||
1045 | /* Now write it into the shadow page table. */ | ||
1046 | set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); | ||
1047 | #else | ||
1048 | pgd_t switcher_pgd; | ||
1049 | 1105 | ||
1050 | /* | 1106 | pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); |
1051 | * Make the last PGD entry for this Guest point to the Switcher's PTE | 1107 | release_pte(*pte); |
1052 | * page for this CPU (with appropriate flags). | 1108 | set_pte(pte, __pte(0)); |
1053 | */ | ||
1054 | switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); | ||
1055 | |||
1056 | cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; | ||
1057 | |||
1058 | #endif | ||
1059 | /* | ||
1060 | * We also change the Switcher PTE page. When we're running the Guest, | ||
1061 | * we want the Guest's "regs" page to appear where the first Switcher | ||
1062 | * page for this CPU is. This is an optimization: when the Switcher | ||
1063 | * saves the Guest registers, it saves them into the first page of this | ||
1064 | * CPU's "struct lguest_pages": if we make sure the Guest's register | ||
1065 | * page is already mapped there, we don't have to copy them out | ||
1066 | * again. | ||
1067 | */ | ||
1068 | regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL); | ||
1069 | set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte); | ||
1070 | } | 1109 | } |
1071 | /*:*/ | ||
1072 | 1110 | ||
1073 | static void free_switcher_pte_pages(void) | 1111 | /*H:480 |
1074 | { | 1112 | * (vi) Mapping the Switcher when the Guest is about to run. |
1075 | unsigned int i; | 1113 | * |
1076 | 1114 | * The Switcher and the two pages for this CPU need to be visible in the Guest | |
1077 | for_each_possible_cpu(i) | 1115 | * (and not the pages for other CPUs). |
1078 | free_page((long)switcher_pte_page(i)); | ||
1079 | } | ||
1080 | |||
1081 | /*H:520 | ||
1082 | * Setting up the Switcher PTE page for given CPU is fairly easy, given | ||
1083 | * the CPU number and the "struct page"s for the Switcher code itself. | ||
1084 | * | 1116 | * |
1085 | * Currently the Switcher is less than a page long, so "pages" is always 1. | 1117 | * The pages for the pagetables have all been allocated before: we just need |
1118 | * to make sure the actual PTEs are up-to-date for the CPU we're about to run | ||
1119 | * on. | ||
1086 | */ | 1120 | */ |
1087 | static __init void populate_switcher_pte_page(unsigned int cpu, | 1121 | void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) |
1088 | struct page *switcher_page[], | ||
1089 | unsigned int pages) | ||
1090 | { | 1122 | { |
1091 | unsigned int i; | 1123 | unsigned long base; |
1092 | pte_t *pte = switcher_pte_page(cpu); | 1124 | struct page *percpu_switcher_page, *regs_page; |
1125 | pte_t *pte; | ||
1126 | struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd]; | ||
1127 | |||
1128 | /* Switcher page should always be mapped by now! */ | ||
1129 | BUG_ON(!pgdir->switcher_mapped); | ||
1130 | |||
1131 | /* | ||
1132 | * Remember that we have two pages for each Host CPU, so we can run a | ||
1133 | * Guest on each CPU without them interfering. We need to make sure | ||
1134 | * those pages are mapped correctly in the Guest, but since we usually | ||
1135 | * run on the same CPU, we cache that, and only update the mappings | ||
1136 | * when we move. | ||
1137 | */ | ||
1138 | if (pgdir->last_host_cpu == raw_smp_processor_id()) | ||
1139 | return; | ||
1093 | 1140 | ||
1094 | /* The first entries are easy: they map the Switcher code. */ | 1141 | /* -1 means unknown so we remove everything. */ |
1095 | for (i = 0; i < pages; i++) { | 1142 | if (pgdir->last_host_cpu == -1) { |
1096 | set_pte(&pte[i], mk_pte(switcher_page[i], | 1143 | unsigned int i; |
1097 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); | 1144 | for_each_possible_cpu(i) |
1145 | remove_switcher_percpu_map(cpu, i); | ||
1146 | } else { | ||
1147 | /* We know exactly what CPU mapping to remove. */ | ||
1148 | remove_switcher_percpu_map(cpu, pgdir->last_host_cpu); | ||
1098 | } | 1149 | } |
1099 | 1150 | ||
1100 | /* The only other thing we map is this CPU's pair of pages. */ | 1151 | /* |
1101 | i = pages + cpu*2; | 1152 | * When we're running the Guest, we want the Guest's "regs" page to |
1102 | 1153 | * appear where the first Switcher page for this CPU is. This is an | |
1103 | /* First page (Guest registers) is writable from the Guest */ | 1154 | * optimization: when the Switcher saves the Guest registers, it saves |
1104 | set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), | 1155 | * them into the first page of this CPU's "struct lguest_pages": if we |
1105 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); | 1156 | * make sure the Guest's register page is already mapped there, we |
1157 | * don't have to copy them out again. | ||
1158 | */ | ||
1159 | /* Find the shadow PTE for this regs page. */ | ||
1160 | base = switcher_addr + PAGE_SIZE | ||
1161 | + raw_smp_processor_id() * sizeof(struct lguest_pages); | ||
1162 | pte = find_spte(cpu, base, false, 0, 0); | ||
1163 | regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT); | ||
1164 | get_page(regs_page); | ||
1165 | set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL))); | ||
1106 | 1166 | ||
1107 | /* | 1167 | /* |
1108 | * The second page contains the "struct lguest_ro_state", and is | 1168 | * We map the second page of the struct lguest_pages read-only in |
1109 | * read-only. | 1169 | * the Guest: the IDT, GDT and other things it's not supposed to |
1170 | * change. | ||
1110 | */ | 1171 | */ |
1111 | set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), | 1172 | pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); |
1112 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); | 1173 | percpu_switcher_page |
1174 | = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1]; | ||
1175 | get_page(percpu_switcher_page); | ||
1176 | set_pte(pte, mk_pte(percpu_switcher_page, | ||
1177 | __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL))); | ||
1178 | |||
1179 | pgdir->last_host_cpu = raw_smp_processor_id(); | ||
1113 | } | 1180 | } |
1114 | 1181 | ||
1115 | /* | 1182 | /*H:490 |
1116 | * We've made it through the page table code. Perhaps our tired brains are | 1183 | * We've made it through the page table code. Perhaps our tired brains are |
1117 | * still processing the details, or perhaps we're simply glad it's over. | 1184 | * still processing the details, or perhaps we're simply glad it's over. |
1118 | * | 1185 | * |
@@ -1124,29 +1191,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu, | |||
1124 | * | 1191 | * |
1125 | * There is just one file remaining in the Host. | 1192 | * There is just one file remaining in the Host. |
1126 | */ | 1193 | */ |
1127 | |||
1128 | /*H:510 | ||
1129 | * At boot or module load time, init_pagetables() allocates and populates | ||
1130 | * the Switcher PTE page for each CPU. | ||
1131 | */ | ||
1132 | __init int init_pagetables(struct page **switcher_page, unsigned int pages) | ||
1133 | { | ||
1134 | unsigned int i; | ||
1135 | |||
1136 | for_each_possible_cpu(i) { | ||
1137 | switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); | ||
1138 | if (!switcher_pte_page(i)) { | ||
1139 | free_switcher_pte_pages(); | ||
1140 | return -ENOMEM; | ||
1141 | } | ||
1142 | populate_switcher_pte_page(i, switcher_page, pages); | ||
1143 | } | ||
1144 | return 0; | ||
1145 | } | ||
1146 | /*:*/ | ||
1147 | |||
1148 | /* Cleaning up simply involves freeing the PTE page for each CPU. */ | ||
1149 | void free_pagetables(void) | ||
1150 | { | ||
1151 | free_switcher_pte_pages(); | ||
1152 | } | ||
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 4af12e1844d5..f0a3347b6441 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c | |||
@@ -59,14 +59,13 @@ static struct { | |||
59 | /* Offset from where switcher.S was compiled to where we've copied it */ | 59 | /* Offset from where switcher.S was compiled to where we've copied it */ |
60 | static unsigned long switcher_offset(void) | 60 | static unsigned long switcher_offset(void) |
61 | { | 61 | { |
62 | return SWITCHER_ADDR - (unsigned long)start_switcher_text; | 62 | return switcher_addr - (unsigned long)start_switcher_text; |
63 | } | 63 | } |
64 | 64 | ||
65 | /* This cpu's struct lguest_pages. */ | 65 | /* This cpu's struct lguest_pages (after the Switcher text page) */ |
66 | static struct lguest_pages *lguest_pages(unsigned int cpu) | 66 | static struct lguest_pages *lguest_pages(unsigned int cpu) |
67 | { | 67 | { |
68 | return &(((struct lguest_pages *) | 68 | return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]); |
69 | (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); | ||
70 | } | 69 | } |
71 | 70 | ||
72 | static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); | 71 | static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); |