summaryrefslogtreecommitdiffstats
path: root/drivers/lguest
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-05-02 17:14:04 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-05-02 17:14:04 -0400
commit736a2dd2571ac56b11ed95a7814d838d5311be04 (patch)
treede10d107025970c6e51d5b6faeba799ed4b9caae /drivers/lguest
parent0b2e3b6bb4a415379f16e38fc92db42379be47a1 (diff)
parent01d779a14ef800b74684d9692add4944df052461 (diff)
Merge tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux
Pull virtio & lguest updates from Rusty Russell: "Lots of virtio work which wasn't quite ready for last merge window. Plus I dived into lguest again, reworking the pagetable code so we can move the switcher page: our fixmaps sometimes take more than 2MB now..." Ugh. Annoying conflicts with the tcm_vhost -> vhost_scsi rename. Hopefully correctly resolved. * tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (57 commits) caif_virtio: Remove bouncing email addresses lguest: improve code readability in lg_cpu_start. virtio-net: fill only rx queues which are being used lguest: map Switcher below fixmap. lguest: cache last cpu we ran on. lguest: map Switcher text whenever we allocate a new pagetable. lguest: don't share Switcher PTE pages between guests. lguest: expost switcher_pages array (as lg_switcher_pages). lguest: extract shadow PTE walking / allocating. lguest: make check_gpte et. al return bool. lguest: assume Switcher text is a single page. lguest: rename switcher_page to switcher_pages. lguest: remove RESERVE_MEM constant. lguest: check vaddr not pgd for Switcher protection. lguest: prepare to make SWITCHER_ADDR a variable. virtio: console: replace EMFILE with EBUSY for already-open port virtio-scsi: reset virtqueue affinity when doing cpu hotplug virtio-scsi: introduce multiqueue support virtio-scsi: push vq lock/unlock into virtscsi_vq_done virtio-scsi: pass struct virtio_scsi to virtqueue completion function ...
Diffstat (limited to 'drivers/lguest')
-rw-r--r--drivers/lguest/Kconfig5
-rw-r--r--drivers/lguest/core.c67
-rw-r--r--drivers/lguest/lg.h6
-rw-r--r--drivers/lguest/lguest_user.c6
-rw-r--r--drivers/lguest/page_tables.c567
-rw-r--r--drivers/lguest/x86/core.c7
6 files changed, 347 insertions, 311 deletions
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 89875ea19ade..ee035ec4526b 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -5,10 +5,9 @@ config LGUEST
5 ---help--- 5 ---help---
6 This is a very simple module which allows you to run 6 This is a very simple module which allows you to run
7 multiple instances of the same Linux kernel, using the 7 multiple instances of the same Linux kernel, using the
8 "lguest" command found in the Documentation/virtual/lguest 8 "lguest" command found in the tools/lguest directory.
9 directory.
10 9
11 Note that "lguest" is pronounced to rhyme with "fell quest", 10 Note that "lguest" is pronounced to rhyme with "fell quest",
12 not "rustyvisor". See Documentation/virtual/lguest/lguest.txt. 11 not "rustyvisor". See tools/lguest/lguest.txt.
13 12
14 If unsure, say N. If curious, say M. If masochistic, say Y. 13 If unsure, say N. If curious, say M. If masochistic, say Y.
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index a5ebc0083d87..0bf1e4edf04d 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -20,9 +20,9 @@
20#include <asm/asm-offsets.h> 20#include <asm/asm-offsets.h>
21#include "lg.h" 21#include "lg.h"
22 22
23 23unsigned long switcher_addr;
24struct page **lg_switcher_pages;
24static struct vm_struct *switcher_vma; 25static struct vm_struct *switcher_vma;
25static struct page **switcher_page;
26 26
27/* This One Big lock protects all inter-guest data structures. */ 27/* This One Big lock protects all inter-guest data structures. */
28DEFINE_MUTEX(lguest_lock); 28DEFINE_MUTEX(lguest_lock);
@@ -52,13 +52,21 @@ static __init int map_switcher(void)
52 * easy. 52 * easy.
53 */ 53 */
54 54
55 /* We assume Switcher text fits into a single page. */
56 if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
57 printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
58 end_switcher_text - start_switcher_text);
59 return -EINVAL;
60 }
61
55 /* 62 /*
56 * We allocate an array of struct page pointers. map_vm_area() wants 63 * We allocate an array of struct page pointers. map_vm_area() wants
57 * this, rather than just an array of pages. 64 * this, rather than just an array of pages.
58 */ 65 */
59 switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, 66 lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
60 GFP_KERNEL); 67 * TOTAL_SWITCHER_PAGES,
61 if (!switcher_page) { 68 GFP_KERNEL);
69 if (!lg_switcher_pages) {
62 err = -ENOMEM; 70 err = -ENOMEM;
63 goto out; 71 goto out;
64 } 72 }
@@ -68,32 +76,29 @@ static __init int map_switcher(void)
68 * so we make sure they're zeroed. 76 * so we make sure they're zeroed.
69 */ 77 */
70 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { 78 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
71 switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); 79 lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
72 if (!switcher_page[i]) { 80 if (!lg_switcher_pages[i]) {
73 err = -ENOMEM; 81 err = -ENOMEM;
74 goto free_some_pages; 82 goto free_some_pages;
75 } 83 }
76 } 84 }
77 85
78 /* 86 /*
79 * First we check that the Switcher won't overlap the fixmap area at 87 * We place the Switcher underneath the fixmap area, which is the
80 * the top of memory. It's currently nowhere near, but it could have 88 * highest virtual address we can get. This is important, since we
81 * very strange effects if it ever happened. 89 * tell the Guest it can't access this memory, so we want its ceiling
90 * as high as possible.
82 */ 91 */
83 if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ 92 switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
84 err = -ENOMEM;
85 printk("lguest: mapping switcher would thwack fixmap\n");
86 goto free_pages;
87 }
88 93
89 /* 94 /*
90 * Now we reserve the "virtual memory area" we want: 0xFFC00000 95 * Now we reserve the "virtual memory area" we want. We might
91 * (SWITCHER_ADDR). We might not get it in theory, but in practice 96 * not get it in theory, but in practice it's worked so far.
92 * it's worked so far. The end address needs +1 because __get_vm_area 97 * The end address needs +1 because __get_vm_area allocates an
93 * allocates an extra guard page, so we need space for that. 98 * extra guard page, so we need space for that.
94 */ 99 */
95 switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, 100 switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
96 VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR 101 VM_ALLOC, switcher_addr, switcher_addr
97 + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); 102 + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
98 if (!switcher_vma) { 103 if (!switcher_vma) {
99 err = -ENOMEM; 104 err = -ENOMEM;
@@ -103,12 +108,12 @@ static __init int map_switcher(void)
103 108
104 /* 109 /*
105 * This code actually sets up the pages we've allocated to appear at 110 * This code actually sets up the pages we've allocated to appear at
106 * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the 111 * switcher_addr. map_vm_area() takes the vma we allocated above, the
107 * kind of pages we're mapping (kernel pages), and a pointer to our 112 * kind of pages we're mapping (kernel pages), and a pointer to our
108 * array of struct pages. It increments that pointer, but we don't 113 * array of struct pages. It increments that pointer, but we don't
109 * care. 114 * care.
110 */ 115 */
111 pagep = switcher_page; 116 pagep = lg_switcher_pages;
112 err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); 117 err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
113 if (err) { 118 if (err) {
114 printk("lguest: map_vm_area failed: %i\n", err); 119 printk("lguest: map_vm_area failed: %i\n", err);
@@ -133,8 +138,8 @@ free_pages:
133 i = TOTAL_SWITCHER_PAGES; 138 i = TOTAL_SWITCHER_PAGES;
134free_some_pages: 139free_some_pages:
135 for (--i; i >= 0; i--) 140 for (--i; i >= 0; i--)
136 __free_pages(switcher_page[i], 0); 141 __free_pages(lg_switcher_pages[i], 0);
137 kfree(switcher_page); 142 kfree(lg_switcher_pages);
138out: 143out:
139 return err; 144 return err;
140} 145}
@@ -149,8 +154,8 @@ static void unmap_switcher(void)
149 vunmap(switcher_vma->addr); 154 vunmap(switcher_vma->addr);
150 /* Now we just need to free the pages we copied the switcher into */ 155 /* Now we just need to free the pages we copied the switcher into */
151 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) 156 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
152 __free_pages(switcher_page[i], 0); 157 __free_pages(lg_switcher_pages[i], 0);
153 kfree(switcher_page); 158 kfree(lg_switcher_pages);
154} 159}
155 160
156/*H:032 161/*H:032
@@ -323,15 +328,10 @@ static int __init init(void)
323 if (err) 328 if (err)
324 goto out; 329 goto out;
325 330
326 /* Now we set up the pagetable implementation for the Guests. */
327 err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
328 if (err)
329 goto unmap;
330
331 /* We might need to reserve an interrupt vector. */ 331 /* We might need to reserve an interrupt vector. */
332 err = init_interrupts(); 332 err = init_interrupts();
333 if (err) 333 if (err)
334 goto free_pgtables; 334 goto unmap;
335 335
336 /* /dev/lguest needs to be registered. */ 336 /* /dev/lguest needs to be registered. */
337 err = lguest_device_init(); 337 err = lguest_device_init();
@@ -346,8 +346,6 @@ static int __init init(void)
346 346
347free_interrupts: 347free_interrupts:
348 free_interrupts(); 348 free_interrupts();
349free_pgtables:
350 free_pagetables();
351unmap: 349unmap:
352 unmap_switcher(); 350 unmap_switcher();
353out: 351out:
@@ -359,7 +357,6 @@ static void __exit fini(void)
359{ 357{
360 lguest_device_remove(); 358 lguest_device_remove();
361 free_interrupts(); 359 free_interrupts();
362 free_pagetables();
363 unmap_switcher(); 360 unmap_switcher();
364 361
365 lguest_arch_host_fini(); 362 lguest_arch_host_fini();
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 295df06e6590..2eef40be4c04 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -14,11 +14,10 @@
14 14
15#include <asm/lguest.h> 15#include <asm/lguest.h>
16 16
17void free_pagetables(void);
18int init_pagetables(struct page **switcher_page, unsigned int pages);
19
20struct pgdir { 17struct pgdir {
21 unsigned long gpgdir; 18 unsigned long gpgdir;
19 bool switcher_mapped;
20 int last_host_cpu;
22 pgd_t *pgdir; 21 pgd_t *pgdir;
23}; 22};
24 23
@@ -124,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg,
124 unsigned long addr, unsigned long len); 123 unsigned long addr, unsigned long len);
125void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); 124void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
126void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); 125void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
126extern struct page **lg_switcher_pages;
127 127
128/*H:035 128/*H:035
129 * Using memory-copy operations like that is usually inconvient, so we 129 * Using memory-copy operations like that is usually inconvient, so we
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index ff4a0bc9904d..4263f4cc8c55 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
250 */ 250 */
251static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) 251static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
252{ 252{
253 /* We have a limited number the number of CPUs in the lguest struct. */ 253 /* We have a limited number of CPUs in the lguest struct. */
254 if (id >= ARRAY_SIZE(cpu->lg->cpus)) 254 if (id >= ARRAY_SIZE(cpu->lg->cpus))
255 return -EINVAL; 255 return -EINVAL;
256 256
257 /* Set up this CPU's id, and pointer back to the lguest struct. */ 257 /* Set up this CPU's id, and pointer back to the lguest struct. */
258 cpu->id = id; 258 cpu->id = id;
259 cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); 259 cpu->lg = container_of(cpu, struct lguest, cpus[id]);
260 cpu->lg->nr_cpus++; 260 cpu->lg->nr_cpus++;
261 261
262 /* Each CPU has a timer it can set. */ 262 /* Each CPU has a timer it can set. */
@@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
270 if (!cpu->regs_page) 270 if (!cpu->regs_page)
271 return -ENOMEM; 271 return -ENOMEM;
272 272
273 /* We actually put the registers at the bottom of the page. */ 273 /* We actually put the registers at the end of the page. */
274 cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); 274 cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
275 275
276 /* 276 /*
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 864baabaee25..699187ab3800 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -7,7 +7,7 @@
7 * converted Guest pages when running the Guest. 7 * converted Guest pages when running the Guest.
8:*/ 8:*/
9 9
10/* Copyright (C) Rusty Russell IBM Corporation 2006. 10/* Copyright (C) Rusty Russell IBM Corporation 2013.
11 * GPL v2 and any later version */ 11 * GPL v2 and any later version */
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/gfp.h> 13#include <linux/gfp.h>
@@ -62,22 +62,11 @@
62 * will need the last pmd entry of the last pmd page. 62 * will need the last pmd entry of the last pmd page.
63 */ 63 */
64#ifdef CONFIG_X86_PAE 64#ifdef CONFIG_X86_PAE
65#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1)
66#define RESERVE_MEM 2U
67#define CHECK_GPGD_MASK _PAGE_PRESENT 65#define CHECK_GPGD_MASK _PAGE_PRESENT
68#else 66#else
69#define RESERVE_MEM 4U
70#define CHECK_GPGD_MASK _PAGE_TABLE 67#define CHECK_GPGD_MASK _PAGE_TABLE
71#endif 68#endif
72 69
73/*
74 * We actually need a separate PTE page for each CPU. Remember that after the
75 * Switcher code itself comes two pages for each CPU, and we don't want this
76 * CPU's guest to see the pages of any other CPU.
77 */
78static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
79#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
80
81/*H:320 70/*H:320
82 * The page table code is curly enough to need helper functions to keep it 71 * The page table code is curly enough to need helper functions to keep it
83 * clear and clean. The kernel itself provides many of them; one advantage 72 * clear and clean. The kernel itself provides many of them; one advantage
@@ -95,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
95{ 84{
96 unsigned int index = pgd_index(vaddr); 85 unsigned int index = pgd_index(vaddr);
97 86
98#ifndef CONFIG_X86_PAE
99 /* We kill any Guest trying to touch the Switcher addresses. */
100 if (index >= SWITCHER_PGD_INDEX) {
101 kill_guest(cpu, "attempt to access switcher pages");
102 index = 0;
103 }
104#endif
105 /* Return a pointer index'th pgd entry for the i'th page table. */ 87 /* Return a pointer index'th pgd entry for the i'th page table. */
106 return &cpu->lg->pgdirs[i].pgdir[index]; 88 return &cpu->lg->pgdirs[i].pgdir[index];
107} 89}
@@ -117,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
117 unsigned int index = pmd_index(vaddr); 99 unsigned int index = pmd_index(vaddr);
118 pmd_t *page; 100 pmd_t *page;
119 101
120 /* We kill any Guest trying to touch the Switcher addresses. */
121 if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
122 index >= SWITCHER_PMD_INDEX) {
123 kill_guest(cpu, "attempt to access switcher pages");
124 index = 0;
125 }
126
127 /* You should never call this if the PGD entry wasn't valid */ 102 /* You should never call this if the PGD entry wasn't valid */
128 BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); 103 BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
129 page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 104 page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
@@ -275,122 +250,177 @@ static void release_pte(pte_t pte)
275} 250}
276/*:*/ 251/*:*/
277 252
278static void check_gpte(struct lg_cpu *cpu, pte_t gpte) 253static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
279{ 254{
280 if ((pte_flags(gpte) & _PAGE_PSE) || 255 if ((pte_flags(gpte) & _PAGE_PSE) ||
281 pte_pfn(gpte) >= cpu->lg->pfn_limit) 256 pte_pfn(gpte) >= cpu->lg->pfn_limit) {
282 kill_guest(cpu, "bad page table entry"); 257 kill_guest(cpu, "bad page table entry");
258 return false;
259 }
260 return true;
283} 261}
284 262
285static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) 263static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
286{ 264{
287 if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || 265 if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
288 (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) 266 (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
289 kill_guest(cpu, "bad page directory entry"); 267 kill_guest(cpu, "bad page directory entry");
268 return false;
269 }
270 return true;
290} 271}
291 272
292#ifdef CONFIG_X86_PAE 273#ifdef CONFIG_X86_PAE
293static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) 274static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
294{ 275{
295 if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || 276 if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
296 (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) 277 (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
297 kill_guest(cpu, "bad page middle directory entry"); 278 kill_guest(cpu, "bad page middle directory entry");
279 return false;
280 }
281 return true;
298} 282}
299#endif 283#endif
300 284
301/*H:330 285/*H:331
302 * (i) Looking up a page table entry when the Guest faults. 286 * This is the core routine to walk the shadow page tables and find the page
303 * 287 * table entry for a specific address.
304 * We saw this call in run_guest(): when we see a page fault in the Guest, we
305 * come here. That's because we only set up the shadow page tables lazily as
306 * they're needed, so we get page faults all the time and quietly fix them up
307 * and return to the Guest without it knowing.
308 * 288 *
309 * If we fixed up the fault (ie. we mapped the address), this routine returns 289 * If allocate is set, then we allocate any missing levels, setting the flags
310 * true. Otherwise, it was a real fault and we need to tell the Guest. 290 * on the new page directory and mid-level directories using the arguments
291 * (which are copied from the Guest's page table entries).
311 */ 292 */
312bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) 293static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
294 int pgd_flags, int pmd_flags)
313{ 295{
314 pgd_t gpgd;
315 pgd_t *spgd; 296 pgd_t *spgd;
316 unsigned long gpte_ptr;
317 pte_t gpte;
318 pte_t *spte;
319
320 /* Mid level for PAE. */ 297 /* Mid level for PAE. */
321#ifdef CONFIG_X86_PAE 298#ifdef CONFIG_X86_PAE
322 pmd_t *spmd; 299 pmd_t *spmd;
323 pmd_t gpmd;
324#endif 300#endif
325 301
326 /* First step: get the top-level Guest page table entry. */ 302 /* Get top level entry. */
327 if (unlikely(cpu->linear_pages)) {
328 /* Faking up a linear mapping. */
329 gpgd = __pgd(CHECK_GPGD_MASK);
330 } else {
331 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
332 /* Toplevel not present? We can't map it in. */
333 if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
334 return false;
335 }
336
337 /* Now look at the matching shadow entry. */
338 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); 303 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
339 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { 304 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
340 /* No shadow entry: allocate a new shadow PTE page. */ 305 /* No shadow entry: allocate a new shadow PTE page. */
341 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 306 unsigned long ptepage;
307
308 /* If they didn't want us to allocate anything, stop. */
309 if (!allocate)
310 return NULL;
311
312 ptepage = get_zeroed_page(GFP_KERNEL);
342 /* 313 /*
343 * This is not really the Guest's fault, but killing it is 314 * This is not really the Guest's fault, but killing it is
344 * simple for this corner case. 315 * simple for this corner case.
345 */ 316 */
346 if (!ptepage) { 317 if (!ptepage) {
347 kill_guest(cpu, "out of memory allocating pte page"); 318 kill_guest(cpu, "out of memory allocating pte page");
348 return false; 319 return NULL;
349 } 320 }
350 /* We check that the Guest pgd is OK. */
351 check_gpgd(cpu, gpgd);
352 /* 321 /*
353 * And we copy the flags to the shadow PGD entry. The page 322 * And we copy the flags to the shadow PGD entry. The page
354 * number in the shadow PGD is the page we just allocated. 323 * number in the shadow PGD is the page we just allocated.
355 */ 324 */
356 set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); 325 set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
357 } 326 }
358 327
328 /*
329 * Intel's Physical Address Extension actually uses three levels of
330 * page tables, so we need to look in the mid-level.
331 */
359#ifdef CONFIG_X86_PAE 332#ifdef CONFIG_X86_PAE
360 if (unlikely(cpu->linear_pages)) { 333 /* Now look at the mid-level shadow entry. */
361 /* Faking up a linear mapping. */
362 gpmd = __pmd(_PAGE_TABLE);
363 } else {
364 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
365 /* Middle level not present? We can't map it in. */
366 if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
367 return false;
368 }
369
370 /* Now look at the matching shadow entry. */
371 spmd = spmd_addr(cpu, *spgd, vaddr); 334 spmd = spmd_addr(cpu, *spgd, vaddr);
372 335
373 if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { 336 if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
374 /* No shadow entry: allocate a new shadow PTE page. */ 337 /* No shadow entry: allocate a new shadow PTE page. */
375 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 338 unsigned long ptepage;
339
340 /* If they didn't want us to allocate anything, stop. */
341 if (!allocate)
342 return NULL;
343
344 ptepage = get_zeroed_page(GFP_KERNEL);
376 345
377 /* 346 /*
378 * This is not really the Guest's fault, but killing it is 347 * This is not really the Guest's fault, but killing it is
379 * simple for this corner case. 348 * simple for this corner case.
380 */ 349 */
381 if (!ptepage) { 350 if (!ptepage) {
382 kill_guest(cpu, "out of memory allocating pte page"); 351 kill_guest(cpu, "out of memory allocating pmd page");
383 return false; 352 return NULL;
384 } 353 }
385 354
386 /* We check that the Guest pmd is OK. */
387 check_gpmd(cpu, gpmd);
388
389 /* 355 /*
390 * And we copy the flags to the shadow PMD entry. The page 356 * And we copy the flags to the shadow PMD entry. The page
391 * number in the shadow PMD is the page we just allocated. 357 * number in the shadow PMD is the page we just allocated.
392 */ 358 */
393 set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); 359 set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
360 }
361#endif
362
363 /* Get the pointer to the shadow PTE entry we're going to set. */
364 return spte_addr(cpu, *spgd, vaddr);
365}
366
367/*H:330
368 * (i) Looking up a page table entry when the Guest faults.
369 *
370 * We saw this call in run_guest(): when we see a page fault in the Guest, we
371 * come here. That's because we only set up the shadow page tables lazily as
372 * they're needed, so we get page faults all the time and quietly fix them up
373 * and return to the Guest without it knowing.
374 *
375 * If we fixed up the fault (ie. we mapped the address), this routine returns
376 * true. Otherwise, it was a real fault and we need to tell the Guest.
377 */
378bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
379{
380 unsigned long gpte_ptr;
381 pte_t gpte;
382 pte_t *spte;
383 pmd_t gpmd;
384 pgd_t gpgd;
385
386 /* We never demand page the Switcher, so trying is a mistake. */
387 if (vaddr >= switcher_addr)
388 return false;
389
390 /* First step: get the top-level Guest page table entry. */
391 if (unlikely(cpu->linear_pages)) {
392 /* Faking up a linear mapping. */
393 gpgd = __pgd(CHECK_GPGD_MASK);
394 } else {
395 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
396 /* Toplevel not present? We can't map it in. */
397 if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
398 return false;
399
400 /*
401 * This kills the Guest if it has weird flags or tries to
402 * refer to a "physical" address outside the bounds.
403 */
404 if (!check_gpgd(cpu, gpgd))
405 return false;
406 }
407
408 /* This "mid-level" entry is only used for non-linear, PAE mode. */
409 gpmd = __pmd(_PAGE_TABLE);
410
411#ifdef CONFIG_X86_PAE
412 if (likely(!cpu->linear_pages)) {
413 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
414 /* Middle level not present? We can't map it in. */
415 if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
416 return false;
417
418 /*
419 * This kills the Guest if it has weird flags or tries to
420 * refer to a "physical" address outside the bounds.
421 */
422 if (!check_gpmd(cpu, gpmd))
423 return false;
394 } 424 }
395 425
396 /* 426 /*
@@ -433,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
433 * Check that the Guest PTE flags are OK, and the page number is below 463 * Check that the Guest PTE flags are OK, and the page number is below
434 * the pfn_limit (ie. not mapping the Launcher binary). 464 * the pfn_limit (ie. not mapping the Launcher binary).
435 */ 465 */
436 check_gpte(cpu, gpte); 466 if (!check_gpte(cpu, gpte))
467 return false;
437 468
438 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 469 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
439 gpte = pte_mkyoung(gpte); 470 gpte = pte_mkyoung(gpte);
@@ -441,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
441 gpte = pte_mkdirty(gpte); 472 gpte = pte_mkdirty(gpte);
442 473
443 /* Get the pointer to the shadow PTE entry we're going to set. */ 474 /* Get the pointer to the shadow PTE entry we're going to set. */
444 spte = spte_addr(cpu, *spgd, vaddr); 475 spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
476 if (!spte)
477 return false;
445 478
446 /* 479 /*
447 * If there was a valid shadow PTE entry here before, we release it. 480 * If there was a valid shadow PTE entry here before, we release it.
@@ -493,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
493 */ 526 */
494static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) 527static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
495{ 528{
496 pgd_t *spgd; 529 pte_t *spte;
497 unsigned long flags; 530 unsigned long flags;
498 531
499#ifdef CONFIG_X86_PAE 532 /* You can't put your stack in the Switcher! */
500 pmd_t *spmd; 533 if (vaddr >= switcher_addr)
501#endif
502 /* Look at the current top level entry: is it present? */
503 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
504 if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
505 return false; 534 return false;
506 535
507#ifdef CONFIG_X86_PAE 536 /* If there's no shadow PTE, it's not writable. */
508 spmd = spmd_addr(cpu, *spgd, vaddr); 537 spte = find_spte(cpu, vaddr, false, 0, 0);
509 if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) 538 if (!spte)
510 return false; 539 return false;
511#endif
512 540
513 /* 541 /*
514 * Check the flags on the pte entry itself: it must be present and 542 * Check the flags on the pte entry itself: it must be present and
515 * writable. 543 * writable.
516 */ 544 */
517 flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); 545 flags = pte_flags(*spte);
518
519 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 546 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
520} 547}
521 548
@@ -678,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
678 int *blank_pgdir) 705 int *blank_pgdir)
679{ 706{
680 unsigned int next; 707 unsigned int next;
681#ifdef CONFIG_X86_PAE
682 pmd_t *pmd_table;
683#endif
684 708
685 /* 709 /*
686 * We pick one entry at random to throw out. Choosing the Least 710 * We pick one entry at random to throw out. Choosing the Least
@@ -695,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
695 if (!cpu->lg->pgdirs[next].pgdir) 719 if (!cpu->lg->pgdirs[next].pgdir)
696 next = cpu->cpu_pgd; 720 next = cpu->cpu_pgd;
697 else { 721 else {
698#ifdef CONFIG_X86_PAE
699 /* 722 /*
700 * In PAE mode, allocate a pmd page and populate the 723 * This is a blank page, so there are no kernel
701 * last pgd entry. 724 * mappings: caller must map the stack!
702 */ 725 */
703 pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
704 if (!pmd_table) {
705 free_page((long)cpu->lg->pgdirs[next].pgdir);
706 set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
707 next = cpu->cpu_pgd;
708 } else {
709 set_pgd(cpu->lg->pgdirs[next].pgdir +
710 SWITCHER_PGD_INDEX,
711 __pgd(__pa(pmd_table) | _PAGE_PRESENT));
712 /*
713 * This is a blank page, so there are no kernel
714 * mappings: caller must map the stack!
715 */
716 *blank_pgdir = 1;
717 }
718#else
719 *blank_pgdir = 1; 726 *blank_pgdir = 1;
720#endif
721 } 727 }
722 } 728 }
723 /* Record which Guest toplevel this shadows. */ 729 /* Record which Guest toplevel this shadows. */
@@ -725,9 +731,50 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
725 /* Release all the non-kernel mappings. */ 731 /* Release all the non-kernel mappings. */
726 flush_user_mappings(cpu->lg, next); 732 flush_user_mappings(cpu->lg, next);
727 733
734 /* This hasn't run on any CPU at all. */
735 cpu->lg->pgdirs[next].last_host_cpu = -1;
736
728 return next; 737 return next;
729} 738}
730 739
740/*H:501
741 * We do need the Switcher code mapped at all times, so we allocate that
742 * part of the Guest page table here. We map the Switcher code immediately,
743 * but defer mapping of the guest register page and IDT/LDT etc page until
744 * just before we run the guest in map_switcher_in_guest().
745 *
746 * We *could* do this setup in map_switcher_in_guest(), but at that point
747 * we've interrupts disabled, and allocating pages like that is fraught: we
748 * can't sleep if we need to free up some memory.
749 */
750static bool allocate_switcher_mapping(struct lg_cpu *cpu)
751{
752 int i;
753
754 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
755 pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
756 CHECK_GPGD_MASK, _PAGE_TABLE);
757 if (!pte)
758 return false;
759
760 /*
761 * Map the switcher page if not already there. It might
762 * already be there because we call allocate_switcher_mapping()
763 * in guest_set_pgd() just in case it did discard our Switcher
764 * mapping, but it probably didn't.
765 */
766 if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) {
767 /* Get a reference to the Switcher page. */
768 get_page(lg_switcher_pages[0]);
769 /* Create a read-only, exectuable, kernel-style PTE */
770 set_pte(pte,
771 mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
772 }
773 }
774 cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true;
775 return true;
776}
777
731/*H:470 778/*H:470
732 * Finally, a routine which throws away everything: all PGD entries in all 779 * Finally, a routine which throws away everything: all PGD entries in all
733 * the shadow page tables, including the Guest's kernel mappings. This is used 780 * the shadow page tables, including the Guest's kernel mappings. This is used
@@ -738,28 +785,16 @@ static void release_all_pagetables(struct lguest *lg)
738 unsigned int i, j; 785 unsigned int i, j;
739 786
740 /* Every shadow pagetable this Guest has */ 787 /* Every shadow pagetable this Guest has */
741 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 788 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
742 if (lg->pgdirs[i].pgdir) { 789 if (!lg->pgdirs[i].pgdir)
743#ifdef CONFIG_X86_PAE 790 continue;
744 pgd_t *spgd; 791
745 pmd_t *pmdpage; 792 /* Every PGD entry. */
746 unsigned int k; 793 for (j = 0; j < PTRS_PER_PGD; j++)
747 794 release_pgd(lg->pgdirs[i].pgdir + j);
748 /* Get the last pmd page. */ 795 lg->pgdirs[i].switcher_mapped = false;
749 spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; 796 lg->pgdirs[i].last_host_cpu = -1;
750 pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 797 }
751
752 /*
753 * And release the pmd entries of that pmd page,
754 * except for the switcher pmd.
755 */
756 for (k = 0; k < SWITCHER_PMD_INDEX; k++)
757 release_pmd(&pmdpage[k]);
758#endif
759 /* Every PGD entry except the Switcher at the top */
760 for (j = 0; j < SWITCHER_PGD_INDEX; j++)
761 release_pgd(lg->pgdirs[i].pgdir + j);
762 }
763} 798}
764 799
765/* 800/*
@@ -773,6 +808,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
773 release_all_pagetables(cpu->lg); 808 release_all_pagetables(cpu->lg);
774 /* We need the Guest kernel stack mapped again. */ 809 /* We need the Guest kernel stack mapped again. */
775 pin_stack_pages(cpu); 810 pin_stack_pages(cpu);
811 /* And we need Switcher allocated. */
812 if (!allocate_switcher_mapping(cpu))
813 kill_guest(cpu, "Cannot populate switcher mapping");
776} 814}
777 815
778/*H:430 816/*H:430
@@ -808,9 +846,17 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
808 newpgdir = new_pgdir(cpu, pgtable, &repin); 846 newpgdir = new_pgdir(cpu, pgtable, &repin);
809 /* Change the current pgd index to the new one. */ 847 /* Change the current pgd index to the new one. */
810 cpu->cpu_pgd = newpgdir; 848 cpu->cpu_pgd = newpgdir;
811 /* If it was completely blank, we map in the Guest kernel stack */ 849 /*
850 * If it was completely blank, we map in the Guest kernel stack and
851 * the Switcher.
852 */
812 if (repin) 853 if (repin)
813 pin_stack_pages(cpu); 854 pin_stack_pages(cpu);
855
856 if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) {
857 if (!allocate_switcher_mapping(cpu))
858 kill_guest(cpu, "Cannot populate switcher mapping");
859 }
814} 860}
815/*:*/ 861/*:*/
816 862
@@ -865,7 +911,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
865 * micro-benchmark. 911 * micro-benchmark.
866 */ 912 */
867 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 913 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
868 check_gpte(cpu, gpte); 914 if (!check_gpte(cpu, gpte))
915 return;
869 set_pte(spte, 916 set_pte(spte,
870 gpte_to_spte(cpu, gpte, 917 gpte_to_spte(cpu, gpte,
871 pte_flags(gpte) & _PAGE_DIRTY)); 918 pte_flags(gpte) & _PAGE_DIRTY));
@@ -897,6 +944,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
897void guest_set_pte(struct lg_cpu *cpu, 944void guest_set_pte(struct lg_cpu *cpu,
898 unsigned long gpgdir, unsigned long vaddr, pte_t gpte) 945 unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
899{ 946{
947 /* We don't let you remap the Switcher; we need it to get back! */
948 if (vaddr >= switcher_addr) {
949 kill_guest(cpu, "attempt to set pte into Switcher pages");
950 return;
951 }
952
900 /* 953 /*
901 * Kernel mappings must be changed on all top levels. Slow, but doesn't 954 * Kernel mappings must be changed on all top levels. Slow, but doesn't
902 * happen often. 955 * happen often.
@@ -933,14 +986,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
933{ 986{
934 int pgdir; 987 int pgdir;
935 988
936 if (idx >= SWITCHER_PGD_INDEX) 989 if (idx > PTRS_PER_PGD) {
990 kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
991 idx, PTRS_PER_PGD);
937 return; 992 return;
993 }
938 994
939 /* If they're talking about a page table we have a shadow for... */ 995 /* If they're talking about a page table we have a shadow for... */
940 pgdir = find_pgdir(lg, gpgdir); 996 pgdir = find_pgdir(lg, gpgdir);
941 if (pgdir < ARRAY_SIZE(lg->pgdirs)) 997 if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
942 /* ... throw it away. */ 998 /* ... throw it away. */
943 release_pgd(lg->pgdirs[pgdir].pgdir + idx); 999 release_pgd(lg->pgdirs[pgdir].pgdir + idx);
1000 /* That might have been the Switcher mapping, remap it. */
1001 if (!allocate_switcher_mapping(&lg->cpus[0])) {
1002 kill_guest(&lg->cpus[0],
1003 "Cannot populate switcher mapping");
1004 }
1005 }
944} 1006}
945 1007
946#ifdef CONFIG_X86_PAE 1008#ifdef CONFIG_X86_PAE
@@ -958,6 +1020,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
958 * we will populate on future faults. The Guest doesn't have any actual 1020 * we will populate on future faults. The Guest doesn't have any actual
959 * pagetables yet, so we set linear_pages to tell demand_page() to fake it 1021 * pagetables yet, so we set linear_pages to tell demand_page() to fake it
960 * for the moment. 1022 * for the moment.
1023 *
1024 * We do need the Switcher to be mapped at all times, so we allocate that
1025 * part of the Guest page table here.
961 */ 1026 */
962int init_guest_pagetable(struct lguest *lg) 1027int init_guest_pagetable(struct lguest *lg)
963{ 1028{
@@ -971,21 +1036,34 @@ int init_guest_pagetable(struct lguest *lg)
971 1036
972 /* We start with a linear mapping until the initialize. */ 1037 /* We start with a linear mapping until the initialize. */
973 cpu->linear_pages = true; 1038 cpu->linear_pages = true;
1039
1040 /* Allocate the page tables for the Switcher. */
1041 if (!allocate_switcher_mapping(cpu)) {
1042 release_all_pagetables(lg);
1043 return -ENOMEM;
1044 }
1045
974 return 0; 1046 return 0;
975} 1047}
976 1048
977/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 1049/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
978void page_table_guest_data_init(struct lg_cpu *cpu) 1050void page_table_guest_data_init(struct lg_cpu *cpu)
979{ 1051{
1052 /*
1053 * We tell the Guest that it can't use the virtual addresses
1054 * used by the Switcher. This trick is equivalent to 4GB -
1055 * switcher_addr.
1056 */
1057 u32 top = ~switcher_addr + 1;
1058
980 /* We get the kernel address: above this is all kernel memory. */ 1059 /* We get the kernel address: above this is all kernel memory. */
981 if (get_user(cpu->lg->kernel_address, 1060 if (get_user(cpu->lg->kernel_address,
982 &cpu->lg->lguest_data->kernel_address) 1061 &cpu->lg->lguest_data->kernel_address)
983 /* 1062 /*
984 * We tell the Guest that it can't use the top 2 or 4 MB 1063 * We tell the Guest that it can't use the top virtual
985 * of virtual addresses used by the Switcher. 1064 * addresses (used by the Switcher).
986 */ 1065 */
987 || put_user(RESERVE_MEM * 1024 * 1024, 1066 || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {
988 &cpu->lg->lguest_data->reserve_mem)) {
989 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 1067 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
990 return; 1068 return;
991 } 1069 }
@@ -995,12 +1073,7 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
995 * "pgd_index(lg->kernel_address)". This assumes it won't hit the 1073 * "pgd_index(lg->kernel_address)". This assumes it won't hit the
996 * Switcher mappings, so check that now. 1074 * Switcher mappings, so check that now.
997 */ 1075 */
998#ifdef CONFIG_X86_PAE 1076 if (cpu->lg->kernel_address >= switcher_addr)
999 if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
1000 pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
1001#else
1002 if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
1003#endif
1004 kill_guest(cpu, "bad kernel address %#lx", 1077 kill_guest(cpu, "bad kernel address %#lx",
1005 cpu->lg->kernel_address); 1078 cpu->lg->kernel_address);
1006} 1079}
@@ -1017,102 +1090,96 @@ void free_guest_pagetable(struct lguest *lg)
1017 free_page((long)lg->pgdirs[i].pgdir); 1090 free_page((long)lg->pgdirs[i].pgdir);
1018} 1091}
1019 1092
1020/*H:480 1093/*H:481
1021 * (vi) Mapping the Switcher when the Guest is about to run. 1094 * This clears the Switcher mappings for cpu #i.
1022 *
1023 * The Switcher and the two pages for this CPU need to be visible in the
1024 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages
1025 * for each CPU already set up, we just need to hook them in now we know which
1026 * Guest is about to run on this CPU.
1027 */ 1095 */
1028void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) 1096static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)
1029{ 1097{
1030 pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages); 1098 unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2;
1031 pte_t regs_pte; 1099 pte_t *pte;
1032 1100
1033#ifdef CONFIG_X86_PAE 1101 /* Clear the mappings for both pages. */
1034 pmd_t switcher_pmd; 1102 pte = find_spte(cpu, base, false, 0, 0);
1035 pmd_t *pmd_table; 1103 release_pte(*pte);
1036 1104 set_pte(pte, __pte(0));
1037 switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT,
1038 PAGE_KERNEL_EXEC);
1039
1040 /* Figure out where the pmd page is, by reading the PGD, and converting
1041 * it to a virtual address. */
1042 pmd_table = __va(pgd_pfn(cpu->lg->
1043 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
1044 << PAGE_SHIFT);
1045 /* Now write it into the shadow page table. */
1046 set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
1047#else
1048 pgd_t switcher_pgd;
1049 1105
1050 /* 1106 pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
1051 * Make the last PGD entry for this Guest point to the Switcher's PTE 1107 release_pte(*pte);
1052 * page for this CPU (with appropriate flags). 1108 set_pte(pte, __pte(0));
1053 */
1054 switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
1055
1056 cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
1057
1058#endif
1059 /*
1060 * We also change the Switcher PTE page. When we're running the Guest,
1061 * we want the Guest's "regs" page to appear where the first Switcher
1062 * page for this CPU is. This is an optimization: when the Switcher
1063 * saves the Guest registers, it saves them into the first page of this
1064 * CPU's "struct lguest_pages": if we make sure the Guest's register
1065 * page is already mapped there, we don't have to copy them out
1066 * again.
1067 */
1068 regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL);
1069 set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte);
1070} 1109}
1071/*:*/
1072 1110
1073static void free_switcher_pte_pages(void) 1111/*H:480
1074{ 1112 * (vi) Mapping the Switcher when the Guest is about to run.
1075 unsigned int i; 1113 *
1076 1114 * The Switcher and the two pages for this CPU need to be visible in the Guest
1077 for_each_possible_cpu(i) 1115 * (and not the pages for other CPUs).
1078 free_page((long)switcher_pte_page(i));
1079}
1080
1081/*H:520
1082 * Setting up the Switcher PTE page for given CPU is fairly easy, given
1083 * the CPU number and the "struct page"s for the Switcher code itself.
1084 * 1116 *
1085 * Currently the Switcher is less than a page long, so "pages" is always 1. 1117 * The pages for the pagetables have all been allocated before: we just need
1118 * to make sure the actual PTEs are up-to-date for the CPU we're about to run
1119 * on.
1086 */ 1120 */
1087static __init void populate_switcher_pte_page(unsigned int cpu, 1121void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
1088 struct page *switcher_page[],
1089 unsigned int pages)
1090{ 1122{
1091 unsigned int i; 1123 unsigned long base;
1092 pte_t *pte = switcher_pte_page(cpu); 1124 struct page *percpu_switcher_page, *regs_page;
1125 pte_t *pte;
1126 struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd];
1127
1128 /* Switcher page should always be mapped by now! */
1129 BUG_ON(!pgdir->switcher_mapped);
1130
1131 /*
1132 * Remember that we have two pages for each Host CPU, so we can run a
1133 * Guest on each CPU without them interfering. We need to make sure
1134 * those pages are mapped correctly in the Guest, but since we usually
1135 * run on the same CPU, we cache that, and only update the mappings
1136 * when we move.
1137 */
1138 if (pgdir->last_host_cpu == raw_smp_processor_id())
1139 return;
1093 1140
1094 /* The first entries are easy: they map the Switcher code. */ 1141 /* -1 means unknown so we remove everything. */
1095 for (i = 0; i < pages; i++) { 1142 if (pgdir->last_host_cpu == -1) {
1096 set_pte(&pte[i], mk_pte(switcher_page[i], 1143 unsigned int i;
1097 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); 1144 for_each_possible_cpu(i)
1145 remove_switcher_percpu_map(cpu, i);
1146 } else {
1147 /* We know exactly what CPU mapping to remove. */
1148 remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);
1098 } 1149 }
1099 1150
1100 /* The only other thing we map is this CPU's pair of pages. */ 1151 /*
1101 i = pages + cpu*2; 1152 * When we're running the Guest, we want the Guest's "regs" page to
1102 1153 * appear where the first Switcher page for this CPU is. This is an
1103 /* First page (Guest registers) is writable from the Guest */ 1154 * optimization: when the Switcher saves the Guest registers, it saves
1104 set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), 1155 * them into the first page of this CPU's "struct lguest_pages": if we
1105 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); 1156 * make sure the Guest's register page is already mapped there, we
1157 * don't have to copy them out again.
1158 */
1159 /* Find the shadow PTE for this regs page. */
1160 base = switcher_addr + PAGE_SIZE
1161 + raw_smp_processor_id() * sizeof(struct lguest_pages);
1162 pte = find_spte(cpu, base, false, 0, 0);
1163 regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
1164 get_page(regs_page);
1165 set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
1106 1166
1107 /* 1167 /*
1108 * The second page contains the "struct lguest_ro_state", and is 1168 * We map the second page of the struct lguest_pages read-only in
1109 * read-only. 1169 * the Guest: the IDT, GDT and other things it's not supposed to
1170 * change.
1110 */ 1171 */
1111 set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), 1172 pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
1112 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); 1173 percpu_switcher_page
1174 = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
1175 get_page(percpu_switcher_page);
1176 set_pte(pte, mk_pte(percpu_switcher_page,
1177 __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
1178
1179 pgdir->last_host_cpu = raw_smp_processor_id();
1113} 1180}
1114 1181
1115/* 1182/*H:490
1116 * We've made it through the page table code. Perhaps our tired brains are 1183 * We've made it through the page table code. Perhaps our tired brains are
1117 * still processing the details, or perhaps we're simply glad it's over. 1184 * still processing the details, or perhaps we're simply glad it's over.
1118 * 1185 *
@@ -1124,29 +1191,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
1124 * 1191 *
1125 * There is just one file remaining in the Host. 1192 * There is just one file remaining in the Host.
1126 */ 1193 */
1127
1128/*H:510
1129 * At boot or module load time, init_pagetables() allocates and populates
1130 * the Switcher PTE page for each CPU.
1131 */
1132__init int init_pagetables(struct page **switcher_page, unsigned int pages)
1133{
1134 unsigned int i;
1135
1136 for_each_possible_cpu(i) {
1137 switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
1138 if (!switcher_pte_page(i)) {
1139 free_switcher_pte_pages();
1140 return -ENOMEM;
1141 }
1142 populate_switcher_pte_page(i, switcher_page, pages);
1143 }
1144 return 0;
1145}
1146/*:*/
1147
1148/* Cleaning up simply involves freeing the PTE page for each CPU. */
1149void free_pagetables(void)
1150{
1151 free_switcher_pte_pages();
1152}
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 4af12e1844d5..f0a3347b6441 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -59,14 +59,13 @@ static struct {
59/* Offset from where switcher.S was compiled to where we've copied it */ 59/* Offset from where switcher.S was compiled to where we've copied it */
60static unsigned long switcher_offset(void) 60static unsigned long switcher_offset(void)
61{ 61{
62 return SWITCHER_ADDR - (unsigned long)start_switcher_text; 62 return switcher_addr - (unsigned long)start_switcher_text;
63} 63}
64 64
65/* This cpu's struct lguest_pages. */ 65/* This cpu's struct lguest_pages (after the Switcher text page) */
66static struct lguest_pages *lguest_pages(unsigned int cpu) 66static struct lguest_pages *lguest_pages(unsigned int cpu)
67{ 67{
68 return &(((struct lguest_pages *) 68 return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
69 (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
70} 69}
71 70
72static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); 71static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);