Merge tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux

Pull virtio & lguest updates from Rusty Russell: "Lots of virtio work which wasn't quite ready for last merge window. Plus I dived into lguest again, reworking the pagetable code so we can move the switcher page: our fixmaps sometimes take more than 2MB now..." Ugh. Annoying conflicts with the tcm_vhost -> vhost_scsi rename. Hopefully correctly resolved. * tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (57 commits) caif_virtio: Remove bouncing email addresses lguest: improve code readability in lg_cpu_start. virtio-net: fill only rx queues which are being used lguest: map Switcher below fixmap. lguest: cache last cpu we ran on. lguest: map Switcher text whenever we allocate a new pagetable. lguest: don't share Switcher PTE pages between guests. lguest: expost switcher_pages array (as lg_switcher_pages). lguest: extract shadow PTE walking / allocating. lguest: make check_gpte et. al return bool. lguest: assume Switcher text is a single page. lguest: rename switcher_page to switcher_pages. lguest: remove RESERVE_MEM constant. lguest: check vaddr not pgd for Switcher protection. lguest: prepare to make SWITCHER_ADDR a variable. virtio: console: replace EMFILE with EBUSY for already-open port virtio-scsi: reset virtqueue affinity when doing cpu hotplug virtio-scsi: introduce multiqueue support virtio-scsi: push vq lock/unlock into virtscsi_vq_done virtio-scsi: pass struct virtio_scsi to virtqueue completion function ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-05-02 17:14:04 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-05-02 17:14:04 -0400
commit: 736a2dd2571ac56b11ed95a7814d838d5311be04 (patch)
tree: de10d107025970c6e51d5b6faeba799ed4b9caae /drivers/lguest
parent: 0b2e3b6bb4a415379f16e38fc92db42379be47a1 (diff)
parent: 01d779a14ef800b74684d9692add4944df052461 (diff)
6 files changed, 347 insertions, 311 deletions
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 89875ea19ade..ee035ec4526b 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -5,10 +5,9 @@ config LGUEST
        ---help---
          This is a very simple module which allows you to run
          multiple instances of the same Linux kernel, using the
-          "lguest" command found in the Documentation/virtual/lguest
+          "lguest" command found in the tools/lguest directory.
-          directory.
          Note that "lguest" is pronounced to rhyme with "fell quest",
-          not "rustyvisor". See Documentation/virtual/lguest/lguest.txt.
+          not "rustyvisor". See tools/lguest/lguest.txt.
          If unsure, say N.  If curious, say M.  If masochistic, say Y.
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index a5ebc0083d87..0bf1e4edf04d 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -20,9 +20,9 @@
 #include <asm/asm-offsets.h>
 #include "lg.h"
+unsigned long switcher_addr;
+struct page **lg_switcher_pages;
 static struct vm_struct *switcher_vma;
-static struct page **switcher_page;
 /* This One Big lock protects all inter-guest data structures. */
 DEFINE_MUTEX(lguest_lock);
@@ -52,13 +52,21 @@ static __init int map_switcher(void)
         * easy.
         */
+        /* We assume Switcher text fits into a single page. */
+        if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
+                printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
+                       end_switcher_text - start_switcher_text);
+                return -EINVAL;
+        }
        /*
         * We allocate an array of struct page pointers.  map_vm_area() wants
         * this, rather than just an array of pages.
         */
-        switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
+        lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
-                                GFP_KERNEL);
+                                    * TOTAL_SWITCHER_PAGES,
-        if (!switcher_page) {
+                                    GFP_KERNEL);
+        if (!lg_switcher_pages) {
                err = -ENOMEM;
                goto out;
        }
@@ -68,32 +76,29 @@ static __init int map_switcher(void)
         * so we make sure they're zeroed.
         */
        for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
-                switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+                lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
-                if (!switcher_page[i]) {
+                if (!lg_switcher_pages[i]) {
                        err = -ENOMEM;
                        goto free_some_pages;
                }
        }
        /*
-         * First we check that the Switcher won't overlap the fixmap area at
+         * We place the Switcher underneath the fixmap area, which is the
-         * the top of memory.  It's currently nowhere near, but it could have
+         * highest virtual address we can get.  This is important, since we
-         * very strange effects if it ever happened.
+         * tell the Guest it can't access this memory, so we want its ceiling
+         * as high as possible.
         */
-        if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){
+        switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
-                err = -ENOMEM;
-                printk("lguest: mapping switcher would thwack fixmap\n");
-                goto free_pages;
-        }
        /*
-         * Now we reserve the "virtual memory area" we want: 0xFFC00000
+         * Now we reserve the "virtual memory area" we want.  We might
-         * (SWITCHER_ADDR).  We might not get it in theory, but in practice
+         * not get it in theory, but in practice it's worked so far.
-         * it's worked so far.  The end address needs +1 because __get_vm_area
+         * The end address needs +1 because __get_vm_area allocates an
-         * allocates an extra guard page, so we need space for that.
+         * extra guard page, so we need space for that.
         */
        switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
-                                     VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR
+                                     VM_ALLOC, switcher_addr, switcher_addr
                                     + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
        if (!switcher_vma) {
                err = -ENOMEM;
@@ -103,12 +108,12 @@ static __init int map_switcher(void)
        /*
         * This code actually sets up the pages we've allocated to appear at
-         * SWITCHER_ADDR.  map_vm_area() takes the vma we allocated above, the
+         * switcher_addr.  map_vm_area() takes the vma we allocated above, the
         * kind of pages we're mapping (kernel pages), and a pointer to our
         * array of struct pages.  It increments that pointer, but we don't
         * care.
         */
-        pagep = switcher_page;
+        pagep = lg_switcher_pages;
        err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
        if (err) {
                printk("lguest: map_vm_area failed: %i\n", err);
@@ -133,8 +138,8 @@ free_pages:
        i = TOTAL_SWITCHER_PAGES;
 free_some_pages:
        for (--i; i >= 0; i--)
-                __free_pages(switcher_page[i], 0);
+                __free_pages(lg_switcher_pages[i], 0);
-        kfree(switcher_page);
+        kfree(lg_switcher_pages);
 out:
        return err;
 }
@@ -149,8 +154,8 @@ static void unmap_switcher(void)
        vunmap(switcher_vma->addr);
        /* Now we just need to free the pages we copied the switcher into */
        for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
-                __free_pages(switcher_page[i], 0);
+                __free_pages(lg_switcher_pages[i], 0);
-        kfree(switcher_page);
+        kfree(lg_switcher_pages);
 }
 /*H:032
@@ -323,15 +328,10 @@ static int __init init(void)
        if (err)
                goto out;
-        /* Now we set up the pagetable implementation for the Guests. */
-        err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
-        if (err)
-                goto unmap;
        /* We might need to reserve an interrupt vector. */
        err = init_interrupts();
        if (err)
-                goto free_pgtables;
+                goto unmap;
        /* /dev/lguest needs to be registered. */
        err = lguest_device_init();
@@ -346,8 +346,6 @@ static int __init init(void)
 free_interrupts:
        free_interrupts();
-free_pgtables:
-        free_pagetables();
 unmap:
        unmap_switcher();
 out:
@@ -359,7 +357,6 @@ static void __exit fini(void)
 {
        lguest_device_remove();
        free_interrupts();
-        free_pagetables();
        unmap_switcher();
        lguest_arch_host_fini();
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 295df06e6590..2eef40be4c04 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -14,11 +14,10 @@
 #include <asm/lguest.h>
-void free_pagetables(void);
-int init_pagetables(struct page **switcher_page, unsigned int pages);
 struct pgdir {
        unsigned long gpgdir;
+        bool switcher_mapped;
+        int last_host_cpu;
        pgd_t *pgdir;
 };
@@ -124,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg,
                       unsigned long addr, unsigned long len);
 void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
 void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
+extern struct page **lg_switcher_pages;
 /*H:035
 * Using memory-copy operations like that is usually inconvient, so we
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index ff4a0bc9904d..4263f4cc8c55 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
 */
 static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
 {
-        /* We have a limited number the number of CPUs in the lguest struct. */
+        /* We have a limited number of CPUs in the lguest struct. */
        if (id >= ARRAY_SIZE(cpu->lg->cpus))
                return -EINVAL;
        /* Set up this CPU's id, and pointer back to the lguest struct. */
        cpu->id = id;
-        cpu->lg = container_of((cpu - id), struct lguest, cpus[0]);
+        cpu->lg = container_of(cpu, struct lguest, cpus[id]);
        cpu->lg->nr_cpus++;
        /* Each CPU has a timer it can set. */
@@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
        if (!cpu->regs_page)
                return -ENOMEM;
-        /* We actually put the registers at the bottom of the page. */
+        /* We actually put the registers at the end of the page. */
        cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
        /*
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 864baabaee25..699187ab3800 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -7,7 +7,7 @@
 * converted Guest pages when running the Guest.
 :*/
-/* Copyright (C) Rusty Russell IBM Corporation 2006.
+/* Copyright (C) Rusty Russell IBM Corporation 2013.
 * GPL v2 and any later version */
 #include <linux/mm.h>
 #include <linux/gfp.h>
@@ -62,22 +62,11 @@
 * will need the last pmd entry of the last pmd page.
 */
 #ifdef CONFIG_X86_PAE
-#define SWITCHER_PMD_INDEX      (PTRS_PER_PMD - 1)
-#define RESERVE_MEM             2U
 #define CHECK_GPGD_MASK         _PAGE_PRESENT
 #else
-#define RESERVE_MEM             4U
 #define CHECK_GPGD_MASK         _PAGE_TABLE
 #endif
-/*
- * We actually need a separate PTE page for each CPU.  Remember that after the
- * Switcher code itself comes two pages for each CPU, and we don't want this
- * CPU's guest to see the pages of any other CPU.
- */
-static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
-#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
 /*H:320
 * The page table code is curly enough to need helper functions to keep it
 * clear and clean.  The kernel itself provides many of them; one advantage
@@ -95,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
 {
        unsigned int index = pgd_index(vaddr);
-#ifndef CONFIG_X86_PAE
-        /* We kill any Guest trying to touch the Switcher addresses. */
-        if (index >= SWITCHER_PGD_INDEX) {
-                kill_guest(cpu, "attempt to access switcher pages");
-                index = 0;
-        }
-#endif
        /* Return a pointer index'th pgd entry for the i'th page table. */
        return &cpu->lg->pgdirs[i].pgdir[index];
 }
@@ -117,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
        unsigned int index = pmd_index(vaddr);
        pmd_t *page;
-        /* We kill any Guest trying to touch the Switcher addresses. */
-        if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
-                                        index >= SWITCHER_PMD_INDEX) {
-                kill_guest(cpu, "attempt to access switcher pages");
-                index = 0;
-        }
        /* You should never call this if the PGD entry wasn't valid */
        BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
        page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
@@ -275,122 +250,177 @@ static void release_pte(pte_t pte)
 }
 /*:*/
-static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
+static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
 {
        if ((pte_flags(gpte) & _PAGE_PSE) ||
-            pte_pfn(gpte) >= cpu->lg->pfn_limit)
+            pte_pfn(gpte) >= cpu->lg->pfn_limit) {
                kill_guest(cpu, "bad page table entry");
+                return false;
+        }
+        return true;
 }
-static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
+static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
 {
        if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
-           (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
+            (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
                kill_guest(cpu, "bad page directory entry");
+                return false;
+        }
+        return true;
 }
 #ifdef CONFIG_X86_PAE
-static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
+static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
 {
        if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
-           (pmd_pfn(gpmd) >= cpu->lg->pfn_limit))
+            (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
                kill_guest(cpu, "bad page middle directory entry");
+                return false;
+        }
+        return true;
 }
 #endif
-/*H:330
+/*H:331
- * (i) Looking up a page table entry when the Guest faults.
+ * This is the core routine to walk the shadow page tables and find the page
- *
+ * table entry for a specific address.
- * We saw this call in run_guest(): when we see a page fault in the Guest, we
- * come here.  That's because we only set up the shadow page tables lazily as
- * they're needed, so we get page faults all the time and quietly fix them up
- * and return to the Guest without it knowing.
 *
- * If we fixed up the fault (ie. we mapped the address), this routine returns
+ * If allocate is set, then we allocate any missing levels, setting the flags
- * true.  Otherwise, it was a real fault and we need to tell the Guest.
+ * on the new page directory and mid-level directories using the arguments
+ * (which are copied from the Guest's page table entries).
 */
-bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
+                        int pgd_flags, int pmd_flags)
 {
-        pgd_t gpgd;
        pgd_t *spgd;
-        unsigned long gpte_ptr;
-        pte_t gpte;
-        pte_t *spte;
        /* Mid level for PAE. */
 #ifdef CONFIG_X86_PAE
        pmd_t *spmd;
-        pmd_t gpmd;
 #endif
-        /* First step: get the top-level Guest page table entry. */
+        /* Get top level entry. */
-        if (unlikely(cpu->linear_pages)) {
-                /* Faking up a linear mapping. */
-                gpgd = __pgd(CHECK_GPGD_MASK);
-        } else {
-                gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
-                /* Toplevel not present?  We can't map it in. */
-                if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-                        return false;
-        }
-        /* Now look at the matching shadow entry. */
        spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
        if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
                /* No shadow entry: allocate a new shadow PTE page. */
-                unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+                unsigned long ptepage;
+                /* If they didn't want us to allocate anything, stop. */
+                if (!allocate)
+                        return NULL;
+                ptepage = get_zeroed_page(GFP_KERNEL);
                /*
                 * This is not really the Guest's fault, but killing it is
                 * simple for this corner case.
                 */
                if (!ptepage) {
                        kill_guest(cpu, "out of memory allocating pte page");
-                        return false;
+                        return NULL;
                }
-                /* We check that the Guest pgd is OK. */
-                check_gpgd(cpu, gpgd);
                /*
                 * And we copy the flags to the shadow PGD entry.  The page
                 * number in the shadow PGD is the page we just allocated.
                 */
-                set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
+                set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
        }
+        /*
+         * Intel's Physical Address Extension actually uses three levels of
+         * page tables, so we need to look in the mid-level.
+         */
 #ifdef CONFIG_X86_PAE
-        if (unlikely(cpu->linear_pages)) {
+        /* Now look at the mid-level shadow entry. */
-                /* Faking up a linear mapping. */
-                gpmd = __pmd(_PAGE_TABLE);
-        } else {
-                gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-                /* Middle level not present?  We can't map it in. */
-                if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
-                        return false;
-        }
-        /* Now look at the matching shadow entry. */
        spmd = spmd_addr(cpu, *spgd, vaddr);
        if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
                /* No shadow entry: allocate a new shadow PTE page. */
-                unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+                unsigned long ptepage;
+                /* If they didn't want us to allocate anything, stop. */
+                if (!allocate)
+                        return NULL;
+                ptepage = get_zeroed_page(GFP_KERNEL);
                /*
                 * This is not really the Guest's fault, but killing it is
                 * simple for this corner case.
                 */
                if (!ptepage) {
-                        kill_guest(cpu, "out of memory allocating pte page");
+                        kill_guest(cpu, "out of memory allocating pmd page");
-                        return false;
+                        return NULL;
                }
-                /* We check that the Guest pmd is OK. */
-                check_gpmd(cpu, gpmd);
                /*
                 * And we copy the flags to the shadow PMD entry.  The page
                 * number in the shadow PMD is the page we just allocated.
                 */
-                set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
+                set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
+        }
+#endif
+        /* Get the pointer to the shadow PTE entry we're going to set. */
+        return spte_addr(cpu, *spgd, vaddr);
+}
+/*H:330
+ * (i) Looking up a page table entry when the Guest faults.
+ *
+ * We saw this call in run_guest(): when we see a page fault in the Guest, we
+ * come here.  That's because we only set up the shadow page tables lazily as
+ * they're needed, so we get page faults all the time and quietly fix them up
+ * and return to the Guest without it knowing.
+ *
+ * If we fixed up the fault (ie. we mapped the address), this routine returns
+ * true.  Otherwise, it was a real fault and we need to tell the Guest.
+ */
+bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+{
+        unsigned long gpte_ptr;
+        pte_t gpte;
+        pte_t *spte;
+        pmd_t gpmd;
+        pgd_t gpgd;
+        /* We never demand page the Switcher, so trying is a mistake. */
+        if (vaddr >= switcher_addr)
+                return false;
+        /* First step: get the top-level Guest page table entry. */
+        if (unlikely(cpu->linear_pages)) {
+                /* Faking up a linear mapping. */
+                gpgd = __pgd(CHECK_GPGD_MASK);
+        } else {
+                gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
+                /* Toplevel not present?  We can't map it in. */
+                if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
+                        return false;
+                /* 
+                 * This kills the Guest if it has weird flags or tries to
+                 * refer to a "physical" address outside the bounds.
+                 */
+                if (!check_gpgd(cpu, gpgd))
+                        return false;
+        }
+        /* This "mid-level" entry is only used for non-linear, PAE mode. */
+        gpmd = __pmd(_PAGE_TABLE);
+#ifdef CONFIG_X86_PAE
+        if (likely(!cpu->linear_pages)) {
+                gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
+                /* Middle level not present?  We can't map it in. */
+                if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+                        return false;
+                /* 
+                 * This kills the Guest if it has weird flags or tries to
+                 * refer to a "physical" address outside the bounds.
+                 */
+                if (!check_gpmd(cpu, gpmd))
+                        return false;
        }
        /*
@@ -433,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
         * Check that the Guest PTE flags are OK, and the page number is below
         * the pfn_limit (ie. not mapping the Launcher binary).
         */
-        check_gpte(cpu, gpte);
+        if (!check_gpte(cpu, gpte))
+                return false;
        /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
        gpte = pte_mkyoung(gpte);
@@ -441,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
                gpte = pte_mkdirty(gpte);
        /* Get the pointer to the shadow PTE entry we're going to set. */
-        spte = spte_addr(cpu, *spgd, vaddr);
+        spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
+        if (!spte)
+                return false;
        /*
         * If there was a valid shadow PTE entry here before, we release it.
@@ -493,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 */
 static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
 {
-        pgd_t *spgd;
+        pte_t *spte;
        unsigned long flags;
-#ifdef CONFIG_X86_PAE
+        /* You can't put your stack in the Switcher! */
-        pmd_t *spmd;
+        if (vaddr >= switcher_addr)
-#endif
-        /* Look at the current top level entry: is it present? */
-        spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
-        if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
                return false;
-#ifdef CONFIG_X86_PAE
+        /* If there's no shadow PTE, it's not writable. */
-        spmd = spmd_addr(cpu, *spgd, vaddr);
+        spte = find_spte(cpu, vaddr, false, 0, 0);
-        if (!(pmd_flags(*spmd) & _PAGE_PRESENT))
+        if (!spte)
                return false;
-#endif
        /*
         * Check the flags on the pte entry itself: it must be present and
         * writable.
         */
-        flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr)));
+        flags = pte_flags(*spte);
        return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
 }
@@ -678,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
                              int *blank_pgdir)
 {
        unsigned int next;
-#ifdef CONFIG_X86_PAE
-        pmd_t *pmd_table;
-#endif
        /*
         * We pick one entry at random to throw out.  Choosing the Least
@@ -695,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
                if (!cpu->lg->pgdirs[next].pgdir)
                        next = cpu->cpu_pgd;
                else {
-#ifdef CONFIG_X86_PAE
                        /*
-                         * In PAE mode, allocate a pmd page and populate the
+                         * This is a blank page, so there are no kernel
-                         * last pgd entry.
+                         * mappings: caller must map the stack!
                         */
-                        pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
-                        if (!pmd_table) {
-                                free_page((long)cpu->lg->pgdirs[next].pgdir);
-                                set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
-                                next = cpu->cpu_pgd;
-                        } else {
-                                set_pgd(cpu->lg->pgdirs[next].pgdir +
-                                        SWITCHER_PGD_INDEX,
-                                        __pgd(__pa(pmd_table) | _PAGE_PRESENT));
-                                /*
-                                 * This is a blank page, so there are no kernel
-                                 * mappings: caller must map the stack!
-                                 */
-                                *blank_pgdir = 1;
-                        }
-#else
                        *blank_pgdir = 1;
-#endif
                }
        }
        /* Record which Guest toplevel this shadows. */
@@ -725,9 +731,50 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
        /* Release all the non-kernel mappings. */
        flush_user_mappings(cpu->lg, next);
+        /* This hasn't run on any CPU at all. */
+        cpu->lg->pgdirs[next].last_host_cpu = -1;
        return next;
 }
+/*H:501
+ * We do need the Switcher code mapped at all times, so we allocate that
+ * part of the Guest page table here.  We map the Switcher code immediately,
+ * but defer mapping of the guest register page and IDT/LDT etc page until
+ * just before we run the guest in map_switcher_in_guest().
+ *
+ * We *could* do this setup in map_switcher_in_guest(), but at that point
+ * we've interrupts disabled, and allocating pages like that is fraught: we
+ * can't sleep if we need to free up some memory.
+ */
+static bool allocate_switcher_mapping(struct lg_cpu *cpu)
+{
+        int i;
+        for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
+                pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
+                                       CHECK_GPGD_MASK, _PAGE_TABLE);
+                if (!pte)
+                        return false;
+                /*
+                 * Map the switcher page if not already there.  It might
+                 * already be there because we call allocate_switcher_mapping()
+                 * in guest_set_pgd() just in case it did discard our Switcher
+                 * mapping, but it probably didn't.
+                 */
+                if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) {
+                        /* Get a reference to the Switcher page. */
+                        get_page(lg_switcher_pages[0]);
+                        /* Create a read-only, exectuable, kernel-style PTE */
+                        set_pte(pte,
+                                mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
+                }
+        }
+        cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true;
+        return true;
+}
 /*H:470
 * Finally, a routine which throws away everything: all PGD entries in all
 * the shadow page tables, including the Guest's kernel mappings.  This is used
@@ -738,28 +785,16 @@ static void release_all_pagetables(struct lguest *lg)
        unsigned int i, j;
        /* Every shadow pagetable this Guest has */
-        for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+        for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
-                if (lg->pgdirs[i].pgdir) {
+                if (!lg->pgdirs[i].pgdir)
-#ifdef CONFIG_X86_PAE
+                        continue;
-                        pgd_t *spgd;
-                        pmd_t *pmdpage;
+                /* Every PGD entry. */
-                        unsigned int k;
+                for (j = 0; j < PTRS_PER_PGD; j++)
+                        release_pgd(lg->pgdirs[i].pgdir + j);
-                        /* Get the last pmd page. */
+                lg->pgdirs[i].switcher_mapped = false;
-                        spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX;
+                lg->pgdirs[i].last_host_cpu = -1;
-                        pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
+        }
-                        /*
-                         * And release the pmd entries of that pmd page,
-                         * except for the switcher pmd.
-                         */
-                        for (k = 0; k < SWITCHER_PMD_INDEX; k++)
-                                release_pmd(&pmdpage[k]);
-#endif
-                        /* Every PGD entry except the Switcher at the top */
-                        for (j = 0; j < SWITCHER_PGD_INDEX; j++)
-                                release_pgd(lg->pgdirs[i].pgdir + j);
-                }
 }
 /*
@@ -773,6 +808,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
        release_all_pagetables(cpu->lg);
        /* We need the Guest kernel stack mapped again. */
        pin_stack_pages(cpu);
+        /* And we need Switcher allocated. */
+        if (!allocate_switcher_mapping(cpu))
+                kill_guest(cpu, "Cannot populate switcher mapping");
 }
 /*H:430
@@ -808,9 +846,17 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
                newpgdir = new_pgdir(cpu, pgtable, &repin);
        /* Change the current pgd index to the new one. */
        cpu->cpu_pgd = newpgdir;
-        /* If it was completely blank, we map in the Guest kernel stack */
+        /*
+         * If it was completely blank, we map in the Guest kernel stack and
+         * the Switcher.
+         */
        if (repin)
                pin_stack_pages(cpu);
+        if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) {
+                if (!allocate_switcher_mapping(cpu))
+                        kill_guest(cpu, "Cannot populate switcher mapping");
+        }
 }
 /*:*/
@@ -865,7 +911,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
                         * micro-benchmark.
                         */
                        if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
-                                check_gpte(cpu, gpte);
+                                if (!check_gpte(cpu, gpte))
+                                        return;
                                set_pte(spte,
                                        gpte_to_spte(cpu, gpte,
                                                pte_flags(gpte) & _PAGE_DIRTY));
@@ -897,6 +944,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
 void guest_set_pte(struct lg_cpu *cpu,
                   unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
 {
+        /* We don't let you remap the Switcher; we need it to get back! */
+        if (vaddr >= switcher_addr) {
+                kill_guest(cpu, "attempt to set pte into Switcher pages");
+                return;
+        }
        /*
         * Kernel mappings must be changed on all top levels.  Slow, but doesn't
         * happen often.
@@ -933,14 +986,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
 {
        int pgdir;
-        if (idx >= SWITCHER_PGD_INDEX)
+        if (idx > PTRS_PER_PGD) {
+                kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
+                           idx, PTRS_PER_PGD);
                return;
+        }
        /* If they're talking about a page table we have a shadow for... */
        pgdir = find_pgdir(lg, gpgdir);
-        if (pgdir < ARRAY_SIZE(lg->pgdirs))
+        if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
                /* ... throw it away. */
                release_pgd(lg->pgdirs[pgdir].pgdir + idx);
+                /* That might have been the Switcher mapping, remap it. */
+                if (!allocate_switcher_mapping(&lg->cpus[0])) {
+                        kill_guest(&lg->cpus[0],
+                                   "Cannot populate switcher mapping");
+                }
+        }
 }
 #ifdef CONFIG_X86_PAE
@@ -958,6 +1020,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
 * we will populate on future faults.  The Guest doesn't have any actual
 * pagetables yet, so we set linear_pages to tell demand_page() to fake it
 * for the moment.
+ *
+ * We do need the Switcher to be mapped at all times, so we allocate that
+ * part of the Guest page table here.
 */
 int init_guest_pagetable(struct lguest *lg)
 {
@@ -971,21 +1036,34 @@ int init_guest_pagetable(struct lguest *lg)
        /* We start with a linear mapping until the initialize. */
        cpu->linear_pages = true;
+        /* Allocate the page tables for the Switcher. */
+        if (!allocate_switcher_mapping(cpu)) {
+                release_all_pagetables(lg);
+                return -ENOMEM;
+        }
        return 0;
 }
 /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
 void page_table_guest_data_init(struct lg_cpu *cpu)
 {
+        /*
+         * We tell the Guest that it can't use the virtual addresses
+         * used by the Switcher.  This trick is equivalent to 4GB -
+         * switcher_addr.
+         */
+        u32 top = ~switcher_addr + 1;
        /* We get the kernel address: above this is all kernel memory. */
        if (get_user(cpu->lg->kernel_address,
-                &cpu->lg->lguest_data->kernel_address)
+                     &cpu->lg->lguest_data->kernel_address)
                /*
-                 * We tell the Guest that it can't use the top 2 or 4 MB
+                 * We tell the Guest that it can't use the top virtual
-                 * of virtual addresses used by the Switcher.
+                 * addresses (used by the Switcher).
                 */
-                || put_user(RESERVE_MEM * 1024 * 1024,
+            || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {
-                            &cpu->lg->lguest_data->reserve_mem)) {
                kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
                return;
        }
@@ -995,12 +1073,7 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
         * "pgd_index(lg->kernel_address)".  This assumes it won't hit the
         * Switcher mappings, so check that now.
         */
-#ifdef CONFIG_X86_PAE
+        if (cpu->lg->kernel_address >= switcher_addr)
-        if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
-                pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
-#else
-        if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
-#endif
                kill_guest(cpu, "bad kernel address %#lx",
                                 cpu->lg->kernel_address);
 }
@@ -1017,102 +1090,96 @@ void free_guest_pagetable(struct lguest *lg)
                free_page((long)lg->pgdirs[i].pgdir);
 }
-/*H:480
+/*H:481
- * (vi) Mapping the Switcher when the Guest is about to run.
+ * This clears the Switcher mappings for cpu #i.
- *
- * The Switcher and the two pages for this CPU need to be visible in the
- * Guest (and not the pages for other CPUs).  We have the appropriate PTE pages
- * for each CPU already set up, we just need to hook them in now we know which
- * Guest is about to run on this CPU.
 */
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
+static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)
 {
-        pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages);
+        unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2;
-        pte_t regs_pte;
+        pte_t *pte;
-#ifdef CONFIG_X86_PAE
+        /* Clear the mappings for both pages. */
-        pmd_t switcher_pmd;
+        pte = find_spte(cpu, base, false, 0, 0);
-        pmd_t *pmd_table;
+        release_pte(*pte);
+        set_pte(pte, __pte(0));
-        switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT,
-                               PAGE_KERNEL_EXEC);
-        /* Figure out where the pmd page is, by reading the PGD, and converting
-         * it to a virtual address. */
-        pmd_table = __va(pgd_pfn(cpu->lg->
-                        pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
-                                                                << PAGE_SHIFT);
-        /* Now write it into the shadow page table. */
-        set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
-#else
-        pgd_t switcher_pgd;
-        /*
+        pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
-         * Make the last PGD entry for this Guest point to the Switcher's PTE
+        release_pte(*pte);
-         * page for this CPU (with appropriate flags).
+        set_pte(pte, __pte(0));
-         */
-        switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
-        cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
-#endif
-        /*
-         * We also change the Switcher PTE page.  When we're running the Guest,
-         * we want the Guest's "regs" page to appear where the first Switcher
-         * page for this CPU is.  This is an optimization: when the Switcher
-         * saves the Guest registers, it saves them into the first page of this
-         * CPU's "struct lguest_pages": if we make sure the Guest's register
-         * page is already mapped there, we don't have to copy them out
-         * again.
-         */
-        regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL);
-        set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte);
 }
-/*:*/
-static void free_switcher_pte_pages(void)
+/*H:480
-{
+ * (vi) Mapping the Switcher when the Guest is about to run.
-        unsigned int i;
+ *
+ * The Switcher and the two pages for this CPU need to be visible in the Guest
-        for_each_possible_cpu(i)
+ * (and not the pages for other CPUs).
-                free_page((long)switcher_pte_page(i));
-}
-/*H:520
- * Setting up the Switcher PTE page for given CPU is fairly easy, given
- * the CPU number and the "struct page"s for the Switcher code itself.
 *
- * Currently the Switcher is less than a page long, so "pages" is always 1.
+ * The pages for the pagetables have all been allocated before: we just need
+ * to make sure the actual PTEs are up-to-date for the CPU we're about to run
+ * on.
 */
-static __init void populate_switcher_pte_page(unsigned int cpu,
+void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
-                                              struct page *switcher_page[],
-                                              unsigned int pages)
 {
-        unsigned int i;
+        unsigned long base;
-        pte_t *pte = switcher_pte_page(cpu);
+        struct page *percpu_switcher_page, *regs_page;
+        pte_t *pte;
+        struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd];
+        /* Switcher page should always be mapped by now! */
+        BUG_ON(!pgdir->switcher_mapped);
+        /* 
+         * Remember that we have two pages for each Host CPU, so we can run a
+         * Guest on each CPU without them interfering.  We need to make sure
+         * those pages are mapped correctly in the Guest, but since we usually
+         * run on the same CPU, we cache that, and only update the mappings
+         * when we move.
+         */
+        if (pgdir->last_host_cpu == raw_smp_processor_id())
+                return;
-        /* The first entries are easy: they map the Switcher code. */
+        /* -1 means unknown so we remove everything. */
-        for (i = 0; i < pages; i++) {
+        if (pgdir->last_host_cpu == -1) {
-                set_pte(&pte[i], mk_pte(switcher_page[i],
+                unsigned int i;
-                                __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
+                for_each_possible_cpu(i)
+                        remove_switcher_percpu_map(cpu, i);
+        } else {
+                /* We know exactly what CPU mapping to remove. */
+                remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);
        }
-        /* The only other thing we map is this CPU's pair of pages. */
+        /*
-        i = pages + cpu*2;
+         * When we're running the Guest, we want the Guest's "regs" page to
+         * appear where the first Switcher page for this CPU is.  This is an
-        /* First page (Guest registers) is writable from the Guest */
+         * optimization: when the Switcher saves the Guest registers, it saves
-        set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]),
+         * them into the first page of this CPU's "struct lguest_pages": if we
-                         __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
+         * make sure the Guest's register page is already mapped there, we
+         * don't have to copy them out again.
+         */
+        /* Find the shadow PTE for this regs page. */
+        base = switcher_addr + PAGE_SIZE
+                + raw_smp_processor_id() * sizeof(struct lguest_pages);
+        pte = find_spte(cpu, base, false, 0, 0);
+        regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
+        get_page(regs_page);
+        set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
        /*
-         * The second page contains the "struct lguest_ro_state", and is
+         * We map the second page of the struct lguest_pages read-only in
-         * read-only.
+         * the Guest: the IDT, GDT and other things it's not supposed to
+         * change.
         */
-        set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]),
+        pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
-                           __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
+        percpu_switcher_page
+                = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
+        get_page(percpu_switcher_page);
+        set_pte(pte, mk_pte(percpu_switcher_page,
+                            __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
+        pgdir->last_host_cpu = raw_smp_processor_id();
 }
-/*
+/*H:490
 * We've made it through the page table code.  Perhaps our tired brains are
 * still processing the details, or perhaps we're simply glad it's over.
 *
@@ -1124,29 +1191,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
 *
 * There is just one file remaining in the Host.
 */
-/*H:510
- * At boot or module load time, init_pagetables() allocates and populates
- * the Switcher PTE page for each CPU.
- */
-__init int init_pagetables(struct page **switcher_page, unsigned int pages)
-{
-        unsigned int i;
-        for_each_possible_cpu(i) {
-                switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
-                if (!switcher_pte_page(i)) {
-                        free_switcher_pte_pages();
-                        return -ENOMEM;
-                }
-                populate_switcher_pte_page(i, switcher_page, pages);
-        }
-        return 0;
-}
-/*:*/
-/* Cleaning up simply involves freeing the PTE page for each CPU. */
-void free_pagetables(void)
-{
-        free_switcher_pte_pages();
-}
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 4af12e1844d5..f0a3347b6441 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -59,14 +59,13 @@ static struct {
 /* Offset from where switcher.S was compiled to where we've copied it */
 static unsigned long switcher_offset(void)
 {
-        return SWITCHER_ADDR - (unsigned long)start_switcher_text;
+        return switcher_addr - (unsigned long)start_switcher_text;
 }
-/* This cpu's struct lguest_pages. */
+/* This cpu's struct lguest_pages (after the Switcher text page) */
 static struct lguest_pages *lguest_pages(unsigned int cpu)
 {
-        return &(((struct lguest_pages *)
+        return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
-                  (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
 }
 static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-05-02 17:14:04 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-05-02 17:14:04 -0400
commit	736a2dd2571ac56b11ed95a7814d838d5311be04 (patch)
tree	de10d107025970c6e51d5b6faeba799ed4b9caae /drivers/lguest
parent	0b2e3b6bb4a415379f16e38fc92db42379be47a1 (diff)
parent	01d779a14ef800b74684d9692add4944df052461 (diff)