[PATCH] kvm: userspace interface

web site: http://kvm.sourceforge.net mailing list: kvm-devel@lists.sourceforge.net (http://lists.sourceforge.net/lists/listinfo/kvm-devel) The following patchset adds a driver for Intel's hardware virtualization extensions to the x86 architecture. The driver adds a character device (/dev/kvm) that exposes the virtualization capabilities to userspace. Using this driver, a process can run a virtual machine (a "guest") in a fully virtualized PC containing its own virtual hard disks, network adapters, and display. Using this driver, one can start multiple virtual machines on a host. Each virtual machine is a process on the host; a virtual cpu is a thread in that process. kill(1), nice(1), top(1) work as expected. In effect, the driver adds a third execution mode to the existing two: we now have kernel mode, user mode, and guest mode. Guest mode has its own address space mapping guest physical memory (which is accessible to user mode by mmap()ing /dev/kvm). Guest mode has no access to any I/O devices; any such access is intercepted and directed to user mode for emulation. The driver supports i386 and x86_64 hosts and guests. All combinations are allowed except x86_64 guest on i386 host. For i386 guests and hosts, both pae and non-pae paging modes are supported. SMP hosts and UP guests are supported. At the moment only Intel hardware is supported, but AMD virtualization support is being worked on. Performance currently is non-stellar due to the naive implementation of the mmu virtualization, which throws away most of the shadow page table entries every context switch. We plan to address this in two ways: - cache shadow page tables across tlb flushes - wait until AMD and Intel release processors with nested page tables Currently a virtual desktop is responsive but consumes a lot of CPU. Under Windows I tried playing pinball and watching a few flash movies; with a recent CPU one can hardly feel the virtualization. Linux/X is slower, probably due to X being in a separate process. In addition to the driver, you need a slightly modified qemu to provide I/O device emulation and the BIOS. Caveats (akpm: might no longer be true): - The Windows install currently bluescreens due to a problem with the virtual APIC. We are working on a fix. A temporary workaround is to use an existing image or install through qemu - Windows 64-bit does not work. That's also true for qemu, so it's probably a problem with the device model. [bero@arklinux.org: build fix] [simon.kagstrom@bth.se: build fix, other fixes] [uril@qumranet.com: KVM: Expose interrupt bitmap] [akpm@osdl.org: i386 build fix] [mingo@elte.hu: i386 fixes] [rdreier@cisco.com: add log levels to all printks] [randy.dunlap@oracle.com: Fix sparse NULL and C99 struct init warnings] [anthony@codemonkey.ws: KVM: AMD SVM: 32-bit host support] Signed-off-by: Yaniv Kamay <yaniv@qumranet.com> Signed-off-by: Avi Kivity <avi@qumranet.com> Cc: Simon Kagstrom <simon.kagstrom@bth.se> Cc: Bernhard Rosenkraenzer <bero@arklinux.org> Signed-off-by: Uri Lublin <uril@qumranet.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Roland Dreier <rolandd@cisco.com> Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com> Signed-off-by: Anthony Liguori <anthony@codemonkey.ws> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Avi Kivity <avi@qumranet.com> 2006-12-10 05:21:36 -0500
committer: Linus Torvalds <torvalds@woody.osdl.org> 2006-12-10 12:57:22 -0500
commit: 6aa8b732ca01c3d7a54e93f4d701b8aabbe60fb7 (patch)
tree: 23fcbe6f4918cacdae26d513a2bd13e91d8b4c38 /drivers/kvm/paging_tmpl.h
parent: f5f1a24a2caa299bb7d294aee92d7dd3410d9ed7 (diff)
1 files changed, 397 insertions, 0 deletions
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
new file mode 100644
index 000000000000..765c2e1a048e
--- /dev/null
+++ b/drivers/kvm/paging_tmpl.h
@@ -0,0 +1,397 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+/*
+ * We need the mmu code to access both 32-bit and 64-bit guest ptes,
+ * so the code in this file is compiled twice, once per pte size.
+ */
+#if PTTYPE == 64
+        #define pt_element_t u64
+        #define guest_walker guest_walker64
+        #define FNAME(name) paging##64_##name
+        #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+        #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
+        #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+        #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+        #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
+        #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK
+        #define PT_NON_PTE_COPY_MASK PT64_NON_PTE_COPY_MASK
+#elif PTTYPE == 32
+        #define pt_element_t u32
+        #define guest_walker guest_walker32
+        #define FNAME(name) paging##32_##name
+        #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
+        #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
+        #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
+        #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+        #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
+        #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK
+        #define PT_NON_PTE_COPY_MASK PT32_NON_PTE_COPY_MASK
+#else
+        #error Invalid PTTYPE value
+#endif
+/*
+ * The guest_walker structure emulates the behavior of the hardware page
+ * table walker.
+ */
+struct guest_walker {
+        int level;
+        pt_element_t *table;
+        pt_element_t inherited_ar;
+};
+static void FNAME(init_walker)(struct guest_walker *walker,
+                               struct kvm_vcpu *vcpu)
+{
+        hpa_t hpa;
+        struct kvm_memory_slot *slot;
+        walker->level = vcpu->mmu.root_level;
+        slot = gfn_to_memslot(vcpu->kvm,
+                              (vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+        hpa = safe_gpa_to_hpa(vcpu, vcpu->cr3 & PT64_BASE_ADDR_MASK);
+        walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0);
+        ASSERT((!kvm_arch_ops->is_long_mode(vcpu) && is_pae(vcpu)) ||
+               (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0);
+        walker->table = (pt_element_t *)( (unsigned long)walker->table |
+                (unsigned long)(vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) );
+        walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
+}
+static void FNAME(release_walker)(struct guest_walker *walker)
+{
+        kunmap_atomic(walker->table, KM_USER0);
+}
+static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte,
+                           u64 *shadow_pte, u64 access_bits)
+{
+        ASSERT(*shadow_pte == 0);
+        access_bits &= guest_pte;
+        *shadow_pte = (guest_pte & PT_PTE_COPY_MASK);
+        set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK,
+                       guest_pte & PT_DIRTY_MASK, access_bits);
+}
+static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde,
+                           u64 *shadow_pte, u64 access_bits,
+                           int index)
+{
+        gpa_t gaddr;
+        ASSERT(*shadow_pte == 0);
+        access_bits &= guest_pde;
+        gaddr = (guest_pde & PT_DIR_BASE_ADDR_MASK) + PAGE_SIZE * index;
+        if (PTTYPE == 32 && is_cpuid_PSE36())
+                gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) <<
+                        (32 - PT32_DIR_PSE36_SHIFT);
+        *shadow_pte = (guest_pde & (PT_NON_PTE_COPY_MASK | PT_GLOBAL_MASK)) |
+                          ((guest_pde & PT_DIR_PAT_MASK) >>
+                                    (PT_DIR_PAT_SHIFT - PT_PAT_SHIFT));
+        set_pte_common(vcpu, shadow_pte, gaddr,
+                       guest_pde & PT_DIRTY_MASK, access_bits);
+}
+/*
+ * Fetch a guest pte from a specific level in the paging hierarchy.
+ */
+static pt_element_t *FNAME(fetch_guest)(struct kvm_vcpu *vcpu,
+                                        struct guest_walker *walker,
+                                        int level,
+                                        gva_t addr)
+{
+        ASSERT(level > 0  && level <= walker->level);
+        for (;;) {
+                int index = PT_INDEX(addr, walker->level);
+                hpa_t paddr;
+                ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
+                       ((unsigned long)&walker->table[index] & PAGE_MASK));
+                if (level == walker->level ||
+                    !is_present_pte(walker->table[index]) ||
+                    (walker->level == PT_DIRECTORY_LEVEL &&
+                     (walker->table[index] & PT_PAGE_SIZE_MASK) &&
+                     (PTTYPE == 64 || is_pse(vcpu))))
+                        return &walker->table[index];
+                if (walker->level != 3 || kvm_arch_ops->is_long_mode(vcpu))
+                        walker->inherited_ar &= walker->table[index];
+                paddr = safe_gpa_to_hpa(vcpu, walker->table[index] & PT_BASE_ADDR_MASK);
+                kunmap_atomic(walker->table, KM_USER0);
+                walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT),
+                                            KM_USER0);
+                --walker->level;
+        }
+}
+/*
+ * Fetch a shadow pte for a specific level in the paging hierarchy.
+ */
+static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+                              struct guest_walker *walker)
+{
+        hpa_t shadow_addr;
+        int level;
+        u64 *prev_shadow_ent = NULL;
+        shadow_addr = vcpu->mmu.root_hpa;
+        level = vcpu->mmu.shadow_root_level;
+        for (; ; level--) {
+                u32 index = SHADOW_PT_INDEX(addr, level);
+                u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index;
+                pt_element_t *guest_ent;
+                if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
+                        if (level == PT_PAGE_TABLE_LEVEL)
+                                return shadow_ent;
+                        shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
+                        prev_shadow_ent = shadow_ent;
+                        continue;
+                }
+                if (PTTYPE == 32 && level > PT32_ROOT_LEVEL) {
+                        ASSERT(level == PT32E_ROOT_LEVEL);
+                        guest_ent = FNAME(fetch_guest)(vcpu, walker,
+                                                       PT32_ROOT_LEVEL, addr);
+                } else
+                        guest_ent = FNAME(fetch_guest)(vcpu, walker,
+                                                       level, addr);
+                if (!is_present_pte(*guest_ent))
+                        return NULL;
+                /* Don't set accessed bit on PAE PDPTRs */
+                if (vcpu->mmu.root_level != 3 || walker->level != 3)
+                        *guest_ent |= PT_ACCESSED_MASK;
+                if (level == PT_PAGE_TABLE_LEVEL) {
+                        if (walker->level == PT_DIRECTORY_LEVEL) {
+                                if (prev_shadow_ent)
+                                        *prev_shadow_ent |= PT_SHADOW_PS_MARK;
+                                FNAME(set_pde)(vcpu, *guest_ent, shadow_ent,
+                                               walker->inherited_ar,
+                                          PT_INDEX(addr, PT_PAGE_TABLE_LEVEL));
+                        } else {
+                                ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
+                                FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, walker->inherited_ar);
+                        }
+                        return shadow_ent;
+                }
+                shadow_addr = kvm_mmu_alloc_page(vcpu, shadow_ent);
+                if (!VALID_PAGE(shadow_addr))
+                        return ERR_PTR(-ENOMEM);
+                if (!kvm_arch_ops->is_long_mode(vcpu) && level == 3)
+                        *shadow_ent = shadow_addr |
+                                (*guest_ent & (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK));
+                else {
+                        *shadow_ent = shadow_addr |
+                                (*guest_ent & PT_NON_PTE_COPY_MASK);
+                        *shadow_ent |= (PT_WRITABLE_MASK | PT_USER_MASK);
+                }
+                prev_shadow_ent = shadow_ent;
+        }
+}
+/*
+ * The guest faulted for write.  We need to
+ *
+ * - check write permissions
+ * - update the guest pte dirty bit
+ * - update our own dirty page tracking structures
+ */
+static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu,
+                               u64 *shadow_ent,
+                               struct guest_walker *walker,
+                               gva_t addr,
+                               int user)
+{
+        pt_element_t *guest_ent;
+        int writable_shadow;
+        gfn_t gfn;
+        if (is_writeble_pte(*shadow_ent))
+                return 0;
+        writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK;
+        if (user) {
+                /*
+                 * User mode access.  Fail if it's a kernel page or a read-only
+                 * page.
+                 */
+                if (!(*shadow_ent & PT_SHADOW_USER_MASK) || !writable_shadow)
+                        return 0;
+                ASSERT(*shadow_ent & PT_USER_MASK);
+        } else
+                /*
+                 * Kernel mode access.  Fail if it's a read-only page and
+                 * supervisor write protection is enabled.
+                 */
+                if (!writable_shadow) {
+                        if (is_write_protection(vcpu))
+                                return 0;
+                        *shadow_ent &= ~PT_USER_MASK;
+                }
+        guest_ent = FNAME(fetch_guest)(vcpu, walker, PT_PAGE_TABLE_LEVEL, addr);
+        if (!is_present_pte(*guest_ent)) {
+                *shadow_ent = 0;
+                return 0;
+        }
+        gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+        mark_page_dirty(vcpu->kvm, gfn);
+        *shadow_ent |= PT_WRITABLE_MASK;
+        *guest_ent |= PT_DIRTY_MASK;
+        return 1;
+}
+/*
+ * Page fault handler.  There are several causes for a page fault:
+ *   - there is no shadow pte for the guest pte
+ *   - write access through a shadow pte marked read only so that we can set
+ *     the dirty bit
+ *   - write access to a shadow pte marked read only so we can update the page
+ *     dirty bitmap, when userspace requests it
+ *   - mmio access; in this case we will never install a present shadow pte
+ *   - normal guest page fault due to the guest pte marked not present, not
+ *     writable, or not executable
+ *
+ *  Returns: 1 if we need to emulate the instruction, 0 otherwise
+ */
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
+                               u32 error_code)
+{
+        int write_fault = error_code & PFERR_WRITE_MASK;
+        int pte_present = error_code & PFERR_PRESENT_MASK;
+        int user_fault = error_code & PFERR_USER_MASK;
+        struct guest_walker walker;
+        u64 *shadow_pte;
+        int fixed;
+        /*
+         * Look up the shadow pte for the faulting address.
+         */
+        for (;;) {
+                FNAME(init_walker)(&walker, vcpu);
+                shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
+                if (IS_ERR(shadow_pte)) {  /* must be -ENOMEM */
+                        nonpaging_flush(vcpu);
+                        FNAME(release_walker)(&walker);
+                        continue;
+                }
+                break;
+        }
+        /*
+         * The page is not mapped by the guest.  Let the guest handle it.
+         */
+        if (!shadow_pte) {
+                inject_page_fault(vcpu, addr, error_code);
+                FNAME(release_walker)(&walker);
+                return 0;
+        }
+        /*
+         * Update the shadow pte.
+         */
+        if (write_fault)
+                fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr,
+                                            user_fault);
+        else
+                fixed = fix_read_pf(shadow_pte);
+        FNAME(release_walker)(&walker);
+        /*
+         * mmio: emulate if accessible, otherwise its a guest fault.
+         */
+        if (is_io_pte(*shadow_pte)) {
+                if (may_access(*shadow_pte, write_fault, user_fault))
+                        return 1;
+                pgprintk("%s: io work, no access\n", __FUNCTION__);
+                inject_page_fault(vcpu, addr,
+                                  error_code | PFERR_PRESENT_MASK);
+                return 0;
+        }
+        /*
+         * pte not present, guest page fault.
+         */
+        if (pte_present && !fixed) {
+                inject_page_fault(vcpu, addr, error_code);
+                return 0;
+        }
+        ++kvm_stat.pf_fixed;
+        return 0;
+}
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
+{
+        struct guest_walker walker;
+        pt_element_t guest_pte;
+        gpa_t gpa;
+        FNAME(init_walker)(&walker, vcpu);
+        guest_pte = *FNAME(fetch_guest)(vcpu, &walker, PT_PAGE_TABLE_LEVEL,
+                                        vaddr);
+        FNAME(release_walker)(&walker);
+        if (!is_present_pte(guest_pte))
+                return UNMAPPED_GVA;
+        if (walker.level == PT_DIRECTORY_LEVEL) {
+                ASSERT((guest_pte & PT_PAGE_SIZE_MASK));
+                ASSERT(PTTYPE == 64 || is_pse(vcpu));
+                gpa = (guest_pte & PT_DIR_BASE_ADDR_MASK) | (vaddr &
+                        (PT_LEVEL_MASK(PT_PAGE_TABLE_LEVEL) | ~PAGE_MASK));
+                if (PTTYPE == 32 && is_cpuid_PSE36())
+                        gpa |= (guest_pte & PT32_DIR_PSE36_MASK) <<
+                                        (32 - PT32_DIR_PSE36_SHIFT);
+        } else {
+                gpa = (guest_pte & PT_BASE_ADDR_MASK);
+                gpa |= (vaddr & ~PAGE_MASK);
+        }
+        return gpa;
+}
+#undef pt_element_t
+#undef guest_walker
+#undef FNAME
+#undef PT_BASE_ADDR_MASK
+#undef PT_INDEX
+#undef SHADOW_PT_INDEX
+#undef PT_LEVEL_MASK
+#undef PT_PTE_COPY_MASK
+#undef PT_NON_PTE_COPY_MASK
+#undef PT_DIR_BASE_ADDR_MASK
author	Avi Kivity <avi@qumranet.com>	2006-12-10 05:21:36 -0500
committer	Linus Torvalds <torvalds@woody.osdl.org>	2006-12-10 12:57:22 -0500
commit	6aa8b732ca01c3d7a54e93f4d701b8aabbe60fb7 (patch)
tree	23fcbe6f4918cacdae26d513a2bd13e91d8b4c38 /drivers/kvm/paging_tmpl.h
parent	f5f1a24a2caa299bb7d294aee92d7dd3410d9ed7 (diff)

diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h new file mode 100644 index 000000000000..765c2e1a048e --- /dev/null +++ b/drivers/kvm/paging_tmpl.h
@@ -0,0 +1,397 @@
	1	/*
	2	* Kernel-based Virtual Machine driver for Linux
	3	*
	4	* This module enables machines with Intel VT-x extensions to run virtual
	5	* machines without emulation or binary translation.
	6	*
	7	* MMU support
	8	*
	9	* Copyright (C) 2006 Qumranet, Inc.
	10	*
	11	* Authors:
	12	* Yaniv Kamay <yaniv@qumranet.com>
	13	* Avi Kivity <avi@qumranet.com>
	14	*
	15	* This work is licensed under the terms of the GNU GPL, version 2. See
	16	* the COPYING file in the top-level directory.
	17	*
	18	*/
	19
	20	/*
	21	* We need the mmu code to access both 32-bit and 64-bit guest ptes,
	22	* so the code in this file is compiled twice, once per pte size.
	23	*/
	24
	25	#if PTTYPE == 64
	26	#define pt_element_t u64
	27	#define guest_walker guest_walker64
	28	#define FNAME(name) paging##64_##name
	29	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
	30	#define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
	31	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
	32	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
	33	#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
	34	#define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK
	35	#define PT_NON_PTE_COPY_MASK PT64_NON_PTE_COPY_MASK
	36	#elif PTTYPE == 32
	37	#define pt_element_t u32
	38	#define guest_walker guest_walker32
	39	#define FNAME(name) paging##32_##name
	40	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
	41	#define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
	42	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
	43	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
	44	#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
	45	#define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK
	46	#define PT_NON_PTE_COPY_MASK PT32_NON_PTE_COPY_MASK
	47	#else
	48	#error Invalid PTTYPE value
	49	#endif
	50
	51	/*
	52	* The guest_walker structure emulates the behavior of the hardware page
	53	* table walker.
	54	*/
	55	struct guest_walker {
	56	int level;
	57	pt_element_t *table;
	58	pt_element_t inherited_ar;
	59	};
	60
	61	static void FNAME(init_walker)(struct guest_walker *walker,
	62	struct kvm_vcpu *vcpu)
	63	{
	64	hpa_t hpa;
	65	struct kvm_memory_slot *slot;
	66
	67	walker->level = vcpu->mmu.root_level;
	68	slot = gfn_to_memslot(vcpu->kvm,
	69	(vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
	70	hpa = safe_gpa_to_hpa(vcpu, vcpu->cr3 & PT64_BASE_ADDR_MASK);
	71	walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0);
	72
	73	ASSERT((!kvm_arch_ops->is_long_mode(vcpu) && is_pae(vcpu)) \|\|
	74	(vcpu->cr3 & ~(PAGE_MASK \| CR3_FLAGS_MASK)) == 0);
	75
	76	walker->table = (pt_element_t *)( (unsigned long)walker->table \|
	77	(unsigned long)(vcpu->cr3 & ~(PAGE_MASK \| CR3_FLAGS_MASK)) );
	78	walker->inherited_ar = PT_USER_MASK \| PT_WRITABLE_MASK;
	79	}
	80
	81	static void FNAME(release_walker)(struct guest_walker *walker)
	82	{
	83	kunmap_atomic(walker->table, KM_USER0);
	84	}
	85
	86	static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte,
	87	u64 *shadow_pte, u64 access_bits)
	88	{
	89	ASSERT(*shadow_pte == 0);
	90	access_bits &= guest_pte;
	91	*shadow_pte = (guest_pte & PT_PTE_COPY_MASK);
	92	set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK,
	93	guest_pte & PT_DIRTY_MASK, access_bits);
	94	}
	95
	96	static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde,
	97	u64 *shadow_pte, u64 access_bits,
	98	int index)
	99	{
	100	gpa_t gaddr;
	101
	102	ASSERT(*shadow_pte == 0);
	103	access_bits &= guest_pde;
	104	gaddr = (guest_pde & PT_DIR_BASE_ADDR_MASK) + PAGE_SIZE * index;
	105	if (PTTYPE == 32 && is_cpuid_PSE36())
	106	gaddr \|= (guest_pde & PT32_DIR_PSE36_MASK) <<
	107	(32 - PT32_DIR_PSE36_SHIFT);
	108	*shadow_pte = (guest_pde & (PT_NON_PTE_COPY_MASK \| PT_GLOBAL_MASK)) \|
	109	((guest_pde & PT_DIR_PAT_MASK) >>
	110	(PT_DIR_PAT_SHIFT - PT_PAT_SHIFT));
	111	set_pte_common(vcpu, shadow_pte, gaddr,
	112	guest_pde & PT_DIRTY_MASK, access_bits);
	113	}
	114
	115	/*
	116	* Fetch a guest pte from a specific level in the paging hierarchy.
	117	*/
	118	static pt_element_t FNAME(fetch_guest)(struct kvm_vcpu vcpu,
	119	struct guest_walker *walker,
	120	int level,
	121	gva_t addr)
	122	{
	123
	124	ASSERT(level > 0 && level <= walker->level);
	125
	126	for (;;) {
	127	int index = PT_INDEX(addr, walker->level);
	128	hpa_t paddr;
	129
	130	ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
	131	((unsigned long)&walker->table[index] & PAGE_MASK));
	132	if (level == walker->level \|\|
	133	!is_present_pte(walker->table[index]) \|\|
	134	(walker->level == PT_DIRECTORY_LEVEL &&
	135	(walker->table[index] & PT_PAGE_SIZE_MASK) &&
	136	(PTTYPE == 64 \|\| is_pse(vcpu))))
	137	return &walker->table[index];
	138	if (walker->level != 3 \|\| kvm_arch_ops->is_long_mode(vcpu))
	139	walker->inherited_ar &= walker->table[index];
	140	paddr = safe_gpa_to_hpa(vcpu, walker->table[index] & PT_BASE_ADDR_MASK);
	141	kunmap_atomic(walker->table, KM_USER0);
	142	walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT),
	143	KM_USER0);
	144	--walker->level;
	145	}
	146	}
	147
	148	/*
	149	* Fetch a shadow pte for a specific level in the paging hierarchy.
	150	*/
	151	static u64 FNAME(fetch)(struct kvm_vcpu vcpu, gva_t addr,
	152	struct guest_walker *walker)
	153	{
	154	hpa_t shadow_addr;
	155	int level;
	156	u64 *prev_shadow_ent = NULL;
	157
	158	shadow_addr = vcpu->mmu.root_hpa;
	159	level = vcpu->mmu.shadow_root_level;
	160
	161	for (; ; level--) {
	162	u32 index = SHADOW_PT_INDEX(addr, level);
	163	u64 shadow_ent = ((u64 )__va(shadow_addr)) + index;
	164	pt_element_t *guest_ent;
	165
	166	if (is_present_pte(shadow_ent) \|\| is_io_pte(shadow_ent)) {
	167	if (level == PT_PAGE_TABLE_LEVEL)
	168	return shadow_ent;
	169	shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
	170	prev_shadow_ent = shadow_ent;
	171	continue;
	172	}
	173
	174	if (PTTYPE == 32 && level > PT32_ROOT_LEVEL) {
	175	ASSERT(level == PT32E_ROOT_LEVEL);
	176	guest_ent = FNAME(fetch_guest)(vcpu, walker,
	177	PT32_ROOT_LEVEL, addr);
	178	} else
	179	guest_ent = FNAME(fetch_guest)(vcpu, walker,
	180	level, addr);
	181
	182	if (!is_present_pte(*guest_ent))
	183	return NULL;
	184
	185	/* Don't set accessed bit on PAE PDPTRs */
	186	if (vcpu->mmu.root_level != 3 \|\| walker->level != 3)
	187	*guest_ent \|= PT_ACCESSED_MASK;
	188
	189	if (level == PT_PAGE_TABLE_LEVEL) {
	190
	191	if (walker->level == PT_DIRECTORY_LEVEL) {
	192	if (prev_shadow_ent)
	193	*prev_shadow_ent \|= PT_SHADOW_PS_MARK;
	194	FNAME(set_pde)(vcpu, *guest_ent, shadow_ent,
	195	walker->inherited_ar,
	196	PT_INDEX(addr, PT_PAGE_TABLE_LEVEL));
	197	} else {
	198	ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
	199	FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, walker->inherited_ar);
	200	}
	201	return shadow_ent;
	202	}
	203
	204	shadow_addr = kvm_mmu_alloc_page(vcpu, shadow_ent);
	205	if (!VALID_PAGE(shadow_addr))
	206	return ERR_PTR(-ENOMEM);
	207	if (!kvm_arch_ops->is_long_mode(vcpu) && level == 3)
	208	*shadow_ent = shadow_addr \|
	209	(*guest_ent & (PT_PRESENT_MASK \| PT_PWT_MASK \| PT_PCD_MASK));
	210	else {
	211	*shadow_ent = shadow_addr \|
	212	(*guest_ent & PT_NON_PTE_COPY_MASK);
	213	*shadow_ent \|= (PT_WRITABLE_MASK \| PT_USER_MASK);
	214	}
	215	prev_shadow_ent = shadow_ent;
	216	}
	217	}
	218
	219	/*
	220	* The guest faulted for write. We need to
	221	*
	222	* - check write permissions
	223	* - update the guest pte dirty bit
	224	* - update our own dirty page tracking structures
	225	*/
	226	static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu,
	227	u64 *shadow_ent,
	228	struct guest_walker *walker,
	229	gva_t addr,
	230	int user)
	231	{
	232	pt_element_t *guest_ent;
	233	int writable_shadow;
	234	gfn_t gfn;
	235
	236	if (is_writeble_pte(*shadow_ent))
	237	return 0;
	238
	239	writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK;
	240	if (user) {
	241	/*
	242	* User mode access. Fail if it's a kernel page or a read-only
	243	* page.
	244	*/
	245	if (!(*shadow_ent & PT_SHADOW_USER_MASK) \|\| !writable_shadow)
	246	return 0;
	247	ASSERT(*shadow_ent & PT_USER_MASK);
	248	} else
	249	/*
	250	* Kernel mode access. Fail if it's a read-only page and
	251	* supervisor write protection is enabled.
	252	*/
	253	if (!writable_shadow) {
	254	if (is_write_protection(vcpu))
	255	return 0;
	256	*shadow_ent &= ~PT_USER_MASK;
	257	}
	258
	259	guest_ent = FNAME(fetch_guest)(vcpu, walker, PT_PAGE_TABLE_LEVEL, addr);
	260
	261	if (!is_present_pte(*guest_ent)) {
	262	*shadow_ent = 0;
	263	return 0;
	264	}
	265
	266	gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
	267	mark_page_dirty(vcpu->kvm, gfn);
	268	*shadow_ent \|= PT_WRITABLE_MASK;
	269	*guest_ent \|= PT_DIRTY_MASK;
	270
	271	return 1;
	272	}
	273
	274	/*
	275	* Page fault handler. There are several causes for a page fault:
	276	* - there is no shadow pte for the guest pte
	277	* - write access through a shadow pte marked read only so that we can set
	278	* the dirty bit
	279	* - write access to a shadow pte marked read only so we can update the page
	280	* dirty bitmap, when userspace requests it
	281	* - mmio access; in this case we will never install a present shadow pte
	282	* - normal guest page fault due to the guest pte marked not present, not
	283	* writable, or not executable
	284	*
	285	* Returns: 1 if we need to emulate the instruction, 0 otherwise
	286	*/
	287	static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
	288	u32 error_code)
	289	{
	290	int write_fault = error_code & PFERR_WRITE_MASK;
	291	int pte_present = error_code & PFERR_PRESENT_MASK;
	292	int user_fault = error_code & PFERR_USER_MASK;
	293	struct guest_walker walker;
	294	u64 *shadow_pte;
	295	int fixed;
	296
	297	/*
	298	* Look up the shadow pte for the faulting address.
	299	*/
	300	for (;;) {
	301	FNAME(init_walker)(&walker, vcpu);
	302	shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
	303	if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */
	304	nonpaging_flush(vcpu);
	305	FNAME(release_walker)(&walker);
	306	continue;
	307	}
	308	break;
	309	}
	310
	311	/*
	312	* The page is not mapped by the guest. Let the guest handle it.
	313	*/
	314	if (!shadow_pte) {
	315	inject_page_fault(vcpu, addr, error_code);
	316	FNAME(release_walker)(&walker);
	317	return 0;
	318	}
	319
	320	/*
	321	* Update the shadow pte.
	322	*/
	323	if (write_fault)
	324	fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr,
	325	user_fault);
	326	else
	327	fixed = fix_read_pf(shadow_pte);
	328
	329	FNAME(release_walker)(&walker);
	330
	331	/*
	332	* mmio: emulate if accessible, otherwise its a guest fault.
	333	*/
	334	if (is_io_pte(*shadow_pte)) {
	335	if (may_access(*shadow_pte, write_fault, user_fault))
	336	return 1;
	337	pgprintk("%s: io work, no access\n", __FUNCTION__);
	338	inject_page_fault(vcpu, addr,
	339	error_code \| PFERR_PRESENT_MASK);
	340	return 0;
	341	}
	342
	343	/*
	344	* pte not present, guest page fault.
	345	*/
	346	if (pte_present && !fixed) {
	347	inject_page_fault(vcpu, addr, error_code);
	348	return 0;
	349	}
	350
	351	++kvm_stat.pf_fixed;
	352
	353	return 0;
	354	}
	355
	356	static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
	357	{
	358	struct guest_walker walker;
	359	pt_element_t guest_pte;
	360	gpa_t gpa;
	361
	362	FNAME(init_walker)(&walker, vcpu);
	363	guest_pte = *FNAME(fetch_guest)(vcpu, &walker, PT_PAGE_TABLE_LEVEL,
	364	vaddr);
	365	FNAME(release_walker)(&walker);
	366
	367	if (!is_present_pte(guest_pte))
	368	return UNMAPPED_GVA;
	369
	370	if (walker.level == PT_DIRECTORY_LEVEL) {
	371	ASSERT((guest_pte & PT_PAGE_SIZE_MASK));
	372	ASSERT(PTTYPE == 64 \|\| is_pse(vcpu));
	373
	374	gpa = (guest_pte & PT_DIR_BASE_ADDR_MASK) \| (vaddr &
	375	(PT_LEVEL_MASK(PT_PAGE_TABLE_LEVEL) \| ~PAGE_MASK));
	376
	377	if (PTTYPE == 32 && is_cpuid_PSE36())
	378	gpa \|= (guest_pte & PT32_DIR_PSE36_MASK) <<
	379	(32 - PT32_DIR_PSE36_SHIFT);
	380	} else {
	381	gpa = (guest_pte & PT_BASE_ADDR_MASK);
	382	gpa \|= (vaddr & ~PAGE_MASK);
	383	}
	384
	385	return gpa;
	386	}
	387
	388	#undef pt_element_t
	389	#undef guest_walker
	390	#undef FNAME
	391	#undef PT_BASE_ADDR_MASK
	392	#undef PT_INDEX
	393	#undef SHADOW_PT_INDEX
	394	#undef PT_LEVEL_MASK
	395	#undef PT_PTE_COPY_MASK
	396	#undef PT_NON_PTE_COPY_MASK
	397	#undef PT_DIR_BASE_ADDR_MASK