aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/kvm/paging_tmpl.h
diff options
context:
space:
mode:
authorAvi Kivity <avi@qumranet.com>2006-12-10 05:21:36 -0500
committerLinus Torvalds <torvalds@woody.osdl.org>2006-12-10 12:57:22 -0500
commit6aa8b732ca01c3d7a54e93f4d701b8aabbe60fb7 (patch)
tree23fcbe6f4918cacdae26d513a2bd13e91d8b4c38 /drivers/kvm/paging_tmpl.h
parentf5f1a24a2caa299bb7d294aee92d7dd3410d9ed7 (diff)
[PATCH] kvm: userspace interface
web site: http://kvm.sourceforge.net mailing list: kvm-devel@lists.sourceforge.net (http://lists.sourceforge.net/lists/listinfo/kvm-devel) The following patchset adds a driver for Intel's hardware virtualization extensions to the x86 architecture. The driver adds a character device (/dev/kvm) that exposes the virtualization capabilities to userspace. Using this driver, a process can run a virtual machine (a "guest") in a fully virtualized PC containing its own virtual hard disks, network adapters, and display. Using this driver, one can start multiple virtual machines on a host. Each virtual machine is a process on the host; a virtual cpu is a thread in that process. kill(1), nice(1), top(1) work as expected. In effect, the driver adds a third execution mode to the existing two: we now have kernel mode, user mode, and guest mode. Guest mode has its own address space mapping guest physical memory (which is accessible to user mode by mmap()ing /dev/kvm). Guest mode has no access to any I/O devices; any such access is intercepted and directed to user mode for emulation. The driver supports i386 and x86_64 hosts and guests. All combinations are allowed except x86_64 guest on i386 host. For i386 guests and hosts, both pae and non-pae paging modes are supported. SMP hosts and UP guests are supported. At the moment only Intel hardware is supported, but AMD virtualization support is being worked on. Performance currently is non-stellar due to the naive implementation of the mmu virtualization, which throws away most of the shadow page table entries every context switch. We plan to address this in two ways: - cache shadow page tables across tlb flushes - wait until AMD and Intel release processors with nested page tables Currently a virtual desktop is responsive but consumes a lot of CPU. Under Windows I tried playing pinball and watching a few flash movies; with a recent CPU one can hardly feel the virtualization. Linux/X is slower, probably due to X being in a separate process. In addition to the driver, you need a slightly modified qemu to provide I/O device emulation and the BIOS. Caveats (akpm: might no longer be true): - The Windows install currently bluescreens due to a problem with the virtual APIC. We are working on a fix. A temporary workaround is to use an existing image or install through qemu - Windows 64-bit does not work. That's also true for qemu, so it's probably a problem with the device model. [bero@arklinux.org: build fix] [simon.kagstrom@bth.se: build fix, other fixes] [uril@qumranet.com: KVM: Expose interrupt bitmap] [akpm@osdl.org: i386 build fix] [mingo@elte.hu: i386 fixes] [rdreier@cisco.com: add log levels to all printks] [randy.dunlap@oracle.com: Fix sparse NULL and C99 struct init warnings] [anthony@codemonkey.ws: KVM: AMD SVM: 32-bit host support] Signed-off-by: Yaniv Kamay <yaniv@qumranet.com> Signed-off-by: Avi Kivity <avi@qumranet.com> Cc: Simon Kagstrom <simon.kagstrom@bth.se> Cc: Bernhard Rosenkraenzer <bero@arklinux.org> Signed-off-by: Uri Lublin <uril@qumranet.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Roland Dreier <rolandd@cisco.com> Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com> Signed-off-by: Anthony Liguori <anthony@codemonkey.ws> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/kvm/paging_tmpl.h')
-rw-r--r--drivers/kvm/paging_tmpl.h397
1 files changed, 397 insertions, 0 deletions
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
new file mode 100644
index 000000000000..765c2e1a048e
--- /dev/null
+++ b/drivers/kvm/paging_tmpl.h
@@ -0,0 +1,397 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20/*
21 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
22 * so the code in this file is compiled twice, once per pte size.
23 */
24
25#if PTTYPE == 64
26 #define pt_element_t u64
27 #define guest_walker guest_walker64
28 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK
35 #define PT_NON_PTE_COPY_MASK PT64_NON_PTE_COPY_MASK
36#elif PTTYPE == 32
37 #define pt_element_t u32
38 #define guest_walker guest_walker32
39 #define FNAME(name) paging##32_##name
40 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
41 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
42 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
43 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
44 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
45 #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK
46 #define PT_NON_PTE_COPY_MASK PT32_NON_PTE_COPY_MASK
47#else
48 #error Invalid PTTYPE value
49#endif
50
51/*
52 * The guest_walker structure emulates the behavior of the hardware page
53 * table walker.
54 */
55struct guest_walker {
56 int level;
57 pt_element_t *table;
58 pt_element_t inherited_ar;
59};
60
61static void FNAME(init_walker)(struct guest_walker *walker,
62 struct kvm_vcpu *vcpu)
63{
64 hpa_t hpa;
65 struct kvm_memory_slot *slot;
66
67 walker->level = vcpu->mmu.root_level;
68 slot = gfn_to_memslot(vcpu->kvm,
69 (vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
70 hpa = safe_gpa_to_hpa(vcpu, vcpu->cr3 & PT64_BASE_ADDR_MASK);
71 walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0);
72
73 ASSERT((!kvm_arch_ops->is_long_mode(vcpu) && is_pae(vcpu)) ||
74 (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0);
75
76 walker->table = (pt_element_t *)( (unsigned long)walker->table |
77 (unsigned long)(vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) );
78 walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
79}
80
81static void FNAME(release_walker)(struct guest_walker *walker)
82{
83 kunmap_atomic(walker->table, KM_USER0);
84}
85
86static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte,
87 u64 *shadow_pte, u64 access_bits)
88{
89 ASSERT(*shadow_pte == 0);
90 access_bits &= guest_pte;
91 *shadow_pte = (guest_pte & PT_PTE_COPY_MASK);
92 set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK,
93 guest_pte & PT_DIRTY_MASK, access_bits);
94}
95
96static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde,
97 u64 *shadow_pte, u64 access_bits,
98 int index)
99{
100 gpa_t gaddr;
101
102 ASSERT(*shadow_pte == 0);
103 access_bits &= guest_pde;
104 gaddr = (guest_pde & PT_DIR_BASE_ADDR_MASK) + PAGE_SIZE * index;
105 if (PTTYPE == 32 && is_cpuid_PSE36())
106 gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) <<
107 (32 - PT32_DIR_PSE36_SHIFT);
108 *shadow_pte = (guest_pde & (PT_NON_PTE_COPY_MASK | PT_GLOBAL_MASK)) |
109 ((guest_pde & PT_DIR_PAT_MASK) >>
110 (PT_DIR_PAT_SHIFT - PT_PAT_SHIFT));
111 set_pte_common(vcpu, shadow_pte, gaddr,
112 guest_pde & PT_DIRTY_MASK, access_bits);
113}
114
115/*
116 * Fetch a guest pte from a specific level in the paging hierarchy.
117 */
118static pt_element_t *FNAME(fetch_guest)(struct kvm_vcpu *vcpu,
119 struct guest_walker *walker,
120 int level,
121 gva_t addr)
122{
123
124 ASSERT(level > 0 && level <= walker->level);
125
126 for (;;) {
127 int index = PT_INDEX(addr, walker->level);
128 hpa_t paddr;
129
130 ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
131 ((unsigned long)&walker->table[index] & PAGE_MASK));
132 if (level == walker->level ||
133 !is_present_pte(walker->table[index]) ||
134 (walker->level == PT_DIRECTORY_LEVEL &&
135 (walker->table[index] & PT_PAGE_SIZE_MASK) &&
136 (PTTYPE == 64 || is_pse(vcpu))))
137 return &walker->table[index];
138 if (walker->level != 3 || kvm_arch_ops->is_long_mode(vcpu))
139 walker->inherited_ar &= walker->table[index];
140 paddr = safe_gpa_to_hpa(vcpu, walker->table[index] & PT_BASE_ADDR_MASK);
141 kunmap_atomic(walker->table, KM_USER0);
142 walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT),
143 KM_USER0);
144 --walker->level;
145 }
146}
147
148/*
149 * Fetch a shadow pte for a specific level in the paging hierarchy.
150 */
151static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
152 struct guest_walker *walker)
153{
154 hpa_t shadow_addr;
155 int level;
156 u64 *prev_shadow_ent = NULL;
157
158 shadow_addr = vcpu->mmu.root_hpa;
159 level = vcpu->mmu.shadow_root_level;
160
161 for (; ; level--) {
162 u32 index = SHADOW_PT_INDEX(addr, level);
163 u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index;
164 pt_element_t *guest_ent;
165
166 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
167 if (level == PT_PAGE_TABLE_LEVEL)
168 return shadow_ent;
169 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
170 prev_shadow_ent = shadow_ent;
171 continue;
172 }
173
174 if (PTTYPE == 32 && level > PT32_ROOT_LEVEL) {
175 ASSERT(level == PT32E_ROOT_LEVEL);
176 guest_ent = FNAME(fetch_guest)(vcpu, walker,
177 PT32_ROOT_LEVEL, addr);
178 } else
179 guest_ent = FNAME(fetch_guest)(vcpu, walker,
180 level, addr);
181
182 if (!is_present_pte(*guest_ent))
183 return NULL;
184
185 /* Don't set accessed bit on PAE PDPTRs */
186 if (vcpu->mmu.root_level != 3 || walker->level != 3)
187 *guest_ent |= PT_ACCESSED_MASK;
188
189 if (level == PT_PAGE_TABLE_LEVEL) {
190
191 if (walker->level == PT_DIRECTORY_LEVEL) {
192 if (prev_shadow_ent)
193 *prev_shadow_ent |= PT_SHADOW_PS_MARK;
194 FNAME(set_pde)(vcpu, *guest_ent, shadow_ent,
195 walker->inherited_ar,
196 PT_INDEX(addr, PT_PAGE_TABLE_LEVEL));
197 } else {
198 ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
199 FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, walker->inherited_ar);
200 }
201 return shadow_ent;
202 }
203
204 shadow_addr = kvm_mmu_alloc_page(vcpu, shadow_ent);
205 if (!VALID_PAGE(shadow_addr))
206 return ERR_PTR(-ENOMEM);
207 if (!kvm_arch_ops->is_long_mode(vcpu) && level == 3)
208 *shadow_ent = shadow_addr |
209 (*guest_ent & (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK));
210 else {
211 *shadow_ent = shadow_addr |
212 (*guest_ent & PT_NON_PTE_COPY_MASK);
213 *shadow_ent |= (PT_WRITABLE_MASK | PT_USER_MASK);
214 }
215 prev_shadow_ent = shadow_ent;
216 }
217}
218
219/*
220 * The guest faulted for write. We need to
221 *
222 * - check write permissions
223 * - update the guest pte dirty bit
224 * - update our own dirty page tracking structures
225 */
226static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu,
227 u64 *shadow_ent,
228 struct guest_walker *walker,
229 gva_t addr,
230 int user)
231{
232 pt_element_t *guest_ent;
233 int writable_shadow;
234 gfn_t gfn;
235
236 if (is_writeble_pte(*shadow_ent))
237 return 0;
238
239 writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK;
240 if (user) {
241 /*
242 * User mode access. Fail if it's a kernel page or a read-only
243 * page.
244 */
245 if (!(*shadow_ent & PT_SHADOW_USER_MASK) || !writable_shadow)
246 return 0;
247 ASSERT(*shadow_ent & PT_USER_MASK);
248 } else
249 /*
250 * Kernel mode access. Fail if it's a read-only page and
251 * supervisor write protection is enabled.
252 */
253 if (!writable_shadow) {
254 if (is_write_protection(vcpu))
255 return 0;
256 *shadow_ent &= ~PT_USER_MASK;
257 }
258
259 guest_ent = FNAME(fetch_guest)(vcpu, walker, PT_PAGE_TABLE_LEVEL, addr);
260
261 if (!is_present_pte(*guest_ent)) {
262 *shadow_ent = 0;
263 return 0;
264 }
265
266 gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
267 mark_page_dirty(vcpu->kvm, gfn);
268 *shadow_ent |= PT_WRITABLE_MASK;
269 *guest_ent |= PT_DIRTY_MASK;
270
271 return 1;
272}
273
274/*
275 * Page fault handler. There are several causes for a page fault:
276 * - there is no shadow pte for the guest pte
277 * - write access through a shadow pte marked read only so that we can set
278 * the dirty bit
279 * - write access to a shadow pte marked read only so we can update the page
280 * dirty bitmap, when userspace requests it
281 * - mmio access; in this case we will never install a present shadow pte
282 * - normal guest page fault due to the guest pte marked not present, not
283 * writable, or not executable
284 *
285 * Returns: 1 if we need to emulate the instruction, 0 otherwise
286 */
287static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
288 u32 error_code)
289{
290 int write_fault = error_code & PFERR_WRITE_MASK;
291 int pte_present = error_code & PFERR_PRESENT_MASK;
292 int user_fault = error_code & PFERR_USER_MASK;
293 struct guest_walker walker;
294 u64 *shadow_pte;
295 int fixed;
296
297 /*
298 * Look up the shadow pte for the faulting address.
299 */
300 for (;;) {
301 FNAME(init_walker)(&walker, vcpu);
302 shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
303 if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */
304 nonpaging_flush(vcpu);
305 FNAME(release_walker)(&walker);
306 continue;
307 }
308 break;
309 }
310
311 /*
312 * The page is not mapped by the guest. Let the guest handle it.
313 */
314 if (!shadow_pte) {
315 inject_page_fault(vcpu, addr, error_code);
316 FNAME(release_walker)(&walker);
317 return 0;
318 }
319
320 /*
321 * Update the shadow pte.
322 */
323 if (write_fault)
324 fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr,
325 user_fault);
326 else
327 fixed = fix_read_pf(shadow_pte);
328
329 FNAME(release_walker)(&walker);
330
331 /*
332 * mmio: emulate if accessible, otherwise its a guest fault.
333 */
334 if (is_io_pte(*shadow_pte)) {
335 if (may_access(*shadow_pte, write_fault, user_fault))
336 return 1;
337 pgprintk("%s: io work, no access\n", __FUNCTION__);
338 inject_page_fault(vcpu, addr,
339 error_code | PFERR_PRESENT_MASK);
340 return 0;
341 }
342
343 /*
344 * pte not present, guest page fault.
345 */
346 if (pte_present && !fixed) {
347 inject_page_fault(vcpu, addr, error_code);
348 return 0;
349 }
350
351 ++kvm_stat.pf_fixed;
352
353 return 0;
354}
355
356static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
357{
358 struct guest_walker walker;
359 pt_element_t guest_pte;
360 gpa_t gpa;
361
362 FNAME(init_walker)(&walker, vcpu);
363 guest_pte = *FNAME(fetch_guest)(vcpu, &walker, PT_PAGE_TABLE_LEVEL,
364 vaddr);
365 FNAME(release_walker)(&walker);
366
367 if (!is_present_pte(guest_pte))
368 return UNMAPPED_GVA;
369
370 if (walker.level == PT_DIRECTORY_LEVEL) {
371 ASSERT((guest_pte & PT_PAGE_SIZE_MASK));
372 ASSERT(PTTYPE == 64 || is_pse(vcpu));
373
374 gpa = (guest_pte & PT_DIR_BASE_ADDR_MASK) | (vaddr &
375 (PT_LEVEL_MASK(PT_PAGE_TABLE_LEVEL) | ~PAGE_MASK));
376
377 if (PTTYPE == 32 && is_cpuid_PSE36())
378 gpa |= (guest_pte & PT32_DIR_PSE36_MASK) <<
379 (32 - PT32_DIR_PSE36_SHIFT);
380 } else {
381 gpa = (guest_pte & PT_BASE_ADDR_MASK);
382 gpa |= (vaddr & ~PAGE_MASK);
383 }
384
385 return gpa;
386}
387
388#undef pt_element_t
389#undef guest_walker
390#undef FNAME
391#undef PT_BASE_ADDR_MASK
392#undef PT_INDEX
393#undef SHADOW_PT_INDEX
394#undef PT_LEVEL_MASK
395#undef PT_PTE_COPY_MASK
396#undef PT_NON_PTE_COPY_MASK
397#undef PT_DIR_BASE_ADDR_MASK