aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/kvm/mmu.c
diff options
context:
space:
mode:
authorAvi Kivity <avi@qumranet.com>2006-12-10 05:21:36 -0500
committerLinus Torvalds <torvalds@woody.osdl.org>2006-12-10 12:57:22 -0500
commit6aa8b732ca01c3d7a54e93f4d701b8aabbe60fb7 (patch)
tree23fcbe6f4918cacdae26d513a2bd13e91d8b4c38 /drivers/kvm/mmu.c
parentf5f1a24a2caa299bb7d294aee92d7dd3410d9ed7 (diff)
[PATCH] kvm: userspace interface
web site: http://kvm.sourceforge.net mailing list: kvm-devel@lists.sourceforge.net (http://lists.sourceforge.net/lists/listinfo/kvm-devel) The following patchset adds a driver for Intel's hardware virtualization extensions to the x86 architecture. The driver adds a character device (/dev/kvm) that exposes the virtualization capabilities to userspace. Using this driver, a process can run a virtual machine (a "guest") in a fully virtualized PC containing its own virtual hard disks, network adapters, and display. Using this driver, one can start multiple virtual machines on a host. Each virtual machine is a process on the host; a virtual cpu is a thread in that process. kill(1), nice(1), top(1) work as expected. In effect, the driver adds a third execution mode to the existing two: we now have kernel mode, user mode, and guest mode. Guest mode has its own address space mapping guest physical memory (which is accessible to user mode by mmap()ing /dev/kvm). Guest mode has no access to any I/O devices; any such access is intercepted and directed to user mode for emulation. The driver supports i386 and x86_64 hosts and guests. All combinations are allowed except x86_64 guest on i386 host. For i386 guests and hosts, both pae and non-pae paging modes are supported. SMP hosts and UP guests are supported. At the moment only Intel hardware is supported, but AMD virtualization support is being worked on. Performance currently is non-stellar due to the naive implementation of the mmu virtualization, which throws away most of the shadow page table entries every context switch. We plan to address this in two ways: - cache shadow page tables across tlb flushes - wait until AMD and Intel release processors with nested page tables Currently a virtual desktop is responsive but consumes a lot of CPU. Under Windows I tried playing pinball and watching a few flash movies; with a recent CPU one can hardly feel the virtualization. Linux/X is slower, probably due to X being in a separate process. In addition to the driver, you need a slightly modified qemu to provide I/O device emulation and the BIOS. Caveats (akpm: might no longer be true): - The Windows install currently bluescreens due to a problem with the virtual APIC. We are working on a fix. A temporary workaround is to use an existing image or install through qemu - Windows 64-bit does not work. That's also true for qemu, so it's probably a problem with the device model. [bero@arklinux.org: build fix] [simon.kagstrom@bth.se: build fix, other fixes] [uril@qumranet.com: KVM: Expose interrupt bitmap] [akpm@osdl.org: i386 build fix] [mingo@elte.hu: i386 fixes] [rdreier@cisco.com: add log levels to all printks] [randy.dunlap@oracle.com: Fix sparse NULL and C99 struct init warnings] [anthony@codemonkey.ws: KVM: AMD SVM: 32-bit host support] Signed-off-by: Yaniv Kamay <yaniv@qumranet.com> Signed-off-by: Avi Kivity <avi@qumranet.com> Cc: Simon Kagstrom <simon.kagstrom@bth.se> Cc: Bernhard Rosenkraenzer <bero@arklinux.org> Signed-off-by: Uri Lublin <uril@qumranet.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Roland Dreier <rolandd@cisco.com> Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com> Signed-off-by: Anthony Liguori <anthony@codemonkey.ws> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/kvm/mmu.c')
-rw-r--r--drivers/kvm/mmu.c699
1 files changed, 699 insertions, 0 deletions
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
new file mode 100644
index 000000000000..4e29d9b7211c
--- /dev/null
+++ b/drivers/kvm/mmu.c
@@ -0,0 +1,699 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19#include <linux/types.h>
20#include <linux/string.h>
21#include <asm/page.h>
22#include <linux/mm.h>
23#include <linux/highmem.h>
24#include <linux/module.h>
25
26#include "vmx.h"
27#include "kvm.h"
28
29#define pgprintk(x...) do { } while (0)
30
31#define ASSERT(x) \
32 if (!(x)) { \
33 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
34 __FILE__, __LINE__, #x); \
35 }
36
37#define PT64_ENT_PER_PAGE 512
38#define PT32_ENT_PER_PAGE 1024
39
40#define PT_WRITABLE_SHIFT 1
41
42#define PT_PRESENT_MASK (1ULL << 0)
43#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
44#define PT_USER_MASK (1ULL << 2)
45#define PT_PWT_MASK (1ULL << 3)
46#define PT_PCD_MASK (1ULL << 4)
47#define PT_ACCESSED_MASK (1ULL << 5)
48#define PT_DIRTY_MASK (1ULL << 6)
49#define PT_PAGE_SIZE_MASK (1ULL << 7)
50#define PT_PAT_MASK (1ULL << 7)
51#define PT_GLOBAL_MASK (1ULL << 8)
52#define PT64_NX_MASK (1ULL << 63)
53
54#define PT_PAT_SHIFT 7
55#define PT_DIR_PAT_SHIFT 12
56#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
57
58#define PT32_DIR_PSE36_SIZE 4
59#define PT32_DIR_PSE36_SHIFT 13
60#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
61
62
63#define PT32_PTE_COPY_MASK \
64 (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK | \
65 PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_PAT_MASK | \
66 PT_GLOBAL_MASK )
67
68#define PT32_NON_PTE_COPY_MASK \
69 (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK | \
70 PT_ACCESSED_MASK | PT_DIRTY_MASK)
71
72
73#define PT64_PTE_COPY_MASK \
74 (PT64_NX_MASK | PT32_PTE_COPY_MASK)
75
76#define PT64_NON_PTE_COPY_MASK \
77 (PT64_NX_MASK | PT32_NON_PTE_COPY_MASK)
78
79
80
81#define PT_FIRST_AVAIL_BITS_SHIFT 9
82#define PT64_SECOND_AVAIL_BITS_SHIFT 52
83
84#define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
85#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
86
87#define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1)
88#define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT)
89
90#define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1)
91#define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT))
92
93#define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT)
94
95#define VALID_PAGE(x) ((x) != INVALID_PAGE)
96
97#define PT64_LEVEL_BITS 9
98
99#define PT64_LEVEL_SHIFT(level) \
100 ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
101
102#define PT64_LEVEL_MASK(level) \
103 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
104
105#define PT64_INDEX(address, level)\
106 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
107
108
109#define PT32_LEVEL_BITS 10
110
111#define PT32_LEVEL_SHIFT(level) \
112 ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
113
114#define PT32_LEVEL_MASK(level) \
115 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
116
117#define PT32_INDEX(address, level)\
118 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
119
120
121#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & PAGE_MASK)
122#define PT64_DIR_BASE_ADDR_MASK \
123 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
124
125#define PT32_BASE_ADDR_MASK PAGE_MASK
126#define PT32_DIR_BASE_ADDR_MASK \
127 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
128
129
130#define PFERR_PRESENT_MASK (1U << 0)
131#define PFERR_WRITE_MASK (1U << 1)
132#define PFERR_USER_MASK (1U << 2)
133
134#define PT64_ROOT_LEVEL 4
135#define PT32_ROOT_LEVEL 2
136#define PT32E_ROOT_LEVEL 3
137
138#define PT_DIRECTORY_LEVEL 2
139#define PT_PAGE_TABLE_LEVEL 1
140
141static int is_write_protection(struct kvm_vcpu *vcpu)
142{
143 return vcpu->cr0 & CR0_WP_MASK;
144}
145
146static int is_cpuid_PSE36(void)
147{
148 return 1;
149}
150
151static int is_present_pte(unsigned long pte)
152{
153 return pte & PT_PRESENT_MASK;
154}
155
156static int is_writeble_pte(unsigned long pte)
157{
158 return pte & PT_WRITABLE_MASK;
159}
160
161static int is_io_pte(unsigned long pte)
162{
163 return pte & PT_SHADOW_IO_MARK;
164}
165
166static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
167{
168 struct kvm_mmu_page *page_head = page_header(page_hpa);
169
170 list_del(&page_head->link);
171 page_head->page_hpa = page_hpa;
172 list_add(&page_head->link, &vcpu->free_pages);
173}
174
175static int is_empty_shadow_page(hpa_t page_hpa)
176{
177 u32 *pos;
178 u32 *end;
179 for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32);
180 pos != end; pos++)
181 if (*pos != 0)
182 return 0;
183 return 1;
184}
185
186static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte)
187{
188 struct kvm_mmu_page *page;
189
190 if (list_empty(&vcpu->free_pages))
191 return INVALID_PAGE;
192
193 page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link);
194 list_del(&page->link);
195 list_add(&page->link, &vcpu->kvm->active_mmu_pages);
196 ASSERT(is_empty_shadow_page(page->page_hpa));
197 page->slot_bitmap = 0;
198 page->global = 1;
199 page->parent_pte = parent_pte;
200 return page->page_hpa;
201}
202
203static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
204{
205 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
206 struct kvm_mmu_page *page_head = page_header(__pa(pte));
207
208 __set_bit(slot, &page_head->slot_bitmap);
209}
210
211hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
212{
213 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
214
215 return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
216}
217
218hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
219{
220 struct kvm_memory_slot *slot;
221 struct page *page;
222
223 ASSERT((gpa & HPA_ERR_MASK) == 0);
224 slot = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT);
225 if (!slot)
226 return gpa | HPA_ERR_MASK;
227 page = gfn_to_page(slot, gpa >> PAGE_SHIFT);
228 return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
229 | (gpa & (PAGE_SIZE-1));
230}
231
232hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
233{
234 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
235
236 if (gpa == UNMAPPED_GVA)
237 return UNMAPPED_GVA;
238 return gpa_to_hpa(vcpu, gpa);
239}
240
241
242static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa,
243 int level)
244{
245 ASSERT(vcpu);
246 ASSERT(VALID_PAGE(page_hpa));
247 ASSERT(level <= PT64_ROOT_LEVEL && level > 0);
248
249 if (level == 1)
250 memset(__va(page_hpa), 0, PAGE_SIZE);
251 else {
252 u64 *pos;
253 u64 *end;
254
255 for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE;
256 pos != end; pos++) {
257 u64 current_ent = *pos;
258
259 *pos = 0;
260 if (is_present_pte(current_ent))
261 release_pt_page_64(vcpu,
262 current_ent &
263 PT64_BASE_ADDR_MASK,
264 level - 1);
265 }
266 }
267 kvm_mmu_free_page(vcpu, page_hpa);
268}
269
270static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
271{
272}
273
274static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
275{
276 int level = PT32E_ROOT_LEVEL;
277 hpa_t table_addr = vcpu->mmu.root_hpa;
278
279 for (; ; level--) {
280 u32 index = PT64_INDEX(v, level);
281 u64 *table;
282
283 ASSERT(VALID_PAGE(table_addr));
284 table = __va(table_addr);
285
286 if (level == 1) {
287 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
288 page_header_update_slot(vcpu->kvm, table, v);
289 table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
290 PT_USER_MASK;
291 return 0;
292 }
293
294 if (table[index] == 0) {
295 hpa_t new_table = kvm_mmu_alloc_page(vcpu,
296 &table[index]);
297
298 if (!VALID_PAGE(new_table)) {
299 pgprintk("nonpaging_map: ENOMEM\n");
300 return -ENOMEM;
301 }
302
303 if (level == PT32E_ROOT_LEVEL)
304 table[index] = new_table | PT_PRESENT_MASK;
305 else
306 table[index] = new_table | PT_PRESENT_MASK |
307 PT_WRITABLE_MASK | PT_USER_MASK;
308 }
309 table_addr = table[index] & PT64_BASE_ADDR_MASK;
310 }
311}
312
313static void nonpaging_flush(struct kvm_vcpu *vcpu)
314{
315 hpa_t root = vcpu->mmu.root_hpa;
316
317 ++kvm_stat.tlb_flush;
318 pgprintk("nonpaging_flush\n");
319 ASSERT(VALID_PAGE(root));
320 release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level);
321 root = kvm_mmu_alloc_page(vcpu, NULL);
322 ASSERT(VALID_PAGE(root));
323 vcpu->mmu.root_hpa = root;
324 if (is_paging(vcpu))
325 root |= (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK));
326 kvm_arch_ops->set_cr3(vcpu, root);
327 kvm_arch_ops->tlb_flush(vcpu);
328}
329
330static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
331{
332 return vaddr;
333}
334
335static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
336 u32 error_code)
337{
338 int ret;
339 gpa_t addr = gva;
340
341 ASSERT(vcpu);
342 ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
343
344 for (;;) {
345 hpa_t paddr;
346
347 paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
348
349 if (is_error_hpa(paddr))
350 return 1;
351
352 ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
353 if (ret) {
354 nonpaging_flush(vcpu);
355 continue;
356 }
357 break;
358 }
359 return ret;
360}
361
362static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
363{
364}
365
366static void nonpaging_free(struct kvm_vcpu *vcpu)
367{
368 hpa_t root;
369
370 ASSERT(vcpu);
371 root = vcpu->mmu.root_hpa;
372 if (VALID_PAGE(root))
373 release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level);
374 vcpu->mmu.root_hpa = INVALID_PAGE;
375}
376
377static int nonpaging_init_context(struct kvm_vcpu *vcpu)
378{
379 struct kvm_mmu *context = &vcpu->mmu;
380
381 context->new_cr3 = nonpaging_new_cr3;
382 context->page_fault = nonpaging_page_fault;
383 context->inval_page = nonpaging_inval_page;
384 context->gva_to_gpa = nonpaging_gva_to_gpa;
385 context->free = nonpaging_free;
386 context->root_level = PT32E_ROOT_LEVEL;
387 context->shadow_root_level = PT32E_ROOT_LEVEL;
388 context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL);
389 ASSERT(VALID_PAGE(context->root_hpa));
390 kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
391 return 0;
392}
393
394
395static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
396{
397 struct kvm_mmu_page *page, *npage;
398
399 list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages,
400 link) {
401 if (page->global)
402 continue;
403
404 if (!page->parent_pte)
405 continue;
406
407 *page->parent_pte = 0;
408 release_pt_page_64(vcpu, page->page_hpa, 1);
409 }
410 ++kvm_stat.tlb_flush;
411 kvm_arch_ops->tlb_flush(vcpu);
412}
413
414static void paging_new_cr3(struct kvm_vcpu *vcpu)
415{
416 kvm_mmu_flush_tlb(vcpu);
417}
418
419static void mark_pagetable_nonglobal(void *shadow_pte)
420{
421 page_header(__pa(shadow_pte))->global = 0;
422}
423
424static inline void set_pte_common(struct kvm_vcpu *vcpu,
425 u64 *shadow_pte,
426 gpa_t gaddr,
427 int dirty,
428 u64 access_bits)
429{
430 hpa_t paddr;
431
432 *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET;
433 if (!dirty)
434 access_bits &= ~PT_WRITABLE_MASK;
435
436 if (access_bits & PT_WRITABLE_MASK)
437 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
438
439 *shadow_pte |= access_bits;
440
441 paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
442
443 if (!(*shadow_pte & PT_GLOBAL_MASK))
444 mark_pagetable_nonglobal(shadow_pte);
445
446 if (is_error_hpa(paddr)) {
447 *shadow_pte |= gaddr;
448 *shadow_pte |= PT_SHADOW_IO_MARK;
449 *shadow_pte &= ~PT_PRESENT_MASK;
450 } else {
451 *shadow_pte |= paddr;
452 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
453 }
454}
455
456static void inject_page_fault(struct kvm_vcpu *vcpu,
457 u64 addr,
458 u32 err_code)
459{
460 kvm_arch_ops->inject_page_fault(vcpu, addr, err_code);
461}
462
463static inline int fix_read_pf(u64 *shadow_ent)
464{
465 if ((*shadow_ent & PT_SHADOW_USER_MASK) &&
466 !(*shadow_ent & PT_USER_MASK)) {
467 /*
468 * If supervisor write protect is disabled, we shadow kernel
469 * pages as user pages so we can trap the write access.
470 */
471 *shadow_ent |= PT_USER_MASK;
472 *shadow_ent &= ~PT_WRITABLE_MASK;
473
474 return 1;
475
476 }
477 return 0;
478}
479
480static int may_access(u64 pte, int write, int user)
481{
482
483 if (user && !(pte & PT_USER_MASK))
484 return 0;
485 if (write && !(pte & PT_WRITABLE_MASK))
486 return 0;
487 return 1;
488}
489
490/*
491 * Remove a shadow pte.
492 */
493static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
494{
495 hpa_t page_addr = vcpu->mmu.root_hpa;
496 int level = vcpu->mmu.shadow_root_level;
497
498 ++kvm_stat.invlpg;
499
500 for (; ; level--) {
501 u32 index = PT64_INDEX(addr, level);
502 u64 *table = __va(page_addr);
503
504 if (level == PT_PAGE_TABLE_LEVEL ) {
505 table[index] = 0;
506 return;
507 }
508
509 if (!is_present_pte(table[index]))
510 return;
511
512 page_addr = table[index] & PT64_BASE_ADDR_MASK;
513
514 if (level == PT_DIRECTORY_LEVEL &&
515 (table[index] & PT_SHADOW_PS_MARK)) {
516 table[index] = 0;
517 release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL);
518
519 kvm_arch_ops->tlb_flush(vcpu);
520 return;
521 }
522 }
523}
524
525static void paging_free(struct kvm_vcpu *vcpu)
526{
527 nonpaging_free(vcpu);
528}
529
530#define PTTYPE 64
531#include "paging_tmpl.h"
532#undef PTTYPE
533
534#define PTTYPE 32
535#include "paging_tmpl.h"
536#undef PTTYPE
537
538static int paging64_init_context(struct kvm_vcpu *vcpu)
539{
540 struct kvm_mmu *context = &vcpu->mmu;
541
542 ASSERT(is_pae(vcpu));
543 context->new_cr3 = paging_new_cr3;
544 context->page_fault = paging64_page_fault;
545 context->inval_page = paging_inval_page;
546 context->gva_to_gpa = paging64_gva_to_gpa;
547 context->free = paging_free;
548 context->root_level = PT64_ROOT_LEVEL;
549 context->shadow_root_level = PT64_ROOT_LEVEL;
550 context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL);
551 ASSERT(VALID_PAGE(context->root_hpa));
552 kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
553 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
554 return 0;
555}
556
557static int paging32_init_context(struct kvm_vcpu *vcpu)
558{
559 struct kvm_mmu *context = &vcpu->mmu;
560
561 context->new_cr3 = paging_new_cr3;
562 context->page_fault = paging32_page_fault;
563 context->inval_page = paging_inval_page;
564 context->gva_to_gpa = paging32_gva_to_gpa;
565 context->free = paging_free;
566 context->root_level = PT32_ROOT_LEVEL;
567 context->shadow_root_level = PT32E_ROOT_LEVEL;
568 context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL);
569 ASSERT(VALID_PAGE(context->root_hpa));
570 kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
571 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
572 return 0;
573}
574
575static int paging32E_init_context(struct kvm_vcpu *vcpu)
576{
577 int ret;
578
579 if ((ret = paging64_init_context(vcpu)))
580 return ret;
581
582 vcpu->mmu.root_level = PT32E_ROOT_LEVEL;
583 vcpu->mmu.shadow_root_level = PT32E_ROOT_LEVEL;
584 return 0;
585}
586
587static int init_kvm_mmu(struct kvm_vcpu *vcpu)
588{
589 ASSERT(vcpu);
590 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
591
592 if (!is_paging(vcpu))
593 return nonpaging_init_context(vcpu);
594 else if (kvm_arch_ops->is_long_mode(vcpu))
595 return paging64_init_context(vcpu);
596 else if (is_pae(vcpu))
597 return paging32E_init_context(vcpu);
598 else
599 return paging32_init_context(vcpu);
600}
601
602static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
603{
604 ASSERT(vcpu);
605 if (VALID_PAGE(vcpu->mmu.root_hpa)) {
606 vcpu->mmu.free(vcpu);
607 vcpu->mmu.root_hpa = INVALID_PAGE;
608 }
609}
610
611int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
612{
613 destroy_kvm_mmu(vcpu);
614 return init_kvm_mmu(vcpu);
615}
616
617static void free_mmu_pages(struct kvm_vcpu *vcpu)
618{
619 while (!list_empty(&vcpu->free_pages)) {
620 struct kvm_mmu_page *page;
621
622 page = list_entry(vcpu->free_pages.next,
623 struct kvm_mmu_page, link);
624 list_del(&page->link);
625 __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT));
626 page->page_hpa = INVALID_PAGE;
627 }
628}
629
630static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
631{
632 int i;
633
634 ASSERT(vcpu);
635
636 for (i = 0; i < KVM_NUM_MMU_PAGES; i++) {
637 struct page *page;
638 struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i];
639
640 INIT_LIST_HEAD(&page_header->link);
641 if ((page = alloc_page(GFP_KVM_MMU)) == NULL)
642 goto error_1;
643 page->private = (unsigned long)page_header;
644 page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
645 memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
646 list_add(&page_header->link, &vcpu->free_pages);
647 }
648 return 0;
649
650error_1:
651 free_mmu_pages(vcpu);
652 return -ENOMEM;
653}
654
655int kvm_mmu_init(struct kvm_vcpu *vcpu)
656{
657 int r;
658
659 ASSERT(vcpu);
660 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
661 ASSERT(list_empty(&vcpu->free_pages));
662
663 if ((r = alloc_mmu_pages(vcpu)))
664 return r;
665
666 if ((r = init_kvm_mmu(vcpu))) {
667 free_mmu_pages(vcpu);
668 return r;
669 }
670 return 0;
671}
672
673void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
674{
675 ASSERT(vcpu);
676
677 destroy_kvm_mmu(vcpu);
678 free_mmu_pages(vcpu);
679}
680
681void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
682{
683 struct kvm_mmu_page *page;
684
685 list_for_each_entry(page, &kvm->active_mmu_pages, link) {
686 int i;
687 u64 *pt;
688
689 if (!test_bit(slot, &page->slot_bitmap))
690 continue;
691
692 pt = __va(page->page_hpa);
693 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
694 /* avoid RMW */
695 if (pt[i] & PT_WRITABLE_MASK)
696 pt[i] &= ~PT_WRITABLE_MASK;
697
698 }
699}