aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAvi Kivity <avi@qumranet.com>2007-01-05 19:36:43 -0500
committerLinus Torvalds <torvalds@woody.osdl.org>2007-01-06 02:55:24 -0500
commitcea0f0e7ea54753c3265dc77f605a6dad1912cfc (patch)
treee0a3e64b45fe83f1f0ae89556e1f6fcf92f07185
parent25c0de2cc6c26cb99553c2444936a7951c120c09 (diff)
[PATCH] KVM: MMU: Shadow page table caching
Define a hashtable for caching shadow page tables. Look up the cache on context switch (cr3 change) or during page faults. The key to the cache is a combination of - the guest page table frame number - the number of paging levels in the guest * we can cache real mode, 32-bit mode, pae, and long mode page tables simultaneously. this is useful for smp bootup. - the guest page table table * some kernels use a page as both a page table and a page directory. this allows multiple shadow pages to exist for that page, one per level - the "quadrant" * 32-bit mode page tables span 4MB, whereas a shadow page table spans 2MB. similarly, a 32-bit page directory spans 4GB, while a shadow page directory spans 1GB. the quadrant allows caching up to 4 shadow page tables for one guest page in one level. - a "metaphysical" bit * for real mode, and for pse pages, there is no guest page table, so set the bit to avoid write protecting the page. Signed-off-by: Avi Kivity <avi@qumranet.com> Acked-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--drivers/kvm/kvm.h45
-rw-r--r--drivers/kvm/mmu.c207
-rw-r--r--drivers/kvm/paging_tmpl.h62
3 files changed, 280 insertions, 34 deletions
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index abe40dd34eea..58b9deb0bc0e 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -89,14 +89,53 @@ typedef unsigned long hva_t;
89typedef u64 hpa_t; 89typedef u64 hpa_t;
90typedef unsigned long hfn_t; 90typedef unsigned long hfn_t;
91 91
92#define NR_PTE_CHAIN_ENTRIES 5
93
94struct kvm_pte_chain {
95 u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
96 struct hlist_node link;
97};
98
99/*
100 * kvm_mmu_page_role, below, is defined as:
101 *
102 * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
103 * bits 4:7 - page table level for this shadow (1-4)
104 * bits 8:9 - page table quadrant for 2-level guests
105 * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
106 */
107union kvm_mmu_page_role {
108 unsigned word;
109 struct {
110 unsigned glevels : 4;
111 unsigned level : 4;
112 unsigned quadrant : 2;
113 unsigned pad_for_nice_hex_output : 6;
114 unsigned metaphysical : 1;
115 };
116};
117
92struct kvm_mmu_page { 118struct kvm_mmu_page {
93 struct list_head link; 119 struct list_head link;
120 struct hlist_node hash_link;
121
122 /*
123 * The following two entries are used to key the shadow page in the
124 * hash table.
125 */
126 gfn_t gfn;
127 union kvm_mmu_page_role role;
128
94 hpa_t page_hpa; 129 hpa_t page_hpa;
95 unsigned long slot_bitmap; /* One bit set per slot which has memory 130 unsigned long slot_bitmap; /* One bit set per slot which has memory
96 * in this shadow page. 131 * in this shadow page.
97 */ 132 */
98 int global; /* Set if all ptes in this page are global */ 133 int global; /* Set if all ptes in this page are global */
99 u64 *parent_pte; 134 int multimapped; /* More than one parent_pte? */
135 union {
136 u64 *parent_pte; /* !multimapped */
137 struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
138 };
100}; 139};
101 140
102struct vmcs { 141struct vmcs {
@@ -235,7 +274,11 @@ struct kvm {
235 spinlock_t lock; /* protects everything except vcpus */ 274 spinlock_t lock; /* protects everything except vcpus */
236 int nmemslots; 275 int nmemslots;
237 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; 276 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS];
277 /*
278 * Hash table of struct kvm_mmu_page.
279 */
238 struct list_head active_mmu_pages; 280 struct list_head active_mmu_pages;
281 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
239 struct kvm_vcpu vcpus[KVM_MAX_VCPUS]; 282 struct kvm_vcpu vcpus[KVM_MAX_VCPUS];
240 int memory_config_version; 283 int memory_config_version;
241 int busy; 284 int busy;
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index da4d7ddb9bdc..47c699c21c08 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -26,8 +26,8 @@
26#include "vmx.h" 26#include "vmx.h"
27#include "kvm.h" 27#include "kvm.h"
28 28
29#define pgprintk(x...) do { } while (0) 29#define pgprintk(x...) do { printk(x); } while (0)
30#define rmap_printk(x...) do { } while (0) 30#define rmap_printk(x...) do { printk(x); } while (0)
31 31
32#define ASSERT(x) \ 32#define ASSERT(x) \
33 if (!(x)) { \ 33 if (!(x)) { \
@@ -35,8 +35,10 @@
35 __FILE__, __LINE__, #x); \ 35 __FILE__, __LINE__, #x); \
36 } 36 }
37 37
38#define PT64_ENT_PER_PAGE 512 38#define PT64_PT_BITS 9
39#define PT32_ENT_PER_PAGE 1024 39#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
40#define PT32_PT_BITS 10
41#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
40 42
41#define PT_WRITABLE_SHIFT 1 43#define PT_WRITABLE_SHIFT 1
42 44
@@ -292,6 +294,11 @@ static int is_empty_shadow_page(hpa_t page_hpa)
292 return 1; 294 return 1;
293} 295}
294 296
297static unsigned kvm_page_table_hashfn(gfn_t gfn)
298{
299 return gfn;
300}
301
295static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, 302static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
296 u64 *parent_pte) 303 u64 *parent_pte)
297{ 304{
@@ -306,10 +313,147 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
306 ASSERT(is_empty_shadow_page(page->page_hpa)); 313 ASSERT(is_empty_shadow_page(page->page_hpa));
307 page->slot_bitmap = 0; 314 page->slot_bitmap = 0;
308 page->global = 1; 315 page->global = 1;
316 page->multimapped = 0;
309 page->parent_pte = parent_pte; 317 page->parent_pte = parent_pte;
310 return page; 318 return page;
311} 319}
312 320
321static void mmu_page_add_parent_pte(struct kvm_mmu_page *page, u64 *parent_pte)
322{
323 struct kvm_pte_chain *pte_chain;
324 struct hlist_node *node;
325 int i;
326
327 if (!parent_pte)
328 return;
329 if (!page->multimapped) {
330 u64 *old = page->parent_pte;
331
332 if (!old) {
333 page->parent_pte = parent_pte;
334 return;
335 }
336 page->multimapped = 1;
337 pte_chain = kzalloc(sizeof(struct kvm_pte_chain), GFP_NOWAIT);
338 BUG_ON(!pte_chain);
339 INIT_HLIST_HEAD(&page->parent_ptes);
340 hlist_add_head(&pte_chain->link, &page->parent_ptes);
341 pte_chain->parent_ptes[0] = old;
342 }
343 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
344 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
345 continue;
346 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
347 if (!pte_chain->parent_ptes[i]) {
348 pte_chain->parent_ptes[i] = parent_pte;
349 return;
350 }
351 }
352 pte_chain = kzalloc(sizeof(struct kvm_pte_chain), GFP_NOWAIT);
353 BUG_ON(!pte_chain);
354 hlist_add_head(&pte_chain->link, &page->parent_ptes);
355 pte_chain->parent_ptes[0] = parent_pte;
356}
357
358static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
359 u64 *parent_pte)
360{
361 struct kvm_pte_chain *pte_chain;
362 struct hlist_node *node;
363 int i;
364
365 if (!page->multimapped) {
366 BUG_ON(page->parent_pte != parent_pte);
367 page->parent_pte = NULL;
368 return;
369 }
370 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
371 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
372 if (!pte_chain->parent_ptes[i])
373 break;
374 if (pte_chain->parent_ptes[i] != parent_pte)
375 continue;
376 while (i + 1 < NR_PTE_CHAIN_ENTRIES) {
377 pte_chain->parent_ptes[i]
378 = pte_chain->parent_ptes[i + 1];
379 ++i;
380 }
381 pte_chain->parent_ptes[i] = NULL;
382 return;
383 }
384 BUG();
385}
386
387static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
388 gfn_t gfn)
389{
390 unsigned index;
391 struct hlist_head *bucket;
392 struct kvm_mmu_page *page;
393 struct hlist_node *node;
394
395 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
396 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
397 bucket = &vcpu->kvm->mmu_page_hash[index];
398 hlist_for_each_entry(page, node, bucket, hash_link)
399 if (page->gfn == gfn && !page->role.metaphysical) {
400 pgprintk("%s: found role %x\n",
401 __FUNCTION__, page->role.word);
402 return page;
403 }
404 return NULL;
405}
406
407static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
408 gfn_t gfn,
409 gva_t gaddr,
410 unsigned level,
411 int metaphysical,
412 u64 *parent_pte)
413{
414 union kvm_mmu_page_role role;
415 unsigned index;
416 unsigned quadrant;
417 struct hlist_head *bucket;
418 struct kvm_mmu_page *page;
419 struct hlist_node *node;
420
421 role.word = 0;
422 role.glevels = vcpu->mmu.root_level;
423 role.level = level;
424 role.metaphysical = metaphysical;
425 if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
426 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
427 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
428 role.quadrant = quadrant;
429 }
430 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
431 gfn, role.word);
432 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
433 bucket = &vcpu->kvm->mmu_page_hash[index];
434 hlist_for_each_entry(page, node, bucket, hash_link)
435 if (page->gfn == gfn && page->role.word == role.word) {
436 mmu_page_add_parent_pte(page, parent_pte);
437 pgprintk("%s: found\n", __FUNCTION__);
438 return page;
439 }
440 page = kvm_mmu_alloc_page(vcpu, parent_pte);
441 if (!page)
442 return page;
443 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
444 page->gfn = gfn;
445 page->role = role;
446 hlist_add_head(&page->hash_link, bucket);
447 return page;
448}
449
450static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
451 struct kvm_mmu_page *page,
452 u64 *parent_pte)
453{
454 mmu_page_remove_parent_pte(page, parent_pte);
455}
456
313static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) 457static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
314{ 458{
315 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT)); 459 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
@@ -389,11 +533,15 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
389 for (; ; level--) { 533 for (; ; level--) {
390 u32 index = PT64_INDEX(v, level); 534 u32 index = PT64_INDEX(v, level);
391 u64 *table; 535 u64 *table;
536 u64 pte;
392 537
393 ASSERT(VALID_PAGE(table_addr)); 538 ASSERT(VALID_PAGE(table_addr));
394 table = __va(table_addr); 539 table = __va(table_addr);
395 540
396 if (level == 1) { 541 if (level == 1) {
542 pte = table[index];
543 if (is_present_pte(pte) && is_writeble_pte(pte))
544 return 0;
397 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); 545 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
398 page_header_update_slot(vcpu->kvm, table, v); 546 page_header_update_slot(vcpu->kvm, table, v);
399 table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | 547 table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
@@ -404,8 +552,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
404 552
405 if (table[index] == 0) { 553 if (table[index] == 0) {
406 struct kvm_mmu_page *new_table; 554 struct kvm_mmu_page *new_table;
555 gfn_t pseudo_gfn;
407 556
408 new_table = kvm_mmu_alloc_page(vcpu, &table[index]); 557 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
558 >> PAGE_SHIFT;
559 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
560 v, level - 1,
561 1, &table[index]);
409 if (!new_table) { 562 if (!new_table) {
410 pgprintk("nonpaging_map: ENOMEM\n"); 563 pgprintk("nonpaging_map: ENOMEM\n");
411 return -ENOMEM; 564 return -ENOMEM;
@@ -427,7 +580,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
427 hpa_t root = vcpu->mmu.root_hpa; 580 hpa_t root = vcpu->mmu.root_hpa;
428 581
429 ASSERT(VALID_PAGE(root)); 582 ASSERT(VALID_PAGE(root));
430 release_pt_page_64(vcpu, root, PT64_ROOT_LEVEL);
431 vcpu->mmu.root_hpa = INVALID_PAGE; 583 vcpu->mmu.root_hpa = INVALID_PAGE;
432 return; 584 return;
433 } 585 }
@@ -437,7 +589,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
437 589
438 ASSERT(VALID_PAGE(root)); 590 ASSERT(VALID_PAGE(root));
439 root &= PT64_BASE_ADDR_MASK; 591 root &= PT64_BASE_ADDR_MASK;
440 release_pt_page_64(vcpu, root, PT32E_ROOT_LEVEL - 1);
441 vcpu->mmu.pae_root[i] = INVALID_PAGE; 592 vcpu->mmu.pae_root[i] = INVALID_PAGE;
442 } 593 }
443 vcpu->mmu.root_hpa = INVALID_PAGE; 594 vcpu->mmu.root_hpa = INVALID_PAGE;
@@ -446,13 +597,16 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
446static void mmu_alloc_roots(struct kvm_vcpu *vcpu) 597static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
447{ 598{
448 int i; 599 int i;
600 gfn_t root_gfn;
601 root_gfn = vcpu->cr3 >> PAGE_SHIFT;
449 602
450#ifdef CONFIG_X86_64 603#ifdef CONFIG_X86_64
451 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { 604 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
452 hpa_t root = vcpu->mmu.root_hpa; 605 hpa_t root = vcpu->mmu.root_hpa;
453 606
454 ASSERT(!VALID_PAGE(root)); 607 ASSERT(!VALID_PAGE(root));
455 root = kvm_mmu_alloc_page(vcpu, NULL)->page_hpa; 608 root = kvm_mmu_get_page(vcpu, root_gfn, 0,
609 PT64_ROOT_LEVEL, 0, NULL)->page_hpa;
456 vcpu->mmu.root_hpa = root; 610 vcpu->mmu.root_hpa = root;
457 return; 611 return;
458 } 612 }
@@ -461,7 +615,13 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
461 hpa_t root = vcpu->mmu.pae_root[i]; 615 hpa_t root = vcpu->mmu.pae_root[i];
462 616
463 ASSERT(!VALID_PAGE(root)); 617 ASSERT(!VALID_PAGE(root));
464 root = kvm_mmu_alloc_page(vcpu, NULL)->page_hpa; 618 if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL)
619 root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
620 else if (vcpu->mmu.root_level == 0)
621 root_gfn = 0;
622 root = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
623 PT32_ROOT_LEVEL, !is_paging(vcpu),
624 NULL)->page_hpa;
465 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; 625 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
466 } 626 }
467 vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root); 627 vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
@@ -529,7 +689,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
529 context->inval_page = nonpaging_inval_page; 689 context->inval_page = nonpaging_inval_page;
530 context->gva_to_gpa = nonpaging_gva_to_gpa; 690 context->gva_to_gpa = nonpaging_gva_to_gpa;
531 context->free = nonpaging_free; 691 context->free = nonpaging_free;
532 context->root_level = PT32E_ROOT_LEVEL; 692 context->root_level = 0;
533 context->shadow_root_level = PT32E_ROOT_LEVEL; 693 context->shadow_root_level = PT32E_ROOT_LEVEL;
534 mmu_alloc_roots(vcpu); 694 mmu_alloc_roots(vcpu);
535 ASSERT(VALID_PAGE(context->root_hpa)); 695 ASSERT(VALID_PAGE(context->root_hpa));
@@ -537,29 +697,18 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
537 return 0; 697 return 0;
538} 698}
539 699
540
541static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) 700static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
542{ 701{
543 struct kvm_mmu_page *page, *npage;
544
545 list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages,
546 link) {
547 if (page->global)
548 continue;
549
550 if (!page->parent_pte)
551 continue;
552
553 *page->parent_pte = 0;
554 release_pt_page_64(vcpu, page->page_hpa, 1);
555 }
556 ++kvm_stat.tlb_flush; 702 ++kvm_stat.tlb_flush;
557 kvm_arch_ops->tlb_flush(vcpu); 703 kvm_arch_ops->tlb_flush(vcpu);
558} 704}
559 705
560static void paging_new_cr3(struct kvm_vcpu *vcpu) 706static void paging_new_cr3(struct kvm_vcpu *vcpu)
561{ 707{
708 mmu_free_roots(vcpu);
709 mmu_alloc_roots(vcpu);
562 kvm_mmu_flush_tlb(vcpu); 710 kvm_mmu_flush_tlb(vcpu);
711 kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
563} 712}
564 713
565static void mark_pagetable_nonglobal(void *shadow_pte) 714static void mark_pagetable_nonglobal(void *shadow_pte)
@@ -578,6 +727,16 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu,
578 *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET; 727 *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET;
579 if (!dirty) 728 if (!dirty)
580 access_bits &= ~PT_WRITABLE_MASK; 729 access_bits &= ~PT_WRITABLE_MASK;
730 if (access_bits & PT_WRITABLE_MASK) {
731 struct kvm_mmu_page *shadow;
732
733 shadow = kvm_mmu_lookup_page(vcpu, gaddr >> PAGE_SHIFT);
734 if (shadow)
735 pgprintk("%s: found shadow page for %lx, marking ro\n",
736 __FUNCTION__, (gfn_t)(gaddr >> PAGE_SHIFT));
737 if (shadow)
738 access_bits &= ~PT_WRITABLE_MASK;
739 }
581 740
582 if (access_bits & PT_WRITABLE_MASK) 741 if (access_bits & PT_WRITABLE_MASK)
583 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); 742 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
index 11cac9ddf26a..f7cce443ca6f 100644
--- a/drivers/kvm/paging_tmpl.h
+++ b/drivers/kvm/paging_tmpl.h
@@ -32,6 +32,11 @@
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) 33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK 34 #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK
35 #ifdef CONFIG_X86_64
36 #define PT_MAX_FULL_LEVELS 4
37 #else
38 #define PT_MAX_FULL_LEVELS 2
39 #endif
35#elif PTTYPE == 32 40#elif PTTYPE == 32
36 #define pt_element_t u32 41 #define pt_element_t u32
37 #define guest_walker guest_walker32 42 #define guest_walker guest_walker32
@@ -42,6 +47,7 @@
42 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 47 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
43 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) 48 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
44 #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK 49 #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK
50 #define PT_MAX_FULL_LEVELS 2
45#else 51#else
46 #error Invalid PTTYPE value 52 #error Invalid PTTYPE value
47#endif 53#endif
@@ -52,7 +58,7 @@
52 */ 58 */
53struct guest_walker { 59struct guest_walker {
54 int level; 60 int level;
55 gfn_t table_gfn; 61 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
56 pt_element_t *table; 62 pt_element_t *table;
57 pt_element_t *ptep; 63 pt_element_t *ptep;
58 pt_element_t inherited_ar; 64 pt_element_t inherited_ar;
@@ -68,7 +74,9 @@ static void FNAME(walk_addr)(struct guest_walker *walker,
68 struct kvm_memory_slot *slot; 74 struct kvm_memory_slot *slot;
69 pt_element_t *ptep; 75 pt_element_t *ptep;
70 pt_element_t root; 76 pt_element_t root;
77 gfn_t table_gfn;
71 78
79 pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
72 walker->level = vcpu->mmu.root_level; 80 walker->level = vcpu->mmu.root_level;
73 walker->table = NULL; 81 walker->table = NULL;
74 root = vcpu->cr3; 82 root = vcpu->cr3;
@@ -81,8 +89,11 @@ static void FNAME(walk_addr)(struct guest_walker *walker,
81 --walker->level; 89 --walker->level;
82 } 90 }
83#endif 91#endif
84 walker->table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 92 table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
85 slot = gfn_to_memslot(vcpu->kvm, walker->table_gfn); 93 walker->table_gfn[walker->level - 1] = table_gfn;
94 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
95 walker->level - 1, table_gfn);
96 slot = gfn_to_memslot(vcpu->kvm, table_gfn);
86 hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK); 97 hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
87 walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0); 98 walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0);
88 99
@@ -111,12 +122,15 @@ static void FNAME(walk_addr)(struct guest_walker *walker,
111 122
112 if (walker->level != 3 || is_long_mode(vcpu)) 123 if (walker->level != 3 || is_long_mode(vcpu))
113 walker->inherited_ar &= walker->table[index]; 124 walker->inherited_ar &= walker->table[index];
114 walker->table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; 125 table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
115 paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK); 126 paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK);
116 kunmap_atomic(walker->table, KM_USER0); 127 kunmap_atomic(walker->table, KM_USER0);
117 walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT), 128 walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT),
118 KM_USER0); 129 KM_USER0);
119 --walker->level; 130 --walker->level;
131 walker->table_gfn[walker->level - 1 ] = table_gfn;
132 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
133 walker->level - 1, table_gfn);
120 } 134 }
121 walker->ptep = ptep; 135 walker->ptep = ptep;
122} 136}
@@ -181,6 +195,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
181 u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index; 195 u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index;
182 struct kvm_mmu_page *shadow_page; 196 struct kvm_mmu_page *shadow_page;
183 u64 shadow_pte; 197 u64 shadow_pte;
198 int metaphysical;
199 gfn_t table_gfn;
184 200
185 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { 201 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
186 if (level == PT_PAGE_TABLE_LEVEL) 202 if (level == PT_PAGE_TABLE_LEVEL)
@@ -205,7 +221,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
205 return shadow_ent; 221 return shadow_ent;
206 } 222 }
207 223
208 shadow_page = kvm_mmu_alloc_page(vcpu, shadow_ent); 224 if (level - 1 == PT_PAGE_TABLE_LEVEL
225 && walker->level == PT_DIRECTORY_LEVEL) {
226 metaphysical = 1;
227 table_gfn = (*guest_ent & PT_BASE_ADDR_MASK)
228 >> PAGE_SHIFT;
229 } else {
230 metaphysical = 0;
231 table_gfn = walker->table_gfn[level - 2];
232 }
233 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
234 metaphysical, shadow_ent);
209 if (!shadow_page) 235 if (!shadow_page)
210 return ERR_PTR(-ENOMEM); 236 return ERR_PTR(-ENOMEM);
211 shadow_addr = shadow_page->page_hpa; 237 shadow_addr = shadow_page->page_hpa;
@@ -227,7 +253,8 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu,
227 u64 *shadow_ent, 253 u64 *shadow_ent,
228 struct guest_walker *walker, 254 struct guest_walker *walker,
229 gva_t addr, 255 gva_t addr,
230 int user) 256 int user,
257 int *write_pt)
231{ 258{
232 pt_element_t *guest_ent; 259 pt_element_t *guest_ent;
233 int writable_shadow; 260 int writable_shadow;
@@ -264,6 +291,12 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu,
264 } 291 }
265 292
266 gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 293 gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
294 if (kvm_mmu_lookup_page(vcpu, gfn)) {
295 pgprintk("%s: found shadow page for %lx, marking ro\n",
296 __FUNCTION__, gfn);
297 *write_pt = 1;
298 return 0;
299 }
267 mark_page_dirty(vcpu->kvm, gfn); 300 mark_page_dirty(vcpu->kvm, gfn);
268 *shadow_ent |= PT_WRITABLE_MASK; 301 *shadow_ent |= PT_WRITABLE_MASK;
269 *guest_ent |= PT_DIRTY_MASK; 302 *guest_ent |= PT_DIRTY_MASK;
@@ -294,7 +327,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
294 struct guest_walker walker; 327 struct guest_walker walker;
295 u64 *shadow_pte; 328 u64 *shadow_pte;
296 int fixed; 329 int fixed;
330 int write_pt = 0;
297 331
332 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
298 /* 333 /*
299 * Look up the shadow pte for the faulting address. 334 * Look up the shadow pte for the faulting address.
300 */ 335 */
@@ -302,6 +337,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
302 FNAME(walk_addr)(&walker, vcpu, addr); 337 FNAME(walk_addr)(&walker, vcpu, addr);
303 shadow_pte = FNAME(fetch)(vcpu, addr, &walker); 338 shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
304 if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */ 339 if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */
340 printk("%s: oom\n", __FUNCTION__);
305 nonpaging_flush(vcpu); 341 nonpaging_flush(vcpu);
306 FNAME(release_walker)(&walker); 342 FNAME(release_walker)(&walker);
307 continue; 343 continue;
@@ -313,20 +349,27 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
313 * The page is not mapped by the guest. Let the guest handle it. 349 * The page is not mapped by the guest. Let the guest handle it.
314 */ 350 */
315 if (!shadow_pte) { 351 if (!shadow_pte) {
352 pgprintk("%s: not mapped\n", __FUNCTION__);
316 inject_page_fault(vcpu, addr, error_code); 353 inject_page_fault(vcpu, addr, error_code);
317 FNAME(release_walker)(&walker); 354 FNAME(release_walker)(&walker);
318 return 0; 355 return 0;
319 } 356 }
320 357
358 pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__,
359 shadow_pte, *shadow_pte);
360
321 /* 361 /*
322 * Update the shadow pte. 362 * Update the shadow pte.
323 */ 363 */
324 if (write_fault) 364 if (write_fault)
325 fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, 365 fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr,
326 user_fault); 366 user_fault, &write_pt);
327 else 367 else
328 fixed = fix_read_pf(shadow_pte); 368 fixed = fix_read_pf(shadow_pte);
329 369
370 pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__,
371 shadow_pte, *shadow_pte);
372
330 FNAME(release_walker)(&walker); 373 FNAME(release_walker)(&walker);
331 374
332 /* 375 /*
@@ -344,14 +387,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
344 /* 387 /*
345 * pte not present, guest page fault. 388 * pte not present, guest page fault.
346 */ 389 */
347 if (pte_present && !fixed) { 390 if (pte_present && !fixed && !write_pt) {
348 inject_page_fault(vcpu, addr, error_code); 391 inject_page_fault(vcpu, addr, error_code);
349 return 0; 392 return 0;
350 } 393 }
351 394
352 ++kvm_stat.pf_fixed; 395 ++kvm_stat.pf_fixed;
353 396
354 return 0; 397 return write_pt;
355} 398}
356 399
357static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 400static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -395,3 +438,4 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
395#undef PT_PTE_COPY_MASK 438#undef PT_PTE_COPY_MASK
396#undef PT_NON_PTE_COPY_MASK 439#undef PT_NON_PTE_COPY_MASK
397#undef PT_DIR_BASE_ADDR_MASK 440#undef PT_DIR_BASE_ADDR_MASK
441#undef PT_MAX_FULL_LEVELS