diff options
author | Avi Kivity <avi@qumranet.com> | 2007-01-05 19:36:43 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.osdl.org> | 2007-01-06 02:55:24 -0500 |
commit | cea0f0e7ea54753c3265dc77f605a6dad1912cfc (patch) | |
tree | e0a3e64b45fe83f1f0ae89556e1f6fcf92f07185 | |
parent | 25c0de2cc6c26cb99553c2444936a7951c120c09 (diff) |
[PATCH] KVM: MMU: Shadow page table caching
Define a hashtable for caching shadow page tables. Look up the cache on
context switch (cr3 change) or during page faults.
The key to the cache is a combination of
- the guest page table frame number
- the number of paging levels in the guest
* we can cache real mode, 32-bit mode, pae, and long mode page
tables simultaneously. this is useful for smp bootup.
- the guest page table table
* some kernels use a page as both a page table and a page directory. this
allows multiple shadow pages to exist for that page, one per level
- the "quadrant"
* 32-bit mode page tables span 4MB, whereas a shadow page table spans
2MB. similarly, a 32-bit page directory spans 4GB, while a shadow
page directory spans 1GB. the quadrant allows caching up to 4 shadow page
tables for one guest page in one level.
- a "metaphysical" bit
* for real mode, and for pse pages, there is no guest page table, so set
the bit to avoid write protecting the page.
Signed-off-by: Avi Kivity <avi@qumranet.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | drivers/kvm/kvm.h | 45 | ||||
-rw-r--r-- | drivers/kvm/mmu.c | 207 | ||||
-rw-r--r-- | drivers/kvm/paging_tmpl.h | 62 |
3 files changed, 280 insertions, 34 deletions
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index abe40dd34eea..58b9deb0bc0e 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h | |||
@@ -89,14 +89,53 @@ typedef unsigned long hva_t; | |||
89 | typedef u64 hpa_t; | 89 | typedef u64 hpa_t; |
90 | typedef unsigned long hfn_t; | 90 | typedef unsigned long hfn_t; |
91 | 91 | ||
92 | #define NR_PTE_CHAIN_ENTRIES 5 | ||
93 | |||
94 | struct kvm_pte_chain { | ||
95 | u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES]; | ||
96 | struct hlist_node link; | ||
97 | }; | ||
98 | |||
99 | /* | ||
100 | * kvm_mmu_page_role, below, is defined as: | ||
101 | * | ||
102 | * bits 0:3 - total guest paging levels (2-4, or zero for real mode) | ||
103 | * bits 4:7 - page table level for this shadow (1-4) | ||
104 | * bits 8:9 - page table quadrant for 2-level guests | ||
105 | * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) | ||
106 | */ | ||
107 | union kvm_mmu_page_role { | ||
108 | unsigned word; | ||
109 | struct { | ||
110 | unsigned glevels : 4; | ||
111 | unsigned level : 4; | ||
112 | unsigned quadrant : 2; | ||
113 | unsigned pad_for_nice_hex_output : 6; | ||
114 | unsigned metaphysical : 1; | ||
115 | }; | ||
116 | }; | ||
117 | |||
92 | struct kvm_mmu_page { | 118 | struct kvm_mmu_page { |
93 | struct list_head link; | 119 | struct list_head link; |
120 | struct hlist_node hash_link; | ||
121 | |||
122 | /* | ||
123 | * The following two entries are used to key the shadow page in the | ||
124 | * hash table. | ||
125 | */ | ||
126 | gfn_t gfn; | ||
127 | union kvm_mmu_page_role role; | ||
128 | |||
94 | hpa_t page_hpa; | 129 | hpa_t page_hpa; |
95 | unsigned long slot_bitmap; /* One bit set per slot which has memory | 130 | unsigned long slot_bitmap; /* One bit set per slot which has memory |
96 | * in this shadow page. | 131 | * in this shadow page. |
97 | */ | 132 | */ |
98 | int global; /* Set if all ptes in this page are global */ | 133 | int global; /* Set if all ptes in this page are global */ |
99 | u64 *parent_pte; | 134 | int multimapped; /* More than one parent_pte? */ |
135 | union { | ||
136 | u64 *parent_pte; /* !multimapped */ | ||
137 | struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ | ||
138 | }; | ||
100 | }; | 139 | }; |
101 | 140 | ||
102 | struct vmcs { | 141 | struct vmcs { |
@@ -235,7 +274,11 @@ struct kvm { | |||
235 | spinlock_t lock; /* protects everything except vcpus */ | 274 | spinlock_t lock; /* protects everything except vcpus */ |
236 | int nmemslots; | 275 | int nmemslots; |
237 | struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; | 276 | struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; |
277 | /* | ||
278 | * Hash table of struct kvm_mmu_page. | ||
279 | */ | ||
238 | struct list_head active_mmu_pages; | 280 | struct list_head active_mmu_pages; |
281 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; | ||
239 | struct kvm_vcpu vcpus[KVM_MAX_VCPUS]; | 282 | struct kvm_vcpu vcpus[KVM_MAX_VCPUS]; |
240 | int memory_config_version; | 283 | int memory_config_version; |
241 | int busy; | 284 | int busy; |
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index da4d7ddb9bdc..47c699c21c08 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c | |||
@@ -26,8 +26,8 @@ | |||
26 | #include "vmx.h" | 26 | #include "vmx.h" |
27 | #include "kvm.h" | 27 | #include "kvm.h" |
28 | 28 | ||
29 | #define pgprintk(x...) do { } while (0) | 29 | #define pgprintk(x...) do { printk(x); } while (0) |
30 | #define rmap_printk(x...) do { } while (0) | 30 | #define rmap_printk(x...) do { printk(x); } while (0) |
31 | 31 | ||
32 | #define ASSERT(x) \ | 32 | #define ASSERT(x) \ |
33 | if (!(x)) { \ | 33 | if (!(x)) { \ |
@@ -35,8 +35,10 @@ | |||
35 | __FILE__, __LINE__, #x); \ | 35 | __FILE__, __LINE__, #x); \ |
36 | } | 36 | } |
37 | 37 | ||
38 | #define PT64_ENT_PER_PAGE 512 | 38 | #define PT64_PT_BITS 9 |
39 | #define PT32_ENT_PER_PAGE 1024 | 39 | #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) |
40 | #define PT32_PT_BITS 10 | ||
41 | #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) | ||
40 | 42 | ||
41 | #define PT_WRITABLE_SHIFT 1 | 43 | #define PT_WRITABLE_SHIFT 1 |
42 | 44 | ||
@@ -292,6 +294,11 @@ static int is_empty_shadow_page(hpa_t page_hpa) | |||
292 | return 1; | 294 | return 1; |
293 | } | 295 | } |
294 | 296 | ||
297 | static unsigned kvm_page_table_hashfn(gfn_t gfn) | ||
298 | { | ||
299 | return gfn; | ||
300 | } | ||
301 | |||
295 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | 302 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, |
296 | u64 *parent_pte) | 303 | u64 *parent_pte) |
297 | { | 304 | { |
@@ -306,10 +313,147 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | |||
306 | ASSERT(is_empty_shadow_page(page->page_hpa)); | 313 | ASSERT(is_empty_shadow_page(page->page_hpa)); |
307 | page->slot_bitmap = 0; | 314 | page->slot_bitmap = 0; |
308 | page->global = 1; | 315 | page->global = 1; |
316 | page->multimapped = 0; | ||
309 | page->parent_pte = parent_pte; | 317 | page->parent_pte = parent_pte; |
310 | return page; | 318 | return page; |
311 | } | 319 | } |
312 | 320 | ||
321 | static void mmu_page_add_parent_pte(struct kvm_mmu_page *page, u64 *parent_pte) | ||
322 | { | ||
323 | struct kvm_pte_chain *pte_chain; | ||
324 | struct hlist_node *node; | ||
325 | int i; | ||
326 | |||
327 | if (!parent_pte) | ||
328 | return; | ||
329 | if (!page->multimapped) { | ||
330 | u64 *old = page->parent_pte; | ||
331 | |||
332 | if (!old) { | ||
333 | page->parent_pte = parent_pte; | ||
334 | return; | ||
335 | } | ||
336 | page->multimapped = 1; | ||
337 | pte_chain = kzalloc(sizeof(struct kvm_pte_chain), GFP_NOWAIT); | ||
338 | BUG_ON(!pte_chain); | ||
339 | INIT_HLIST_HEAD(&page->parent_ptes); | ||
340 | hlist_add_head(&pte_chain->link, &page->parent_ptes); | ||
341 | pte_chain->parent_ptes[0] = old; | ||
342 | } | ||
343 | hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) { | ||
344 | if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) | ||
345 | continue; | ||
346 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) | ||
347 | if (!pte_chain->parent_ptes[i]) { | ||
348 | pte_chain->parent_ptes[i] = parent_pte; | ||
349 | return; | ||
350 | } | ||
351 | } | ||
352 | pte_chain = kzalloc(sizeof(struct kvm_pte_chain), GFP_NOWAIT); | ||
353 | BUG_ON(!pte_chain); | ||
354 | hlist_add_head(&pte_chain->link, &page->parent_ptes); | ||
355 | pte_chain->parent_ptes[0] = parent_pte; | ||
356 | } | ||
357 | |||
358 | static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page, | ||
359 | u64 *parent_pte) | ||
360 | { | ||
361 | struct kvm_pte_chain *pte_chain; | ||
362 | struct hlist_node *node; | ||
363 | int i; | ||
364 | |||
365 | if (!page->multimapped) { | ||
366 | BUG_ON(page->parent_pte != parent_pte); | ||
367 | page->parent_pte = NULL; | ||
368 | return; | ||
369 | } | ||
370 | hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) | ||
371 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
372 | if (!pte_chain->parent_ptes[i]) | ||
373 | break; | ||
374 | if (pte_chain->parent_ptes[i] != parent_pte) | ||
375 | continue; | ||
376 | while (i + 1 < NR_PTE_CHAIN_ENTRIES) { | ||
377 | pte_chain->parent_ptes[i] | ||
378 | = pte_chain->parent_ptes[i + 1]; | ||
379 | ++i; | ||
380 | } | ||
381 | pte_chain->parent_ptes[i] = NULL; | ||
382 | return; | ||
383 | } | ||
384 | BUG(); | ||
385 | } | ||
386 | |||
387 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu, | ||
388 | gfn_t gfn) | ||
389 | { | ||
390 | unsigned index; | ||
391 | struct hlist_head *bucket; | ||
392 | struct kvm_mmu_page *page; | ||
393 | struct hlist_node *node; | ||
394 | |||
395 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
396 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
397 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
398 | hlist_for_each_entry(page, node, bucket, hash_link) | ||
399 | if (page->gfn == gfn && !page->role.metaphysical) { | ||
400 | pgprintk("%s: found role %x\n", | ||
401 | __FUNCTION__, page->role.word); | ||
402 | return page; | ||
403 | } | ||
404 | return NULL; | ||
405 | } | ||
406 | |||
407 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | ||
408 | gfn_t gfn, | ||
409 | gva_t gaddr, | ||
410 | unsigned level, | ||
411 | int metaphysical, | ||
412 | u64 *parent_pte) | ||
413 | { | ||
414 | union kvm_mmu_page_role role; | ||
415 | unsigned index; | ||
416 | unsigned quadrant; | ||
417 | struct hlist_head *bucket; | ||
418 | struct kvm_mmu_page *page; | ||
419 | struct hlist_node *node; | ||
420 | |||
421 | role.word = 0; | ||
422 | role.glevels = vcpu->mmu.root_level; | ||
423 | role.level = level; | ||
424 | role.metaphysical = metaphysical; | ||
425 | if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) { | ||
426 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); | ||
427 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | ||
428 | role.quadrant = quadrant; | ||
429 | } | ||
430 | pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, | ||
431 | gfn, role.word); | ||
432 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
433 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
434 | hlist_for_each_entry(page, node, bucket, hash_link) | ||
435 | if (page->gfn == gfn && page->role.word == role.word) { | ||
436 | mmu_page_add_parent_pte(page, parent_pte); | ||
437 | pgprintk("%s: found\n", __FUNCTION__); | ||
438 | return page; | ||
439 | } | ||
440 | page = kvm_mmu_alloc_page(vcpu, parent_pte); | ||
441 | if (!page) | ||
442 | return page; | ||
443 | pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); | ||
444 | page->gfn = gfn; | ||
445 | page->role = role; | ||
446 | hlist_add_head(&page->hash_link, bucket); | ||
447 | return page; | ||
448 | } | ||
449 | |||
450 | static void kvm_mmu_put_page(struct kvm_vcpu *vcpu, | ||
451 | struct kvm_mmu_page *page, | ||
452 | u64 *parent_pte) | ||
453 | { | ||
454 | mmu_page_remove_parent_pte(page, parent_pte); | ||
455 | } | ||
456 | |||
313 | static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) | 457 | static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) |
314 | { | 458 | { |
315 | int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT)); | 459 | int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT)); |
@@ -389,11 +533,15 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) | |||
389 | for (; ; level--) { | 533 | for (; ; level--) { |
390 | u32 index = PT64_INDEX(v, level); | 534 | u32 index = PT64_INDEX(v, level); |
391 | u64 *table; | 535 | u64 *table; |
536 | u64 pte; | ||
392 | 537 | ||
393 | ASSERT(VALID_PAGE(table_addr)); | 538 | ASSERT(VALID_PAGE(table_addr)); |
394 | table = __va(table_addr); | 539 | table = __va(table_addr); |
395 | 540 | ||
396 | if (level == 1) { | 541 | if (level == 1) { |
542 | pte = table[index]; | ||
543 | if (is_present_pte(pte) && is_writeble_pte(pte)) | ||
544 | return 0; | ||
397 | mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); | 545 | mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); |
398 | page_header_update_slot(vcpu->kvm, table, v); | 546 | page_header_update_slot(vcpu->kvm, table, v); |
399 | table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | | 547 | table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | |
@@ -404,8 +552,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) | |||
404 | 552 | ||
405 | if (table[index] == 0) { | 553 | if (table[index] == 0) { |
406 | struct kvm_mmu_page *new_table; | 554 | struct kvm_mmu_page *new_table; |
555 | gfn_t pseudo_gfn; | ||
407 | 556 | ||
408 | new_table = kvm_mmu_alloc_page(vcpu, &table[index]); | 557 | pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) |
558 | >> PAGE_SHIFT; | ||
559 | new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, | ||
560 | v, level - 1, | ||
561 | 1, &table[index]); | ||
409 | if (!new_table) { | 562 | if (!new_table) { |
410 | pgprintk("nonpaging_map: ENOMEM\n"); | 563 | pgprintk("nonpaging_map: ENOMEM\n"); |
411 | return -ENOMEM; | 564 | return -ENOMEM; |
@@ -427,7 +580,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) | |||
427 | hpa_t root = vcpu->mmu.root_hpa; | 580 | hpa_t root = vcpu->mmu.root_hpa; |
428 | 581 | ||
429 | ASSERT(VALID_PAGE(root)); | 582 | ASSERT(VALID_PAGE(root)); |
430 | release_pt_page_64(vcpu, root, PT64_ROOT_LEVEL); | ||
431 | vcpu->mmu.root_hpa = INVALID_PAGE; | 583 | vcpu->mmu.root_hpa = INVALID_PAGE; |
432 | return; | 584 | return; |
433 | } | 585 | } |
@@ -437,7 +589,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) | |||
437 | 589 | ||
438 | ASSERT(VALID_PAGE(root)); | 590 | ASSERT(VALID_PAGE(root)); |
439 | root &= PT64_BASE_ADDR_MASK; | 591 | root &= PT64_BASE_ADDR_MASK; |
440 | release_pt_page_64(vcpu, root, PT32E_ROOT_LEVEL - 1); | ||
441 | vcpu->mmu.pae_root[i] = INVALID_PAGE; | 592 | vcpu->mmu.pae_root[i] = INVALID_PAGE; |
442 | } | 593 | } |
443 | vcpu->mmu.root_hpa = INVALID_PAGE; | 594 | vcpu->mmu.root_hpa = INVALID_PAGE; |
@@ -446,13 +597,16 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) | |||
446 | static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | 597 | static void mmu_alloc_roots(struct kvm_vcpu *vcpu) |
447 | { | 598 | { |
448 | int i; | 599 | int i; |
600 | gfn_t root_gfn; | ||
601 | root_gfn = vcpu->cr3 >> PAGE_SHIFT; | ||
449 | 602 | ||
450 | #ifdef CONFIG_X86_64 | 603 | #ifdef CONFIG_X86_64 |
451 | if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { | 604 | if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { |
452 | hpa_t root = vcpu->mmu.root_hpa; | 605 | hpa_t root = vcpu->mmu.root_hpa; |
453 | 606 | ||
454 | ASSERT(!VALID_PAGE(root)); | 607 | ASSERT(!VALID_PAGE(root)); |
455 | root = kvm_mmu_alloc_page(vcpu, NULL)->page_hpa; | 608 | root = kvm_mmu_get_page(vcpu, root_gfn, 0, |
609 | PT64_ROOT_LEVEL, 0, NULL)->page_hpa; | ||
456 | vcpu->mmu.root_hpa = root; | 610 | vcpu->mmu.root_hpa = root; |
457 | return; | 611 | return; |
458 | } | 612 | } |
@@ -461,7 +615,13 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | |||
461 | hpa_t root = vcpu->mmu.pae_root[i]; | 615 | hpa_t root = vcpu->mmu.pae_root[i]; |
462 | 616 | ||
463 | ASSERT(!VALID_PAGE(root)); | 617 | ASSERT(!VALID_PAGE(root)); |
464 | root = kvm_mmu_alloc_page(vcpu, NULL)->page_hpa; | 618 | if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) |
619 | root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT; | ||
620 | else if (vcpu->mmu.root_level == 0) | ||
621 | root_gfn = 0; | ||
622 | root = kvm_mmu_get_page(vcpu, root_gfn, i << 30, | ||
623 | PT32_ROOT_LEVEL, !is_paging(vcpu), | ||
624 | NULL)->page_hpa; | ||
465 | vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; | 625 | vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; |
466 | } | 626 | } |
467 | vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root); | 627 | vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root); |
@@ -529,7 +689,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) | |||
529 | context->inval_page = nonpaging_inval_page; | 689 | context->inval_page = nonpaging_inval_page; |
530 | context->gva_to_gpa = nonpaging_gva_to_gpa; | 690 | context->gva_to_gpa = nonpaging_gva_to_gpa; |
531 | context->free = nonpaging_free; | 691 | context->free = nonpaging_free; |
532 | context->root_level = PT32E_ROOT_LEVEL; | 692 | context->root_level = 0; |
533 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 693 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
534 | mmu_alloc_roots(vcpu); | 694 | mmu_alloc_roots(vcpu); |
535 | ASSERT(VALID_PAGE(context->root_hpa)); | 695 | ASSERT(VALID_PAGE(context->root_hpa)); |
@@ -537,29 +697,18 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) | |||
537 | return 0; | 697 | return 0; |
538 | } | 698 | } |
539 | 699 | ||
540 | |||
541 | static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | 700 | static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) |
542 | { | 701 | { |
543 | struct kvm_mmu_page *page, *npage; | ||
544 | |||
545 | list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages, | ||
546 | link) { | ||
547 | if (page->global) | ||
548 | continue; | ||
549 | |||
550 | if (!page->parent_pte) | ||
551 | continue; | ||
552 | |||
553 | *page->parent_pte = 0; | ||
554 | release_pt_page_64(vcpu, page->page_hpa, 1); | ||
555 | } | ||
556 | ++kvm_stat.tlb_flush; | 702 | ++kvm_stat.tlb_flush; |
557 | kvm_arch_ops->tlb_flush(vcpu); | 703 | kvm_arch_ops->tlb_flush(vcpu); |
558 | } | 704 | } |
559 | 705 | ||
560 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | 706 | static void paging_new_cr3(struct kvm_vcpu *vcpu) |
561 | { | 707 | { |
708 | mmu_free_roots(vcpu); | ||
709 | mmu_alloc_roots(vcpu); | ||
562 | kvm_mmu_flush_tlb(vcpu); | 710 | kvm_mmu_flush_tlb(vcpu); |
711 | kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); | ||
563 | } | 712 | } |
564 | 713 | ||
565 | static void mark_pagetable_nonglobal(void *shadow_pte) | 714 | static void mark_pagetable_nonglobal(void *shadow_pte) |
@@ -578,6 +727,16 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu, | |||
578 | *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET; | 727 | *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET; |
579 | if (!dirty) | 728 | if (!dirty) |
580 | access_bits &= ~PT_WRITABLE_MASK; | 729 | access_bits &= ~PT_WRITABLE_MASK; |
730 | if (access_bits & PT_WRITABLE_MASK) { | ||
731 | struct kvm_mmu_page *shadow; | ||
732 | |||
733 | shadow = kvm_mmu_lookup_page(vcpu, gaddr >> PAGE_SHIFT); | ||
734 | if (shadow) | ||
735 | pgprintk("%s: found shadow page for %lx, marking ro\n", | ||
736 | __FUNCTION__, (gfn_t)(gaddr >> PAGE_SHIFT)); | ||
737 | if (shadow) | ||
738 | access_bits &= ~PT_WRITABLE_MASK; | ||
739 | } | ||
581 | 740 | ||
582 | if (access_bits & PT_WRITABLE_MASK) | 741 | if (access_bits & PT_WRITABLE_MASK) |
583 | mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); | 742 | mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); |
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 11cac9ddf26a..f7cce443ca6f 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h | |||
@@ -32,6 +32,11 @@ | |||
32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | 32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | 33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) |
34 | #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK | 34 | #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK |
35 | #ifdef CONFIG_X86_64 | ||
36 | #define PT_MAX_FULL_LEVELS 4 | ||
37 | #else | ||
38 | #define PT_MAX_FULL_LEVELS 2 | ||
39 | #endif | ||
35 | #elif PTTYPE == 32 | 40 | #elif PTTYPE == 32 |
36 | #define pt_element_t u32 | 41 | #define pt_element_t u32 |
37 | #define guest_walker guest_walker32 | 42 | #define guest_walker guest_walker32 |
@@ -42,6 +47,7 @@ | |||
42 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | 47 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
43 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | 48 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) |
44 | #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK | 49 | #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK |
50 | #define PT_MAX_FULL_LEVELS 2 | ||
45 | #else | 51 | #else |
46 | #error Invalid PTTYPE value | 52 | #error Invalid PTTYPE value |
47 | #endif | 53 | #endif |
@@ -52,7 +58,7 @@ | |||
52 | */ | 58 | */ |
53 | struct guest_walker { | 59 | struct guest_walker { |
54 | int level; | 60 | int level; |
55 | gfn_t table_gfn; | 61 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; |
56 | pt_element_t *table; | 62 | pt_element_t *table; |
57 | pt_element_t *ptep; | 63 | pt_element_t *ptep; |
58 | pt_element_t inherited_ar; | 64 | pt_element_t inherited_ar; |
@@ -68,7 +74,9 @@ static void FNAME(walk_addr)(struct guest_walker *walker, | |||
68 | struct kvm_memory_slot *slot; | 74 | struct kvm_memory_slot *slot; |
69 | pt_element_t *ptep; | 75 | pt_element_t *ptep; |
70 | pt_element_t root; | 76 | pt_element_t root; |
77 | gfn_t table_gfn; | ||
71 | 78 | ||
79 | pgprintk("%s: addr %lx\n", __FUNCTION__, addr); | ||
72 | walker->level = vcpu->mmu.root_level; | 80 | walker->level = vcpu->mmu.root_level; |
73 | walker->table = NULL; | 81 | walker->table = NULL; |
74 | root = vcpu->cr3; | 82 | root = vcpu->cr3; |
@@ -81,8 +89,11 @@ static void FNAME(walk_addr)(struct guest_walker *walker, | |||
81 | --walker->level; | 89 | --walker->level; |
82 | } | 90 | } |
83 | #endif | 91 | #endif |
84 | walker->table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | 92 | table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; |
85 | slot = gfn_to_memslot(vcpu->kvm, walker->table_gfn); | 93 | walker->table_gfn[walker->level - 1] = table_gfn; |
94 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
95 | walker->level - 1, table_gfn); | ||
96 | slot = gfn_to_memslot(vcpu->kvm, table_gfn); | ||
86 | hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK); | 97 | hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK); |
87 | walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0); | 98 | walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0); |
88 | 99 | ||
@@ -111,12 +122,15 @@ static void FNAME(walk_addr)(struct guest_walker *walker, | |||
111 | 122 | ||
112 | if (walker->level != 3 || is_long_mode(vcpu)) | 123 | if (walker->level != 3 || is_long_mode(vcpu)) |
113 | walker->inherited_ar &= walker->table[index]; | 124 | walker->inherited_ar &= walker->table[index]; |
114 | walker->table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | 125 | table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; |
115 | paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK); | 126 | paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK); |
116 | kunmap_atomic(walker->table, KM_USER0); | 127 | kunmap_atomic(walker->table, KM_USER0); |
117 | walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT), | 128 | walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT), |
118 | KM_USER0); | 129 | KM_USER0); |
119 | --walker->level; | 130 | --walker->level; |
131 | walker->table_gfn[walker->level - 1 ] = table_gfn; | ||
132 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
133 | walker->level - 1, table_gfn); | ||
120 | } | 134 | } |
121 | walker->ptep = ptep; | 135 | walker->ptep = ptep; |
122 | } | 136 | } |
@@ -181,6 +195,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
181 | u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index; | 195 | u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index; |
182 | struct kvm_mmu_page *shadow_page; | 196 | struct kvm_mmu_page *shadow_page; |
183 | u64 shadow_pte; | 197 | u64 shadow_pte; |
198 | int metaphysical; | ||
199 | gfn_t table_gfn; | ||
184 | 200 | ||
185 | if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { | 201 | if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { |
186 | if (level == PT_PAGE_TABLE_LEVEL) | 202 | if (level == PT_PAGE_TABLE_LEVEL) |
@@ -205,7 +221,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
205 | return shadow_ent; | 221 | return shadow_ent; |
206 | } | 222 | } |
207 | 223 | ||
208 | shadow_page = kvm_mmu_alloc_page(vcpu, shadow_ent); | 224 | if (level - 1 == PT_PAGE_TABLE_LEVEL |
225 | && walker->level == PT_DIRECTORY_LEVEL) { | ||
226 | metaphysical = 1; | ||
227 | table_gfn = (*guest_ent & PT_BASE_ADDR_MASK) | ||
228 | >> PAGE_SHIFT; | ||
229 | } else { | ||
230 | metaphysical = 0; | ||
231 | table_gfn = walker->table_gfn[level - 2]; | ||
232 | } | ||
233 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, | ||
234 | metaphysical, shadow_ent); | ||
209 | if (!shadow_page) | 235 | if (!shadow_page) |
210 | return ERR_PTR(-ENOMEM); | 236 | return ERR_PTR(-ENOMEM); |
211 | shadow_addr = shadow_page->page_hpa; | 237 | shadow_addr = shadow_page->page_hpa; |
@@ -227,7 +253,8 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, | |||
227 | u64 *shadow_ent, | 253 | u64 *shadow_ent, |
228 | struct guest_walker *walker, | 254 | struct guest_walker *walker, |
229 | gva_t addr, | 255 | gva_t addr, |
230 | int user) | 256 | int user, |
257 | int *write_pt) | ||
231 | { | 258 | { |
232 | pt_element_t *guest_ent; | 259 | pt_element_t *guest_ent; |
233 | int writable_shadow; | 260 | int writable_shadow; |
@@ -264,6 +291,12 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, | |||
264 | } | 291 | } |
265 | 292 | ||
266 | gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | 293 | gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; |
294 | if (kvm_mmu_lookup_page(vcpu, gfn)) { | ||
295 | pgprintk("%s: found shadow page for %lx, marking ro\n", | ||
296 | __FUNCTION__, gfn); | ||
297 | *write_pt = 1; | ||
298 | return 0; | ||
299 | } | ||
267 | mark_page_dirty(vcpu->kvm, gfn); | 300 | mark_page_dirty(vcpu->kvm, gfn); |
268 | *shadow_ent |= PT_WRITABLE_MASK; | 301 | *shadow_ent |= PT_WRITABLE_MASK; |
269 | *guest_ent |= PT_DIRTY_MASK; | 302 | *guest_ent |= PT_DIRTY_MASK; |
@@ -294,7 +327,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
294 | struct guest_walker walker; | 327 | struct guest_walker walker; |
295 | u64 *shadow_pte; | 328 | u64 *shadow_pte; |
296 | int fixed; | 329 | int fixed; |
330 | int write_pt = 0; | ||
297 | 331 | ||
332 | pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); | ||
298 | /* | 333 | /* |
299 | * Look up the shadow pte for the faulting address. | 334 | * Look up the shadow pte for the faulting address. |
300 | */ | 335 | */ |
@@ -302,6 +337,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
302 | FNAME(walk_addr)(&walker, vcpu, addr); | 337 | FNAME(walk_addr)(&walker, vcpu, addr); |
303 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker); | 338 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker); |
304 | if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */ | 339 | if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */ |
340 | printk("%s: oom\n", __FUNCTION__); | ||
305 | nonpaging_flush(vcpu); | 341 | nonpaging_flush(vcpu); |
306 | FNAME(release_walker)(&walker); | 342 | FNAME(release_walker)(&walker); |
307 | continue; | 343 | continue; |
@@ -313,20 +349,27 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
313 | * The page is not mapped by the guest. Let the guest handle it. | 349 | * The page is not mapped by the guest. Let the guest handle it. |
314 | */ | 350 | */ |
315 | if (!shadow_pte) { | 351 | if (!shadow_pte) { |
352 | pgprintk("%s: not mapped\n", __FUNCTION__); | ||
316 | inject_page_fault(vcpu, addr, error_code); | 353 | inject_page_fault(vcpu, addr, error_code); |
317 | FNAME(release_walker)(&walker); | 354 | FNAME(release_walker)(&walker); |
318 | return 0; | 355 | return 0; |
319 | } | 356 | } |
320 | 357 | ||
358 | pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__, | ||
359 | shadow_pte, *shadow_pte); | ||
360 | |||
321 | /* | 361 | /* |
322 | * Update the shadow pte. | 362 | * Update the shadow pte. |
323 | */ | 363 | */ |
324 | if (write_fault) | 364 | if (write_fault) |
325 | fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, | 365 | fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, |
326 | user_fault); | 366 | user_fault, &write_pt); |
327 | else | 367 | else |
328 | fixed = fix_read_pf(shadow_pte); | 368 | fixed = fix_read_pf(shadow_pte); |
329 | 369 | ||
370 | pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__, | ||
371 | shadow_pte, *shadow_pte); | ||
372 | |||
330 | FNAME(release_walker)(&walker); | 373 | FNAME(release_walker)(&walker); |
331 | 374 | ||
332 | /* | 375 | /* |
@@ -344,14 +387,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
344 | /* | 387 | /* |
345 | * pte not present, guest page fault. | 388 | * pte not present, guest page fault. |
346 | */ | 389 | */ |
347 | if (pte_present && !fixed) { | 390 | if (pte_present && !fixed && !write_pt) { |
348 | inject_page_fault(vcpu, addr, error_code); | 391 | inject_page_fault(vcpu, addr, error_code); |
349 | return 0; | 392 | return 0; |
350 | } | 393 | } |
351 | 394 | ||
352 | ++kvm_stat.pf_fixed; | 395 | ++kvm_stat.pf_fixed; |
353 | 396 | ||
354 | return 0; | 397 | return write_pt; |
355 | } | 398 | } |
356 | 399 | ||
357 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | 400 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) |
@@ -395,3 +438,4 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | |||
395 | #undef PT_PTE_COPY_MASK | 438 | #undef PT_PTE_COPY_MASK |
396 | #undef PT_NON_PTE_COPY_MASK | 439 | #undef PT_NON_PTE_COPY_MASK |
397 | #undef PT_DIR_BASE_ADDR_MASK | 440 | #undef PT_DIR_BASE_ADDR_MASK |
441 | #undef PT_MAX_FULL_LEVELS | ||