diff options
author | Marcelo Tosatti <marcelo@kvack.org> | 2008-02-23 09:44:30 -0500 |
---|---|---|
committer | Avi Kivity <avi@qumranet.com> | 2008-04-27 04:53:25 -0400 |
commit | 05da45583de9b383dc81dd695fe248431d6c9f2b (patch) | |
tree | a76d699e60aca4f775d5f67254214654235e2e17 | |
parent | 2e53d63acba75795aa226febd140f67c58c6a353 (diff) |
KVM: MMU: large page support
Create large pages mappings if the guest PTE's are marked as such and
the underlying memory is hugetlbfs backed. If the largepage contains
write-protected pages, a large pte is not used.
Gives a consistent 2% improvement for data copies on ram mounted
filesystem, without NPT/EPT.
Anthony measures a 4% improvement on 4-way kernbench, with NPT.
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
-rw-r--r-- | arch/x86/kvm/mmu.c | 222 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 32 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 1 | ||||
-rw-r--r-- | include/asm-x86/kvm_host.h | 9 | ||||
-rw-r--r-- | include/linux/kvm_host.h | 5 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 25 |
6 files changed, 262 insertions, 32 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 103d008dab8b..1932a3aeda1d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/highmem.h> | 27 | #include <linux/highmem.h> |
28 | #include <linux/module.h> | 28 | #include <linux/module.h> |
29 | #include <linux/swap.h> | 29 | #include <linux/swap.h> |
30 | #include <linux/hugetlb.h> | ||
30 | 31 | ||
31 | #include <asm/page.h> | 32 | #include <asm/page.h> |
32 | #include <asm/cmpxchg.h> | 33 | #include <asm/cmpxchg.h> |
@@ -211,6 +212,11 @@ static int is_shadow_present_pte(u64 pte) | |||
211 | && pte != shadow_notrap_nonpresent_pte; | 212 | && pte != shadow_notrap_nonpresent_pte; |
212 | } | 213 | } |
213 | 214 | ||
215 | static int is_large_pte(u64 pte) | ||
216 | { | ||
217 | return pte & PT_PAGE_SIZE_MASK; | ||
218 | } | ||
219 | |||
214 | static int is_writeble_pte(unsigned long pte) | 220 | static int is_writeble_pte(unsigned long pte) |
215 | { | 221 | { |
216 | return pte & PT_WRITABLE_MASK; | 222 | return pte & PT_WRITABLE_MASK; |
@@ -350,16 +356,100 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) | |||
350 | } | 356 | } |
351 | 357 | ||
352 | /* | 358 | /* |
359 | * Return the pointer to the largepage write count for a given | ||
360 | * gfn, handling slots that are not large page aligned. | ||
361 | */ | ||
362 | static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) | ||
363 | { | ||
364 | unsigned long idx; | ||
365 | |||
366 | idx = (gfn / KVM_PAGES_PER_HPAGE) - | ||
367 | (slot->base_gfn / KVM_PAGES_PER_HPAGE); | ||
368 | return &slot->lpage_info[idx].write_count; | ||
369 | } | ||
370 | |||
371 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) | ||
372 | { | ||
373 | int *write_count; | ||
374 | |||
375 | write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); | ||
376 | *write_count += 1; | ||
377 | WARN_ON(*write_count > KVM_PAGES_PER_HPAGE); | ||
378 | } | ||
379 | |||
380 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) | ||
381 | { | ||
382 | int *write_count; | ||
383 | |||
384 | write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); | ||
385 | *write_count -= 1; | ||
386 | WARN_ON(*write_count < 0); | ||
387 | } | ||
388 | |||
389 | static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) | ||
390 | { | ||
391 | struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); | ||
392 | int *largepage_idx; | ||
393 | |||
394 | if (slot) { | ||
395 | largepage_idx = slot_largepage_idx(gfn, slot); | ||
396 | return *largepage_idx; | ||
397 | } | ||
398 | |||
399 | return 1; | ||
400 | } | ||
401 | |||
402 | static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) | ||
403 | { | ||
404 | struct vm_area_struct *vma; | ||
405 | unsigned long addr; | ||
406 | |||
407 | addr = gfn_to_hva(kvm, gfn); | ||
408 | if (kvm_is_error_hva(addr)) | ||
409 | return 0; | ||
410 | |||
411 | vma = find_vma(current->mm, addr); | ||
412 | if (vma && is_vm_hugetlb_page(vma)) | ||
413 | return 1; | ||
414 | |||
415 | return 0; | ||
416 | } | ||
417 | |||
418 | static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) | ||
419 | { | ||
420 | struct kvm_memory_slot *slot; | ||
421 | |||
422 | if (has_wrprotected_page(vcpu->kvm, large_gfn)) | ||
423 | return 0; | ||
424 | |||
425 | if (!host_largepage_backed(vcpu->kvm, large_gfn)) | ||
426 | return 0; | ||
427 | |||
428 | slot = gfn_to_memslot(vcpu->kvm, large_gfn); | ||
429 | if (slot && slot->dirty_bitmap) | ||
430 | return 0; | ||
431 | |||
432 | return 1; | ||
433 | } | ||
434 | |||
435 | /* | ||
353 | * Take gfn and return the reverse mapping to it. | 436 | * Take gfn and return the reverse mapping to it. |
354 | * Note: gfn must be unaliased before this function get called | 437 | * Note: gfn must be unaliased before this function get called |
355 | */ | 438 | */ |
356 | 439 | ||
357 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) | 440 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) |
358 | { | 441 | { |
359 | struct kvm_memory_slot *slot; | 442 | struct kvm_memory_slot *slot; |
443 | unsigned long idx; | ||
360 | 444 | ||
361 | slot = gfn_to_memslot(kvm, gfn); | 445 | slot = gfn_to_memslot(kvm, gfn); |
362 | return &slot->rmap[gfn - slot->base_gfn]; | 446 | if (!lpage) |
447 | return &slot->rmap[gfn - slot->base_gfn]; | ||
448 | |||
449 | idx = (gfn / KVM_PAGES_PER_HPAGE) - | ||
450 | (slot->base_gfn / KVM_PAGES_PER_HPAGE); | ||
451 | |||
452 | return &slot->lpage_info[idx].rmap_pde; | ||
363 | } | 453 | } |
364 | 454 | ||
365 | /* | 455 | /* |
@@ -371,7 +461,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) | |||
371 | * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc | 461 | * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc |
372 | * containing more mappings. | 462 | * containing more mappings. |
373 | */ | 463 | */ |
374 | static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | 464 | static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) |
375 | { | 465 | { |
376 | struct kvm_mmu_page *sp; | 466 | struct kvm_mmu_page *sp; |
377 | struct kvm_rmap_desc *desc; | 467 | struct kvm_rmap_desc *desc; |
@@ -383,7 +473,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
383 | gfn = unalias_gfn(vcpu->kvm, gfn); | 473 | gfn = unalias_gfn(vcpu->kvm, gfn); |
384 | sp = page_header(__pa(spte)); | 474 | sp = page_header(__pa(spte)); |
385 | sp->gfns[spte - sp->spt] = gfn; | 475 | sp->gfns[spte - sp->spt] = gfn; |
386 | rmapp = gfn_to_rmap(vcpu->kvm, gfn); | 476 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); |
387 | if (!*rmapp) { | 477 | if (!*rmapp) { |
388 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); | 478 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); |
389 | *rmapp = (unsigned long)spte; | 479 | *rmapp = (unsigned long)spte; |
@@ -449,7 +539,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
449 | kvm_release_page_dirty(page); | 539 | kvm_release_page_dirty(page); |
450 | else | 540 | else |
451 | kvm_release_page_clean(page); | 541 | kvm_release_page_clean(page); |
452 | rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); | 542 | rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte)); |
453 | if (!*rmapp) { | 543 | if (!*rmapp) { |
454 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); | 544 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); |
455 | BUG(); | 545 | BUG(); |
@@ -515,7 +605,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
515 | int write_protected = 0; | 605 | int write_protected = 0; |
516 | 606 | ||
517 | gfn = unalias_gfn(kvm, gfn); | 607 | gfn = unalias_gfn(kvm, gfn); |
518 | rmapp = gfn_to_rmap(kvm, gfn); | 608 | rmapp = gfn_to_rmap(kvm, gfn, 0); |
519 | 609 | ||
520 | spte = rmap_next(kvm, rmapp, NULL); | 610 | spte = rmap_next(kvm, rmapp, NULL); |
521 | while (spte) { | 611 | while (spte) { |
@@ -528,8 +618,27 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
528 | } | 618 | } |
529 | spte = rmap_next(kvm, rmapp, spte); | 619 | spte = rmap_next(kvm, rmapp, spte); |
530 | } | 620 | } |
621 | /* check for huge page mappings */ | ||
622 | rmapp = gfn_to_rmap(kvm, gfn, 1); | ||
623 | spte = rmap_next(kvm, rmapp, NULL); | ||
624 | while (spte) { | ||
625 | BUG_ON(!spte); | ||
626 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | ||
627 | BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); | ||
628 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); | ||
629 | if (is_writeble_pte(*spte)) { | ||
630 | rmap_remove(kvm, spte); | ||
631 | --kvm->stat.lpages; | ||
632 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | ||
633 | write_protected = 1; | ||
634 | } | ||
635 | spte = rmap_next(kvm, rmapp, spte); | ||
636 | } | ||
637 | |||
531 | if (write_protected) | 638 | if (write_protected) |
532 | kvm_flush_remote_tlbs(kvm); | 639 | kvm_flush_remote_tlbs(kvm); |
640 | |||
641 | account_shadowed(kvm, gfn); | ||
533 | } | 642 | } |
534 | 643 | ||
535 | #ifdef MMU_DEBUG | 644 | #ifdef MMU_DEBUG |
@@ -747,11 +856,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, | |||
747 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | 856 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { |
748 | ent = pt[i]; | 857 | ent = pt[i]; |
749 | 858 | ||
859 | if (is_shadow_present_pte(ent)) { | ||
860 | if (!is_large_pte(ent)) { | ||
861 | ent &= PT64_BASE_ADDR_MASK; | ||
862 | mmu_page_remove_parent_pte(page_header(ent), | ||
863 | &pt[i]); | ||
864 | } else { | ||
865 | --kvm->stat.lpages; | ||
866 | rmap_remove(kvm, &pt[i]); | ||
867 | } | ||
868 | } | ||
750 | pt[i] = shadow_trap_nonpresent_pte; | 869 | pt[i] = shadow_trap_nonpresent_pte; |
751 | if (!is_shadow_present_pte(ent)) | ||
752 | continue; | ||
753 | ent &= PT64_BASE_ADDR_MASK; | ||
754 | mmu_page_remove_parent_pte(page_header(ent), &pt[i]); | ||
755 | } | 870 | } |
756 | kvm_flush_remote_tlbs(kvm); | 871 | kvm_flush_remote_tlbs(kvm); |
757 | } | 872 | } |
@@ -791,6 +906,8 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
791 | } | 906 | } |
792 | kvm_mmu_page_unlink_children(kvm, sp); | 907 | kvm_mmu_page_unlink_children(kvm, sp); |
793 | if (!sp->root_count) { | 908 | if (!sp->root_count) { |
909 | if (!sp->role.metaphysical) | ||
910 | unaccount_shadowed(kvm, sp->gfn); | ||
794 | hlist_del(&sp->hash_link); | 911 | hlist_del(&sp->hash_link); |
795 | kvm_mmu_free_page(kvm, sp); | 912 | kvm_mmu_free_page(kvm, sp); |
796 | } else { | 913 | } else { |
@@ -894,7 +1011,8 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) | |||
894 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | 1011 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, |
895 | unsigned pt_access, unsigned pte_access, | 1012 | unsigned pt_access, unsigned pte_access, |
896 | int user_fault, int write_fault, int dirty, | 1013 | int user_fault, int write_fault, int dirty, |
897 | int *ptwrite, gfn_t gfn, struct page *page) | 1014 | int *ptwrite, int largepage, gfn_t gfn, |
1015 | struct page *page) | ||
898 | { | 1016 | { |
899 | u64 spte; | 1017 | u64 spte; |
900 | int was_rmapped = 0; | 1018 | int was_rmapped = 0; |
@@ -907,15 +1025,29 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
907 | write_fault, user_fault, gfn); | 1025 | write_fault, user_fault, gfn); |
908 | 1026 | ||
909 | if (is_rmap_pte(*shadow_pte)) { | 1027 | if (is_rmap_pte(*shadow_pte)) { |
910 | if (host_pfn != page_to_pfn(page)) { | 1028 | /* |
1029 | * If we overwrite a PTE page pointer with a 2MB PMD, unlink | ||
1030 | * the parent of the now unreachable PTE. | ||
1031 | */ | ||
1032 | if (largepage && !is_large_pte(*shadow_pte)) { | ||
1033 | struct kvm_mmu_page *child; | ||
1034 | u64 pte = *shadow_pte; | ||
1035 | |||
1036 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
1037 | mmu_page_remove_parent_pte(child, shadow_pte); | ||
1038 | } else if (host_pfn != page_to_pfn(page)) { | ||
911 | pgprintk("hfn old %lx new %lx\n", | 1039 | pgprintk("hfn old %lx new %lx\n", |
912 | host_pfn, page_to_pfn(page)); | 1040 | host_pfn, page_to_pfn(page)); |
913 | rmap_remove(vcpu->kvm, shadow_pte); | 1041 | rmap_remove(vcpu->kvm, shadow_pte); |
1042 | } else { | ||
1043 | if (largepage) | ||
1044 | was_rmapped = is_large_pte(*shadow_pte); | ||
1045 | else | ||
1046 | was_rmapped = 1; | ||
914 | } | 1047 | } |
915 | else | ||
916 | was_rmapped = 1; | ||
917 | } | 1048 | } |
918 | 1049 | ||
1050 | |||
919 | /* | 1051 | /* |
920 | * We don't set the accessed bit, since we sometimes want to see | 1052 | * We don't set the accessed bit, since we sometimes want to see |
921 | * whether the guest actually used the pte (in order to detect | 1053 | * whether the guest actually used the pte (in order to detect |
@@ -930,6 +1062,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
930 | spte |= PT_PRESENT_MASK; | 1062 | spte |= PT_PRESENT_MASK; |
931 | if (pte_access & ACC_USER_MASK) | 1063 | if (pte_access & ACC_USER_MASK) |
932 | spte |= PT_USER_MASK; | 1064 | spte |= PT_USER_MASK; |
1065 | if (largepage) | ||
1066 | spte |= PT_PAGE_SIZE_MASK; | ||
933 | 1067 | ||
934 | spte |= page_to_phys(page); | 1068 | spte |= page_to_phys(page); |
935 | 1069 | ||
@@ -944,7 +1078,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
944 | } | 1078 | } |
945 | 1079 | ||
946 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); | 1080 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); |
947 | if (shadow) { | 1081 | if (shadow || |
1082 | (largepage && has_wrprotected_page(vcpu->kvm, gfn))) { | ||
948 | pgprintk("%s: found shadow page for %lx, marking ro\n", | 1083 | pgprintk("%s: found shadow page for %lx, marking ro\n", |
949 | __FUNCTION__, gfn); | 1084 | __FUNCTION__, gfn); |
950 | pte_access &= ~ACC_WRITE_MASK; | 1085 | pte_access &= ~ACC_WRITE_MASK; |
@@ -963,10 +1098,17 @@ unshadowed: | |||
963 | mark_page_dirty(vcpu->kvm, gfn); | 1098 | mark_page_dirty(vcpu->kvm, gfn); |
964 | 1099 | ||
965 | pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); | 1100 | pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); |
1101 | pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n", | ||
1102 | (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", | ||
1103 | (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); | ||
966 | set_shadow_pte(shadow_pte, spte); | 1104 | set_shadow_pte(shadow_pte, spte); |
1105 | if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) | ||
1106 | && (spte & PT_PRESENT_MASK)) | ||
1107 | ++vcpu->kvm->stat.lpages; | ||
1108 | |||
967 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); | 1109 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); |
968 | if (!was_rmapped) { | 1110 | if (!was_rmapped) { |
969 | rmap_add(vcpu, shadow_pte, gfn); | 1111 | rmap_add(vcpu, shadow_pte, gfn, largepage); |
970 | if (!is_rmap_pte(*shadow_pte)) | 1112 | if (!is_rmap_pte(*shadow_pte)) |
971 | kvm_release_page_clean(page); | 1113 | kvm_release_page_clean(page); |
972 | } else { | 1114 | } else { |
@@ -984,7 +1126,8 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | |||
984 | } | 1126 | } |
985 | 1127 | ||
986 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | 1128 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, |
987 | gfn_t gfn, struct page *page, int level) | 1129 | int largepage, gfn_t gfn, struct page *page, |
1130 | int level) | ||
988 | { | 1131 | { |
989 | hpa_t table_addr = vcpu->arch.mmu.root_hpa; | 1132 | hpa_t table_addr = vcpu->arch.mmu.root_hpa; |
990 | int pt_write = 0; | 1133 | int pt_write = 0; |
@@ -998,7 +1141,13 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
998 | 1141 | ||
999 | if (level == 1) { | 1142 | if (level == 1) { |
1000 | mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, | 1143 | mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, |
1001 | 0, write, 1, &pt_write, gfn, page); | 1144 | 0, write, 1, &pt_write, 0, gfn, page); |
1145 | return pt_write; | ||
1146 | } | ||
1147 | |||
1148 | if (largepage && level == 2) { | ||
1149 | mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, | ||
1150 | 0, write, 1, &pt_write, 1, gfn, page); | ||
1002 | return pt_write; | 1151 | return pt_write; |
1003 | } | 1152 | } |
1004 | 1153 | ||
@@ -1027,12 +1176,18 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
1027 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | 1176 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) |
1028 | { | 1177 | { |
1029 | int r; | 1178 | int r; |
1179 | int largepage = 0; | ||
1030 | 1180 | ||
1031 | struct page *page; | 1181 | struct page *page; |
1032 | 1182 | ||
1033 | down_read(&vcpu->kvm->slots_lock); | 1183 | down_read(&vcpu->kvm->slots_lock); |
1034 | 1184 | ||
1035 | down_read(¤t->mm->mmap_sem); | 1185 | down_read(¤t->mm->mmap_sem); |
1186 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { | ||
1187 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | ||
1188 | largepage = 1; | ||
1189 | } | ||
1190 | |||
1036 | page = gfn_to_page(vcpu->kvm, gfn); | 1191 | page = gfn_to_page(vcpu->kvm, gfn); |
1037 | up_read(¤t->mm->mmap_sem); | 1192 | up_read(¤t->mm->mmap_sem); |
1038 | 1193 | ||
@@ -1045,7 +1200,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
1045 | 1200 | ||
1046 | spin_lock(&vcpu->kvm->mmu_lock); | 1201 | spin_lock(&vcpu->kvm->mmu_lock); |
1047 | kvm_mmu_free_some_pages(vcpu); | 1202 | kvm_mmu_free_some_pages(vcpu); |
1048 | r = __direct_map(vcpu, v, write, gfn, page, PT32E_ROOT_LEVEL); | 1203 | r = __direct_map(vcpu, v, write, largepage, gfn, page, |
1204 | PT32E_ROOT_LEVEL); | ||
1049 | spin_unlock(&vcpu->kvm->mmu_lock); | 1205 | spin_unlock(&vcpu->kvm->mmu_lock); |
1050 | 1206 | ||
1051 | up_read(&vcpu->kvm->slots_lock); | 1207 | up_read(&vcpu->kvm->slots_lock); |
@@ -1180,6 +1336,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
1180 | { | 1336 | { |
1181 | struct page *page; | 1337 | struct page *page; |
1182 | int r; | 1338 | int r; |
1339 | int largepage = 0; | ||
1340 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
1183 | 1341 | ||
1184 | ASSERT(vcpu); | 1342 | ASSERT(vcpu); |
1185 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 1343 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
@@ -1189,7 +1347,11 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
1189 | return r; | 1347 | return r; |
1190 | 1348 | ||
1191 | down_read(¤t->mm->mmap_sem); | 1349 | down_read(¤t->mm->mmap_sem); |
1192 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | 1350 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { |
1351 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | ||
1352 | largepage = 1; | ||
1353 | } | ||
1354 | page = gfn_to_page(vcpu->kvm, gfn); | ||
1193 | if (is_error_page(page)) { | 1355 | if (is_error_page(page)) { |
1194 | kvm_release_page_clean(page); | 1356 | kvm_release_page_clean(page); |
1195 | up_read(¤t->mm->mmap_sem); | 1357 | up_read(¤t->mm->mmap_sem); |
@@ -1198,7 +1360,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
1198 | spin_lock(&vcpu->kvm->mmu_lock); | 1360 | spin_lock(&vcpu->kvm->mmu_lock); |
1199 | kvm_mmu_free_some_pages(vcpu); | 1361 | kvm_mmu_free_some_pages(vcpu); |
1200 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, | 1362 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, |
1201 | gpa >> PAGE_SHIFT, page, TDP_ROOT_LEVEL); | 1363 | largepage, gfn, page, TDP_ROOT_LEVEL); |
1202 | spin_unlock(&vcpu->kvm->mmu_lock); | 1364 | spin_unlock(&vcpu->kvm->mmu_lock); |
1203 | up_read(¤t->mm->mmap_sem); | 1365 | up_read(¤t->mm->mmap_sem); |
1204 | 1366 | ||
@@ -1397,7 +1559,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | |||
1397 | 1559 | ||
1398 | pte = *spte; | 1560 | pte = *spte; |
1399 | if (is_shadow_present_pte(pte)) { | 1561 | if (is_shadow_present_pte(pte)) { |
1400 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) | 1562 | if (sp->role.level == PT_PAGE_TABLE_LEVEL || |
1563 | is_large_pte(pte)) | ||
1401 | rmap_remove(vcpu->kvm, spte); | 1564 | rmap_remove(vcpu->kvm, spte); |
1402 | else { | 1565 | else { |
1403 | child = page_header(pte & PT64_BASE_ADDR_MASK); | 1566 | child = page_header(pte & PT64_BASE_ADDR_MASK); |
@@ -1405,6 +1568,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | |||
1405 | } | 1568 | } |
1406 | } | 1569 | } |
1407 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | 1570 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); |
1571 | if (is_large_pte(pte)) | ||
1572 | --vcpu->kvm->stat.lpages; | ||
1408 | } | 1573 | } |
1409 | 1574 | ||
1410 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | 1575 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, |
@@ -1412,7 +1577,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | |||
1412 | u64 *spte, | 1577 | u64 *spte, |
1413 | const void *new) | 1578 | const void *new) |
1414 | { | 1579 | { |
1415 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) { | 1580 | if ((sp->role.level != PT_PAGE_TABLE_LEVEL) |
1581 | && !vcpu->arch.update_pte.largepage) { | ||
1416 | ++vcpu->kvm->stat.mmu_pde_zapped; | 1582 | ++vcpu->kvm->stat.mmu_pde_zapped; |
1417 | return; | 1583 | return; |
1418 | } | 1584 | } |
@@ -1460,6 +1626,8 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1460 | u64 gpte = 0; | 1626 | u64 gpte = 0; |
1461 | struct page *page; | 1627 | struct page *page; |
1462 | 1628 | ||
1629 | vcpu->arch.update_pte.largepage = 0; | ||
1630 | |||
1463 | if (bytes != 4 && bytes != 8) | 1631 | if (bytes != 4 && bytes != 8) |
1464 | return; | 1632 | return; |
1465 | 1633 | ||
@@ -1487,9 +1655,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1487 | return; | 1655 | return; |
1488 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | 1656 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; |
1489 | 1657 | ||
1490 | down_read(&vcpu->kvm->slots_lock); | 1658 | down_read(¤t->mm->mmap_sem); |
1659 | if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { | ||
1660 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | ||
1661 | vcpu->arch.update_pte.largepage = 1; | ||
1662 | } | ||
1491 | page = gfn_to_page(vcpu->kvm, gfn); | 1663 | page = gfn_to_page(vcpu->kvm, gfn); |
1492 | up_read(&vcpu->kvm->slots_lock); | 1664 | up_read(¤t->mm->mmap_sem); |
1493 | 1665 | ||
1494 | if (is_error_page(page)) { | 1666 | if (is_error_page(page)) { |
1495 | kvm_release_page_clean(page); | 1667 | kvm_release_page_clean(page); |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 4b55f462e2b3..17f9d160ca34 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -248,6 +248,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
248 | pt_element_t gpte; | 248 | pt_element_t gpte; |
249 | unsigned pte_access; | 249 | unsigned pte_access; |
250 | struct page *npage; | 250 | struct page *npage; |
251 | int largepage = vcpu->arch.update_pte.largepage; | ||
251 | 252 | ||
252 | gpte = *(const pt_element_t *)pte; | 253 | gpte = *(const pt_element_t *)pte; |
253 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { | 254 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { |
@@ -264,7 +265,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
264 | return; | 265 | return; |
265 | get_page(npage); | 266 | get_page(npage); |
266 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, | 267 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, |
267 | gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage); | 268 | gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), |
269 | npage); | ||
268 | } | 270 | } |
269 | 271 | ||
270 | /* | 272 | /* |
@@ -272,8 +274,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
272 | */ | 274 | */ |
273 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | 275 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, |
274 | struct guest_walker *walker, | 276 | struct guest_walker *walker, |
275 | int user_fault, int write_fault, int *ptwrite, | 277 | int user_fault, int write_fault, int largepage, |
276 | struct page *page) | 278 | int *ptwrite, struct page *page) |
277 | { | 279 | { |
278 | hpa_t shadow_addr; | 280 | hpa_t shadow_addr; |
279 | int level; | 281 | int level; |
@@ -301,11 +303,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
301 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; | 303 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; |
302 | if (level == PT_PAGE_TABLE_LEVEL) | 304 | if (level == PT_PAGE_TABLE_LEVEL) |
303 | break; | 305 | break; |
304 | if (is_shadow_present_pte(*shadow_ent)) { | 306 | |
307 | if (largepage && level == PT_DIRECTORY_LEVEL) | ||
308 | break; | ||
309 | |||
310 | if (is_shadow_present_pte(*shadow_ent) | ||
311 | && !is_large_pte(*shadow_ent)) { | ||
305 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; | 312 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; |
306 | continue; | 313 | continue; |
307 | } | 314 | } |
308 | 315 | ||
316 | if (is_large_pte(*shadow_ent)) | ||
317 | rmap_remove(vcpu->kvm, shadow_ent); | ||
318 | |||
309 | if (level - 1 == PT_PAGE_TABLE_LEVEL | 319 | if (level - 1 == PT_PAGE_TABLE_LEVEL |
310 | && walker->level == PT_DIRECTORY_LEVEL) { | 320 | && walker->level == PT_DIRECTORY_LEVEL) { |
311 | metaphysical = 1; | 321 | metaphysical = 1; |
@@ -339,7 +349,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
339 | mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, | 349 | mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, |
340 | user_fault, write_fault, | 350 | user_fault, write_fault, |
341 | walker->ptes[walker->level-1] & PT_DIRTY_MASK, | 351 | walker->ptes[walker->level-1] & PT_DIRTY_MASK, |
342 | ptwrite, walker->gfn, page); | 352 | ptwrite, largepage, walker->gfn, page); |
343 | 353 | ||
344 | return shadow_ent; | 354 | return shadow_ent; |
345 | } | 355 | } |
@@ -369,6 +379,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
369 | int write_pt = 0; | 379 | int write_pt = 0; |
370 | int r; | 380 | int r; |
371 | struct page *page; | 381 | struct page *page; |
382 | int largepage = 0; | ||
372 | 383 | ||
373 | pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); | 384 | pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); |
374 | kvm_mmu_audit(vcpu, "pre page fault"); | 385 | kvm_mmu_audit(vcpu, "pre page fault"); |
@@ -396,6 +407,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
396 | } | 407 | } |
397 | 408 | ||
398 | down_read(¤t->mm->mmap_sem); | 409 | down_read(¤t->mm->mmap_sem); |
410 | if (walker.level == PT_DIRECTORY_LEVEL) { | ||
411 | gfn_t large_gfn; | ||
412 | large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); | ||
413 | if (is_largepage_backed(vcpu, large_gfn)) { | ||
414 | walker.gfn = large_gfn; | ||
415 | largepage = 1; | ||
416 | } | ||
417 | } | ||
399 | page = gfn_to_page(vcpu->kvm, walker.gfn); | 418 | page = gfn_to_page(vcpu->kvm, walker.gfn); |
400 | up_read(¤t->mm->mmap_sem); | 419 | up_read(¤t->mm->mmap_sem); |
401 | 420 | ||
@@ -410,7 +429,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
410 | spin_lock(&vcpu->kvm->mmu_lock); | 429 | spin_lock(&vcpu->kvm->mmu_lock); |
411 | kvm_mmu_free_some_pages(vcpu); | 430 | kvm_mmu_free_some_pages(vcpu); |
412 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 431 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, |
413 | &write_pt, page); | 432 | largepage, &write_pt, page); |
433 | |||
414 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, | 434 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, |
415 | shadow_pte, *shadow_pte, write_pt); | 435 | shadow_pte, *shadow_pte, write_pt); |
416 | 436 | ||
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e8e64927bddc..0458bd516185 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -88,6 +88,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
88 | { "mmu_recycled", VM_STAT(mmu_recycled) }, | 88 | { "mmu_recycled", VM_STAT(mmu_recycled) }, |
89 | { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, | 89 | { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, |
90 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, | 90 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, |
91 | { "largepages", VM_STAT(lpages) }, | ||
91 | { NULL } | 92 | { NULL } |
92 | }; | 93 | }; |
93 | 94 | ||
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 8c3f74b73524..95473ef5a906 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h | |||
@@ -39,6 +39,13 @@ | |||
39 | #define INVALID_PAGE (~(hpa_t)0) | 39 | #define INVALID_PAGE (~(hpa_t)0) |
40 | #define UNMAPPED_GVA (~(gpa_t)0) | 40 | #define UNMAPPED_GVA (~(gpa_t)0) |
41 | 41 | ||
42 | /* shadow tables are PAE even on non-PAE hosts */ | ||
43 | #define KVM_HPAGE_SHIFT 21 | ||
44 | #define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT) | ||
45 | #define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1)) | ||
46 | |||
47 | #define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE) | ||
48 | |||
42 | #define DE_VECTOR 0 | 49 | #define DE_VECTOR 0 |
43 | #define UD_VECTOR 6 | 50 | #define UD_VECTOR 6 |
44 | #define NM_VECTOR 7 | 51 | #define NM_VECTOR 7 |
@@ -230,6 +237,7 @@ struct kvm_vcpu_arch { | |||
230 | struct { | 237 | struct { |
231 | gfn_t gfn; /* presumed gfn during guest pte update */ | 238 | gfn_t gfn; /* presumed gfn during guest pte update */ |
232 | struct page *page; /* page corresponding to that gfn */ | 239 | struct page *page; /* page corresponding to that gfn */ |
240 | int largepage; | ||
233 | } update_pte; | 241 | } update_pte; |
234 | 242 | ||
235 | struct i387_fxsave_struct host_fx_image; | 243 | struct i387_fxsave_struct host_fx_image; |
@@ -307,6 +315,7 @@ struct kvm_vm_stat { | |||
307 | u32 mmu_recycled; | 315 | u32 mmu_recycled; |
308 | u32 mmu_cache_miss; | 316 | u32 mmu_cache_miss; |
309 | u32 remote_tlb_flush; | 317 | u32 remote_tlb_flush; |
318 | u32 lpages; | ||
310 | }; | 319 | }; |
311 | 320 | ||
312 | struct kvm_vcpu_stat { | 321 | struct kvm_vcpu_stat { |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 994278fb5883..9750bb3c5a75 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
@@ -103,6 +103,10 @@ struct kvm_memory_slot { | |||
103 | unsigned long flags; | 103 | unsigned long flags; |
104 | unsigned long *rmap; | 104 | unsigned long *rmap; |
105 | unsigned long *dirty_bitmap; | 105 | unsigned long *dirty_bitmap; |
106 | struct { | ||
107 | unsigned long rmap_pde; | ||
108 | int write_count; | ||
109 | } *lpage_info; | ||
106 | unsigned long userspace_addr; | 110 | unsigned long userspace_addr; |
107 | int user_alloc; | 111 | int user_alloc; |
108 | }; | 112 | }; |
@@ -169,6 +173,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, | |||
169 | int user_alloc); | 173 | int user_alloc); |
170 | gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); | 174 | gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); |
171 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); | 175 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); |
176 | unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); | ||
172 | void kvm_release_page_clean(struct page *page); | 177 | void kvm_release_page_clean(struct page *page); |
173 | void kvm_release_page_dirty(struct page *page); | 178 | void kvm_release_page_dirty(struct page *page); |
174 | int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, | 179 | int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c41eb57ce29b..31db9b4d3016 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
@@ -212,9 +212,13 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, | |||
212 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) | 212 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) |
213 | vfree(free->dirty_bitmap); | 213 | vfree(free->dirty_bitmap); |
214 | 214 | ||
215 | if (!dont || free->lpage_info != dont->lpage_info) | ||
216 | vfree(free->lpage_info); | ||
217 | |||
215 | free->npages = 0; | 218 | free->npages = 0; |
216 | free->dirty_bitmap = NULL; | 219 | free->dirty_bitmap = NULL; |
217 | free->rmap = NULL; | 220 | free->rmap = NULL; |
221 | free->lpage_info = NULL; | ||
218 | } | 222 | } |
219 | 223 | ||
220 | void kvm_free_physmem(struct kvm *kvm) | 224 | void kvm_free_physmem(struct kvm *kvm) |
@@ -324,6 +328,25 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
324 | new.user_alloc = user_alloc; | 328 | new.user_alloc = user_alloc; |
325 | new.userspace_addr = mem->userspace_addr; | 329 | new.userspace_addr = mem->userspace_addr; |
326 | } | 330 | } |
331 | if (npages && !new.lpage_info) { | ||
332 | int largepages = npages / KVM_PAGES_PER_HPAGE; | ||
333 | if (npages % KVM_PAGES_PER_HPAGE) | ||
334 | largepages++; | ||
335 | if (base_gfn % KVM_PAGES_PER_HPAGE) | ||
336 | largepages++; | ||
337 | |||
338 | new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); | ||
339 | |||
340 | if (!new.lpage_info) | ||
341 | goto out_free; | ||
342 | |||
343 | memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); | ||
344 | |||
345 | if (base_gfn % KVM_PAGES_PER_HPAGE) | ||
346 | new.lpage_info[0].write_count = 1; | ||
347 | if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) | ||
348 | new.lpage_info[largepages-1].write_count = 1; | ||
349 | } | ||
327 | 350 | ||
328 | /* Allocate page dirty bitmap if needed */ | 351 | /* Allocate page dirty bitmap if needed */ |
329 | if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { | 352 | if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { |
@@ -467,7 +490,7 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) | |||
467 | } | 490 | } |
468 | EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); | 491 | EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); |
469 | 492 | ||
470 | static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) | 493 | unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) |
471 | { | 494 | { |
472 | struct kvm_memory_slot *slot; | 495 | struct kvm_memory_slot *slot; |
473 | 496 | ||