aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMarcelo Tosatti <marcelo@kvack.org>2008-02-23 09:44:30 -0500
committerAvi Kivity <avi@qumranet.com>2008-04-27 04:53:25 -0400
commit05da45583de9b383dc81dd695fe248431d6c9f2b (patch)
treea76d699e60aca4f775d5f67254214654235e2e17
parent2e53d63acba75795aa226febd140f67c58c6a353 (diff)
KVM: MMU: large page support
Create large pages mappings if the guest PTE's are marked as such and the underlying memory is hugetlbfs backed. If the largepage contains write-protected pages, a large pte is not used. Gives a consistent 2% improvement for data copies on ram mounted filesystem, without NPT/EPT. Anthony measures a 4% improvement on 4-way kernbench, with NPT. Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> Signed-off-by: Avi Kivity <avi@qumranet.com>
-rw-r--r--arch/x86/kvm/mmu.c222
-rw-r--r--arch/x86/kvm/paging_tmpl.h32
-rw-r--r--arch/x86/kvm/x86.c1
-rw-r--r--include/asm-x86/kvm_host.h9
-rw-r--r--include/linux/kvm_host.h5
-rw-r--r--virt/kvm/kvm_main.c25
6 files changed, 262 insertions, 32 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 103d008dab8b..1932a3aeda1d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -27,6 +27,7 @@
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/swap.h> 29#include <linux/swap.h>
30#include <linux/hugetlb.h>
30 31
31#include <asm/page.h> 32#include <asm/page.h>
32#include <asm/cmpxchg.h> 33#include <asm/cmpxchg.h>
@@ -211,6 +212,11 @@ static int is_shadow_present_pte(u64 pte)
211 && pte != shadow_notrap_nonpresent_pte; 212 && pte != shadow_notrap_nonpresent_pte;
212} 213}
213 214
215static int is_large_pte(u64 pte)
216{
217 return pte & PT_PAGE_SIZE_MASK;
218}
219
214static int is_writeble_pte(unsigned long pte) 220static int is_writeble_pte(unsigned long pte)
215{ 221{
216 return pte & PT_WRITABLE_MASK; 222 return pte & PT_WRITABLE_MASK;
@@ -350,16 +356,100 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
350} 356}
351 357
352/* 358/*
359 * Return the pointer to the largepage write count for a given
360 * gfn, handling slots that are not large page aligned.
361 */
362static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
363{
364 unsigned long idx;
365
366 idx = (gfn / KVM_PAGES_PER_HPAGE) -
367 (slot->base_gfn / KVM_PAGES_PER_HPAGE);
368 return &slot->lpage_info[idx].write_count;
369}
370
371static void account_shadowed(struct kvm *kvm, gfn_t gfn)
372{
373 int *write_count;
374
375 write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
376 *write_count += 1;
377 WARN_ON(*write_count > KVM_PAGES_PER_HPAGE);
378}
379
380static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
381{
382 int *write_count;
383
384 write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
385 *write_count -= 1;
386 WARN_ON(*write_count < 0);
387}
388
389static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
390{
391 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
392 int *largepage_idx;
393
394 if (slot) {
395 largepage_idx = slot_largepage_idx(gfn, slot);
396 return *largepage_idx;
397 }
398
399 return 1;
400}
401
402static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
403{
404 struct vm_area_struct *vma;
405 unsigned long addr;
406
407 addr = gfn_to_hva(kvm, gfn);
408 if (kvm_is_error_hva(addr))
409 return 0;
410
411 vma = find_vma(current->mm, addr);
412 if (vma && is_vm_hugetlb_page(vma))
413 return 1;
414
415 return 0;
416}
417
418static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
419{
420 struct kvm_memory_slot *slot;
421
422 if (has_wrprotected_page(vcpu->kvm, large_gfn))
423 return 0;
424
425 if (!host_largepage_backed(vcpu->kvm, large_gfn))
426 return 0;
427
428 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
429 if (slot && slot->dirty_bitmap)
430 return 0;
431
432 return 1;
433}
434
435/*
353 * Take gfn and return the reverse mapping to it. 436 * Take gfn and return the reverse mapping to it.
354 * Note: gfn must be unaliased before this function get called 437 * Note: gfn must be unaliased before this function get called
355 */ 438 */
356 439
357static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) 440static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
358{ 441{
359 struct kvm_memory_slot *slot; 442 struct kvm_memory_slot *slot;
443 unsigned long idx;
360 444
361 slot = gfn_to_memslot(kvm, gfn); 445 slot = gfn_to_memslot(kvm, gfn);
362 return &slot->rmap[gfn - slot->base_gfn]; 446 if (!lpage)
447 return &slot->rmap[gfn - slot->base_gfn];
448
449 idx = (gfn / KVM_PAGES_PER_HPAGE) -
450 (slot->base_gfn / KVM_PAGES_PER_HPAGE);
451
452 return &slot->lpage_info[idx].rmap_pde;
363} 453}
364 454
365/* 455/*
@@ -371,7 +461,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
371 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc 461 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
372 * containing more mappings. 462 * containing more mappings.
373 */ 463 */
374static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 464static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
375{ 465{
376 struct kvm_mmu_page *sp; 466 struct kvm_mmu_page *sp;
377 struct kvm_rmap_desc *desc; 467 struct kvm_rmap_desc *desc;
@@ -383,7 +473,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
383 gfn = unalias_gfn(vcpu->kvm, gfn); 473 gfn = unalias_gfn(vcpu->kvm, gfn);
384 sp = page_header(__pa(spte)); 474 sp = page_header(__pa(spte));
385 sp->gfns[spte - sp->spt] = gfn; 475 sp->gfns[spte - sp->spt] = gfn;
386 rmapp = gfn_to_rmap(vcpu->kvm, gfn); 476 rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
387 if (!*rmapp) { 477 if (!*rmapp) {
388 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); 478 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
389 *rmapp = (unsigned long)spte; 479 *rmapp = (unsigned long)spte;
@@ -449,7 +539,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
449 kvm_release_page_dirty(page); 539 kvm_release_page_dirty(page);
450 else 540 else
451 kvm_release_page_clean(page); 541 kvm_release_page_clean(page);
452 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); 542 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
453 if (!*rmapp) { 543 if (!*rmapp) {
454 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 544 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
455 BUG(); 545 BUG();
@@ -515,7 +605,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
515 int write_protected = 0; 605 int write_protected = 0;
516 606
517 gfn = unalias_gfn(kvm, gfn); 607 gfn = unalias_gfn(kvm, gfn);
518 rmapp = gfn_to_rmap(kvm, gfn); 608 rmapp = gfn_to_rmap(kvm, gfn, 0);
519 609
520 spte = rmap_next(kvm, rmapp, NULL); 610 spte = rmap_next(kvm, rmapp, NULL);
521 while (spte) { 611 while (spte) {
@@ -528,8 +618,27 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
528 } 618 }
529 spte = rmap_next(kvm, rmapp, spte); 619 spte = rmap_next(kvm, rmapp, spte);
530 } 620 }
621 /* check for huge page mappings */
622 rmapp = gfn_to_rmap(kvm, gfn, 1);
623 spte = rmap_next(kvm, rmapp, NULL);
624 while (spte) {
625 BUG_ON(!spte);
626 BUG_ON(!(*spte & PT_PRESENT_MASK));
627 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
628 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
629 if (is_writeble_pte(*spte)) {
630 rmap_remove(kvm, spte);
631 --kvm->stat.lpages;
632 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
633 write_protected = 1;
634 }
635 spte = rmap_next(kvm, rmapp, spte);
636 }
637
531 if (write_protected) 638 if (write_protected)
532 kvm_flush_remote_tlbs(kvm); 639 kvm_flush_remote_tlbs(kvm);
640
641 account_shadowed(kvm, gfn);
533} 642}
534 643
535#ifdef MMU_DEBUG 644#ifdef MMU_DEBUG
@@ -747,11 +856,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
747 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 856 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
748 ent = pt[i]; 857 ent = pt[i];
749 858
859 if (is_shadow_present_pte(ent)) {
860 if (!is_large_pte(ent)) {
861 ent &= PT64_BASE_ADDR_MASK;
862 mmu_page_remove_parent_pte(page_header(ent),
863 &pt[i]);
864 } else {
865 --kvm->stat.lpages;
866 rmap_remove(kvm, &pt[i]);
867 }
868 }
750 pt[i] = shadow_trap_nonpresent_pte; 869 pt[i] = shadow_trap_nonpresent_pte;
751 if (!is_shadow_present_pte(ent))
752 continue;
753 ent &= PT64_BASE_ADDR_MASK;
754 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
755 } 870 }
756 kvm_flush_remote_tlbs(kvm); 871 kvm_flush_remote_tlbs(kvm);
757} 872}
@@ -791,6 +906,8 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
791 } 906 }
792 kvm_mmu_page_unlink_children(kvm, sp); 907 kvm_mmu_page_unlink_children(kvm, sp);
793 if (!sp->root_count) { 908 if (!sp->root_count) {
909 if (!sp->role.metaphysical)
910 unaccount_shadowed(kvm, sp->gfn);
794 hlist_del(&sp->hash_link); 911 hlist_del(&sp->hash_link);
795 kvm_mmu_free_page(kvm, sp); 912 kvm_mmu_free_page(kvm, sp);
796 } else { 913 } else {
@@ -894,7 +1011,8 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
894static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1011static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
895 unsigned pt_access, unsigned pte_access, 1012 unsigned pt_access, unsigned pte_access,
896 int user_fault, int write_fault, int dirty, 1013 int user_fault, int write_fault, int dirty,
897 int *ptwrite, gfn_t gfn, struct page *page) 1014 int *ptwrite, int largepage, gfn_t gfn,
1015 struct page *page)
898{ 1016{
899 u64 spte; 1017 u64 spte;
900 int was_rmapped = 0; 1018 int was_rmapped = 0;
@@ -907,15 +1025,29 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
907 write_fault, user_fault, gfn); 1025 write_fault, user_fault, gfn);
908 1026
909 if (is_rmap_pte(*shadow_pte)) { 1027 if (is_rmap_pte(*shadow_pte)) {
910 if (host_pfn != page_to_pfn(page)) { 1028 /*
1029 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1030 * the parent of the now unreachable PTE.
1031 */
1032 if (largepage && !is_large_pte(*shadow_pte)) {
1033 struct kvm_mmu_page *child;
1034 u64 pte = *shadow_pte;
1035
1036 child = page_header(pte & PT64_BASE_ADDR_MASK);
1037 mmu_page_remove_parent_pte(child, shadow_pte);
1038 } else if (host_pfn != page_to_pfn(page)) {
911 pgprintk("hfn old %lx new %lx\n", 1039 pgprintk("hfn old %lx new %lx\n",
912 host_pfn, page_to_pfn(page)); 1040 host_pfn, page_to_pfn(page));
913 rmap_remove(vcpu->kvm, shadow_pte); 1041 rmap_remove(vcpu->kvm, shadow_pte);
1042 } else {
1043 if (largepage)
1044 was_rmapped = is_large_pte(*shadow_pte);
1045 else
1046 was_rmapped = 1;
914 } 1047 }
915 else
916 was_rmapped = 1;
917 } 1048 }
918 1049
1050
919 /* 1051 /*
920 * We don't set the accessed bit, since we sometimes want to see 1052 * We don't set the accessed bit, since we sometimes want to see
921 * whether the guest actually used the pte (in order to detect 1053 * whether the guest actually used the pte (in order to detect
@@ -930,6 +1062,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
930 spte |= PT_PRESENT_MASK; 1062 spte |= PT_PRESENT_MASK;
931 if (pte_access & ACC_USER_MASK) 1063 if (pte_access & ACC_USER_MASK)
932 spte |= PT_USER_MASK; 1064 spte |= PT_USER_MASK;
1065 if (largepage)
1066 spte |= PT_PAGE_SIZE_MASK;
933 1067
934 spte |= page_to_phys(page); 1068 spte |= page_to_phys(page);
935 1069
@@ -944,7 +1078,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
944 } 1078 }
945 1079
946 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1080 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
947 if (shadow) { 1081 if (shadow ||
1082 (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
948 pgprintk("%s: found shadow page for %lx, marking ro\n", 1083 pgprintk("%s: found shadow page for %lx, marking ro\n",
949 __FUNCTION__, gfn); 1084 __FUNCTION__, gfn);
950 pte_access &= ~ACC_WRITE_MASK; 1085 pte_access &= ~ACC_WRITE_MASK;
@@ -963,10 +1098,17 @@ unshadowed:
963 mark_page_dirty(vcpu->kvm, gfn); 1098 mark_page_dirty(vcpu->kvm, gfn);
964 1099
965 pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); 1100 pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
1101 pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n",
1102 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
1103 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
966 set_shadow_pte(shadow_pte, spte); 1104 set_shadow_pte(shadow_pte, spte);
1105 if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
1106 && (spte & PT_PRESENT_MASK))
1107 ++vcpu->kvm->stat.lpages;
1108
967 page_header_update_slot(vcpu->kvm, shadow_pte, gfn); 1109 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
968 if (!was_rmapped) { 1110 if (!was_rmapped) {
969 rmap_add(vcpu, shadow_pte, gfn); 1111 rmap_add(vcpu, shadow_pte, gfn, largepage);
970 if (!is_rmap_pte(*shadow_pte)) 1112 if (!is_rmap_pte(*shadow_pte))
971 kvm_release_page_clean(page); 1113 kvm_release_page_clean(page);
972 } else { 1114 } else {
@@ -984,7 +1126,8 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
984} 1126}
985 1127
986static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 1128static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
987 gfn_t gfn, struct page *page, int level) 1129 int largepage, gfn_t gfn, struct page *page,
1130 int level)
988{ 1131{
989 hpa_t table_addr = vcpu->arch.mmu.root_hpa; 1132 hpa_t table_addr = vcpu->arch.mmu.root_hpa;
990 int pt_write = 0; 1133 int pt_write = 0;
@@ -998,7 +1141,13 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
998 1141
999 if (level == 1) { 1142 if (level == 1) {
1000 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 1143 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
1001 0, write, 1, &pt_write, gfn, page); 1144 0, write, 1, &pt_write, 0, gfn, page);
1145 return pt_write;
1146 }
1147
1148 if (largepage && level == 2) {
1149 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
1150 0, write, 1, &pt_write, 1, gfn, page);
1002 return pt_write; 1151 return pt_write;
1003 } 1152 }
1004 1153
@@ -1027,12 +1176,18 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1027static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 1176static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1028{ 1177{
1029 int r; 1178 int r;
1179 int largepage = 0;
1030 1180
1031 struct page *page; 1181 struct page *page;
1032 1182
1033 down_read(&vcpu->kvm->slots_lock); 1183 down_read(&vcpu->kvm->slots_lock);
1034 1184
1035 down_read(&current->mm->mmap_sem); 1185 down_read(&current->mm->mmap_sem);
1186 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1187 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1188 largepage = 1;
1189 }
1190
1036 page = gfn_to_page(vcpu->kvm, gfn); 1191 page = gfn_to_page(vcpu->kvm, gfn);
1037 up_read(&current->mm->mmap_sem); 1192 up_read(&current->mm->mmap_sem);
1038 1193
@@ -1045,7 +1200,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1045 1200
1046 spin_lock(&vcpu->kvm->mmu_lock); 1201 spin_lock(&vcpu->kvm->mmu_lock);
1047 kvm_mmu_free_some_pages(vcpu); 1202 kvm_mmu_free_some_pages(vcpu);
1048 r = __direct_map(vcpu, v, write, gfn, page, PT32E_ROOT_LEVEL); 1203 r = __direct_map(vcpu, v, write, largepage, gfn, page,
1204 PT32E_ROOT_LEVEL);
1049 spin_unlock(&vcpu->kvm->mmu_lock); 1205 spin_unlock(&vcpu->kvm->mmu_lock);
1050 1206
1051 up_read(&vcpu->kvm->slots_lock); 1207 up_read(&vcpu->kvm->slots_lock);
@@ -1180,6 +1336,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1180{ 1336{
1181 struct page *page; 1337 struct page *page;
1182 int r; 1338 int r;
1339 int largepage = 0;
1340 gfn_t gfn = gpa >> PAGE_SHIFT;
1183 1341
1184 ASSERT(vcpu); 1342 ASSERT(vcpu);
1185 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 1343 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1189,7 +1347,11 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1189 return r; 1347 return r;
1190 1348
1191 down_read(&current->mm->mmap_sem); 1349 down_read(&current->mm->mmap_sem);
1192 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1350 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1351 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1352 largepage = 1;
1353 }
1354 page = gfn_to_page(vcpu->kvm, gfn);
1193 if (is_error_page(page)) { 1355 if (is_error_page(page)) {
1194 kvm_release_page_clean(page); 1356 kvm_release_page_clean(page);
1195 up_read(&current->mm->mmap_sem); 1357 up_read(&current->mm->mmap_sem);
@@ -1198,7 +1360,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1198 spin_lock(&vcpu->kvm->mmu_lock); 1360 spin_lock(&vcpu->kvm->mmu_lock);
1199 kvm_mmu_free_some_pages(vcpu); 1361 kvm_mmu_free_some_pages(vcpu);
1200 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 1362 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1201 gpa >> PAGE_SHIFT, page, TDP_ROOT_LEVEL); 1363 largepage, gfn, page, TDP_ROOT_LEVEL);
1202 spin_unlock(&vcpu->kvm->mmu_lock); 1364 spin_unlock(&vcpu->kvm->mmu_lock);
1203 up_read(&current->mm->mmap_sem); 1365 up_read(&current->mm->mmap_sem);
1204 1366
@@ -1397,7 +1559,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1397 1559
1398 pte = *spte; 1560 pte = *spte;
1399 if (is_shadow_present_pte(pte)) { 1561 if (is_shadow_present_pte(pte)) {
1400 if (sp->role.level == PT_PAGE_TABLE_LEVEL) 1562 if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
1563 is_large_pte(pte))
1401 rmap_remove(vcpu->kvm, spte); 1564 rmap_remove(vcpu->kvm, spte);
1402 else { 1565 else {
1403 child = page_header(pte & PT64_BASE_ADDR_MASK); 1566 child = page_header(pte & PT64_BASE_ADDR_MASK);
@@ -1405,6 +1568,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1405 } 1568 }
1406 } 1569 }
1407 set_shadow_pte(spte, shadow_trap_nonpresent_pte); 1570 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1571 if (is_large_pte(pte))
1572 --vcpu->kvm->stat.lpages;
1408} 1573}
1409 1574
1410static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 1575static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
@@ -1412,7 +1577,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1412 u64 *spte, 1577 u64 *spte,
1413 const void *new) 1578 const void *new)
1414{ 1579{
1415 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 1580 if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
1581 && !vcpu->arch.update_pte.largepage) {
1416 ++vcpu->kvm->stat.mmu_pde_zapped; 1582 ++vcpu->kvm->stat.mmu_pde_zapped;
1417 return; 1583 return;
1418 } 1584 }
@@ -1460,6 +1626,8 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1460 u64 gpte = 0; 1626 u64 gpte = 0;
1461 struct page *page; 1627 struct page *page;
1462 1628
1629 vcpu->arch.update_pte.largepage = 0;
1630
1463 if (bytes != 4 && bytes != 8) 1631 if (bytes != 4 && bytes != 8)
1464 return; 1632 return;
1465 1633
@@ -1487,9 +1655,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1487 return; 1655 return;
1488 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 1656 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1489 1657
1490 down_read(&vcpu->kvm->slots_lock); 1658 down_read(&current->mm->mmap_sem);
1659 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
1660 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1661 vcpu->arch.update_pte.largepage = 1;
1662 }
1491 page = gfn_to_page(vcpu->kvm, gfn); 1663 page = gfn_to_page(vcpu->kvm, gfn);
1492 up_read(&vcpu->kvm->slots_lock); 1664 up_read(&current->mm->mmap_sem);
1493 1665
1494 if (is_error_page(page)) { 1666 if (is_error_page(page)) {
1495 kvm_release_page_clean(page); 1667 kvm_release_page_clean(page);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4b55f462e2b3..17f9d160ca34 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -248,6 +248,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
248 pt_element_t gpte; 248 pt_element_t gpte;
249 unsigned pte_access; 249 unsigned pte_access;
250 struct page *npage; 250 struct page *npage;
251 int largepage = vcpu->arch.update_pte.largepage;
251 252
252 gpte = *(const pt_element_t *)pte; 253 gpte = *(const pt_element_t *)pte;
253 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 254 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
@@ -264,7 +265,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
264 return; 265 return;
265 get_page(npage); 266 get_page(npage);
266 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 267 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
267 gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage); 268 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
269 npage);
268} 270}
269 271
270/* 272/*
@@ -272,8 +274,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
272 */ 274 */
273static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 275static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
274 struct guest_walker *walker, 276 struct guest_walker *walker,
275 int user_fault, int write_fault, int *ptwrite, 277 int user_fault, int write_fault, int largepage,
276 struct page *page) 278 int *ptwrite, struct page *page)
277{ 279{
278 hpa_t shadow_addr; 280 hpa_t shadow_addr;
279 int level; 281 int level;
@@ -301,11 +303,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
301 shadow_ent = ((u64 *)__va(shadow_addr)) + index; 303 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
302 if (level == PT_PAGE_TABLE_LEVEL) 304 if (level == PT_PAGE_TABLE_LEVEL)
303 break; 305 break;
304 if (is_shadow_present_pte(*shadow_ent)) { 306
307 if (largepage && level == PT_DIRECTORY_LEVEL)
308 break;
309
310 if (is_shadow_present_pte(*shadow_ent)
311 && !is_large_pte(*shadow_ent)) {
305 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; 312 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
306 continue; 313 continue;
307 } 314 }
308 315
316 if (is_large_pte(*shadow_ent))
317 rmap_remove(vcpu->kvm, shadow_ent);
318
309 if (level - 1 == PT_PAGE_TABLE_LEVEL 319 if (level - 1 == PT_PAGE_TABLE_LEVEL
310 && walker->level == PT_DIRECTORY_LEVEL) { 320 && walker->level == PT_DIRECTORY_LEVEL) {
311 metaphysical = 1; 321 metaphysical = 1;
@@ -339,7 +349,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
339 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, 349 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
340 user_fault, write_fault, 350 user_fault, write_fault,
341 walker->ptes[walker->level-1] & PT_DIRTY_MASK, 351 walker->ptes[walker->level-1] & PT_DIRTY_MASK,
342 ptwrite, walker->gfn, page); 352 ptwrite, largepage, walker->gfn, page);
343 353
344 return shadow_ent; 354 return shadow_ent;
345} 355}
@@ -369,6 +379,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
369 int write_pt = 0; 379 int write_pt = 0;
370 int r; 380 int r;
371 struct page *page; 381 struct page *page;
382 int largepage = 0;
372 383
373 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); 384 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
374 kvm_mmu_audit(vcpu, "pre page fault"); 385 kvm_mmu_audit(vcpu, "pre page fault");
@@ -396,6 +407,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
396 } 407 }
397 408
398 down_read(&current->mm->mmap_sem); 409 down_read(&current->mm->mmap_sem);
410 if (walker.level == PT_DIRECTORY_LEVEL) {
411 gfn_t large_gfn;
412 large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
413 if (is_largepage_backed(vcpu, large_gfn)) {
414 walker.gfn = large_gfn;
415 largepage = 1;
416 }
417 }
399 page = gfn_to_page(vcpu->kvm, walker.gfn); 418 page = gfn_to_page(vcpu->kvm, walker.gfn);
400 up_read(&current->mm->mmap_sem); 419 up_read(&current->mm->mmap_sem);
401 420
@@ -410,7 +429,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
410 spin_lock(&vcpu->kvm->mmu_lock); 429 spin_lock(&vcpu->kvm->mmu_lock);
411 kvm_mmu_free_some_pages(vcpu); 430 kvm_mmu_free_some_pages(vcpu);
412 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 431 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
413 &write_pt, page); 432 largepage, &write_pt, page);
433
414 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, 434 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
415 shadow_pte, *shadow_pte, write_pt); 435 shadow_pte, *shadow_pte, write_pt);
416 436
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e8e64927bddc..0458bd516185 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -88,6 +88,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
88 { "mmu_recycled", VM_STAT(mmu_recycled) }, 88 { "mmu_recycled", VM_STAT(mmu_recycled) },
89 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 89 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
90 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 90 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
91 { "largepages", VM_STAT(lpages) },
91 { NULL } 92 { NULL }
92}; 93};
93 94
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 8c3f74b73524..95473ef5a906 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -39,6 +39,13 @@
39#define INVALID_PAGE (~(hpa_t)0) 39#define INVALID_PAGE (~(hpa_t)0)
40#define UNMAPPED_GVA (~(gpa_t)0) 40#define UNMAPPED_GVA (~(gpa_t)0)
41 41
42/* shadow tables are PAE even on non-PAE hosts */
43#define KVM_HPAGE_SHIFT 21
44#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT)
45#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1))
46
47#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
48
42#define DE_VECTOR 0 49#define DE_VECTOR 0
43#define UD_VECTOR 6 50#define UD_VECTOR 6
44#define NM_VECTOR 7 51#define NM_VECTOR 7
@@ -230,6 +237,7 @@ struct kvm_vcpu_arch {
230 struct { 237 struct {
231 gfn_t gfn; /* presumed gfn during guest pte update */ 238 gfn_t gfn; /* presumed gfn during guest pte update */
232 struct page *page; /* page corresponding to that gfn */ 239 struct page *page; /* page corresponding to that gfn */
240 int largepage;
233 } update_pte; 241 } update_pte;
234 242
235 struct i387_fxsave_struct host_fx_image; 243 struct i387_fxsave_struct host_fx_image;
@@ -307,6 +315,7 @@ struct kvm_vm_stat {
307 u32 mmu_recycled; 315 u32 mmu_recycled;
308 u32 mmu_cache_miss; 316 u32 mmu_cache_miss;
309 u32 remote_tlb_flush; 317 u32 remote_tlb_flush;
318 u32 lpages;
310}; 319};
311 320
312struct kvm_vcpu_stat { 321struct kvm_vcpu_stat {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 994278fb5883..9750bb3c5a75 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -103,6 +103,10 @@ struct kvm_memory_slot {
103 unsigned long flags; 103 unsigned long flags;
104 unsigned long *rmap; 104 unsigned long *rmap;
105 unsigned long *dirty_bitmap; 105 unsigned long *dirty_bitmap;
106 struct {
107 unsigned long rmap_pde;
108 int write_count;
109 } *lpage_info;
106 unsigned long userspace_addr; 110 unsigned long userspace_addr;
107 int user_alloc; 111 int user_alloc;
108}; 112};
@@ -169,6 +173,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
169 int user_alloc); 173 int user_alloc);
170gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); 174gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
171struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); 175struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
176unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
172void kvm_release_page_clean(struct page *page); 177void kvm_release_page_clean(struct page *page);
173void kvm_release_page_dirty(struct page *page); 178void kvm_release_page_dirty(struct page *page);
174int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 179int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c41eb57ce29b..31db9b4d3016 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -212,9 +212,13 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
212 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 212 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
213 vfree(free->dirty_bitmap); 213 vfree(free->dirty_bitmap);
214 214
215 if (!dont || free->lpage_info != dont->lpage_info)
216 vfree(free->lpage_info);
217
215 free->npages = 0; 218 free->npages = 0;
216 free->dirty_bitmap = NULL; 219 free->dirty_bitmap = NULL;
217 free->rmap = NULL; 220 free->rmap = NULL;
221 free->lpage_info = NULL;
218} 222}
219 223
220void kvm_free_physmem(struct kvm *kvm) 224void kvm_free_physmem(struct kvm *kvm)
@@ -324,6 +328,25 @@ int __kvm_set_memory_region(struct kvm *kvm,
324 new.user_alloc = user_alloc; 328 new.user_alloc = user_alloc;
325 new.userspace_addr = mem->userspace_addr; 329 new.userspace_addr = mem->userspace_addr;
326 } 330 }
331 if (npages && !new.lpage_info) {
332 int largepages = npages / KVM_PAGES_PER_HPAGE;
333 if (npages % KVM_PAGES_PER_HPAGE)
334 largepages++;
335 if (base_gfn % KVM_PAGES_PER_HPAGE)
336 largepages++;
337
338 new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info));
339
340 if (!new.lpage_info)
341 goto out_free;
342
343 memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info));
344
345 if (base_gfn % KVM_PAGES_PER_HPAGE)
346 new.lpage_info[0].write_count = 1;
347 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE)
348 new.lpage_info[largepages-1].write_count = 1;
349 }
327 350
328 /* Allocate page dirty bitmap if needed */ 351 /* Allocate page dirty bitmap if needed */
329 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 352 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
@@ -467,7 +490,7 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
467} 490}
468EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 491EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
469 492
470static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 493unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
471{ 494{
472 struct kvm_memory_slot *slot; 495 struct kvm_memory_slot *slot;
473 496