aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2011-01-13 18:46:48 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2011-01-13 20:32:41 -0500
commit936a5fe6e6148c0b3ea0d792b903847d9b9931a1 (patch)
treef0326a5877a89251ff2cb727d5e832e911ed1d18
parent47ad8475c000141eacb3ecda5e5ce4b43a9cd04d (diff)
thp: kvm mmu transparent hugepage support
This should work for both hugetlbfs and transparent hugepages. [akpm@linux-foundation.org: bring forward PageTransCompound() addition for bisectability] Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Avi Kivity <avi@redhat.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/x86/kvm/mmu.c91
-rw-r--r--arch/x86/kvm/paging_tmpl.h9
-rw-r--r--include/linux/page-flags.h12
-rw-r--r--virt/kvm/kvm_main.c32
4 files changed, 125 insertions, 19 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9cafbb499813..47b2c3288b6b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
554 return ret; 554 return ret;
555} 555}
556 556
557static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 557static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
558{ 558{
559 struct kvm_memory_slot *slot; 559 struct kvm_memory_slot *slot;
560 int host_level, level, max_level;
561
562 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 560 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
563 if (slot && slot->dirty_bitmap) 561 if (slot && slot->dirty_bitmap)
564 return PT_PAGE_TABLE_LEVEL; 562 return true;
563 return false;
564}
565
566static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567{
568 int host_level, level, max_level;
565 569
566 host_level = host_mapping_level(vcpu->kvm, large_gfn); 570 host_level = host_mapping_level(vcpu->kvm, large_gfn);
567 571
@@ -2281,6 +2285,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2281 return 1; 2285 return 1;
2282} 2286}
2283 2287
2288static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2289 gfn_t *gfnp, pfn_t *pfnp, int *levelp)
2290{
2291 pfn_t pfn = *pfnp;
2292 gfn_t gfn = *gfnp;
2293 int level = *levelp;
2294
2295 /*
2296 * Check if it's a transparent hugepage. If this would be an
2297 * hugetlbfs page, level wouldn't be set to
2298 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2299 * here.
2300 */
2301 if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2302 level == PT_PAGE_TABLE_LEVEL &&
2303 PageTransCompound(pfn_to_page(pfn)) &&
2304 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
2305 unsigned long mask;
2306 /*
2307 * mmu_notifier_retry was successful and we hold the
2308 * mmu_lock here, so the pmd can't become splitting
2309 * from under us, and in turn
2310 * __split_huge_page_refcount() can't run from under
2311 * us and we can safely transfer the refcount from
2312 * PG_tail to PG_head as we switch the pfn to tail to
2313 * head.
2314 */
2315 *levelp = level = PT_DIRECTORY_LEVEL;
2316 mask = KVM_PAGES_PER_HPAGE(level) - 1;
2317 VM_BUG_ON((gfn & mask) != (pfn & mask));
2318 if (pfn & mask) {
2319 gfn &= ~mask;
2320 *gfnp = gfn;
2321 kvm_release_pfn_clean(pfn);
2322 pfn &= ~mask;
2323 if (!get_page_unless_zero(pfn_to_page(pfn)))
2324 BUG();
2325 *pfnp = pfn;
2326 }
2327 }
2328}
2329
2284static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2330static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2285 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2331 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2286 2332
@@ -2289,20 +2335,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2289{ 2335{
2290 int r; 2336 int r;
2291 int level; 2337 int level;
2338 int force_pt_level;
2292 pfn_t pfn; 2339 pfn_t pfn;
2293 unsigned long mmu_seq; 2340 unsigned long mmu_seq;
2294 bool map_writable; 2341 bool map_writable;
2295 2342
2296 level = mapping_level(vcpu, gfn); 2343 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2297 2344 if (likely(!force_pt_level)) {
2298 /* 2345 level = mapping_level(vcpu, gfn);
2299 * This path builds a PAE pagetable - so we can map 2mb pages at 2346 /*
2300 * maximum. Therefore check if the level is larger than that. 2347 * This path builds a PAE pagetable - so we can map
2301 */ 2348 * 2mb pages at maximum. Therefore check if the level
2302 if (level > PT_DIRECTORY_LEVEL) 2349 * is larger than that.
2303 level = PT_DIRECTORY_LEVEL; 2350 */
2351 if (level > PT_DIRECTORY_LEVEL)
2352 level = PT_DIRECTORY_LEVEL;
2304 2353
2305 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2354 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2355 } else
2356 level = PT_PAGE_TABLE_LEVEL;
2306 2357
2307 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2358 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2308 smp_rmb(); 2359 smp_rmb();
@@ -2318,6 +2369,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2318 if (mmu_notifier_retry(vcpu, mmu_seq)) 2369 if (mmu_notifier_retry(vcpu, mmu_seq))
2319 goto out_unlock; 2370 goto out_unlock;
2320 kvm_mmu_free_some_pages(vcpu); 2371 kvm_mmu_free_some_pages(vcpu);
2372 if (likely(!force_pt_level))
2373 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2321 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, 2374 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
2322 prefault); 2375 prefault);
2323 spin_unlock(&vcpu->kvm->mmu_lock); 2376 spin_unlock(&vcpu->kvm->mmu_lock);
@@ -2655,6 +2708,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2655 pfn_t pfn; 2708 pfn_t pfn;
2656 int r; 2709 int r;
2657 int level; 2710 int level;
2711 int force_pt_level;
2658 gfn_t gfn = gpa >> PAGE_SHIFT; 2712 gfn_t gfn = gpa >> PAGE_SHIFT;
2659 unsigned long mmu_seq; 2713 unsigned long mmu_seq;
2660 int write = error_code & PFERR_WRITE_MASK; 2714 int write = error_code & PFERR_WRITE_MASK;
@@ -2667,9 +2721,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2667 if (r) 2721 if (r)
2668 return r; 2722 return r;
2669 2723
2670 level = mapping_level(vcpu, gfn); 2724 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2671 2725 if (likely(!force_pt_level)) {
2672 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2726 level = mapping_level(vcpu, gfn);
2727 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2728 } else
2729 level = PT_PAGE_TABLE_LEVEL;
2673 2730
2674 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2731 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2675 smp_rmb(); 2732 smp_rmb();
@@ -2684,6 +2741,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2684 if (mmu_notifier_retry(vcpu, mmu_seq)) 2741 if (mmu_notifier_retry(vcpu, mmu_seq))
2685 goto out_unlock; 2742 goto out_unlock;
2686 kvm_mmu_free_some_pages(vcpu); 2743 kvm_mmu_free_some_pages(vcpu);
2744 if (likely(!force_pt_level))
2745 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2687 r = __direct_map(vcpu, gpa, write, map_writable, 2746 r = __direct_map(vcpu, gpa, write, map_writable,
2688 level, gfn, pfn, prefault); 2747 level, gfn, pfn, prefault);
2689 spin_unlock(&vcpu->kvm->mmu_lock); 2748 spin_unlock(&vcpu->kvm->mmu_lock);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 53210f1e94c2..6bccc24c4181 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
550 int r; 550 int r;
551 pfn_t pfn; 551 pfn_t pfn;
552 int level = PT_PAGE_TABLE_LEVEL; 552 int level = PT_PAGE_TABLE_LEVEL;
553 int force_pt_level;
553 unsigned long mmu_seq; 554 unsigned long mmu_seq;
554 bool map_writable; 555 bool map_writable;
555 556
@@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
577 return 0; 578 return 0;
578 } 579 }
579 580
580 if (walker.level >= PT_DIRECTORY_LEVEL) { 581 if (walker.level >= PT_DIRECTORY_LEVEL)
582 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
583 else
584 force_pt_level = 1;
585 if (!force_pt_level) {
581 level = min(walker.level, mapping_level(vcpu, walker.gfn)); 586 level = min(walker.level, mapping_level(vcpu, walker.gfn));
582 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); 587 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
583 } 588 }
@@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
599 604
600 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 605 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
601 kvm_mmu_free_some_pages(vcpu); 606 kvm_mmu_free_some_pages(vcpu);
607 if (!force_pt_level)
608 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
602 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 609 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
603 level, &write_pt, pfn, map_writable, prefault); 610 level, &write_pt, pfn, map_writable, prefault);
604 (void)sptep; 611 (void)sptep;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 4835cae71047..907f1605926b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -409,6 +409,18 @@ static inline void ClearPageCompound(struct page *page)
409 409
410#endif /* !PAGEFLAGS_EXTENDED */ 410#endif /* !PAGEFLAGS_EXTENDED */
411 411
412#ifdef CONFIG_TRANSPARENT_HUGEPAGE
413static inline int PageTransCompound(struct page *page)
414{
415 return PageCompound(page);
416}
417#else
418static inline int PageTransCompound(struct page *page)
419{
420 return 0;
421}
422#endif
423
412#ifdef CONFIG_MMU 424#ifdef CONFIG_MMU
413#define __PG_MLOCKED (1 << PG_mlocked) 425#define __PG_MLOCKED (1 << PG_mlocked)
414#else 426#else
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7f686251f711..85ab7db0d366 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -104,8 +104,36 @@ static pfn_t fault_pfn;
104inline int kvm_is_mmio_pfn(pfn_t pfn) 104inline int kvm_is_mmio_pfn(pfn_t pfn)
105{ 105{
106 if (pfn_valid(pfn)) { 106 if (pfn_valid(pfn)) {
107 struct page *page = compound_head(pfn_to_page(pfn)); 107 struct page *head;
108 return PageReserved(page); 108 struct page *tail = pfn_to_page(pfn);
109 head = compound_head(tail);
110 if (head != tail) {
111 smp_rmb();
112 /*
113 * head may be a dangling pointer.
114 * __split_huge_page_refcount clears PageTail
115 * before overwriting first_page, so if
116 * PageTail is still there it means the head
117 * pointer isn't dangling.
118 */
119 if (PageTail(tail)) {
120 /*
121 * the "head" is not a dangling
122 * pointer but the hugepage may have
123 * been splitted from under us (and we
124 * may not hold a reference count on
125 * the head page so it can be reused
126 * before we run PageReferenced), so
127 * we've to recheck PageTail before
128 * returning what we just read.
129 */
130 int reserved = PageReserved(head);
131 smp_rmb();
132 if (PageTail(tail))
133 return reserved;
134 }
135 }
136 return PageReserved(tail);
109 } 137 }
110 138
111 return true; 139 return true;