diff options
author | Andrea Arcangeli <aarcange@redhat.com> | 2011-01-13 18:46:48 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-01-13 20:32:41 -0500 |
commit | 936a5fe6e6148c0b3ea0d792b903847d9b9931a1 (patch) | |
tree | f0326a5877a89251ff2cb727d5e832e911ed1d18 | |
parent | 47ad8475c000141eacb3ecda5e5ce4b43a9cd04d (diff) |
thp: kvm mmu transparent hugepage support
This should work for both hugetlbfs and transparent hugepages.
[akpm@linux-foundation.org: bring forward PageTransCompound() addition for bisectability]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | arch/x86/kvm/mmu.c | 91 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 9 | ||||
-rw-r--r-- | include/linux/page-flags.h | 12 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 32 |
4 files changed, 125 insertions, 19 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9cafbb499813..47b2c3288b6b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) | |||
554 | return ret; | 554 | return ret; |
555 | } | 555 | } |
556 | 556 | ||
557 | static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | 557 | static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) |
558 | { | 558 | { |
559 | struct kvm_memory_slot *slot; | 559 | struct kvm_memory_slot *slot; |
560 | int host_level, level, max_level; | ||
561 | |||
562 | slot = gfn_to_memslot(vcpu->kvm, large_gfn); | 560 | slot = gfn_to_memslot(vcpu->kvm, large_gfn); |
563 | if (slot && slot->dirty_bitmap) | 561 | if (slot && slot->dirty_bitmap) |
564 | return PT_PAGE_TABLE_LEVEL; | 562 | return true; |
563 | return false; | ||
564 | } | ||
565 | |||
566 | static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | ||
567 | { | ||
568 | int host_level, level, max_level; | ||
565 | 569 | ||
566 | host_level = host_mapping_level(vcpu->kvm, large_gfn); | 570 | host_level = host_mapping_level(vcpu->kvm, large_gfn); |
567 | 571 | ||
@@ -2281,6 +2285,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) | |||
2281 | return 1; | 2285 | return 1; |
2282 | } | 2286 | } |
2283 | 2287 | ||
2288 | static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, | ||
2289 | gfn_t *gfnp, pfn_t *pfnp, int *levelp) | ||
2290 | { | ||
2291 | pfn_t pfn = *pfnp; | ||
2292 | gfn_t gfn = *gfnp; | ||
2293 | int level = *levelp; | ||
2294 | |||
2295 | /* | ||
2296 | * Check if it's a transparent hugepage. If this would be an | ||
2297 | * hugetlbfs page, level wouldn't be set to | ||
2298 | * PT_PAGE_TABLE_LEVEL and there would be no adjustment done | ||
2299 | * here. | ||
2300 | */ | ||
2301 | if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && | ||
2302 | level == PT_PAGE_TABLE_LEVEL && | ||
2303 | PageTransCompound(pfn_to_page(pfn)) && | ||
2304 | !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { | ||
2305 | unsigned long mask; | ||
2306 | /* | ||
2307 | * mmu_notifier_retry was successful and we hold the | ||
2308 | * mmu_lock here, so the pmd can't become splitting | ||
2309 | * from under us, and in turn | ||
2310 | * __split_huge_page_refcount() can't run from under | ||
2311 | * us and we can safely transfer the refcount from | ||
2312 | * PG_tail to PG_head as we switch the pfn to tail to | ||
2313 | * head. | ||
2314 | */ | ||
2315 | *levelp = level = PT_DIRECTORY_LEVEL; | ||
2316 | mask = KVM_PAGES_PER_HPAGE(level) - 1; | ||
2317 | VM_BUG_ON((gfn & mask) != (pfn & mask)); | ||
2318 | if (pfn & mask) { | ||
2319 | gfn &= ~mask; | ||
2320 | *gfnp = gfn; | ||
2321 | kvm_release_pfn_clean(pfn); | ||
2322 | pfn &= ~mask; | ||
2323 | if (!get_page_unless_zero(pfn_to_page(pfn))) | ||
2324 | BUG(); | ||
2325 | *pfnp = pfn; | ||
2326 | } | ||
2327 | } | ||
2328 | } | ||
2329 | |||
2284 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | 2330 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, |
2285 | gva_t gva, pfn_t *pfn, bool write, bool *writable); | 2331 | gva_t gva, pfn_t *pfn, bool write, bool *writable); |
2286 | 2332 | ||
@@ -2289,20 +2335,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | |||
2289 | { | 2335 | { |
2290 | int r; | 2336 | int r; |
2291 | int level; | 2337 | int level; |
2338 | int force_pt_level; | ||
2292 | pfn_t pfn; | 2339 | pfn_t pfn; |
2293 | unsigned long mmu_seq; | 2340 | unsigned long mmu_seq; |
2294 | bool map_writable; | 2341 | bool map_writable; |
2295 | 2342 | ||
2296 | level = mapping_level(vcpu, gfn); | 2343 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); |
2297 | 2344 | if (likely(!force_pt_level)) { | |
2298 | /* | 2345 | level = mapping_level(vcpu, gfn); |
2299 | * This path builds a PAE pagetable - so we can map 2mb pages at | 2346 | /* |
2300 | * maximum. Therefore check if the level is larger than that. | 2347 | * This path builds a PAE pagetable - so we can map |
2301 | */ | 2348 | * 2mb pages at maximum. Therefore check if the level |
2302 | if (level > PT_DIRECTORY_LEVEL) | 2349 | * is larger than that. |
2303 | level = PT_DIRECTORY_LEVEL; | 2350 | */ |
2351 | if (level > PT_DIRECTORY_LEVEL) | ||
2352 | level = PT_DIRECTORY_LEVEL; | ||
2304 | 2353 | ||
2305 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | 2354 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); |
2355 | } else | ||
2356 | level = PT_PAGE_TABLE_LEVEL; | ||
2306 | 2357 | ||
2307 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2358 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2308 | smp_rmb(); | 2359 | smp_rmb(); |
@@ -2318,6 +2369,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | |||
2318 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 2369 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
2319 | goto out_unlock; | 2370 | goto out_unlock; |
2320 | kvm_mmu_free_some_pages(vcpu); | 2371 | kvm_mmu_free_some_pages(vcpu); |
2372 | if (likely(!force_pt_level)) | ||
2373 | transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); | ||
2321 | r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, | 2374 | r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, |
2322 | prefault); | 2375 | prefault); |
2323 | spin_unlock(&vcpu->kvm->mmu_lock); | 2376 | spin_unlock(&vcpu->kvm->mmu_lock); |
@@ -2655,6 +2708,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | |||
2655 | pfn_t pfn; | 2708 | pfn_t pfn; |
2656 | int r; | 2709 | int r; |
2657 | int level; | 2710 | int level; |
2711 | int force_pt_level; | ||
2658 | gfn_t gfn = gpa >> PAGE_SHIFT; | 2712 | gfn_t gfn = gpa >> PAGE_SHIFT; |
2659 | unsigned long mmu_seq; | 2713 | unsigned long mmu_seq; |
2660 | int write = error_code & PFERR_WRITE_MASK; | 2714 | int write = error_code & PFERR_WRITE_MASK; |
@@ -2667,9 +2721,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | |||
2667 | if (r) | 2721 | if (r) |
2668 | return r; | 2722 | return r; |
2669 | 2723 | ||
2670 | level = mapping_level(vcpu, gfn); | 2724 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); |
2671 | 2725 | if (likely(!force_pt_level)) { | |
2672 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | 2726 | level = mapping_level(vcpu, gfn); |
2727 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | ||
2728 | } else | ||
2729 | level = PT_PAGE_TABLE_LEVEL; | ||
2673 | 2730 | ||
2674 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2731 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2675 | smp_rmb(); | 2732 | smp_rmb(); |
@@ -2684,6 +2741,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | |||
2684 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 2741 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
2685 | goto out_unlock; | 2742 | goto out_unlock; |
2686 | kvm_mmu_free_some_pages(vcpu); | 2743 | kvm_mmu_free_some_pages(vcpu); |
2744 | if (likely(!force_pt_level)) | ||
2745 | transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); | ||
2687 | r = __direct_map(vcpu, gpa, write, map_writable, | 2746 | r = __direct_map(vcpu, gpa, write, map_writable, |
2688 | level, gfn, pfn, prefault); | 2747 | level, gfn, pfn, prefault); |
2689 | spin_unlock(&vcpu->kvm->mmu_lock); | 2748 | spin_unlock(&vcpu->kvm->mmu_lock); |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 53210f1e94c2..6bccc24c4181 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
550 | int r; | 550 | int r; |
551 | pfn_t pfn; | 551 | pfn_t pfn; |
552 | int level = PT_PAGE_TABLE_LEVEL; | 552 | int level = PT_PAGE_TABLE_LEVEL; |
553 | int force_pt_level; | ||
553 | unsigned long mmu_seq; | 554 | unsigned long mmu_seq; |
554 | bool map_writable; | 555 | bool map_writable; |
555 | 556 | ||
@@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
577 | return 0; | 578 | return 0; |
578 | } | 579 | } |
579 | 580 | ||
580 | if (walker.level >= PT_DIRECTORY_LEVEL) { | 581 | if (walker.level >= PT_DIRECTORY_LEVEL) |
582 | force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); | ||
583 | else | ||
584 | force_pt_level = 1; | ||
585 | if (!force_pt_level) { | ||
581 | level = min(walker.level, mapping_level(vcpu, walker.gfn)); | 586 | level = min(walker.level, mapping_level(vcpu, walker.gfn)); |
582 | walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); | 587 | walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); |
583 | } | 588 | } |
@@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
599 | 604 | ||
600 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); | 605 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); |
601 | kvm_mmu_free_some_pages(vcpu); | 606 | kvm_mmu_free_some_pages(vcpu); |
607 | if (!force_pt_level) | ||
608 | transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); | ||
602 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 609 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, |
603 | level, &write_pt, pfn, map_writable, prefault); | 610 | level, &write_pt, pfn, map_writable, prefault); |
604 | (void)sptep; | 611 | (void)sptep; |
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4835cae71047..907f1605926b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h | |||
@@ -409,6 +409,18 @@ static inline void ClearPageCompound(struct page *page) | |||
409 | 409 | ||
410 | #endif /* !PAGEFLAGS_EXTENDED */ | 410 | #endif /* !PAGEFLAGS_EXTENDED */ |
411 | 411 | ||
412 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
413 | static inline int PageTransCompound(struct page *page) | ||
414 | { | ||
415 | return PageCompound(page); | ||
416 | } | ||
417 | #else | ||
418 | static inline int PageTransCompound(struct page *page) | ||
419 | { | ||
420 | return 0; | ||
421 | } | ||
422 | #endif | ||
423 | |||
412 | #ifdef CONFIG_MMU | 424 | #ifdef CONFIG_MMU |
413 | #define __PG_MLOCKED (1 << PG_mlocked) | 425 | #define __PG_MLOCKED (1 << PG_mlocked) |
414 | #else | 426 | #else |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7f686251f711..85ab7db0d366 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
@@ -104,8 +104,36 @@ static pfn_t fault_pfn; | |||
104 | inline int kvm_is_mmio_pfn(pfn_t pfn) | 104 | inline int kvm_is_mmio_pfn(pfn_t pfn) |
105 | { | 105 | { |
106 | if (pfn_valid(pfn)) { | 106 | if (pfn_valid(pfn)) { |
107 | struct page *page = compound_head(pfn_to_page(pfn)); | 107 | struct page *head; |
108 | return PageReserved(page); | 108 | struct page *tail = pfn_to_page(pfn); |
109 | head = compound_head(tail); | ||
110 | if (head != tail) { | ||
111 | smp_rmb(); | ||
112 | /* | ||
113 | * head may be a dangling pointer. | ||
114 | * __split_huge_page_refcount clears PageTail | ||
115 | * before overwriting first_page, so if | ||
116 | * PageTail is still there it means the head | ||
117 | * pointer isn't dangling. | ||
118 | */ | ||
119 | if (PageTail(tail)) { | ||
120 | /* | ||
121 | * the "head" is not a dangling | ||
122 | * pointer but the hugepage may have | ||
123 | * been splitted from under us (and we | ||
124 | * may not hold a reference count on | ||
125 | * the head page so it can be reused | ||
126 | * before we run PageReferenced), so | ||
127 | * we've to recheck PageTail before | ||
128 | * returning what we just read. | ||
129 | */ | ||
130 | int reserved = PageReserved(head); | ||
131 | smp_rmb(); | ||
132 | if (PageTail(tail)) | ||
133 | return reserved; | ||
134 | } | ||
135 | } | ||
136 | return PageReserved(tail); | ||
109 | } | 137 | } |
110 | 138 | ||
111 | return true; | 139 | return true; |