aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2011-01-13 18:47:10 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2011-01-13 20:32:46 -0500
commit8ee53820edfd1f3b6554c593f337148dd3d7fc91 (patch)
treeca69957e928cd3efa1b47f92dcfb00591702684c
parent4b7167b9ff9b7f3f528cbc4c7d02ebd275b9b10c (diff)
thp: mmu_notifier_test_young
For GRU and EPT, we need gup-fast to set referenced bit too (this is why it's correct to return 0 when shadow_access_mask is zero, it requires gup-fast to set the referenced bit). qemu-kvm access already sets the young bit in the pte if it isn't zero-copy, if it's zero copy or a shadow paging EPT minor fault we relay on gup-fast to signal the page is in use... We also need to check the young bits on the secondary pagetables for NPT and not nested shadow mmu as the data may never get accessed again by the primary pte. Without this closer accuracy, we'd have to remove the heuristic that avoids collapsing hugepages in hugepage virtual regions that have not even a single subpage in use. ->test_young is full backwards compatible with GRU and other usages that don't have young bits in pagetables set by the hardware and that should nuke the secondary mmu mappings when ->clear_flush_young runs just like EPT does. Removing the heuristic that checks the young bit in khugepaged/collapse_huge_page completely isn't so bad either probably but I thought it was worth it and this makes it reliable. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/x86/include/asm/kvm_host.h1
-rw-r--r--arch/x86/kvm/mmu.c34
-rw-r--r--arch/x86/mm/gup.c3
-rw-r--r--include/linux/mmu_notifier.h26
-rw-r--r--mm/huge_memory.c6
-rw-r--r--mm/mmu_notifier.c20
-rw-r--r--virt/kvm/kvm_main.c17
7 files changed, 105 insertions, 2 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa75f21a9fba..ffd7f8d29187 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -822,6 +822,7 @@ extern bool kvm_rebooting;
822#define KVM_ARCH_WANT_MMU_NOTIFIER 822#define KVM_ARCH_WANT_MMU_NOTIFIER
823int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 823int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
824int kvm_age_hva(struct kvm *kvm, unsigned long hva); 824int kvm_age_hva(struct kvm *kvm, unsigned long hva);
825int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
825void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 826void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
826int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 827int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
827int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 828int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 47b2c3288b6b..f02b8edc3d44 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -945,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
945 return young; 945 return young;
946} 946}
947 947
948static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
949 unsigned long data)
950{
951 u64 *spte;
952 int young = 0;
953
954 /*
955 * If there's no access bit in the secondary pte set by the
956 * hardware it's up to gup-fast/gup to set the access bit in
957 * the primary pte or in the page structure.
958 */
959 if (!shadow_accessed_mask)
960 goto out;
961
962 spte = rmap_next(kvm, rmapp, NULL);
963 while (spte) {
964 u64 _spte = *spte;
965 BUG_ON(!(_spte & PT_PRESENT_MASK));
966 young = _spte & PT_ACCESSED_MASK;
967 if (young) {
968 young = 1;
969 break;
970 }
971 spte = rmap_next(kvm, rmapp, spte);
972 }
973out:
974 return young;
975}
976
948#define RMAP_RECYCLE_THRESHOLD 1000 977#define RMAP_RECYCLE_THRESHOLD 1000
949 978
950static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 979static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -965,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
965 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 994 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
966} 995}
967 996
997int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
998{
999 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1000}
1001
968#ifdef MMU_DEBUG 1002#ifdef MMU_DEBUG
969static int is_empty_shadow_page(u64 *spt) 1003static int is_empty_shadow_page(u64 *spt)
970{ 1004{
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 269aa53932e0..dbe34b931374 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/vmstat.h> 9#include <linux/vmstat.h>
10#include <linux/highmem.h> 10#include <linux/highmem.h>
11#include <linux/swap.h>
11 12
12#include <asm/pgtable.h> 13#include <asm/pgtable.h>
13 14
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 90 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte); 91 page = pte_page(pte);
91 get_page(page); 92 get_page(page);
93 SetPageReferenced(page);
92 pages[*nr] = page; 94 pages[*nr] = page;
93 (*nr)++; 95 (*nr)++;
94 96
@@ -103,6 +105,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
103 VM_BUG_ON(page != compound_head(page)); 105 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0); 106 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count); 107 atomic_add(nr, &page->_count);
108 SetPageReferenced(page);
106} 109}
107 110
108static inline void get_huge_page_tail(struct page *page) 111static inline void get_huge_page_tail(struct page *page)
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index cbfab1e9957d..cc2e7dfea9d7 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -62,6 +62,16 @@ struct mmu_notifier_ops {
62 unsigned long address); 62 unsigned long address);
63 63
64 /* 64 /*
65 * test_young is called to check the young/accessed bitflag in
66 * the secondary pte. This is used to know if the page is
67 * frequently used without actually clearing the flag or tearing
68 * down the secondary mapping on the page.
69 */
70 int (*test_young)(struct mmu_notifier *mn,
71 struct mm_struct *mm,
72 unsigned long address);
73
74 /*
65 * change_pte is called in cases that pte mapping to page is changed: 75 * change_pte is called in cases that pte mapping to page is changed:
66 * for example, when ksm remaps pte to point to a new shared page. 76 * for example, when ksm remaps pte to point to a new shared page.
67 */ 77 */
@@ -163,6 +173,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
163extern void __mmu_notifier_release(struct mm_struct *mm); 173extern void __mmu_notifier_release(struct mm_struct *mm);
164extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, 174extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
165 unsigned long address); 175 unsigned long address);
176extern int __mmu_notifier_test_young(struct mm_struct *mm,
177 unsigned long address);
166extern void __mmu_notifier_change_pte(struct mm_struct *mm, 178extern void __mmu_notifier_change_pte(struct mm_struct *mm,
167 unsigned long address, pte_t pte); 179 unsigned long address, pte_t pte);
168extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, 180extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -186,6 +198,14 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
186 return 0; 198 return 0;
187} 199}
188 200
201static inline int mmu_notifier_test_young(struct mm_struct *mm,
202 unsigned long address)
203{
204 if (mm_has_notifiers(mm))
205 return __mmu_notifier_test_young(mm, address);
206 return 0;
207}
208
189static inline void mmu_notifier_change_pte(struct mm_struct *mm, 209static inline void mmu_notifier_change_pte(struct mm_struct *mm,
190 unsigned long address, pte_t pte) 210 unsigned long address, pte_t pte)
191{ 211{
@@ -313,6 +333,12 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
313 return 0; 333 return 0;
314} 334}
315 335
336static inline int mmu_notifier_test_young(struct mm_struct *mm,
337 unsigned long address)
338{
339 return 0;
340}
341
316static inline void mmu_notifier_change_pte(struct mm_struct *mm, 342static inline void mmu_notifier_change_pte(struct mm_struct *mm,
317 unsigned long address, pte_t pte) 343 unsigned long address, pte_t pte)
318{ 344{
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 915809b16edf..39d7df40c067 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1632,7 +1632,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1632 VM_BUG_ON(PageLRU(page)); 1632 VM_BUG_ON(PageLRU(page));
1633 1633
1634 /* If there is no mapped pte young don't collapse the page */ 1634 /* If there is no mapped pte young don't collapse the page */
1635 if (pte_young(pteval)) 1635 if (pte_young(pteval) || PageReferenced(page) ||
1636 mmu_notifier_test_young(vma->vm_mm, address))
1636 referenced = 1; 1637 referenced = 1;
1637 } 1638 }
1638 if (unlikely(!referenced)) 1639 if (unlikely(!referenced))
@@ -1892,7 +1893,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
1892 /* cannot use mapcount: can't collapse if there's a gup pin */ 1893 /* cannot use mapcount: can't collapse if there's a gup pin */
1893 if (page_count(page) != 1) 1894 if (page_count(page) != 1)
1894 goto out_unmap; 1895 goto out_unmap;
1895 if (pte_young(pteval)) 1896 if (pte_young(pteval) || PageReferenced(page) ||
1897 mmu_notifier_test_young(vma->vm_mm, address))
1896 referenced = 1; 1898 referenced = 1;
1897 } 1899 }
1898 if (referenced) 1900 if (referenced)
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 438951d366f2..8d032de4088e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
100 return young; 100 return young;
101} 101}
102 102
103int __mmu_notifier_test_young(struct mm_struct *mm,
104 unsigned long address)
105{
106 struct mmu_notifier *mn;
107 struct hlist_node *n;
108 int young = 0;
109
110 rcu_read_lock();
111 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
112 if (mn->ops->test_young) {
113 young = mn->ops->test_young(mn, mm, address);
114 if (young)
115 break;
116 }
117 }
118 rcu_read_unlock();
119
120 return young;
121}
122
103void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, 123void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
104 pte_t pte) 124 pte_t pte)
105{ 125{
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 85ab7db0d366..4286d4766510 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -380,6 +380,22 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
380 return young; 380 return young;
381} 381}
382 382
383static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
384 struct mm_struct *mm,
385 unsigned long address)
386{
387 struct kvm *kvm = mmu_notifier_to_kvm(mn);
388 int young, idx;
389
390 idx = srcu_read_lock(&kvm->srcu);
391 spin_lock(&kvm->mmu_lock);
392 young = kvm_test_age_hva(kvm, address);
393 spin_unlock(&kvm->mmu_lock);
394 srcu_read_unlock(&kvm->srcu, idx);
395
396 return young;
397}
398
383static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 399static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
384 struct mm_struct *mm) 400 struct mm_struct *mm)
385{ 401{
@@ -396,6 +412,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
396 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 412 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
397 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 413 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
398 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 414 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
415 .test_young = kvm_mmu_notifier_test_young,
399 .change_pte = kvm_mmu_notifier_change_pte, 416 .change_pte = kvm_mmu_notifier_change_pte,
400 .release = kvm_mmu_notifier_release, 417 .release = kvm_mmu_notifier_release,
401}; 418};