aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kvm
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2014-05-26 05:48:39 -0400
committerAlexander Graf <agraf@suse.de>2014-05-30 08:26:29 -0400
commit6c576e74fd91b93ca1eedcd9eb5200171d2ba32b (patch)
tree6b7563c0ef1a9b9f1166ea3b594220de96c7981a /arch/powerpc/kvm
parent687414bebe30d59c766b682cf86b1c5fa92d7af9 (diff)
KVM: PPC: Book3S HV: Make sure we don't miss dirty pages
Current, when testing whether a page is dirty (when constructing the bitmap for the KVM_GET_DIRTY_LOG ioctl), we test the C (changed) bit in the HPT entries mapping the page, and if it is 0, we consider the page to be clean. However, the Power ISA doesn't require processors to set the C bit to 1 immediately when writing to a page, and in fact allows them to delay the writeback of the C bit until they receive a TLB invalidation for the page. Thus it is possible that the page could be dirty and we miss it. Now, if there are vcpus running, this is not serious since the collection of the dirty log is racy already - some vcpu could dirty the page just after we check it. But if there are no vcpus running we should return definitive results, in case we are in the final phase of migrating the guest. Also, if the permission bits in the HPTE don't allow writing, then we know that no CPU can set C. If the HPTE was previously writable and the page was modified, any C bit writeback would have been flushed out by the tlbie that we did when changing the HPTE to read-only. Otherwise we need to do a TLB invalidation even if the C bit is 0, and then check the C bit. Signed-off-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Alexander Graf <agraf@suse.de>
Diffstat (limited to 'arch/powerpc/kvm')
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c47
1 files changed, 37 insertions, 10 deletions
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 96c90447d4bf..80561074078d 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1060,6 +1060,11 @@ void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
1060 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); 1060 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
1061} 1061}
1062 1062
1063static int vcpus_running(struct kvm *kvm)
1064{
1065 return atomic_read(&kvm->arch.vcpus_running) != 0;
1066}
1067
1063/* 1068/*
1064 * Returns the number of system pages that are dirty. 1069 * Returns the number of system pages that are dirty.
1065 * This can be more than 1 if we find a huge-page HPTE. 1070 * This can be more than 1 if we find a huge-page HPTE.
@@ -1069,6 +1074,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
1069 struct revmap_entry *rev = kvm->arch.revmap; 1074 struct revmap_entry *rev = kvm->arch.revmap;
1070 unsigned long head, i, j; 1075 unsigned long head, i, j;
1071 unsigned long n; 1076 unsigned long n;
1077 unsigned long v, r;
1072 unsigned long *hptep; 1078 unsigned long *hptep;
1073 int npages_dirty = 0; 1079 int npages_dirty = 0;
1074 1080
@@ -1088,7 +1094,22 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
1088 hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); 1094 hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
1089 j = rev[i].forw; 1095 j = rev[i].forw;
1090 1096
1091 if (!(hptep[1] & HPTE_R_C)) 1097 /*
1098 * Checking the C (changed) bit here is racy since there
1099 * is no guarantee about when the hardware writes it back.
1100 * If the HPTE is not writable then it is stable since the
1101 * page can't be written to, and we would have done a tlbie
1102 * (which forces the hardware to complete any writeback)
1103 * when making the HPTE read-only.
1104 * If vcpus are running then this call is racy anyway
1105 * since the page could get dirtied subsequently, so we
1106 * expect there to be a further call which would pick up
1107 * any delayed C bit writeback.
1108 * Otherwise we need to do the tlbie even if C==0 in
1109 * order to pick up any delayed writeback of C.
1110 */
1111 if (!(hptep[1] & HPTE_R_C) &&
1112 (!hpte_is_writable(hptep[1]) || vcpus_running(kvm)))
1092 continue; 1113 continue;
1093 1114
1094 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 1115 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
@@ -1100,23 +1121,29 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
1100 } 1121 }
1101 1122
1102 /* Now check and modify the HPTE */ 1123 /* Now check and modify the HPTE */
1103 if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) { 1124 if (!(hptep[0] & HPTE_V_VALID))
1104 /* need to make it temporarily absent to clear C */ 1125 continue;
1105 hptep[0] |= HPTE_V_ABSENT; 1126
1106 kvmppc_invalidate_hpte(kvm, hptep, i); 1127 /* need to make it temporarily absent so C is stable */
1107 hptep[1] &= ~HPTE_R_C; 1128 hptep[0] |= HPTE_V_ABSENT;
1108 eieio(); 1129 kvmppc_invalidate_hpte(kvm, hptep, i);
1109 hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; 1130 v = hptep[0];
1131 r = hptep[1];
1132 if (r & HPTE_R_C) {
1133 hptep[1] = r & ~HPTE_R_C;
1110 if (!(rev[i].guest_rpte & HPTE_R_C)) { 1134 if (!(rev[i].guest_rpte & HPTE_R_C)) {
1111 rev[i].guest_rpte |= HPTE_R_C; 1135 rev[i].guest_rpte |= HPTE_R_C;
1112 note_hpte_modification(kvm, &rev[i]); 1136 note_hpte_modification(kvm, &rev[i]);
1113 } 1137 }
1114 n = hpte_page_size(hptep[0], hptep[1]); 1138 n = hpte_page_size(v, r);
1115 n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; 1139 n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
1116 if (n > npages_dirty) 1140 if (n > npages_dirty)
1117 npages_dirty = n; 1141 npages_dirty = n;
1142 eieio();
1118 } 1143 }
1119 hptep[0] &= ~HPTE_V_HVLOCK; 1144 v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK);
1145 v |= HPTE_V_VALID;
1146 hptep[0] = v;
1120 } while ((i = j) != head); 1147 } while ((i = j) != head);
1121 1148
1122 unlock_rmap(rmapp); 1149 unlock_rmap(rmapp);