summaryrefslogtreecommitdiffstats
path: root/virt
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2018-10-22 20:36:47 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2018-12-14 06:34:19 -0500
commit2a31b9db153530df4aa02dac8c32837bf5f47019 (patch)
tree0cd6fe156ec696e6a55a0d7117794f590ec76958 /virt
parent8fe65a8299f9e1f40cb95308ab7b3c4ad80bf801 (diff)
kvm: introduce manual dirty log reprotect
There are two problems with KVM_GET_DIRTY_LOG. First, and less important, it can take kvm->mmu_lock for an extended period of time. Second, its user can actually see many false positives in some cases. The latter is due to a benign race like this: 1. KVM_GET_DIRTY_LOG returns a set of dirty pages and write protects them. 2. The guest modifies the pages, causing them to be marked ditry. 3. Userspace actually copies the pages. 4. KVM_GET_DIRTY_LOG returns those pages as dirty again, even though they were not written to since (3). This is especially a problem for large guests, where the time between (1) and (3) can be substantial. This patch introduces a new capability which, when enabled, makes KVM_GET_DIRTY_LOG not write-protect the pages it returns. Instead, userspace has to explicitly clear the dirty log bits just before using the content of the page. The new KVM_CLEAR_DIRTY_LOG ioctl can also operate on a 64-page granularity rather than requiring to sync a full memslot; this way, the mmu_lock is taken for small amounts of time, and only a small amount of time will pass between write protection of pages and the sending of their content. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Diffstat (limited to 'virt')
-rw-r--r--virt/kvm/arm/arm.c16
-rw-r--r--virt/kvm/kvm_main.c132
2 files changed, 131 insertions, 17 deletions
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 120a2663dab9..e91adf77d99a 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -1219,6 +1219,22 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
1219 return r; 1219 return r;
1220} 1220}
1221 1221
1222int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
1223{
1224 bool flush = false;
1225 int r;
1226
1227 mutex_lock(&kvm->slots_lock);
1228
1229 r = kvm_clear_dirty_log_protect(kvm, log, &flush);
1230
1231 if (flush)
1232 kvm_flush_remote_tlbs(kvm);
1233
1234 mutex_unlock(&kvm->slots_lock);
1235 return r;
1236}
1237
1222static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, 1238static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
1223 struct kvm_arm_device_addr *dev_addr) 1239 struct kvm_arm_device_addr *dev_addr)
1224{ 1240{
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 54f0fcfd431e..0041947b7390 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1133,7 +1133,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1133#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 1133#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1134/** 1134/**
1135 * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages 1135 * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
1136 * are dirty write protect them for next write. 1136 * and reenable dirty page tracking for the corresponding pages.
1137 * @kvm: pointer to kvm instance 1137 * @kvm: pointer to kvm instance
1138 * @log: slot id and address to which we copy the log 1138 * @log: slot id and address to which we copy the log
1139 * @is_dirty: flag set if any page is dirty 1139 * @is_dirty: flag set if any page is dirty
@@ -1176,37 +1176,114 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
1176 return -ENOENT; 1176 return -ENOENT;
1177 1177
1178 n = kvm_dirty_bitmap_bytes(memslot); 1178 n = kvm_dirty_bitmap_bytes(memslot);
1179 *flush = false;
1180 if (kvm->manual_dirty_log_protect) {
1181 /*
1182 * Unlike kvm_get_dirty_log, we always return false in *flush,
1183 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
1184 * is some code duplication between this function and
1185 * kvm_get_dirty_log, but hopefully all architecture
1186 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
1187 * can be eliminated.
1188 */
1189 dirty_bitmap_buffer = dirty_bitmap;
1190 } else {
1191 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1192 memset(dirty_bitmap_buffer, 0, n);
1179 1193
1180 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1194 spin_lock(&kvm->mmu_lock);
1181 memset(dirty_bitmap_buffer, 0, n); 1195 for (i = 0; i < n / sizeof(long); i++) {
1196 unsigned long mask;
1197 gfn_t offset;
1182 1198
1183 spin_lock(&kvm->mmu_lock); 1199 if (!dirty_bitmap[i])
1200 continue;
1201
1202 *flush = true;
1203 mask = xchg(&dirty_bitmap[i], 0);
1204 dirty_bitmap_buffer[i] = mask;
1205
1206 if (mask) {
1207 offset = i * BITS_PER_LONG;
1208 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1209 offset, mask);
1210 }
1211 }
1212 spin_unlock(&kvm->mmu_lock);
1213 }
1214
1215 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1216 return -EFAULT;
1217 return 0;
1218}
1219EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
1220
1221/**
1222 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
1223 * and reenable dirty page tracking for the corresponding pages.
1224 * @kvm: pointer to kvm instance
1225 * @log: slot id and address from which to fetch the bitmap of dirty pages
1226 */
1227int kvm_clear_dirty_log_protect(struct kvm *kvm,
1228 struct kvm_clear_dirty_log *log, bool *flush)
1229{
1230 struct kvm_memslots *slots;
1231 struct kvm_memory_slot *memslot;
1232 int as_id, id, n;
1233 gfn_t offset;
1234 unsigned long i;
1235 unsigned long *dirty_bitmap;
1236 unsigned long *dirty_bitmap_buffer;
1237
1238 as_id = log->slot >> 16;
1239 id = (u16)log->slot;
1240 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1241 return -EINVAL;
1242
1243 if ((log->first_page & 63) || (log->num_pages & 63))
1244 return -EINVAL;
1245
1246 slots = __kvm_memslots(kvm, as_id);
1247 memslot = id_to_memslot(slots, id);
1248
1249 dirty_bitmap = memslot->dirty_bitmap;
1250 if (!dirty_bitmap)
1251 return -ENOENT;
1252
1253 n = kvm_dirty_bitmap_bytes(memslot);
1184 *flush = false; 1254 *flush = false;
1185 for (i = 0; i < n / sizeof(long); i++) { 1255 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1186 unsigned long mask; 1256 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
1187 gfn_t offset; 1257 return -EFAULT;
1188 1258
1189 if (!dirty_bitmap[i]) 1259 spin_lock(&kvm->mmu_lock);
1260 for (offset = log->first_page,
1261 i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--;
1262 i++, offset += BITS_PER_LONG) {
1263 unsigned long mask = *dirty_bitmap_buffer++;
1264 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
1265 if (!mask)
1190 continue; 1266 continue;
1191 1267
1192 *flush = true; 1268 mask &= atomic_long_fetch_andnot(mask, p);
1193
1194 mask = xchg(&dirty_bitmap[i], 0);
1195 dirty_bitmap_buffer[i] = mask;
1196 1269
1270 /*
1271 * mask contains the bits that really have been cleared. This
1272 * never includes any bits beyond the length of the memslot (if
1273 * the length is not aligned to 64 pages), therefore it is not
1274 * a problem if userspace sets them in log->dirty_bitmap.
1275 */
1197 if (mask) { 1276 if (mask) {
1198 offset = i * BITS_PER_LONG; 1277 *flush = true;
1199 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1278 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1200 offset, mask); 1279 offset, mask);
1201 } 1280 }
1202 } 1281 }
1203
1204 spin_unlock(&kvm->mmu_lock); 1282 spin_unlock(&kvm->mmu_lock);
1205 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 1283
1206 return -EFAULT;
1207 return 0; 1284 return 0;
1208} 1285}
1209EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); 1286EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect);
1210#endif 1287#endif
1211 1288
1212bool kvm_largepages_enabled(void) 1289bool kvm_largepages_enabled(void)
@@ -2949,6 +3026,9 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
2949 case KVM_CAP_IOEVENTFD_ANY_LENGTH: 3026 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
2950 case KVM_CAP_CHECK_EXTENSION_VM: 3027 case KVM_CAP_CHECK_EXTENSION_VM:
2951 case KVM_CAP_ENABLE_CAP_VM: 3028 case KVM_CAP_ENABLE_CAP_VM:
3029#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3030 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT:
3031#endif
2952 return 1; 3032 return 1;
2953#ifdef CONFIG_KVM_MMIO 3033#ifdef CONFIG_KVM_MMIO
2954 case KVM_CAP_COALESCED_MMIO: 3034 case KVM_CAP_COALESCED_MMIO:
@@ -2982,6 +3062,13 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
2982 struct kvm_enable_cap *cap) 3062 struct kvm_enable_cap *cap)
2983{ 3063{
2984 switch (cap->cap) { 3064 switch (cap->cap) {
3065#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3066 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT:
3067 if (cap->flags || (cap->args[0] & ~1))
3068 return -EINVAL;
3069 kvm->manual_dirty_log_protect = cap->args[0];
3070 return 0;
3071#endif
2985 default: 3072 default:
2986 return kvm_vm_ioctl_enable_cap(kvm, cap); 3073 return kvm_vm_ioctl_enable_cap(kvm, cap);
2987 } 3074 }
@@ -3029,6 +3116,17 @@ static long kvm_vm_ioctl(struct file *filp,
3029 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3116 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3030 break; 3117 break;
3031 } 3118 }
3119#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3120 case KVM_CLEAR_DIRTY_LOG: {
3121 struct kvm_clear_dirty_log log;
3122
3123 r = -EFAULT;
3124 if (copy_from_user(&log, argp, sizeof(log)))
3125 goto out;
3126 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
3127 break;
3128 }
3129#endif
3032#ifdef CONFIG_KVM_MMIO 3130#ifdef CONFIG_KVM_MMIO
3033 case KVM_REGISTER_COALESCED_MMIO: { 3131 case KVM_REGISTER_COALESCED_MMIO: {
3034 struct kvm_coalesced_mmio_zone zone; 3132 struct kvm_coalesced_mmio_zone zone;