diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2018-10-22 20:36:47 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2018-12-14 06:34:19 -0500 |
commit | 2a31b9db153530df4aa02dac8c32837bf5f47019 (patch) | |
tree | 0cd6fe156ec696e6a55a0d7117794f590ec76958 /virt | |
parent | 8fe65a8299f9e1f40cb95308ab7b3c4ad80bf801 (diff) |
kvm: introduce manual dirty log reprotect
There are two problems with KVM_GET_DIRTY_LOG. First, and less important,
it can take kvm->mmu_lock for an extended period of time. Second, its user
can actually see many false positives in some cases. The latter is due
to a benign race like this:
1. KVM_GET_DIRTY_LOG returns a set of dirty pages and write protects
them.
2. The guest modifies the pages, causing them to be marked ditry.
3. Userspace actually copies the pages.
4. KVM_GET_DIRTY_LOG returns those pages as dirty again, even though
they were not written to since (3).
This is especially a problem for large guests, where the time between
(1) and (3) can be substantial. This patch introduces a new
capability which, when enabled, makes KVM_GET_DIRTY_LOG not
write-protect the pages it returns. Instead, userspace has to
explicitly clear the dirty log bits just before using the content
of the page. The new KVM_CLEAR_DIRTY_LOG ioctl can also operate on a
64-page granularity rather than requiring to sync a full memslot;
this way, the mmu_lock is taken for small amounts of time, and
only a small amount of time will pass between write protection
of pages and the sending of their content.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Diffstat (limited to 'virt')
-rw-r--r-- | virt/kvm/arm/arm.c | 16 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 132 |
2 files changed, 131 insertions, 17 deletions
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 120a2663dab9..e91adf77d99a 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c | |||
@@ -1219,6 +1219,22 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) | |||
1219 | return r; | 1219 | return r; |
1220 | } | 1220 | } |
1221 | 1221 | ||
1222 | int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) | ||
1223 | { | ||
1224 | bool flush = false; | ||
1225 | int r; | ||
1226 | |||
1227 | mutex_lock(&kvm->slots_lock); | ||
1228 | |||
1229 | r = kvm_clear_dirty_log_protect(kvm, log, &flush); | ||
1230 | |||
1231 | if (flush) | ||
1232 | kvm_flush_remote_tlbs(kvm); | ||
1233 | |||
1234 | mutex_unlock(&kvm->slots_lock); | ||
1235 | return r; | ||
1236 | } | ||
1237 | |||
1222 | static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, | 1238 | static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, |
1223 | struct kvm_arm_device_addr *dev_addr) | 1239 | struct kvm_arm_device_addr *dev_addr) |
1224 | { | 1240 | { |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 54f0fcfd431e..0041947b7390 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
@@ -1133,7 +1133,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log); | |||
1133 | #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT | 1133 | #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT |
1134 | /** | 1134 | /** |
1135 | * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages | 1135 | * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages |
1136 | * are dirty write protect them for next write. | 1136 | * and reenable dirty page tracking for the corresponding pages. |
1137 | * @kvm: pointer to kvm instance | 1137 | * @kvm: pointer to kvm instance |
1138 | * @log: slot id and address to which we copy the log | 1138 | * @log: slot id and address to which we copy the log |
1139 | * @is_dirty: flag set if any page is dirty | 1139 | * @is_dirty: flag set if any page is dirty |
@@ -1176,37 +1176,114 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, | |||
1176 | return -ENOENT; | 1176 | return -ENOENT; |
1177 | 1177 | ||
1178 | n = kvm_dirty_bitmap_bytes(memslot); | 1178 | n = kvm_dirty_bitmap_bytes(memslot); |
1179 | *flush = false; | ||
1180 | if (kvm->manual_dirty_log_protect) { | ||
1181 | /* | ||
1182 | * Unlike kvm_get_dirty_log, we always return false in *flush, | ||
1183 | * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There | ||
1184 | * is some code duplication between this function and | ||
1185 | * kvm_get_dirty_log, but hopefully all architecture | ||
1186 | * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log | ||
1187 | * can be eliminated. | ||
1188 | */ | ||
1189 | dirty_bitmap_buffer = dirty_bitmap; | ||
1190 | } else { | ||
1191 | dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); | ||
1192 | memset(dirty_bitmap_buffer, 0, n); | ||
1179 | 1193 | ||
1180 | dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); | 1194 | spin_lock(&kvm->mmu_lock); |
1181 | memset(dirty_bitmap_buffer, 0, n); | 1195 | for (i = 0; i < n / sizeof(long); i++) { |
1196 | unsigned long mask; | ||
1197 | gfn_t offset; | ||
1182 | 1198 | ||
1183 | spin_lock(&kvm->mmu_lock); | 1199 | if (!dirty_bitmap[i]) |
1200 | continue; | ||
1201 | |||
1202 | *flush = true; | ||
1203 | mask = xchg(&dirty_bitmap[i], 0); | ||
1204 | dirty_bitmap_buffer[i] = mask; | ||
1205 | |||
1206 | if (mask) { | ||
1207 | offset = i * BITS_PER_LONG; | ||
1208 | kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, | ||
1209 | offset, mask); | ||
1210 | } | ||
1211 | } | ||
1212 | spin_unlock(&kvm->mmu_lock); | ||
1213 | } | ||
1214 | |||
1215 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) | ||
1216 | return -EFAULT; | ||
1217 | return 0; | ||
1218 | } | ||
1219 | EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); | ||
1220 | |||
1221 | /** | ||
1222 | * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap | ||
1223 | * and reenable dirty page tracking for the corresponding pages. | ||
1224 | * @kvm: pointer to kvm instance | ||
1225 | * @log: slot id and address from which to fetch the bitmap of dirty pages | ||
1226 | */ | ||
1227 | int kvm_clear_dirty_log_protect(struct kvm *kvm, | ||
1228 | struct kvm_clear_dirty_log *log, bool *flush) | ||
1229 | { | ||
1230 | struct kvm_memslots *slots; | ||
1231 | struct kvm_memory_slot *memslot; | ||
1232 | int as_id, id, n; | ||
1233 | gfn_t offset; | ||
1234 | unsigned long i; | ||
1235 | unsigned long *dirty_bitmap; | ||
1236 | unsigned long *dirty_bitmap_buffer; | ||
1237 | |||
1238 | as_id = log->slot >> 16; | ||
1239 | id = (u16)log->slot; | ||
1240 | if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) | ||
1241 | return -EINVAL; | ||
1242 | |||
1243 | if ((log->first_page & 63) || (log->num_pages & 63)) | ||
1244 | return -EINVAL; | ||
1245 | |||
1246 | slots = __kvm_memslots(kvm, as_id); | ||
1247 | memslot = id_to_memslot(slots, id); | ||
1248 | |||
1249 | dirty_bitmap = memslot->dirty_bitmap; | ||
1250 | if (!dirty_bitmap) | ||
1251 | return -ENOENT; | ||
1252 | |||
1253 | n = kvm_dirty_bitmap_bytes(memslot); | ||
1184 | *flush = false; | 1254 | *flush = false; |
1185 | for (i = 0; i < n / sizeof(long); i++) { | 1255 | dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); |
1186 | unsigned long mask; | 1256 | if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) |
1187 | gfn_t offset; | 1257 | return -EFAULT; |
1188 | 1258 | ||
1189 | if (!dirty_bitmap[i]) | 1259 | spin_lock(&kvm->mmu_lock); |
1260 | for (offset = log->first_page, | ||
1261 | i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--; | ||
1262 | i++, offset += BITS_PER_LONG) { | ||
1263 | unsigned long mask = *dirty_bitmap_buffer++; | ||
1264 | atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; | ||
1265 | if (!mask) | ||
1190 | continue; | 1266 | continue; |
1191 | 1267 | ||
1192 | *flush = true; | 1268 | mask &= atomic_long_fetch_andnot(mask, p); |
1193 | |||
1194 | mask = xchg(&dirty_bitmap[i], 0); | ||
1195 | dirty_bitmap_buffer[i] = mask; | ||
1196 | 1269 | ||
1270 | /* | ||
1271 | * mask contains the bits that really have been cleared. This | ||
1272 | * never includes any bits beyond the length of the memslot (if | ||
1273 | * the length is not aligned to 64 pages), therefore it is not | ||
1274 | * a problem if userspace sets them in log->dirty_bitmap. | ||
1275 | */ | ||
1197 | if (mask) { | 1276 | if (mask) { |
1198 | offset = i * BITS_PER_LONG; | 1277 | *flush = true; |
1199 | kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, | 1278 | kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, |
1200 | offset, mask); | 1279 | offset, mask); |
1201 | } | 1280 | } |
1202 | } | 1281 | } |
1203 | |||
1204 | spin_unlock(&kvm->mmu_lock); | 1282 | spin_unlock(&kvm->mmu_lock); |
1205 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) | 1283 | |
1206 | return -EFAULT; | ||
1207 | return 0; | 1284 | return 0; |
1208 | } | 1285 | } |
1209 | EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); | 1286 | EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect); |
1210 | #endif | 1287 | #endif |
1211 | 1288 | ||
1212 | bool kvm_largepages_enabled(void) | 1289 | bool kvm_largepages_enabled(void) |
@@ -2949,6 +3026,9 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) | |||
2949 | case KVM_CAP_IOEVENTFD_ANY_LENGTH: | 3026 | case KVM_CAP_IOEVENTFD_ANY_LENGTH: |
2950 | case KVM_CAP_CHECK_EXTENSION_VM: | 3027 | case KVM_CAP_CHECK_EXTENSION_VM: |
2951 | case KVM_CAP_ENABLE_CAP_VM: | 3028 | case KVM_CAP_ENABLE_CAP_VM: |
3029 | #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT | ||
3030 | case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: | ||
3031 | #endif | ||
2952 | return 1; | 3032 | return 1; |
2953 | #ifdef CONFIG_KVM_MMIO | 3033 | #ifdef CONFIG_KVM_MMIO |
2954 | case KVM_CAP_COALESCED_MMIO: | 3034 | case KVM_CAP_COALESCED_MMIO: |
@@ -2982,6 +3062,13 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, | |||
2982 | struct kvm_enable_cap *cap) | 3062 | struct kvm_enable_cap *cap) |
2983 | { | 3063 | { |
2984 | switch (cap->cap) { | 3064 | switch (cap->cap) { |
3065 | #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT | ||
3066 | case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: | ||
3067 | if (cap->flags || (cap->args[0] & ~1)) | ||
3068 | return -EINVAL; | ||
3069 | kvm->manual_dirty_log_protect = cap->args[0]; | ||
3070 | return 0; | ||
3071 | #endif | ||
2985 | default: | 3072 | default: |
2986 | return kvm_vm_ioctl_enable_cap(kvm, cap); | 3073 | return kvm_vm_ioctl_enable_cap(kvm, cap); |
2987 | } | 3074 | } |
@@ -3029,6 +3116,17 @@ static long kvm_vm_ioctl(struct file *filp, | |||
3029 | r = kvm_vm_ioctl_get_dirty_log(kvm, &log); | 3116 | r = kvm_vm_ioctl_get_dirty_log(kvm, &log); |
3030 | break; | 3117 | break; |
3031 | } | 3118 | } |
3119 | #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT | ||
3120 | case KVM_CLEAR_DIRTY_LOG: { | ||
3121 | struct kvm_clear_dirty_log log; | ||
3122 | |||
3123 | r = -EFAULT; | ||
3124 | if (copy_from_user(&log, argp, sizeof(log))) | ||
3125 | goto out; | ||
3126 | r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); | ||
3127 | break; | ||
3128 | } | ||
3129 | #endif | ||
3032 | #ifdef CONFIG_KVM_MMIO | 3130 | #ifdef CONFIG_KVM_MMIO |
3033 | case KVM_REGISTER_COALESCED_MMIO: { | 3131 | case KVM_REGISTER_COALESCED_MMIO: { |
3034 | struct kvm_coalesced_mmio_zone zone; | 3132 | struct kvm_coalesced_mmio_zone zone; |