diff options
| -rw-r--r-- | arch/x86/kvm/mmu.c | 100 | ||||
| -rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 12 | ||||
| -rw-r--r-- | include/asm-x86/kvm_host.h | 6 | ||||
| -rw-r--r-- | include/linux/kvm_host.h | 24 | ||||
| -rw-r--r-- | virt/kvm/kvm_main.c | 135 |
5 files changed, 277 insertions, 0 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2fa231923cf7..0bfe2bd305eb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
| @@ -653,6 +653,84 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
| 653 | account_shadowed(kvm, gfn); | 653 | account_shadowed(kvm, gfn); |
| 654 | } | 654 | } |
| 655 | 655 | ||
| 656 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) | ||
| 657 | { | ||
| 658 | u64 *spte; | ||
| 659 | int need_tlb_flush = 0; | ||
| 660 | |||
| 661 | while ((spte = rmap_next(kvm, rmapp, NULL))) { | ||
| 662 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | ||
| 663 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); | ||
| 664 | rmap_remove(kvm, spte); | ||
| 665 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | ||
| 666 | need_tlb_flush = 1; | ||
| 667 | } | ||
| 668 | return need_tlb_flush; | ||
| 669 | } | ||
| 670 | |||
| 671 | static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | ||
| 672 | int (*handler)(struct kvm *kvm, unsigned long *rmapp)) | ||
| 673 | { | ||
| 674 | int i; | ||
| 675 | int retval = 0; | ||
| 676 | |||
| 677 | /* | ||
| 678 | * If mmap_sem isn't taken, we can look the memslots with only | ||
| 679 | * the mmu_lock by skipping over the slots with userspace_addr == 0. | ||
| 680 | */ | ||
| 681 | for (i = 0; i < kvm->nmemslots; i++) { | ||
| 682 | struct kvm_memory_slot *memslot = &kvm->memslots[i]; | ||
| 683 | unsigned long start = memslot->userspace_addr; | ||
| 684 | unsigned long end; | ||
| 685 | |||
| 686 | /* mmu_lock protects userspace_addr */ | ||
| 687 | if (!start) | ||
| 688 | continue; | ||
| 689 | |||
| 690 | end = start + (memslot->npages << PAGE_SHIFT); | ||
| 691 | if (hva >= start && hva < end) { | ||
| 692 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; | ||
| 693 | retval |= handler(kvm, &memslot->rmap[gfn_offset]); | ||
| 694 | retval |= handler(kvm, | ||
| 695 | &memslot->lpage_info[ | ||
| 696 | gfn_offset / | ||
| 697 | KVM_PAGES_PER_HPAGE].rmap_pde); | ||
| 698 | } | ||
| 699 | } | ||
| 700 | |||
| 701 | return retval; | ||
| 702 | } | ||
| 703 | |||
| 704 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) | ||
| 705 | { | ||
| 706 | return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); | ||
| 707 | } | ||
| 708 | |||
| 709 | static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) | ||
| 710 | { | ||
| 711 | u64 *spte; | ||
| 712 | int young = 0; | ||
| 713 | |||
| 714 | spte = rmap_next(kvm, rmapp, NULL); | ||
| 715 | while (spte) { | ||
| 716 | int _young; | ||
| 717 | u64 _spte = *spte; | ||
| 718 | BUG_ON(!(_spte & PT_PRESENT_MASK)); | ||
| 719 | _young = _spte & PT_ACCESSED_MASK; | ||
| 720 | if (_young) { | ||
| 721 | young = 1; | ||
| 722 | clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); | ||
| 723 | } | ||
| 724 | spte = rmap_next(kvm, rmapp, spte); | ||
| 725 | } | ||
| 726 | return young; | ||
| 727 | } | ||
| 728 | |||
| 729 | int kvm_age_hva(struct kvm *kvm, unsigned long hva) | ||
| 730 | { | ||
| 731 | return kvm_handle_hva(kvm, hva, kvm_age_rmapp); | ||
| 732 | } | ||
| 733 | |||
| 656 | #ifdef MMU_DEBUG | 734 | #ifdef MMU_DEBUG |
| 657 | static int is_empty_shadow_page(u64 *spt) | 735 | static int is_empty_shadow_page(u64 *spt) |
| 658 | { | 736 | { |
| @@ -1203,6 +1281,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
| 1203 | int r; | 1281 | int r; |
| 1204 | int largepage = 0; | 1282 | int largepage = 0; |
| 1205 | pfn_t pfn; | 1283 | pfn_t pfn; |
| 1284 | unsigned long mmu_seq; | ||
| 1206 | 1285 | ||
| 1207 | down_read(¤t->mm->mmap_sem); | 1286 | down_read(¤t->mm->mmap_sem); |
| 1208 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { | 1287 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { |
| @@ -1210,6 +1289,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
| 1210 | largepage = 1; | 1289 | largepage = 1; |
| 1211 | } | 1290 | } |
| 1212 | 1291 | ||
| 1292 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | ||
| 1293 | /* implicit mb(), we'll read before PT lock is unlocked */ | ||
| 1213 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 1294 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
| 1214 | up_read(¤t->mm->mmap_sem); | 1295 | up_read(¤t->mm->mmap_sem); |
| 1215 | 1296 | ||
| @@ -1220,6 +1301,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
| 1220 | } | 1301 | } |
| 1221 | 1302 | ||
| 1222 | spin_lock(&vcpu->kvm->mmu_lock); | 1303 | spin_lock(&vcpu->kvm->mmu_lock); |
| 1304 | if (mmu_notifier_retry(vcpu, mmu_seq)) | ||
| 1305 | goto out_unlock; | ||
| 1223 | kvm_mmu_free_some_pages(vcpu); | 1306 | kvm_mmu_free_some_pages(vcpu); |
| 1224 | r = __direct_map(vcpu, v, write, largepage, gfn, pfn, | 1307 | r = __direct_map(vcpu, v, write, largepage, gfn, pfn, |
| 1225 | PT32E_ROOT_LEVEL); | 1308 | PT32E_ROOT_LEVEL); |
| @@ -1227,6 +1310,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
| 1227 | 1310 | ||
| 1228 | 1311 | ||
| 1229 | return r; | 1312 | return r; |
| 1313 | |||
| 1314 | out_unlock: | ||
| 1315 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
| 1316 | kvm_release_pfn_clean(pfn); | ||
| 1317 | return 0; | ||
| 1230 | } | 1318 | } |
| 1231 | 1319 | ||
| 1232 | 1320 | ||
| @@ -1345,6 +1433,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
| 1345 | int r; | 1433 | int r; |
| 1346 | int largepage = 0; | 1434 | int largepage = 0; |
| 1347 | gfn_t gfn = gpa >> PAGE_SHIFT; | 1435 | gfn_t gfn = gpa >> PAGE_SHIFT; |
| 1436 | unsigned long mmu_seq; | ||
| 1348 | 1437 | ||
| 1349 | ASSERT(vcpu); | 1438 | ASSERT(vcpu); |
| 1350 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 1439 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
| @@ -1358,6 +1447,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
| 1358 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 1447 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); |
| 1359 | largepage = 1; | 1448 | largepage = 1; |
| 1360 | } | 1449 | } |
| 1450 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | ||
| 1451 | /* implicit mb(), we'll read before PT lock is unlocked */ | ||
| 1361 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 1452 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
| 1362 | up_read(¤t->mm->mmap_sem); | 1453 | up_read(¤t->mm->mmap_sem); |
| 1363 | if (is_error_pfn(pfn)) { | 1454 | if (is_error_pfn(pfn)) { |
| @@ -1365,12 +1456,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
| 1365 | return 1; | 1456 | return 1; |
| 1366 | } | 1457 | } |
| 1367 | spin_lock(&vcpu->kvm->mmu_lock); | 1458 | spin_lock(&vcpu->kvm->mmu_lock); |
| 1459 | if (mmu_notifier_retry(vcpu, mmu_seq)) | ||
| 1460 | goto out_unlock; | ||
| 1368 | kvm_mmu_free_some_pages(vcpu); | 1461 | kvm_mmu_free_some_pages(vcpu); |
| 1369 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, | 1462 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, |
| 1370 | largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); | 1463 | largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); |
| 1371 | spin_unlock(&vcpu->kvm->mmu_lock); | 1464 | spin_unlock(&vcpu->kvm->mmu_lock); |
| 1372 | 1465 | ||
| 1373 | return r; | 1466 | return r; |
| 1467 | |||
| 1468 | out_unlock: | ||
| 1469 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
| 1470 | kvm_release_pfn_clean(pfn); | ||
| 1471 | return 0; | ||
| 1374 | } | 1472 | } |
| 1375 | 1473 | ||
| 1376 | static void nonpaging_free(struct kvm_vcpu *vcpu) | 1474 | static void nonpaging_free(struct kvm_vcpu *vcpu) |
| @@ -1670,6 +1768,8 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
| 1670 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 1768 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); |
| 1671 | vcpu->arch.update_pte.largepage = 1; | 1769 | vcpu->arch.update_pte.largepage = 1; |
| 1672 | } | 1770 | } |
| 1771 | vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; | ||
| 1772 | /* implicit mb(), we'll read before PT lock is unlocked */ | ||
| 1673 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 1773 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
| 1674 | up_read(¤t->mm->mmap_sem); | 1774 | up_read(¤t->mm->mmap_sem); |
| 1675 | 1775 | ||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 4d918220baeb..f72ac1fa35f0 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
| @@ -263,6 +263,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
| 263 | pfn = vcpu->arch.update_pte.pfn; | 263 | pfn = vcpu->arch.update_pte.pfn; |
| 264 | if (is_error_pfn(pfn)) | 264 | if (is_error_pfn(pfn)) |
| 265 | return; | 265 | return; |
| 266 | if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) | ||
| 267 | return; | ||
| 266 | kvm_get_pfn(pfn); | 268 | kvm_get_pfn(pfn); |
| 267 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, | 269 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, |
| 268 | gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), | 270 | gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), |
| @@ -380,6 +382,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
| 380 | int r; | 382 | int r; |
| 381 | pfn_t pfn; | 383 | pfn_t pfn; |
| 382 | int largepage = 0; | 384 | int largepage = 0; |
| 385 | unsigned long mmu_seq; | ||
| 383 | 386 | ||
| 384 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); | 387 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); |
| 385 | kvm_mmu_audit(vcpu, "pre page fault"); | 388 | kvm_mmu_audit(vcpu, "pre page fault"); |
| @@ -413,6 +416,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
| 413 | largepage = 1; | 416 | largepage = 1; |
| 414 | } | 417 | } |
| 415 | } | 418 | } |
| 419 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | ||
| 420 | /* implicit mb(), we'll read before PT lock is unlocked */ | ||
| 416 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); | 421 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); |
| 417 | up_read(¤t->mm->mmap_sem); | 422 | up_read(¤t->mm->mmap_sem); |
| 418 | 423 | ||
| @@ -424,6 +429,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
| 424 | } | 429 | } |
| 425 | 430 | ||
| 426 | spin_lock(&vcpu->kvm->mmu_lock); | 431 | spin_lock(&vcpu->kvm->mmu_lock); |
| 432 | if (mmu_notifier_retry(vcpu, mmu_seq)) | ||
| 433 | goto out_unlock; | ||
| 427 | kvm_mmu_free_some_pages(vcpu); | 434 | kvm_mmu_free_some_pages(vcpu); |
| 428 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 435 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, |
| 429 | largepage, &write_pt, pfn); | 436 | largepage, &write_pt, pfn); |
| @@ -439,6 +446,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
| 439 | spin_unlock(&vcpu->kvm->mmu_lock); | 446 | spin_unlock(&vcpu->kvm->mmu_lock); |
| 440 | 447 | ||
| 441 | return write_pt; | 448 | return write_pt; |
| 449 | |||
| 450 | out_unlock: | ||
| 451 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
| 452 | kvm_release_pfn_clean(pfn); | ||
| 453 | return 0; | ||
| 442 | } | 454 | } |
| 443 | 455 | ||
| 444 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | 456 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) |
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index bc34dc21f178..0f3c53114614 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | 13 | ||
| 14 | #include <linux/types.h> | 14 | #include <linux/types.h> |
| 15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
| 16 | #include <linux/mmu_notifier.h> | ||
| 16 | 17 | ||
| 17 | #include <linux/kvm.h> | 18 | #include <linux/kvm.h> |
| 18 | #include <linux/kvm_para.h> | 19 | #include <linux/kvm_para.h> |
| @@ -251,6 +252,7 @@ struct kvm_vcpu_arch { | |||
| 251 | gfn_t gfn; /* presumed gfn during guest pte update */ | 252 | gfn_t gfn; /* presumed gfn during guest pte update */ |
| 252 | pfn_t pfn; /* pfn corresponding to that gfn */ | 253 | pfn_t pfn; /* pfn corresponding to that gfn */ |
| 253 | int largepage; | 254 | int largepage; |
| 255 | unsigned long mmu_seq; | ||
| 254 | } update_pte; | 256 | } update_pte; |
| 255 | 257 | ||
| 256 | struct i387_fxsave_struct host_fx_image; | 258 | struct i387_fxsave_struct host_fx_image; |
| @@ -729,4 +731,8 @@ asmlinkage void kvm_handle_fault_on_reboot(void); | |||
| 729 | KVM_EX_ENTRY " 666b, 667b \n\t" \ | 731 | KVM_EX_ENTRY " 666b, 667b \n\t" \ |
| 730 | ".popsection" | 732 | ".popsection" |
| 731 | 733 | ||
| 734 | #define KVM_ARCH_WANT_MMU_NOTIFIER | ||
| 735 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); | ||
| 736 | int kvm_age_hva(struct kvm *kvm, unsigned long hva); | ||
| 737 | |||
| 732 | #endif | 738 | #endif |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 07d68a8ae8e9..8525afc53107 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
| @@ -121,6 +121,12 @@ struct kvm { | |||
| 121 | struct kvm_coalesced_mmio_dev *coalesced_mmio_dev; | 121 | struct kvm_coalesced_mmio_dev *coalesced_mmio_dev; |
| 122 | struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; | 122 | struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; |
| 123 | #endif | 123 | #endif |
| 124 | |||
| 125 | #ifdef KVM_ARCH_WANT_MMU_NOTIFIER | ||
| 126 | struct mmu_notifier mmu_notifier; | ||
| 127 | unsigned long mmu_notifier_seq; | ||
| 128 | long mmu_notifier_count; | ||
| 129 | #endif | ||
| 124 | }; | 130 | }; |
| 125 | 131 | ||
| 126 | /* The guest did something we don't support. */ | 132 | /* The guest did something we don't support. */ |
| @@ -332,4 +338,22 @@ int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg) | |||
| 332 | #define kvm_trace_cleanup() ((void)0) | 338 | #define kvm_trace_cleanup() ((void)0) |
| 333 | #endif | 339 | #endif |
| 334 | 340 | ||
| 341 | #ifdef KVM_ARCH_WANT_MMU_NOTIFIER | ||
| 342 | static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) | ||
| 343 | { | ||
| 344 | if (unlikely(vcpu->kvm->mmu_notifier_count)) | ||
| 345 | return 1; | ||
| 346 | /* | ||
| 347 | * Both reads happen under the mmu_lock and both values are | ||
| 348 | * modified under mmu_lock, so there's no need of smb_rmb() | ||
| 349 | * here in between, otherwise mmu_notifier_count should be | ||
| 350 | * read before mmu_notifier_seq, see | ||
| 351 | * mmu_notifier_invalidate_range_end write side. | ||
| 352 | */ | ||
| 353 | if (vcpu->kvm->mmu_notifier_seq != mmu_seq) | ||
| 354 | return 1; | ||
| 355 | return 0; | ||
| 356 | } | ||
| 357 | #endif | ||
| 358 | |||
| 335 | #endif | 359 | #endif |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3735212cd3f8..7dd9b0b85e4e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
| @@ -192,6 +192,123 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) | |||
| 192 | } | 192 | } |
| 193 | EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); | 193 | EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); |
| 194 | 194 | ||
| 195 | #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) | ||
| 196 | static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) | ||
| 197 | { | ||
| 198 | return container_of(mn, struct kvm, mmu_notifier); | ||
| 199 | } | ||
| 200 | |||
| 201 | static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, | ||
| 202 | struct mm_struct *mm, | ||
| 203 | unsigned long address) | ||
| 204 | { | ||
| 205 | struct kvm *kvm = mmu_notifier_to_kvm(mn); | ||
| 206 | int need_tlb_flush; | ||
| 207 | |||
| 208 | /* | ||
| 209 | * When ->invalidate_page runs, the linux pte has been zapped | ||
| 210 | * already but the page is still allocated until | ||
| 211 | * ->invalidate_page returns. So if we increase the sequence | ||
| 212 | * here the kvm page fault will notice if the spte can't be | ||
| 213 | * established because the page is going to be freed. If | ||
| 214 | * instead the kvm page fault establishes the spte before | ||
| 215 | * ->invalidate_page runs, kvm_unmap_hva will release it | ||
| 216 | * before returning. | ||
| 217 | * | ||
| 218 | * The sequence increase only need to be seen at spin_unlock | ||
| 219 | * time, and not at spin_lock time. | ||
| 220 | * | ||
| 221 | * Increasing the sequence after the spin_unlock would be | ||
| 222 | * unsafe because the kvm page fault could then establish the | ||
| 223 | * pte after kvm_unmap_hva returned, without noticing the page | ||
| 224 | * is going to be freed. | ||
| 225 | */ | ||
| 226 | spin_lock(&kvm->mmu_lock); | ||
| 227 | kvm->mmu_notifier_seq++; | ||
| 228 | need_tlb_flush = kvm_unmap_hva(kvm, address); | ||
| 229 | spin_unlock(&kvm->mmu_lock); | ||
| 230 | |||
| 231 | /* we've to flush the tlb before the pages can be freed */ | ||
| 232 | if (need_tlb_flush) | ||
| 233 | kvm_flush_remote_tlbs(kvm); | ||
| 234 | |||
| 235 | } | ||
| 236 | |||
| 237 | static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, | ||
| 238 | struct mm_struct *mm, | ||
| 239 | unsigned long start, | ||
| 240 | unsigned long end) | ||
| 241 | { | ||
| 242 | struct kvm *kvm = mmu_notifier_to_kvm(mn); | ||
| 243 | int need_tlb_flush = 0; | ||
| 244 | |||
| 245 | spin_lock(&kvm->mmu_lock); | ||
| 246 | /* | ||
| 247 | * The count increase must become visible at unlock time as no | ||
| 248 | * spte can be established without taking the mmu_lock and | ||
| 249 | * count is also read inside the mmu_lock critical section. | ||
| 250 | */ | ||
| 251 | kvm->mmu_notifier_count++; | ||
| 252 | for (; start < end; start += PAGE_SIZE) | ||
| 253 | need_tlb_flush |= kvm_unmap_hva(kvm, start); | ||
| 254 | spin_unlock(&kvm->mmu_lock); | ||
| 255 | |||
| 256 | /* we've to flush the tlb before the pages can be freed */ | ||
| 257 | if (need_tlb_flush) | ||
| 258 | kvm_flush_remote_tlbs(kvm); | ||
| 259 | } | ||
| 260 | |||
| 261 | static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, | ||
| 262 | struct mm_struct *mm, | ||
| 263 | unsigned long start, | ||
| 264 | unsigned long end) | ||
| 265 | { | ||
| 266 | struct kvm *kvm = mmu_notifier_to_kvm(mn); | ||
| 267 | |||
| 268 | spin_lock(&kvm->mmu_lock); | ||
| 269 | /* | ||
| 270 | * This sequence increase will notify the kvm page fault that | ||
| 271 | * the page that is going to be mapped in the spte could have | ||
| 272 | * been freed. | ||
| 273 | */ | ||
| 274 | kvm->mmu_notifier_seq++; | ||
| 275 | /* | ||
| 276 | * The above sequence increase must be visible before the | ||
| 277 | * below count decrease but both values are read by the kvm | ||
| 278 | * page fault under mmu_lock spinlock so we don't need to add | ||
| 279 | * a smb_wmb() here in between the two. | ||
| 280 | */ | ||
| 281 | kvm->mmu_notifier_count--; | ||
| 282 | spin_unlock(&kvm->mmu_lock); | ||
| 283 | |||
| 284 | BUG_ON(kvm->mmu_notifier_count < 0); | ||
| 285 | } | ||
| 286 | |||
| 287 | static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, | ||
| 288 | struct mm_struct *mm, | ||
| 289 | unsigned long address) | ||
| 290 | { | ||
| 291 | struct kvm *kvm = mmu_notifier_to_kvm(mn); | ||
| 292 | int young; | ||
| 293 | |||
| 294 | spin_lock(&kvm->mmu_lock); | ||
| 295 | young = kvm_age_hva(kvm, address); | ||
| 296 | spin_unlock(&kvm->mmu_lock); | ||
| 297 | |||
| 298 | if (young) | ||
| 299 | kvm_flush_remote_tlbs(kvm); | ||
| 300 | |||
| 301 | return young; | ||
| 302 | } | ||
| 303 | |||
| 304 | static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { | ||
| 305 | .invalidate_page = kvm_mmu_notifier_invalidate_page, | ||
| 306 | .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, | ||
| 307 | .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, | ||
| 308 | .clear_flush_young = kvm_mmu_notifier_clear_flush_young, | ||
| 309 | }; | ||
| 310 | #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ | ||
| 311 | |||
| 195 | static struct kvm *kvm_create_vm(void) | 312 | static struct kvm *kvm_create_vm(void) |
| 196 | { | 313 | { |
| 197 | struct kvm *kvm = kvm_arch_create_vm(); | 314 | struct kvm *kvm = kvm_arch_create_vm(); |
| @@ -212,6 +329,21 @@ static struct kvm *kvm_create_vm(void) | |||
| 212 | (struct kvm_coalesced_mmio_ring *)page_address(page); | 329 | (struct kvm_coalesced_mmio_ring *)page_address(page); |
| 213 | #endif | 330 | #endif |
| 214 | 331 | ||
| 332 | #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) | ||
| 333 | { | ||
| 334 | int err; | ||
| 335 | kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; | ||
| 336 | err = mmu_notifier_register(&kvm->mmu_notifier, current->mm); | ||
| 337 | if (err) { | ||
| 338 | #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET | ||
| 339 | put_page(page); | ||
| 340 | #endif | ||
| 341 | kfree(kvm); | ||
| 342 | return ERR_PTR(err); | ||
| 343 | } | ||
| 344 | } | ||
| 345 | #endif | ||
| 346 | |||
| 215 | kvm->mm = current->mm; | 347 | kvm->mm = current->mm; |
| 216 | atomic_inc(&kvm->mm->mm_count); | 348 | atomic_inc(&kvm->mm->mm_count); |
| 217 | spin_lock_init(&kvm->mmu_lock); | 349 | spin_lock_init(&kvm->mmu_lock); |
| @@ -272,6 +404,9 @@ static void kvm_destroy_vm(struct kvm *kvm) | |||
| 272 | if (kvm->coalesced_mmio_ring != NULL) | 404 | if (kvm->coalesced_mmio_ring != NULL) |
| 273 | free_page((unsigned long)kvm->coalesced_mmio_ring); | 405 | free_page((unsigned long)kvm->coalesced_mmio_ring); |
| 274 | #endif | 406 | #endif |
| 407 | #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) | ||
| 408 | mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); | ||
| 409 | #endif | ||
| 275 | kvm_arch_destroy_vm(kvm); | 410 | kvm_arch_destroy_vm(kvm); |
| 276 | mmdrop(mm); | 411 | mmdrop(mm); |
| 277 | } | 412 | } |
