diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-08-01 15:48:16 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-08-01 15:48:16 -0400 |
commit | 84ff7a001270258f71d6ab0d164f351e32c9718a (patch) | |
tree | 6b2f33a9ad492186062de247d3572fa5efebe4bd | |
parent | 478735e42bfa047384afa72dceb408035532db20 (diff) | |
parent | 1f4170e12db06fdde5279d665a7e6e2976b2b623 (diff) |
Merge branch 'kvm-updates-2.6.27' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm
* 'kvm-updates-2.6.27' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm:
KVM: s390: Fix kvm on IBM System z10
KVM: Advertise synchronized mmu support to userspace
KVM: Synchronize guest physical memory map to host virtual memory map
KVM: Allow browsing memslots with mmu_lock
KVM: Allow reading aliases with mmu_lock
-rw-r--r-- | arch/s390/kvm/priv.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 100 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 12 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 24 | ||||
-rw-r--r-- | include/asm-x86/kvm_host.h | 6 | ||||
-rw-r--r-- | include/linux/kvm.h | 1 | ||||
-rw-r--r-- | include/linux/kvm_host.h | 24 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 155 |
8 files changed, 312 insertions, 11 deletions
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 2e2d2ffb6a07..d1faf5c54405 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c | |||
@@ -158,6 +158,7 @@ static int handle_stfl(struct kvm_vcpu *vcpu) | |||
158 | 158 | ||
159 | vcpu->stat.instruction_stfl++; | 159 | vcpu->stat.instruction_stfl++; |
160 | facility_list &= ~(1UL<<24); /* no stfle */ | 160 | facility_list &= ~(1UL<<24); /* no stfle */ |
161 | facility_list &= ~(1UL<<23); /* no large pages */ | ||
161 | 162 | ||
162 | rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list), | 163 | rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list), |
163 | &facility_list, sizeof(facility_list)); | 164 | &facility_list, sizeof(facility_list)); |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2fa231923cf7..0bfe2bd305eb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -653,6 +653,84 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
653 | account_shadowed(kvm, gfn); | 653 | account_shadowed(kvm, gfn); |
654 | } | 654 | } |
655 | 655 | ||
656 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) | ||
657 | { | ||
658 | u64 *spte; | ||
659 | int need_tlb_flush = 0; | ||
660 | |||
661 | while ((spte = rmap_next(kvm, rmapp, NULL))) { | ||
662 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | ||
663 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); | ||
664 | rmap_remove(kvm, spte); | ||
665 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | ||
666 | need_tlb_flush = 1; | ||
667 | } | ||
668 | return need_tlb_flush; | ||
669 | } | ||
670 | |||
671 | static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | ||
672 | int (*handler)(struct kvm *kvm, unsigned long *rmapp)) | ||
673 | { | ||
674 | int i; | ||
675 | int retval = 0; | ||
676 | |||
677 | /* | ||
678 | * If mmap_sem isn't taken, we can look the memslots with only | ||
679 | * the mmu_lock by skipping over the slots with userspace_addr == 0. | ||
680 | */ | ||
681 | for (i = 0; i < kvm->nmemslots; i++) { | ||
682 | struct kvm_memory_slot *memslot = &kvm->memslots[i]; | ||
683 | unsigned long start = memslot->userspace_addr; | ||
684 | unsigned long end; | ||
685 | |||
686 | /* mmu_lock protects userspace_addr */ | ||
687 | if (!start) | ||
688 | continue; | ||
689 | |||
690 | end = start + (memslot->npages << PAGE_SHIFT); | ||
691 | if (hva >= start && hva < end) { | ||
692 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; | ||
693 | retval |= handler(kvm, &memslot->rmap[gfn_offset]); | ||
694 | retval |= handler(kvm, | ||
695 | &memslot->lpage_info[ | ||
696 | gfn_offset / | ||
697 | KVM_PAGES_PER_HPAGE].rmap_pde); | ||
698 | } | ||
699 | } | ||
700 | |||
701 | return retval; | ||
702 | } | ||
703 | |||
704 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) | ||
705 | { | ||
706 | return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); | ||
707 | } | ||
708 | |||
709 | static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) | ||
710 | { | ||
711 | u64 *spte; | ||
712 | int young = 0; | ||
713 | |||
714 | spte = rmap_next(kvm, rmapp, NULL); | ||
715 | while (spte) { | ||
716 | int _young; | ||
717 | u64 _spte = *spte; | ||
718 | BUG_ON(!(_spte & PT_PRESENT_MASK)); | ||
719 | _young = _spte & PT_ACCESSED_MASK; | ||
720 | if (_young) { | ||
721 | young = 1; | ||
722 | clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); | ||
723 | } | ||
724 | spte = rmap_next(kvm, rmapp, spte); | ||
725 | } | ||
726 | return young; | ||
727 | } | ||
728 | |||
729 | int kvm_age_hva(struct kvm *kvm, unsigned long hva) | ||
730 | { | ||
731 | return kvm_handle_hva(kvm, hva, kvm_age_rmapp); | ||
732 | } | ||
733 | |||
656 | #ifdef MMU_DEBUG | 734 | #ifdef MMU_DEBUG |
657 | static int is_empty_shadow_page(u64 *spt) | 735 | static int is_empty_shadow_page(u64 *spt) |
658 | { | 736 | { |
@@ -1203,6 +1281,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
1203 | int r; | 1281 | int r; |
1204 | int largepage = 0; | 1282 | int largepage = 0; |
1205 | pfn_t pfn; | 1283 | pfn_t pfn; |
1284 | unsigned long mmu_seq; | ||
1206 | 1285 | ||
1207 | down_read(¤t->mm->mmap_sem); | 1286 | down_read(¤t->mm->mmap_sem); |
1208 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { | 1287 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { |
@@ -1210,6 +1289,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
1210 | largepage = 1; | 1289 | largepage = 1; |
1211 | } | 1290 | } |
1212 | 1291 | ||
1292 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | ||
1293 | /* implicit mb(), we'll read before PT lock is unlocked */ | ||
1213 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 1294 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
1214 | up_read(¤t->mm->mmap_sem); | 1295 | up_read(¤t->mm->mmap_sem); |
1215 | 1296 | ||
@@ -1220,6 +1301,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
1220 | } | 1301 | } |
1221 | 1302 | ||
1222 | spin_lock(&vcpu->kvm->mmu_lock); | 1303 | spin_lock(&vcpu->kvm->mmu_lock); |
1304 | if (mmu_notifier_retry(vcpu, mmu_seq)) | ||
1305 | goto out_unlock; | ||
1223 | kvm_mmu_free_some_pages(vcpu); | 1306 | kvm_mmu_free_some_pages(vcpu); |
1224 | r = __direct_map(vcpu, v, write, largepage, gfn, pfn, | 1307 | r = __direct_map(vcpu, v, write, largepage, gfn, pfn, |
1225 | PT32E_ROOT_LEVEL); | 1308 | PT32E_ROOT_LEVEL); |
@@ -1227,6 +1310,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
1227 | 1310 | ||
1228 | 1311 | ||
1229 | return r; | 1312 | return r; |
1313 | |||
1314 | out_unlock: | ||
1315 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
1316 | kvm_release_pfn_clean(pfn); | ||
1317 | return 0; | ||
1230 | } | 1318 | } |
1231 | 1319 | ||
1232 | 1320 | ||
@@ -1345,6 +1433,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
1345 | int r; | 1433 | int r; |
1346 | int largepage = 0; | 1434 | int largepage = 0; |
1347 | gfn_t gfn = gpa >> PAGE_SHIFT; | 1435 | gfn_t gfn = gpa >> PAGE_SHIFT; |
1436 | unsigned long mmu_seq; | ||
1348 | 1437 | ||
1349 | ASSERT(vcpu); | 1438 | ASSERT(vcpu); |
1350 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 1439 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
@@ -1358,6 +1447,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
1358 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 1447 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); |
1359 | largepage = 1; | 1448 | largepage = 1; |
1360 | } | 1449 | } |
1450 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | ||
1451 | /* implicit mb(), we'll read before PT lock is unlocked */ | ||
1361 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 1452 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
1362 | up_read(¤t->mm->mmap_sem); | 1453 | up_read(¤t->mm->mmap_sem); |
1363 | if (is_error_pfn(pfn)) { | 1454 | if (is_error_pfn(pfn)) { |
@@ -1365,12 +1456,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
1365 | return 1; | 1456 | return 1; |
1366 | } | 1457 | } |
1367 | spin_lock(&vcpu->kvm->mmu_lock); | 1458 | spin_lock(&vcpu->kvm->mmu_lock); |
1459 | if (mmu_notifier_retry(vcpu, mmu_seq)) | ||
1460 | goto out_unlock; | ||
1368 | kvm_mmu_free_some_pages(vcpu); | 1461 | kvm_mmu_free_some_pages(vcpu); |
1369 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, | 1462 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, |
1370 | largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); | 1463 | largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); |
1371 | spin_unlock(&vcpu->kvm->mmu_lock); | 1464 | spin_unlock(&vcpu->kvm->mmu_lock); |
1372 | 1465 | ||
1373 | return r; | 1466 | return r; |
1467 | |||
1468 | out_unlock: | ||
1469 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
1470 | kvm_release_pfn_clean(pfn); | ||
1471 | return 0; | ||
1374 | } | 1472 | } |
1375 | 1473 | ||
1376 | static void nonpaging_free(struct kvm_vcpu *vcpu) | 1474 | static void nonpaging_free(struct kvm_vcpu *vcpu) |
@@ -1670,6 +1768,8 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1670 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 1768 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); |
1671 | vcpu->arch.update_pte.largepage = 1; | 1769 | vcpu->arch.update_pte.largepage = 1; |
1672 | } | 1770 | } |
1771 | vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; | ||
1772 | /* implicit mb(), we'll read before PT lock is unlocked */ | ||
1673 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 1773 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
1674 | up_read(¤t->mm->mmap_sem); | 1774 | up_read(¤t->mm->mmap_sem); |
1675 | 1775 | ||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 4d918220baeb..f72ac1fa35f0 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -263,6 +263,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
263 | pfn = vcpu->arch.update_pte.pfn; | 263 | pfn = vcpu->arch.update_pte.pfn; |
264 | if (is_error_pfn(pfn)) | 264 | if (is_error_pfn(pfn)) |
265 | return; | 265 | return; |
266 | if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) | ||
267 | return; | ||
266 | kvm_get_pfn(pfn); | 268 | kvm_get_pfn(pfn); |
267 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, | 269 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, |
268 | gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), | 270 | gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), |
@@ -380,6 +382,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
380 | int r; | 382 | int r; |
381 | pfn_t pfn; | 383 | pfn_t pfn; |
382 | int largepage = 0; | 384 | int largepage = 0; |
385 | unsigned long mmu_seq; | ||
383 | 386 | ||
384 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); | 387 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); |
385 | kvm_mmu_audit(vcpu, "pre page fault"); | 388 | kvm_mmu_audit(vcpu, "pre page fault"); |
@@ -413,6 +416,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
413 | largepage = 1; | 416 | largepage = 1; |
414 | } | 417 | } |
415 | } | 418 | } |
419 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | ||
420 | /* implicit mb(), we'll read before PT lock is unlocked */ | ||
416 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); | 421 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); |
417 | up_read(¤t->mm->mmap_sem); | 422 | up_read(¤t->mm->mmap_sem); |
418 | 423 | ||
@@ -424,6 +429,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
424 | } | 429 | } |
425 | 430 | ||
426 | spin_lock(&vcpu->kvm->mmu_lock); | 431 | spin_lock(&vcpu->kvm->mmu_lock); |
432 | if (mmu_notifier_retry(vcpu, mmu_seq)) | ||
433 | goto out_unlock; | ||
427 | kvm_mmu_free_some_pages(vcpu); | 434 | kvm_mmu_free_some_pages(vcpu); |
428 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 435 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, |
429 | largepage, &write_pt, pfn); | 436 | largepage, &write_pt, pfn); |
@@ -439,6 +446,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
439 | spin_unlock(&vcpu->kvm->mmu_lock); | 446 | spin_unlock(&vcpu->kvm->mmu_lock); |
440 | 447 | ||
441 | return write_pt; | 448 | return write_pt; |
449 | |||
450 | out_unlock: | ||
451 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
452 | kvm_release_pfn_clean(pfn); | ||
453 | return 0; | ||
442 | } | 454 | } |
443 | 455 | ||
444 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | 456 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5916191420c7..0d682fc6aeb3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -883,6 +883,7 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
883 | case KVM_CAP_PIT: | 883 | case KVM_CAP_PIT: |
884 | case KVM_CAP_NOP_IO_DELAY: | 884 | case KVM_CAP_NOP_IO_DELAY: |
885 | case KVM_CAP_MP_STATE: | 885 | case KVM_CAP_MP_STATE: |
886 | case KVM_CAP_SYNC_MMU: | ||
886 | r = 1; | 887 | r = 1; |
887 | break; | 888 | break; |
888 | case KVM_CAP_COALESCED_MMIO: | 889 | case KVM_CAP_COALESCED_MMIO: |
@@ -1495,6 +1496,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, | |||
1495 | goto out; | 1496 | goto out; |
1496 | 1497 | ||
1497 | down_write(&kvm->slots_lock); | 1498 | down_write(&kvm->slots_lock); |
1499 | spin_lock(&kvm->mmu_lock); | ||
1498 | 1500 | ||
1499 | p = &kvm->arch.aliases[alias->slot]; | 1501 | p = &kvm->arch.aliases[alias->slot]; |
1500 | p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; | 1502 | p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; |
@@ -1506,6 +1508,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, | |||
1506 | break; | 1508 | break; |
1507 | kvm->arch.naliases = n; | 1509 | kvm->arch.naliases = n; |
1508 | 1510 | ||
1511 | spin_unlock(&kvm->mmu_lock); | ||
1509 | kvm_mmu_zap_all(kvm); | 1512 | kvm_mmu_zap_all(kvm); |
1510 | 1513 | ||
1511 | up_write(&kvm->slots_lock); | 1514 | up_write(&kvm->slots_lock); |
@@ -3972,16 +3975,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm, | |||
3972 | */ | 3975 | */ |
3973 | if (!user_alloc) { | 3976 | if (!user_alloc) { |
3974 | if (npages && !old.rmap) { | 3977 | if (npages && !old.rmap) { |
3978 | unsigned long userspace_addr; | ||
3979 | |||
3975 | down_write(¤t->mm->mmap_sem); | 3980 | down_write(¤t->mm->mmap_sem); |
3976 | memslot->userspace_addr = do_mmap(NULL, 0, | 3981 | userspace_addr = do_mmap(NULL, 0, |
3977 | npages * PAGE_SIZE, | 3982 | npages * PAGE_SIZE, |
3978 | PROT_READ | PROT_WRITE, | 3983 | PROT_READ | PROT_WRITE, |
3979 | MAP_SHARED | MAP_ANONYMOUS, | 3984 | MAP_SHARED | MAP_ANONYMOUS, |
3980 | 0); | 3985 | 0); |
3981 | up_write(¤t->mm->mmap_sem); | 3986 | up_write(¤t->mm->mmap_sem); |
3982 | 3987 | ||
3983 | if (IS_ERR((void *)memslot->userspace_addr)) | 3988 | if (IS_ERR((void *)userspace_addr)) |
3984 | return PTR_ERR((void *)memslot->userspace_addr); | 3989 | return PTR_ERR((void *)userspace_addr); |
3990 | |||
3991 | /* set userspace_addr atomically for kvm_hva_to_rmapp */ | ||
3992 | spin_lock(&kvm->mmu_lock); | ||
3993 | memslot->userspace_addr = userspace_addr; | ||
3994 | spin_unlock(&kvm->mmu_lock); | ||
3985 | } else { | 3995 | } else { |
3986 | if (!old.user_alloc && old.rmap) { | 3996 | if (!old.user_alloc && old.rmap) { |
3987 | int ret; | 3997 | int ret; |
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index bc34dc21f178..0f3c53114614 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h | |||
@@ -13,6 +13,7 @@ | |||
13 | 13 | ||
14 | #include <linux/types.h> | 14 | #include <linux/types.h> |
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
16 | #include <linux/mmu_notifier.h> | ||
16 | 17 | ||
17 | #include <linux/kvm.h> | 18 | #include <linux/kvm.h> |
18 | #include <linux/kvm_para.h> | 19 | #include <linux/kvm_para.h> |
@@ -251,6 +252,7 @@ struct kvm_vcpu_arch { | |||
251 | gfn_t gfn; /* presumed gfn during guest pte update */ | 252 | gfn_t gfn; /* presumed gfn during guest pte update */ |
252 | pfn_t pfn; /* pfn corresponding to that gfn */ | 253 | pfn_t pfn; /* pfn corresponding to that gfn */ |
253 | int largepage; | 254 | int largepage; |
255 | unsigned long mmu_seq; | ||
254 | } update_pte; | 256 | } update_pte; |
255 | 257 | ||
256 | struct i387_fxsave_struct host_fx_image; | 258 | struct i387_fxsave_struct host_fx_image; |
@@ -729,4 +731,8 @@ asmlinkage void kvm_handle_fault_on_reboot(void); | |||
729 | KVM_EX_ENTRY " 666b, 667b \n\t" \ | 731 | KVM_EX_ENTRY " 666b, 667b \n\t" \ |
730 | ".popsection" | 732 | ".popsection" |
731 | 733 | ||
734 | #define KVM_ARCH_WANT_MMU_NOTIFIER | ||
735 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); | ||
736 | int kvm_age_hva(struct kvm *kvm, unsigned long hva); | ||
737 | |||
732 | #endif | 738 | #endif |
diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 0ea064cbfbc8..69511f74f912 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h | |||
@@ -371,6 +371,7 @@ struct kvm_trace_rec { | |||
371 | #define KVM_CAP_PV_MMU 13 | 371 | #define KVM_CAP_PV_MMU 13 |
372 | #define KVM_CAP_MP_STATE 14 | 372 | #define KVM_CAP_MP_STATE 14 |
373 | #define KVM_CAP_COALESCED_MMIO 15 | 373 | #define KVM_CAP_COALESCED_MMIO 15 |
374 | #define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ | ||
374 | 375 | ||
375 | /* | 376 | /* |
376 | * ioctls for VM fds | 377 | * ioctls for VM fds |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 07d68a8ae8e9..8525afc53107 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
@@ -121,6 +121,12 @@ struct kvm { | |||
121 | struct kvm_coalesced_mmio_dev *coalesced_mmio_dev; | 121 | struct kvm_coalesced_mmio_dev *coalesced_mmio_dev; |
122 | struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; | 122 | struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; |
123 | #endif | 123 | #endif |
124 | |||
125 | #ifdef KVM_ARCH_WANT_MMU_NOTIFIER | ||
126 | struct mmu_notifier mmu_notifier; | ||
127 | unsigned long mmu_notifier_seq; | ||
128 | long mmu_notifier_count; | ||
129 | #endif | ||
124 | }; | 130 | }; |
125 | 131 | ||
126 | /* The guest did something we don't support. */ | 132 | /* The guest did something we don't support. */ |
@@ -332,4 +338,22 @@ int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg) | |||
332 | #define kvm_trace_cleanup() ((void)0) | 338 | #define kvm_trace_cleanup() ((void)0) |
333 | #endif | 339 | #endif |
334 | 340 | ||
341 | #ifdef KVM_ARCH_WANT_MMU_NOTIFIER | ||
342 | static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) | ||
343 | { | ||
344 | if (unlikely(vcpu->kvm->mmu_notifier_count)) | ||
345 | return 1; | ||
346 | /* | ||
347 | * Both reads happen under the mmu_lock and both values are | ||
348 | * modified under mmu_lock, so there's no need of smb_rmb() | ||
349 | * here in between, otherwise mmu_notifier_count should be | ||
350 | * read before mmu_notifier_seq, see | ||
351 | * mmu_notifier_invalidate_range_end write side. | ||
352 | */ | ||
353 | if (vcpu->kvm->mmu_notifier_seq != mmu_seq) | ||
354 | return 1; | ||
355 | return 0; | ||
356 | } | ||
357 | #endif | ||
358 | |||
335 | #endif | 359 | #endif |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a845890b6800..7dd9b0b85e4e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
@@ -192,6 +192,123 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) | |||
192 | } | 192 | } |
193 | EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); | 193 | EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); |
194 | 194 | ||
195 | #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) | ||
196 | static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) | ||
197 | { | ||
198 | return container_of(mn, struct kvm, mmu_notifier); | ||
199 | } | ||
200 | |||
201 | static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, | ||
202 | struct mm_struct *mm, | ||
203 | unsigned long address) | ||
204 | { | ||
205 | struct kvm *kvm = mmu_notifier_to_kvm(mn); | ||
206 | int need_tlb_flush; | ||
207 | |||
208 | /* | ||
209 | * When ->invalidate_page runs, the linux pte has been zapped | ||
210 | * already but the page is still allocated until | ||
211 | * ->invalidate_page returns. So if we increase the sequence | ||
212 | * here the kvm page fault will notice if the spte can't be | ||
213 | * established because the page is going to be freed. If | ||
214 | * instead the kvm page fault establishes the spte before | ||
215 | * ->invalidate_page runs, kvm_unmap_hva will release it | ||
216 | * before returning. | ||
217 | * | ||
218 | * The sequence increase only need to be seen at spin_unlock | ||
219 | * time, and not at spin_lock time. | ||
220 | * | ||
221 | * Increasing the sequence after the spin_unlock would be | ||
222 | * unsafe because the kvm page fault could then establish the | ||
223 | * pte after kvm_unmap_hva returned, without noticing the page | ||
224 | * is going to be freed. | ||
225 | */ | ||
226 | spin_lock(&kvm->mmu_lock); | ||
227 | kvm->mmu_notifier_seq++; | ||
228 | need_tlb_flush = kvm_unmap_hva(kvm, address); | ||
229 | spin_unlock(&kvm->mmu_lock); | ||
230 | |||
231 | /* we've to flush the tlb before the pages can be freed */ | ||
232 | if (need_tlb_flush) | ||
233 | kvm_flush_remote_tlbs(kvm); | ||
234 | |||
235 | } | ||
236 | |||
237 | static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, | ||
238 | struct mm_struct *mm, | ||
239 | unsigned long start, | ||
240 | unsigned long end) | ||
241 | { | ||
242 | struct kvm *kvm = mmu_notifier_to_kvm(mn); | ||
243 | int need_tlb_flush = 0; | ||
244 | |||
245 | spin_lock(&kvm->mmu_lock); | ||
246 | /* | ||
247 | * The count increase must become visible at unlock time as no | ||
248 | * spte can be established without taking the mmu_lock and | ||
249 | * count is also read inside the mmu_lock critical section. | ||
250 | */ | ||
251 | kvm->mmu_notifier_count++; | ||
252 | for (; start < end; start += PAGE_SIZE) | ||
253 | need_tlb_flush |= kvm_unmap_hva(kvm, start); | ||
254 | spin_unlock(&kvm->mmu_lock); | ||
255 | |||
256 | /* we've to flush the tlb before the pages can be freed */ | ||
257 | if (need_tlb_flush) | ||
258 | kvm_flush_remote_tlbs(kvm); | ||
259 | } | ||
260 | |||
261 | static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, | ||
262 | struct mm_struct *mm, | ||
263 | unsigned long start, | ||
264 | unsigned long end) | ||
265 | { | ||
266 | struct kvm *kvm = mmu_notifier_to_kvm(mn); | ||
267 | |||
268 | spin_lock(&kvm->mmu_lock); | ||
269 | /* | ||
270 | * This sequence increase will notify the kvm page fault that | ||
271 | * the page that is going to be mapped in the spte could have | ||
272 | * been freed. | ||
273 | */ | ||
274 | kvm->mmu_notifier_seq++; | ||
275 | /* | ||
276 | * The above sequence increase must be visible before the | ||
277 | * below count decrease but both values are read by the kvm | ||
278 | * page fault under mmu_lock spinlock so we don't need to add | ||
279 | * a smb_wmb() here in between the two. | ||
280 | */ | ||
281 | kvm->mmu_notifier_count--; | ||
282 | spin_unlock(&kvm->mmu_lock); | ||
283 | |||
284 | BUG_ON(kvm->mmu_notifier_count < 0); | ||
285 | } | ||
286 | |||
287 | static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, | ||
288 | struct mm_struct *mm, | ||
289 | unsigned long address) | ||
290 | { | ||
291 | struct kvm *kvm = mmu_notifier_to_kvm(mn); | ||
292 | int young; | ||
293 | |||
294 | spin_lock(&kvm->mmu_lock); | ||
295 | young = kvm_age_hva(kvm, address); | ||
296 | spin_unlock(&kvm->mmu_lock); | ||
297 | |||
298 | if (young) | ||
299 | kvm_flush_remote_tlbs(kvm); | ||
300 | |||
301 | return young; | ||
302 | } | ||
303 | |||
304 | static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { | ||
305 | .invalidate_page = kvm_mmu_notifier_invalidate_page, | ||
306 | .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, | ||
307 | .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, | ||
308 | .clear_flush_young = kvm_mmu_notifier_clear_flush_young, | ||
309 | }; | ||
310 | #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ | ||
311 | |||
195 | static struct kvm *kvm_create_vm(void) | 312 | static struct kvm *kvm_create_vm(void) |
196 | { | 313 | { |
197 | struct kvm *kvm = kvm_arch_create_vm(); | 314 | struct kvm *kvm = kvm_arch_create_vm(); |
@@ -212,6 +329,21 @@ static struct kvm *kvm_create_vm(void) | |||
212 | (struct kvm_coalesced_mmio_ring *)page_address(page); | 329 | (struct kvm_coalesced_mmio_ring *)page_address(page); |
213 | #endif | 330 | #endif |
214 | 331 | ||
332 | #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) | ||
333 | { | ||
334 | int err; | ||
335 | kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; | ||
336 | err = mmu_notifier_register(&kvm->mmu_notifier, current->mm); | ||
337 | if (err) { | ||
338 | #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET | ||
339 | put_page(page); | ||
340 | #endif | ||
341 | kfree(kvm); | ||
342 | return ERR_PTR(err); | ||
343 | } | ||
344 | } | ||
345 | #endif | ||
346 | |||
215 | kvm->mm = current->mm; | 347 | kvm->mm = current->mm; |
216 | atomic_inc(&kvm->mm->mm_count); | 348 | atomic_inc(&kvm->mm->mm_count); |
217 | spin_lock_init(&kvm->mmu_lock); | 349 | spin_lock_init(&kvm->mmu_lock); |
@@ -272,6 +404,9 @@ static void kvm_destroy_vm(struct kvm *kvm) | |||
272 | if (kvm->coalesced_mmio_ring != NULL) | 404 | if (kvm->coalesced_mmio_ring != NULL) |
273 | free_page((unsigned long)kvm->coalesced_mmio_ring); | 405 | free_page((unsigned long)kvm->coalesced_mmio_ring); |
274 | #endif | 406 | #endif |
407 | #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) | ||
408 | mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); | ||
409 | #endif | ||
275 | kvm_arch_destroy_vm(kvm); | 410 | kvm_arch_destroy_vm(kvm); |
276 | mmdrop(mm); | 411 | mmdrop(mm); |
277 | } | 412 | } |
@@ -375,7 +510,15 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
375 | memset(new.rmap, 0, npages * sizeof(*new.rmap)); | 510 | memset(new.rmap, 0, npages * sizeof(*new.rmap)); |
376 | 511 | ||
377 | new.user_alloc = user_alloc; | 512 | new.user_alloc = user_alloc; |
378 | new.userspace_addr = mem->userspace_addr; | 513 | /* |
514 | * hva_to_rmmap() serialzies with the mmu_lock and to be | ||
515 | * safe it has to ignore memslots with !user_alloc && | ||
516 | * !userspace_addr. | ||
517 | */ | ||
518 | if (user_alloc) | ||
519 | new.userspace_addr = mem->userspace_addr; | ||
520 | else | ||
521 | new.userspace_addr = 0; | ||
379 | } | 522 | } |
380 | if (npages && !new.lpage_info) { | 523 | if (npages && !new.lpage_info) { |
381 | int largepages = npages / KVM_PAGES_PER_HPAGE; | 524 | int largepages = npages / KVM_PAGES_PER_HPAGE; |
@@ -408,17 +551,21 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
408 | } | 551 | } |
409 | #endif /* not defined CONFIG_S390 */ | 552 | #endif /* not defined CONFIG_S390 */ |
410 | 553 | ||
411 | if (mem->slot >= kvm->nmemslots) | ||
412 | kvm->nmemslots = mem->slot + 1; | ||
413 | |||
414 | if (!npages) | 554 | if (!npages) |
415 | kvm_arch_flush_shadow(kvm); | 555 | kvm_arch_flush_shadow(kvm); |
416 | 556 | ||
557 | spin_lock(&kvm->mmu_lock); | ||
558 | if (mem->slot >= kvm->nmemslots) | ||
559 | kvm->nmemslots = mem->slot + 1; | ||
560 | |||
417 | *memslot = new; | 561 | *memslot = new; |
562 | spin_unlock(&kvm->mmu_lock); | ||
418 | 563 | ||
419 | r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); | 564 | r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); |
420 | if (r) { | 565 | if (r) { |
566 | spin_lock(&kvm->mmu_lock); | ||
421 | *memslot = old; | 567 | *memslot = old; |
568 | spin_unlock(&kvm->mmu_lock); | ||
422 | goto out_free; | 569 | goto out_free; |
423 | } | 570 | } |
424 | 571 | ||