aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/s390/kvm/priv.c1
-rw-r--r--arch/x86/kvm/mmu.c100
-rw-r--r--arch/x86/kvm/paging_tmpl.h12
-rw-r--r--arch/x86/kvm/x86.c24
-rw-r--r--include/asm-x86/kvm_host.h6
-rw-r--r--include/linux/kvm.h1
-rw-r--r--include/linux/kvm_host.h24
-rw-r--r--virt/kvm/kvm_main.c155
8 files changed, 312 insertions, 11 deletions
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 2e2d2ffb6a07..d1faf5c54405 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -158,6 +158,7 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
158 158
159 vcpu->stat.instruction_stfl++; 159 vcpu->stat.instruction_stfl++;
160 facility_list &= ~(1UL<<24); /* no stfle */ 160 facility_list &= ~(1UL<<24); /* no stfle */
161 facility_list &= ~(1UL<<23); /* no large pages */
161 162
162 rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list), 163 rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
163 &facility_list, sizeof(facility_list)); 164 &facility_list, sizeof(facility_list));
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2fa231923cf7..0bfe2bd305eb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -653,6 +653,84 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
653 account_shadowed(kvm, gfn); 653 account_shadowed(kvm, gfn);
654} 654}
655 655
656static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
657{
658 u64 *spte;
659 int need_tlb_flush = 0;
660
661 while ((spte = rmap_next(kvm, rmapp, NULL))) {
662 BUG_ON(!(*spte & PT_PRESENT_MASK));
663 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
664 rmap_remove(kvm, spte);
665 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
666 need_tlb_flush = 1;
667 }
668 return need_tlb_flush;
669}
670
671static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
672 int (*handler)(struct kvm *kvm, unsigned long *rmapp))
673{
674 int i;
675 int retval = 0;
676
677 /*
678 * If mmap_sem isn't taken, we can look the memslots with only
679 * the mmu_lock by skipping over the slots with userspace_addr == 0.
680 */
681 for (i = 0; i < kvm->nmemslots; i++) {
682 struct kvm_memory_slot *memslot = &kvm->memslots[i];
683 unsigned long start = memslot->userspace_addr;
684 unsigned long end;
685
686 /* mmu_lock protects userspace_addr */
687 if (!start)
688 continue;
689
690 end = start + (memslot->npages << PAGE_SHIFT);
691 if (hva >= start && hva < end) {
692 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
693 retval |= handler(kvm, &memslot->rmap[gfn_offset]);
694 retval |= handler(kvm,
695 &memslot->lpage_info[
696 gfn_offset /
697 KVM_PAGES_PER_HPAGE].rmap_pde);
698 }
699 }
700
701 return retval;
702}
703
704int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
705{
706 return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
707}
708
709static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
710{
711 u64 *spte;
712 int young = 0;
713
714 spte = rmap_next(kvm, rmapp, NULL);
715 while (spte) {
716 int _young;
717 u64 _spte = *spte;
718 BUG_ON(!(_spte & PT_PRESENT_MASK));
719 _young = _spte & PT_ACCESSED_MASK;
720 if (_young) {
721 young = 1;
722 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
723 }
724 spte = rmap_next(kvm, rmapp, spte);
725 }
726 return young;
727}
728
729int kvm_age_hva(struct kvm *kvm, unsigned long hva)
730{
731 return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
732}
733
656#ifdef MMU_DEBUG 734#ifdef MMU_DEBUG
657static int is_empty_shadow_page(u64 *spt) 735static int is_empty_shadow_page(u64 *spt)
658{ 736{
@@ -1203,6 +1281,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1203 int r; 1281 int r;
1204 int largepage = 0; 1282 int largepage = 0;
1205 pfn_t pfn; 1283 pfn_t pfn;
1284 unsigned long mmu_seq;
1206 1285
1207 down_read(&current->mm->mmap_sem); 1286 down_read(&current->mm->mmap_sem);
1208 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1287 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
@@ -1210,6 +1289,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1210 largepage = 1; 1289 largepage = 1;
1211 } 1290 }
1212 1291
1292 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1293 /* implicit mb(), we'll read before PT lock is unlocked */
1213 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1294 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1214 up_read(&current->mm->mmap_sem); 1295 up_read(&current->mm->mmap_sem);
1215 1296
@@ -1220,6 +1301,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1220 } 1301 }
1221 1302
1222 spin_lock(&vcpu->kvm->mmu_lock); 1303 spin_lock(&vcpu->kvm->mmu_lock);
1304 if (mmu_notifier_retry(vcpu, mmu_seq))
1305 goto out_unlock;
1223 kvm_mmu_free_some_pages(vcpu); 1306 kvm_mmu_free_some_pages(vcpu);
1224 r = __direct_map(vcpu, v, write, largepage, gfn, pfn, 1307 r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
1225 PT32E_ROOT_LEVEL); 1308 PT32E_ROOT_LEVEL);
@@ -1227,6 +1310,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1227 1310
1228 1311
1229 return r; 1312 return r;
1313
1314out_unlock:
1315 spin_unlock(&vcpu->kvm->mmu_lock);
1316 kvm_release_pfn_clean(pfn);
1317 return 0;
1230} 1318}
1231 1319
1232 1320
@@ -1345,6 +1433,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1345 int r; 1433 int r;
1346 int largepage = 0; 1434 int largepage = 0;
1347 gfn_t gfn = gpa >> PAGE_SHIFT; 1435 gfn_t gfn = gpa >> PAGE_SHIFT;
1436 unsigned long mmu_seq;
1348 1437
1349 ASSERT(vcpu); 1438 ASSERT(vcpu);
1350 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 1439 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1358,6 +1447,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1358 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1447 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1359 largepage = 1; 1448 largepage = 1;
1360 } 1449 }
1450 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1451 /* implicit mb(), we'll read before PT lock is unlocked */
1361 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1452 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1362 up_read(&current->mm->mmap_sem); 1453 up_read(&current->mm->mmap_sem);
1363 if (is_error_pfn(pfn)) { 1454 if (is_error_pfn(pfn)) {
@@ -1365,12 +1456,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1365 return 1; 1456 return 1;
1366 } 1457 }
1367 spin_lock(&vcpu->kvm->mmu_lock); 1458 spin_lock(&vcpu->kvm->mmu_lock);
1459 if (mmu_notifier_retry(vcpu, mmu_seq))
1460 goto out_unlock;
1368 kvm_mmu_free_some_pages(vcpu); 1461 kvm_mmu_free_some_pages(vcpu);
1369 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 1462 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1370 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); 1463 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
1371 spin_unlock(&vcpu->kvm->mmu_lock); 1464 spin_unlock(&vcpu->kvm->mmu_lock);
1372 1465
1373 return r; 1466 return r;
1467
1468out_unlock:
1469 spin_unlock(&vcpu->kvm->mmu_lock);
1470 kvm_release_pfn_clean(pfn);
1471 return 0;
1374} 1472}
1375 1473
1376static void nonpaging_free(struct kvm_vcpu *vcpu) 1474static void nonpaging_free(struct kvm_vcpu *vcpu)
@@ -1670,6 +1768,8 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1670 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1768 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1671 vcpu->arch.update_pte.largepage = 1; 1769 vcpu->arch.update_pte.largepage = 1;
1672 } 1770 }
1771 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
1772 /* implicit mb(), we'll read before PT lock is unlocked */
1673 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1773 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1674 up_read(&current->mm->mmap_sem); 1774 up_read(&current->mm->mmap_sem);
1675 1775
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4d918220baeb..f72ac1fa35f0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -263,6 +263,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
263 pfn = vcpu->arch.update_pte.pfn; 263 pfn = vcpu->arch.update_pte.pfn;
264 if (is_error_pfn(pfn)) 264 if (is_error_pfn(pfn))
265 return; 265 return;
266 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
267 return;
266 kvm_get_pfn(pfn); 268 kvm_get_pfn(pfn);
267 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 269 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
268 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), 270 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
@@ -380,6 +382,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
380 int r; 382 int r;
381 pfn_t pfn; 383 pfn_t pfn;
382 int largepage = 0; 384 int largepage = 0;
385 unsigned long mmu_seq;
383 386
384 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 387 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
385 kvm_mmu_audit(vcpu, "pre page fault"); 388 kvm_mmu_audit(vcpu, "pre page fault");
@@ -413,6 +416,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
413 largepage = 1; 416 largepage = 1;
414 } 417 }
415 } 418 }
419 mmu_seq = vcpu->kvm->mmu_notifier_seq;
420 /* implicit mb(), we'll read before PT lock is unlocked */
416 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 421 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
417 up_read(&current->mm->mmap_sem); 422 up_read(&current->mm->mmap_sem);
418 423
@@ -424,6 +429,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
424 } 429 }
425 430
426 spin_lock(&vcpu->kvm->mmu_lock); 431 spin_lock(&vcpu->kvm->mmu_lock);
432 if (mmu_notifier_retry(vcpu, mmu_seq))
433 goto out_unlock;
427 kvm_mmu_free_some_pages(vcpu); 434 kvm_mmu_free_some_pages(vcpu);
428 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 435 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
429 largepage, &write_pt, pfn); 436 largepage, &write_pt, pfn);
@@ -439,6 +446,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
439 spin_unlock(&vcpu->kvm->mmu_lock); 446 spin_unlock(&vcpu->kvm->mmu_lock);
440 447
441 return write_pt; 448 return write_pt;
449
450out_unlock:
451 spin_unlock(&vcpu->kvm->mmu_lock);
452 kvm_release_pfn_clean(pfn);
453 return 0;
442} 454}
443 455
444static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 456static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5916191420c7..0d682fc6aeb3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -883,6 +883,7 @@ int kvm_dev_ioctl_check_extension(long ext)
883 case KVM_CAP_PIT: 883 case KVM_CAP_PIT:
884 case KVM_CAP_NOP_IO_DELAY: 884 case KVM_CAP_NOP_IO_DELAY:
885 case KVM_CAP_MP_STATE: 885 case KVM_CAP_MP_STATE:
886 case KVM_CAP_SYNC_MMU:
886 r = 1; 887 r = 1;
887 break; 888 break;
888 case KVM_CAP_COALESCED_MMIO: 889 case KVM_CAP_COALESCED_MMIO:
@@ -1495,6 +1496,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1495 goto out; 1496 goto out;
1496 1497
1497 down_write(&kvm->slots_lock); 1498 down_write(&kvm->slots_lock);
1499 spin_lock(&kvm->mmu_lock);
1498 1500
1499 p = &kvm->arch.aliases[alias->slot]; 1501 p = &kvm->arch.aliases[alias->slot];
1500 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1502 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1506,6 +1508,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1506 break; 1508 break;
1507 kvm->arch.naliases = n; 1509 kvm->arch.naliases = n;
1508 1510
1511 spin_unlock(&kvm->mmu_lock);
1509 kvm_mmu_zap_all(kvm); 1512 kvm_mmu_zap_all(kvm);
1510 1513
1511 up_write(&kvm->slots_lock); 1514 up_write(&kvm->slots_lock);
@@ -3972,16 +3975,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
3972 */ 3975 */
3973 if (!user_alloc) { 3976 if (!user_alloc) {
3974 if (npages && !old.rmap) { 3977 if (npages && !old.rmap) {
3978 unsigned long userspace_addr;
3979
3975 down_write(&current->mm->mmap_sem); 3980 down_write(&current->mm->mmap_sem);
3976 memslot->userspace_addr = do_mmap(NULL, 0, 3981 userspace_addr = do_mmap(NULL, 0,
3977 npages * PAGE_SIZE, 3982 npages * PAGE_SIZE,
3978 PROT_READ | PROT_WRITE, 3983 PROT_READ | PROT_WRITE,
3979 MAP_SHARED | MAP_ANONYMOUS, 3984 MAP_SHARED | MAP_ANONYMOUS,
3980 0); 3985 0);
3981 up_write(&current->mm->mmap_sem); 3986 up_write(&current->mm->mmap_sem);
3982 3987
3983 if (IS_ERR((void *)memslot->userspace_addr)) 3988 if (IS_ERR((void *)userspace_addr))
3984 return PTR_ERR((void *)memslot->userspace_addr); 3989 return PTR_ERR((void *)userspace_addr);
3990
3991 /* set userspace_addr atomically for kvm_hva_to_rmapp */
3992 spin_lock(&kvm->mmu_lock);
3993 memslot->userspace_addr = userspace_addr;
3994 spin_unlock(&kvm->mmu_lock);
3985 } else { 3995 } else {
3986 if (!old.user_alloc && old.rmap) { 3996 if (!old.user_alloc && old.rmap) {
3987 int ret; 3997 int ret;
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index bc34dc21f178..0f3c53114614 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -13,6 +13,7 @@
13 13
14#include <linux/types.h> 14#include <linux/types.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/mmu_notifier.h>
16 17
17#include <linux/kvm.h> 18#include <linux/kvm.h>
18#include <linux/kvm_para.h> 19#include <linux/kvm_para.h>
@@ -251,6 +252,7 @@ struct kvm_vcpu_arch {
251 gfn_t gfn; /* presumed gfn during guest pte update */ 252 gfn_t gfn; /* presumed gfn during guest pte update */
252 pfn_t pfn; /* pfn corresponding to that gfn */ 253 pfn_t pfn; /* pfn corresponding to that gfn */
253 int largepage; 254 int largepage;
255 unsigned long mmu_seq;
254 } update_pte; 256 } update_pte;
255 257
256 struct i387_fxsave_struct host_fx_image; 258 struct i387_fxsave_struct host_fx_image;
@@ -729,4 +731,8 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
729 KVM_EX_ENTRY " 666b, 667b \n\t" \ 731 KVM_EX_ENTRY " 666b, 667b \n\t" \
730 ".popsection" 732 ".popsection"
731 733
734#define KVM_ARCH_WANT_MMU_NOTIFIER
735int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
736int kvm_age_hva(struct kvm *kvm, unsigned long hva);
737
732#endif 738#endif
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 0ea064cbfbc8..69511f74f912 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -371,6 +371,7 @@ struct kvm_trace_rec {
371#define KVM_CAP_PV_MMU 13 371#define KVM_CAP_PV_MMU 13
372#define KVM_CAP_MP_STATE 14 372#define KVM_CAP_MP_STATE 14
373#define KVM_CAP_COALESCED_MMIO 15 373#define KVM_CAP_COALESCED_MMIO 15
374#define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */
374 375
375/* 376/*
376 * ioctls for VM fds 377 * ioctls for VM fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 07d68a8ae8e9..8525afc53107 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -121,6 +121,12 @@ struct kvm {
121 struct kvm_coalesced_mmio_dev *coalesced_mmio_dev; 121 struct kvm_coalesced_mmio_dev *coalesced_mmio_dev;
122 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; 122 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
123#endif 123#endif
124
125#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
126 struct mmu_notifier mmu_notifier;
127 unsigned long mmu_notifier_seq;
128 long mmu_notifier_count;
129#endif
124}; 130};
125 131
126/* The guest did something we don't support. */ 132/* The guest did something we don't support. */
@@ -332,4 +338,22 @@ int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg)
332#define kvm_trace_cleanup() ((void)0) 338#define kvm_trace_cleanup() ((void)0)
333#endif 339#endif
334 340
341#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
342static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq)
343{
344 if (unlikely(vcpu->kvm->mmu_notifier_count))
345 return 1;
346 /*
347 * Both reads happen under the mmu_lock and both values are
348 * modified under mmu_lock, so there's no need of smb_rmb()
349 * here in between, otherwise mmu_notifier_count should be
350 * read before mmu_notifier_seq, see
351 * mmu_notifier_invalidate_range_end write side.
352 */
353 if (vcpu->kvm->mmu_notifier_seq != mmu_seq)
354 return 1;
355 return 0;
356}
357#endif
358
335#endif 359#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a845890b6800..7dd9b0b85e4e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -192,6 +192,123 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
192} 192}
193EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 193EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
194 194
195#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
196static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
197{
198 return container_of(mn, struct kvm, mmu_notifier);
199}
200
201static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
202 struct mm_struct *mm,
203 unsigned long address)
204{
205 struct kvm *kvm = mmu_notifier_to_kvm(mn);
206 int need_tlb_flush;
207
208 /*
209 * When ->invalidate_page runs, the linux pte has been zapped
210 * already but the page is still allocated until
211 * ->invalidate_page returns. So if we increase the sequence
212 * here the kvm page fault will notice if the spte can't be
213 * established because the page is going to be freed. If
214 * instead the kvm page fault establishes the spte before
215 * ->invalidate_page runs, kvm_unmap_hva will release it
216 * before returning.
217 *
218 * The sequence increase only need to be seen at spin_unlock
219 * time, and not at spin_lock time.
220 *
221 * Increasing the sequence after the spin_unlock would be
222 * unsafe because the kvm page fault could then establish the
223 * pte after kvm_unmap_hva returned, without noticing the page
224 * is going to be freed.
225 */
226 spin_lock(&kvm->mmu_lock);
227 kvm->mmu_notifier_seq++;
228 need_tlb_flush = kvm_unmap_hva(kvm, address);
229 spin_unlock(&kvm->mmu_lock);
230
231 /* we've to flush the tlb before the pages can be freed */
232 if (need_tlb_flush)
233 kvm_flush_remote_tlbs(kvm);
234
235}
236
237static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
238 struct mm_struct *mm,
239 unsigned long start,
240 unsigned long end)
241{
242 struct kvm *kvm = mmu_notifier_to_kvm(mn);
243 int need_tlb_flush = 0;
244
245 spin_lock(&kvm->mmu_lock);
246 /*
247 * The count increase must become visible at unlock time as no
248 * spte can be established without taking the mmu_lock and
249 * count is also read inside the mmu_lock critical section.
250 */
251 kvm->mmu_notifier_count++;
252 for (; start < end; start += PAGE_SIZE)
253 need_tlb_flush |= kvm_unmap_hva(kvm, start);
254 spin_unlock(&kvm->mmu_lock);
255
256 /* we've to flush the tlb before the pages can be freed */
257 if (need_tlb_flush)
258 kvm_flush_remote_tlbs(kvm);
259}
260
261static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
262 struct mm_struct *mm,
263 unsigned long start,
264 unsigned long end)
265{
266 struct kvm *kvm = mmu_notifier_to_kvm(mn);
267
268 spin_lock(&kvm->mmu_lock);
269 /*
270 * This sequence increase will notify the kvm page fault that
271 * the page that is going to be mapped in the spte could have
272 * been freed.
273 */
274 kvm->mmu_notifier_seq++;
275 /*
276 * The above sequence increase must be visible before the
277 * below count decrease but both values are read by the kvm
278 * page fault under mmu_lock spinlock so we don't need to add
279 * a smb_wmb() here in between the two.
280 */
281 kvm->mmu_notifier_count--;
282 spin_unlock(&kvm->mmu_lock);
283
284 BUG_ON(kvm->mmu_notifier_count < 0);
285}
286
287static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
288 struct mm_struct *mm,
289 unsigned long address)
290{
291 struct kvm *kvm = mmu_notifier_to_kvm(mn);
292 int young;
293
294 spin_lock(&kvm->mmu_lock);
295 young = kvm_age_hva(kvm, address);
296 spin_unlock(&kvm->mmu_lock);
297
298 if (young)
299 kvm_flush_remote_tlbs(kvm);
300
301 return young;
302}
303
304static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
305 .invalidate_page = kvm_mmu_notifier_invalidate_page,
306 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
307 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
308 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
309};
310#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
311
195static struct kvm *kvm_create_vm(void) 312static struct kvm *kvm_create_vm(void)
196{ 313{
197 struct kvm *kvm = kvm_arch_create_vm(); 314 struct kvm *kvm = kvm_arch_create_vm();
@@ -212,6 +329,21 @@ static struct kvm *kvm_create_vm(void)
212 (struct kvm_coalesced_mmio_ring *)page_address(page); 329 (struct kvm_coalesced_mmio_ring *)page_address(page);
213#endif 330#endif
214 331
332#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
333 {
334 int err;
335 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
336 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
337 if (err) {
338#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
339 put_page(page);
340#endif
341 kfree(kvm);
342 return ERR_PTR(err);
343 }
344 }
345#endif
346
215 kvm->mm = current->mm; 347 kvm->mm = current->mm;
216 atomic_inc(&kvm->mm->mm_count); 348 atomic_inc(&kvm->mm->mm_count);
217 spin_lock_init(&kvm->mmu_lock); 349 spin_lock_init(&kvm->mmu_lock);
@@ -272,6 +404,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
272 if (kvm->coalesced_mmio_ring != NULL) 404 if (kvm->coalesced_mmio_ring != NULL)
273 free_page((unsigned long)kvm->coalesced_mmio_ring); 405 free_page((unsigned long)kvm->coalesced_mmio_ring);
274#endif 406#endif
407#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
408 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
409#endif
275 kvm_arch_destroy_vm(kvm); 410 kvm_arch_destroy_vm(kvm);
276 mmdrop(mm); 411 mmdrop(mm);
277} 412}
@@ -375,7 +510,15 @@ int __kvm_set_memory_region(struct kvm *kvm,
375 memset(new.rmap, 0, npages * sizeof(*new.rmap)); 510 memset(new.rmap, 0, npages * sizeof(*new.rmap));
376 511
377 new.user_alloc = user_alloc; 512 new.user_alloc = user_alloc;
378 new.userspace_addr = mem->userspace_addr; 513 /*
514 * hva_to_rmmap() serialzies with the mmu_lock and to be
515 * safe it has to ignore memslots with !user_alloc &&
516 * !userspace_addr.
517 */
518 if (user_alloc)
519 new.userspace_addr = mem->userspace_addr;
520 else
521 new.userspace_addr = 0;
379 } 522 }
380 if (npages && !new.lpage_info) { 523 if (npages && !new.lpage_info) {
381 int largepages = npages / KVM_PAGES_PER_HPAGE; 524 int largepages = npages / KVM_PAGES_PER_HPAGE;
@@ -408,17 +551,21 @@ int __kvm_set_memory_region(struct kvm *kvm,
408 } 551 }
409#endif /* not defined CONFIG_S390 */ 552#endif /* not defined CONFIG_S390 */
410 553
411 if (mem->slot >= kvm->nmemslots)
412 kvm->nmemslots = mem->slot + 1;
413
414 if (!npages) 554 if (!npages)
415 kvm_arch_flush_shadow(kvm); 555 kvm_arch_flush_shadow(kvm);
416 556
557 spin_lock(&kvm->mmu_lock);
558 if (mem->slot >= kvm->nmemslots)
559 kvm->nmemslots = mem->slot + 1;
560
417 *memslot = new; 561 *memslot = new;
562 spin_unlock(&kvm->mmu_lock);
418 563
419 r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); 564 r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
420 if (r) { 565 if (r) {
566 spin_lock(&kvm->mmu_lock);
421 *memslot = old; 567 *memslot = old;
568 spin_unlock(&kvm->mmu_lock);
422 goto out_free; 569 goto out_free;
423 } 570 }
424 571