aboutsummaryrefslogtreecommitdiffstats
path: root/virt/kvm/kvm_main.c
diff options
context:
space:
mode:
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r--virt/kvm/kvm_main.c235
1 files changed, 163 insertions, 72 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f5283438ee05..cc6a25d95fbf 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -66,6 +66,9 @@
66MODULE_AUTHOR("Qumranet"); 66MODULE_AUTHOR("Qumranet");
67MODULE_LICENSE("GPL"); 67MODULE_LICENSE("GPL");
68 68
69unsigned int halt_poll_ns = 0;
70module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
71
69/* 72/*
70 * Ordering of locks: 73 * Ordering of locks:
71 * 74 *
@@ -89,7 +92,7 @@ struct dentry *kvm_debugfs_dir;
89 92
90static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 93static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
91 unsigned long arg); 94 unsigned long arg);
92#ifdef CONFIG_COMPAT 95#ifdef CONFIG_KVM_COMPAT
93static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 96static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
94 unsigned long arg); 97 unsigned long arg);
95#endif 98#endif
@@ -176,6 +179,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
176 return called; 179 return called;
177} 180}
178 181
182#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
179void kvm_flush_remote_tlbs(struct kvm *kvm) 183void kvm_flush_remote_tlbs(struct kvm *kvm)
180{ 184{
181 long dirty_count = kvm->tlbs_dirty; 185 long dirty_count = kvm->tlbs_dirty;
@@ -186,6 +190,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
186 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 190 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
187} 191}
188EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 192EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
193#endif
189 194
190void kvm_reload_remote_mmus(struct kvm *kvm) 195void kvm_reload_remote_mmus(struct kvm *kvm)
191{ 196{
@@ -466,7 +471,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
466 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 471 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
467 472
468 r = -ENOMEM; 473 r = -ENOMEM;
469 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 474 kvm->memslots = kvm_kvzalloc(sizeof(struct kvm_memslots));
470 if (!kvm->memslots) 475 if (!kvm->memslots)
471 goto out_err_no_srcu; 476 goto out_err_no_srcu;
472 477
@@ -517,7 +522,7 @@ out_err_no_srcu:
517out_err_no_disable: 522out_err_no_disable:
518 for (i = 0; i < KVM_NR_BUSES; i++) 523 for (i = 0; i < KVM_NR_BUSES; i++)
519 kfree(kvm->buses[i]); 524 kfree(kvm->buses[i]);
520 kfree(kvm->memslots); 525 kvfree(kvm->memslots);
521 kvm_arch_free_vm(kvm); 526 kvm_arch_free_vm(kvm);
522 return ERR_PTR(r); 527 return ERR_PTR(r);
523} 528}
@@ -573,7 +578,7 @@ static void kvm_free_physmem(struct kvm *kvm)
573 kvm_for_each_memslot(memslot, slots) 578 kvm_for_each_memslot(memslot, slots)
574 kvm_free_physmem_slot(kvm, memslot, NULL); 579 kvm_free_physmem_slot(kvm, memslot, NULL);
575 580
576 kfree(kvm->memslots); 581 kvfree(kvm->memslots);
577} 582}
578 583
579static void kvm_destroy_devices(struct kvm *kvm) 584static void kvm_destroy_devices(struct kvm *kvm)
@@ -671,7 +676,9 @@ static void update_memslots(struct kvm_memslots *slots,
671 676
672 WARN_ON(mslots[i].id != id); 677 WARN_ON(mslots[i].id != id);
673 if (!new->npages) { 678 if (!new->npages) {
679 WARN_ON(!mslots[i].npages);
674 new->base_gfn = 0; 680 new->base_gfn = 0;
681 new->flags = 0;
675 if (mslots[i].npages) 682 if (mslots[i].npages)
676 slots->used_slots--; 683 slots->used_slots--;
677 } else { 684 } else {
@@ -687,12 +694,25 @@ static void update_memslots(struct kvm_memslots *slots,
687 slots->id_to_index[mslots[i].id] = i; 694 slots->id_to_index[mslots[i].id] = i;
688 i++; 695 i++;
689 } 696 }
690 while (i > 0 && 697
691 new->base_gfn > mslots[i - 1].base_gfn) { 698 /*
692 mslots[i] = mslots[i - 1]; 699 * The ">=" is needed when creating a slot with base_gfn == 0,
693 slots->id_to_index[mslots[i].id] = i; 700 * so that it moves before all those with base_gfn == npages == 0.
694 i--; 701 *
695 } 702 * On the other hand, if new->npages is zero, the above loop has
703 * already left i pointing to the beginning of the empty part of
704 * mslots, and the ">=" would move the hole backwards in this
705 * case---which is wrong. So skip the loop when deleting a slot.
706 */
707 if (new->npages) {
708 while (i > 0 &&
709 new->base_gfn >= mslots[i - 1].base_gfn) {
710 mslots[i] = mslots[i - 1];
711 slots->id_to_index[mslots[i].id] = i;
712 i--;
713 }
714 } else
715 WARN_ON_ONCE(i != slots->used_slots);
696 716
697 mslots[i] = *new; 717 mslots[i] = *new;
698 slots->id_to_index[mslots[i].id] = i; 718 slots->id_to_index[mslots[i].id] = i;
@@ -851,10 +871,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
851 goto out_free; 871 goto out_free;
852 } 872 }
853 873
854 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), 874 slots = kvm_kvzalloc(sizeof(struct kvm_memslots));
855 GFP_KERNEL);
856 if (!slots) 875 if (!slots)
857 goto out_free; 876 goto out_free;
877 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
858 878
859 if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { 879 if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
860 slot = id_to_memslot(slots, mem->slot); 880 slot = id_to_memslot(slots, mem->slot);
@@ -897,7 +917,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
897 kvm_arch_commit_memory_region(kvm, mem, &old, change); 917 kvm_arch_commit_memory_region(kvm, mem, &old, change);
898 918
899 kvm_free_physmem_slot(kvm, &old, &new); 919 kvm_free_physmem_slot(kvm, &old, &new);
900 kfree(old_memslots); 920 kvfree(old_memslots);
901 921
902 /* 922 /*
903 * IOMMU mapping: New slots need to be mapped. Old slots need to be 923 * IOMMU mapping: New slots need to be mapped. Old slots need to be
@@ -916,7 +936,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
916 return 0; 936 return 0;
917 937
918out_slots: 938out_slots:
919 kfree(slots); 939 kvfree(slots);
920out_free: 940out_free:
921 kvm_free_physmem_slot(kvm, &new, &old); 941 kvm_free_physmem_slot(kvm, &new, &old);
922out: 942out:
@@ -979,6 +999,86 @@ out:
979} 999}
980EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 1000EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
981 1001
1002#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1003/**
1004 * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
1005 * are dirty write protect them for next write.
1006 * @kvm: pointer to kvm instance
1007 * @log: slot id and address to which we copy the log
1008 * @is_dirty: flag set if any page is dirty
1009 *
1010 * We need to keep it in mind that VCPU threads can write to the bitmap
1011 * concurrently. So, to avoid losing track of dirty pages we keep the
1012 * following order:
1013 *
1014 * 1. Take a snapshot of the bit and clear it if needed.
1015 * 2. Write protect the corresponding page.
1016 * 3. Copy the snapshot to the userspace.
1017 * 4. Upon return caller flushes TLB's if needed.
1018 *
1019 * Between 2 and 4, the guest may write to the page using the remaining TLB
1020 * entry. This is not a problem because the page is reported dirty using
1021 * the snapshot taken before and step 4 ensures that writes done after
1022 * exiting to userspace will be logged for the next call.
1023 *
1024 */
1025int kvm_get_dirty_log_protect(struct kvm *kvm,
1026 struct kvm_dirty_log *log, bool *is_dirty)
1027{
1028 struct kvm_memory_slot *memslot;
1029 int r, i;
1030 unsigned long n;
1031 unsigned long *dirty_bitmap;
1032 unsigned long *dirty_bitmap_buffer;
1033
1034 r = -EINVAL;
1035 if (log->slot >= KVM_USER_MEM_SLOTS)
1036 goto out;
1037
1038 memslot = id_to_memslot(kvm->memslots, log->slot);
1039
1040 dirty_bitmap = memslot->dirty_bitmap;
1041 r = -ENOENT;
1042 if (!dirty_bitmap)
1043 goto out;
1044
1045 n = kvm_dirty_bitmap_bytes(memslot);
1046
1047 dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
1048 memset(dirty_bitmap_buffer, 0, n);
1049
1050 spin_lock(&kvm->mmu_lock);
1051 *is_dirty = false;
1052 for (i = 0; i < n / sizeof(long); i++) {
1053 unsigned long mask;
1054 gfn_t offset;
1055
1056 if (!dirty_bitmap[i])
1057 continue;
1058
1059 *is_dirty = true;
1060
1061 mask = xchg(&dirty_bitmap[i], 0);
1062 dirty_bitmap_buffer[i] = mask;
1063
1064 offset = i * BITS_PER_LONG;
1065 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset,
1066 mask);
1067 }
1068
1069 spin_unlock(&kvm->mmu_lock);
1070
1071 r = -EFAULT;
1072 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1073 goto out;
1074
1075 r = 0;
1076out:
1077 return r;
1078}
1079EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
1080#endif
1081
982bool kvm_largepages_enabled(void) 1082bool kvm_largepages_enabled(void)
983{ 1083{
984 return largepages_enabled; 1084 return largepages_enabled;
@@ -1114,43 +1214,6 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
1114 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); 1214 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
1115} 1215}
1116 1216
1117int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
1118 unsigned long addr, bool write_fault,
1119 struct page **pagep)
1120{
1121 int npages;
1122 int locked = 1;
1123 int flags = FOLL_TOUCH | FOLL_HWPOISON |
1124 (pagep ? FOLL_GET : 0) |
1125 (write_fault ? FOLL_WRITE : 0);
1126
1127 /*
1128 * If retrying the fault, we get here *not* having allowed the filemap
1129 * to wait on the page lock. We should now allow waiting on the IO with
1130 * the mmap semaphore released.
1131 */
1132 down_read(&mm->mmap_sem);
1133 npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
1134 &locked);
1135 if (!locked) {
1136 VM_BUG_ON(npages);
1137
1138 if (!pagep)
1139 return 0;
1140
1141 /*
1142 * The previous call has now waited on the IO. Now we can
1143 * retry and complete. Pass TRIED to ensure we do not re
1144 * schedule async IO (see e.g. filemap_fault).
1145 */
1146 down_read(&mm->mmap_sem);
1147 npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
1148 pagep, NULL, NULL);
1149 }
1150 up_read(&mm->mmap_sem);
1151 return npages;
1152}
1153
1154static inline int check_user_page_hwpoison(unsigned long addr) 1217static inline int check_user_page_hwpoison(unsigned long addr)
1155{ 1218{
1156 int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; 1219 int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
@@ -1213,15 +1276,10 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
1213 npages = get_user_page_nowait(current, current->mm, 1276 npages = get_user_page_nowait(current, current->mm,
1214 addr, write_fault, page); 1277 addr, write_fault, page);
1215 up_read(&current->mm->mmap_sem); 1278 up_read(&current->mm->mmap_sem);
1216 } else { 1279 } else
1217 /* 1280 npages = __get_user_pages_unlocked(current, current->mm, addr, 1,
1218 * By now we have tried gup_fast, and possibly async_pf, and we 1281 write_fault, 0, page,
1219 * are certainly not atomic. Time to retry the gup, allowing 1282 FOLL_TOUCH|FOLL_HWPOISON);
1220 * mmap semaphore to be relinquished in the case of IO.
1221 */
1222 npages = kvm_get_user_page_io(current, current->mm, addr,
1223 write_fault, page);
1224 }
1225 if (npages != 1) 1283 if (npages != 1)
1226 return npages; 1284 return npages;
1227 1285
@@ -1579,6 +1637,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1579 } 1637 }
1580 return 0; 1638 return 0;
1581} 1639}
1640EXPORT_SYMBOL_GPL(kvm_write_guest);
1582 1641
1583int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1642int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1584 gpa_t gpa, unsigned long len) 1643 gpa_t gpa, unsigned long len)
@@ -1715,29 +1774,60 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1715} 1774}
1716EXPORT_SYMBOL_GPL(mark_page_dirty); 1775EXPORT_SYMBOL_GPL(mark_page_dirty);
1717 1776
1777static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
1778{
1779 if (kvm_arch_vcpu_runnable(vcpu)) {
1780 kvm_make_request(KVM_REQ_UNHALT, vcpu);
1781 return -EINTR;
1782 }
1783 if (kvm_cpu_has_pending_timer(vcpu))
1784 return -EINTR;
1785 if (signal_pending(current))
1786 return -EINTR;
1787
1788 return 0;
1789}
1790
1718/* 1791/*
1719 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1792 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1720 */ 1793 */
1721void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1794void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1722{ 1795{
1796 ktime_t start, cur;
1723 DEFINE_WAIT(wait); 1797 DEFINE_WAIT(wait);
1798 bool waited = false;
1799
1800 start = cur = ktime_get();
1801 if (halt_poll_ns) {
1802 ktime_t stop = ktime_add_ns(ktime_get(), halt_poll_ns);
1803 do {
1804 /*
1805 * This sets KVM_REQ_UNHALT if an interrupt
1806 * arrives.
1807 */
1808 if (kvm_vcpu_check_block(vcpu) < 0) {
1809 ++vcpu->stat.halt_successful_poll;
1810 goto out;
1811 }
1812 cur = ktime_get();
1813 } while (single_task_running() && ktime_before(cur, stop));
1814 }
1724 1815
1725 for (;;) { 1816 for (;;) {
1726 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1817 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1727 1818
1728 if (kvm_arch_vcpu_runnable(vcpu)) { 1819 if (kvm_vcpu_check_block(vcpu) < 0)
1729 kvm_make_request(KVM_REQ_UNHALT, vcpu);
1730 break;
1731 }
1732 if (kvm_cpu_has_pending_timer(vcpu))
1733 break;
1734 if (signal_pending(current))
1735 break; 1820 break;
1736 1821
1822 waited = true;
1737 schedule(); 1823 schedule();
1738 } 1824 }
1739 1825
1740 finish_wait(&vcpu->wq, &wait); 1826 finish_wait(&vcpu->wq, &wait);
1827 cur = ktime_get();
1828
1829out:
1830 trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited);
1741} 1831}
1742EXPORT_SYMBOL_GPL(kvm_vcpu_block); 1832EXPORT_SYMBOL_GPL(kvm_vcpu_block);
1743 1833
@@ -1920,7 +2010,7 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1920static struct file_operations kvm_vcpu_fops = { 2010static struct file_operations kvm_vcpu_fops = {
1921 .release = kvm_vcpu_release, 2011 .release = kvm_vcpu_release,
1922 .unlocked_ioctl = kvm_vcpu_ioctl, 2012 .unlocked_ioctl = kvm_vcpu_ioctl,
1923#ifdef CONFIG_COMPAT 2013#ifdef CONFIG_KVM_COMPAT
1924 .compat_ioctl = kvm_vcpu_compat_ioctl, 2014 .compat_ioctl = kvm_vcpu_compat_ioctl,
1925#endif 2015#endif
1926 .mmap = kvm_vcpu_mmap, 2016 .mmap = kvm_vcpu_mmap,
@@ -2210,7 +2300,7 @@ out:
2210 return r; 2300 return r;
2211} 2301}
2212 2302
2213#ifdef CONFIG_COMPAT 2303#ifdef CONFIG_KVM_COMPAT
2214static long kvm_vcpu_compat_ioctl(struct file *filp, 2304static long kvm_vcpu_compat_ioctl(struct file *filp,
2215 unsigned int ioctl, unsigned long arg) 2305 unsigned int ioctl, unsigned long arg)
2216{ 2306{
@@ -2302,7 +2392,7 @@ static int kvm_device_release(struct inode *inode, struct file *filp)
2302 2392
2303static const struct file_operations kvm_device_fops = { 2393static const struct file_operations kvm_device_fops = {
2304 .unlocked_ioctl = kvm_device_ioctl, 2394 .unlocked_ioctl = kvm_device_ioctl,
2305#ifdef CONFIG_COMPAT 2395#ifdef CONFIG_KVM_COMPAT
2306 .compat_ioctl = kvm_device_ioctl, 2396 .compat_ioctl = kvm_device_ioctl,
2307#endif 2397#endif
2308 .release = kvm_device_release, 2398 .release = kvm_device_release,
@@ -2402,6 +2492,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
2402 case KVM_CAP_SIGNAL_MSI: 2492 case KVM_CAP_SIGNAL_MSI:
2403#endif 2493#endif
2404#ifdef CONFIG_HAVE_KVM_IRQFD 2494#ifdef CONFIG_HAVE_KVM_IRQFD
2495 case KVM_CAP_IRQFD:
2405 case KVM_CAP_IRQFD_RESAMPLE: 2496 case KVM_CAP_IRQFD_RESAMPLE:
2406#endif 2497#endif
2407 case KVM_CAP_CHECK_EXTENSION_VM: 2498 case KVM_CAP_CHECK_EXTENSION_VM:
@@ -2589,7 +2680,7 @@ out:
2589 return r; 2680 return r;
2590} 2681}
2591 2682
2592#ifdef CONFIG_COMPAT 2683#ifdef CONFIG_KVM_COMPAT
2593struct compat_kvm_dirty_log { 2684struct compat_kvm_dirty_log {
2594 __u32 slot; 2685 __u32 slot;
2595 __u32 padding1; 2686 __u32 padding1;
@@ -2636,7 +2727,7 @@ out:
2636static struct file_operations kvm_vm_fops = { 2727static struct file_operations kvm_vm_fops = {
2637 .release = kvm_vm_release, 2728 .release = kvm_vm_release,
2638 .unlocked_ioctl = kvm_vm_ioctl, 2729 .unlocked_ioctl = kvm_vm_ioctl,
2639#ifdef CONFIG_COMPAT 2730#ifdef CONFIG_KVM_COMPAT
2640 .compat_ioctl = kvm_vm_compat_ioctl, 2731 .compat_ioctl = kvm_vm_compat_ioctl,
2641#endif 2732#endif
2642 .llseek = noop_llseek, 2733 .llseek = noop_llseek,