Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Paolo Bonzini: "Fixes and features for 3.18. Apart from the usual cleanups, here is the summary of new features: - s390 moves closer towards host large page support - PowerPC has improved support for debugging (both inside the guest and via gdbstub) and support for e6500 processors - ARM/ARM64 support read-only memory (which is necessary to put firmware in emulated NOR flash) - x86 has the usual emulator fixes and nested virtualization improvements (including improved Windows support on Intel and Jailhouse hypervisor support on AMD), adaptive PLE which helps overcommitting of huge guests. Also included are some patches that make KVM more friendly to memory hot-unplug, and fixes for rare caching bugs. Two patches have trivial mm/ parts that were acked by Rik and Andrew. Note: I will soon switch to a subkey for signing purposes" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (157 commits) kvm: do not handle APIC access page if in-kernel irqchip is not in use KVM: s390: count vcpu wakeups in stat.halt_wakeup KVM: s390/facilities: allow TOD-CLOCK steering facility bit KVM: PPC: BOOK3S: HV: CMA: Reserve cma region only in hypervisor mode arm/arm64: KVM: Report correct FSC for unsupported fault types arm/arm64: KVM: Fix VTTBR_BADDR_MASK and pgd alloc kvm: Fix kvm_get_page_retry_io __gup retval check arm/arm64: KVM: Fix set_clear_sgi_pend_reg offset kvm: x86: Unpin and remove kvm_arch->apic_access_page kvm: vmx: Implement set_apic_access_page_addr kvm: x86: Add request bit to reload APIC access page address kvm: Add arch specific mmu notifier for page invalidation kvm: Rename make_all_cpus_request() to kvm_make_all_cpus_request() and make it non-static kvm: Fix page ageing bugs kvm/x86/mmu: Pass gfn and level to rmapp callback. x86: kvm: use alternatives for VMCALL vs. VMMCALL if kernel text is read-only kvm: x86: use macros to compute bank MSRs KVM: x86: Remove debug assertion of non-PAE reserved bits kvm: don't take vcpu mutex for obviously invalid vcpu ioctls kvm: Faults which trigger IO release the mmap_sem ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-10-08 05:27:39 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-10-08 05:27:39 -0400
commit: e4e65676f272adb63655a2ca95207e8212d282f1 (patch)
tree: 3679a3e6897d698ee949642660281e7f74e2852b /virt/kvm/kvm_main.c
parent: f89f4a06a59f30dec64b2afc4111426fc01e9e12 (diff)
parent: f439ed27f8b8b90d243ae15acb193d37f96eebe0 (diff)
1 files changed, 133 insertions, 59 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 95519bc959ed..384eaa7b02fa 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -52,11 +52,13 @@
 #include <asm/processor.h>
 #include <asm/io.h>
+#include <asm/ioctl.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include "coalesced_mmio.h"
 #include "async_pf.h"
+#include "vfio.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
@@ -95,8 +97,6 @@ static int hardware_enable_all(void);
 static void hardware_disable_all(void);
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
-static void update_memslots(struct kvm_memslots *slots,
-                            struct kvm_memory_slot *new, u64 last_generation);
 static void kvm_release_pfn_dirty(pfn_t pfn);
 static void mark_page_dirty_in_slot(struct kvm *kvm,
@@ -129,7 +129,8 @@ int vcpu_load(struct kvm_vcpu *vcpu)
                struct pid *oldpid = vcpu->pid;
                struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
                rcu_assign_pointer(vcpu->pid, newpid);
-                synchronize_rcu();
+                if (oldpid)
+                        synchronize_rcu();
                put_pid(oldpid);
        }
        cpu = get_cpu();
@@ -152,7 +153,7 @@ static void ack_flush(void *_completed)
 {
 }
-static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
+bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 {
        int i, cpu, me;
        cpumask_var_t cpus;
@@ -189,7 +190,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
        long dirty_count = kvm->tlbs_dirty;
        smp_mb();
-        if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
+        if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
                ++kvm->stat.remote_tlb_flush;
        cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
 }
@@ -197,17 +198,17 @@ EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
 void kvm_reload_remote_mmus(struct kvm *kvm)
 {
-        make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
+        kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 }
 void kvm_make_mclock_inprogress_request(struct kvm *kvm)
 {
-        make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+        kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
 }
 void kvm_make_scan_ioapic_request(struct kvm *kvm)
 {
-        make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
+        kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
 }
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
@@ -295,6 +296,9 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
                kvm_flush_remote_tlbs(kvm);
        spin_unlock(&kvm->mmu_lock);
+        kvm_arch_mmu_notifier_invalidate_page(kvm, address);
        srcu_read_unlock(&kvm->srcu, idx);
 }
@@ -368,7 +372,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
                                              struct mm_struct *mm,
-                                              unsigned long address)
+                                              unsigned long start,
+                                              unsigned long end)
 {
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
        int young, idx;
@@ -376,7 +381,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
        idx = srcu_read_lock(&kvm->srcu);
        spin_lock(&kvm->mmu_lock);
-        young = kvm_age_hva(kvm, address);
+        young = kvm_age_hva(kvm, start, end);
        if (young)
                kvm_flush_remote_tlbs(kvm);
@@ -476,6 +481,13 @@ static struct kvm *kvm_create_vm(unsigned long type)
        kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
        if (!kvm->memslots)
                goto out_err_no_srcu;
+        /*
+         * Init kvm generation close to the maximum to easily test the
+         * code of handling generation number wrap-around.
+         */
+        kvm->memslots->generation = -150;
        kvm_init_memslots_id(kvm);
        if (init_srcu_struct(&kvm->srcu))
                goto out_err_no_srcu;
@@ -687,8 +699,7 @@ static void sort_memslots(struct kvm_memslots *slots)
 }
 static void update_memslots(struct kvm_memslots *slots,
-                            struct kvm_memory_slot *new,
+                            struct kvm_memory_slot *new)
-                            u64 last_generation)
 {
        if (new) {
                int id = new->id;
@@ -699,15 +710,13 @@ static void update_memslots(struct kvm_memslots *slots,
                if (new->npages != npages)
                        sort_memslots(slots);
        }
-        slots->generation = last_generation + 1;
 }
 static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
 {
        u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
-#ifdef KVM_CAP_READONLY_MEM
+#ifdef __KVM_HAVE_READONLY_MEM
        valid_flags |= KVM_MEM_READONLY;
 #endif
@@ -722,10 +731,24 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 {
        struct kvm_memslots *old_memslots = kvm->memslots;
-        update_memslots(slots, new, kvm->memslots->generation);
+        /*
+         * Set the low bit in the generation, which disables SPTE caching
+         * until the end of synchronize_srcu_expedited.
+         */
+        WARN_ON(old_memslots->generation & 1);
+        slots->generation = old_memslots->generation + 1;
+        update_memslots(slots, new);
        rcu_assign_pointer(kvm->memslots, slots);
        synchronize_srcu_expedited(&kvm->srcu);
+        /*
+         * Increment the new memslot generation a second time. This prevents
+         * vm exits that race with memslot updates from caching a memslot
+         * generation that will (potentially) be valid forever.
+         */
+        slots->generation++;
        kvm_arch_memslots_updated(kvm);
        return old_memslots;
@@ -776,7 +799,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
        base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
        npages = mem->memory_size >> PAGE_SHIFT;
-        r = -EINVAL;
        if (npages > KVM_MEM_MAX_NR_PAGES)
                goto out;
@@ -790,7 +812,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
        new.npages = npages;
        new.flags = mem->flags;
-        r = -EINVAL;
        if (npages) {
                if (!old.npages)
                        change = KVM_MR_CREATE;
@@ -846,7 +867,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
        }
        if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
-                r = -ENOMEM;
                slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
                                GFP_KERNEL);
                if (!slots)
@@ -1075,9 +1095,9 @@ EXPORT_SYMBOL_GPL(gfn_to_hva);
 * If writable is set to false, the hva returned by this function is only
 * allowed to be read.
 */
-unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
+unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
+                                      gfn_t gfn, bool *writable)
 {
-        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
        unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
        if (!kvm_is_error_hva(hva) && writable)
@@ -1086,6 +1106,13 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
        return hva;
 }
+unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
+{
+        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+        return gfn_to_hva_memslot_prot(slot, gfn, writable);
+}
 static int kvm_read_hva(void *data, void __user *hva, int len)
 {
        return __copy_from_user(data, hva, len);
@@ -1107,6 +1134,43 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
        return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
 }
+int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
+                         unsigned long addr, bool write_fault,
+                         struct page **pagep)
+{
+        int npages;
+        int locked = 1;
+        int flags = FOLL_TOUCH | FOLL_HWPOISON |
+                    (pagep ? FOLL_GET : 0) |
+                    (write_fault ? FOLL_WRITE : 0);
+        /*
+         * If retrying the fault, we get here *not* having allowed the filemap
+         * to wait on the page lock. We should now allow waiting on the IO with
+         * the mmap semaphore released.
+         */
+        down_read(&mm->mmap_sem);
+        npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
+                                  &locked);
+        if (!locked) {
+                VM_BUG_ON(npages);
+                if (!pagep)
+                        return 0;
+                /*
+                 * The previous call has now waited on the IO. Now we can
+                 * retry and complete. Pass TRIED to ensure we do not re
+                 * schedule async IO (see e.g. filemap_fault).
+                 */
+                down_read(&mm->mmap_sem);
+                npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
+                                          pagep, NULL, NULL);
+        }
+        up_read(&mm->mmap_sem);
+        return npages;
+}
 static inline int check_user_page_hwpoison(unsigned long addr)
 {
        int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
@@ -1169,9 +1233,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
                npages = get_user_page_nowait(current, current->mm,
                                              addr, write_fault, page);
                up_read(&current->mm->mmap_sem);
-        } else
+        } else {
-                npages = get_user_pages_fast(addr, 1, write_fault,
+                /*
-                                             page);
+                 * By now we have tried gup_fast, and possibly async_pf, and we
+                 * are certainly not atomic. Time to retry the gup, allowing
+                 * mmap semaphore to be relinquished in the case of IO.
+                 */
+                npages = kvm_get_user_page_io(current, current->mm, addr,
+                                              write_fault, page);
+        }
        if (npages != 1)
                return npages;
@@ -1768,8 +1838,7 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
        bool eligible;
        eligible = !vcpu->spin_loop.in_spin_loop ||
-                        (vcpu->spin_loop.in_spin_loop &&
+                    vcpu->spin_loop.dy_eligible;
-                         vcpu->spin_loop.dy_eligible);
        if (vcpu->spin_loop.in_spin_loop)
                kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
@@ -1975,6 +2044,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
        if (vcpu->kvm->mm != current->mm)
                return -EIO;
+        if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
+                return -EINVAL;
 #if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
        /*
         * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
@@ -2259,6 +2331,29 @@ struct kvm_device *kvm_device_from_filp(struct file *filp)
        return filp->private_data;
 }
+static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
+#ifdef CONFIG_KVM_MPIC
+        [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
+        [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
+#endif
+#ifdef CONFIG_KVM_XICS
+        [KVM_DEV_TYPE_XICS]             = &kvm_xics_ops,
+#endif
+};
+int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
+{
+        if (type >= ARRAY_SIZE(kvm_device_ops_table))
+                return -ENOSPC;
+        if (kvm_device_ops_table[type] != NULL)
+                return -EEXIST;
+        kvm_device_ops_table[type] = ops;
+        return 0;
+}
 static int kvm_ioctl_create_device(struct kvm *kvm,
                                   struct kvm_create_device *cd)
 {
@@ -2267,36 +2362,12 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
        bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
        int ret;
-        switch (cd->type) {
+        if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
-#ifdef CONFIG_KVM_MPIC
+                return -ENODEV;
-        case KVM_DEV_TYPE_FSL_MPIC_20:
-        case KVM_DEV_TYPE_FSL_MPIC_42:
+        ops = kvm_device_ops_table[cd->type];
-                ops = &kvm_mpic_ops;
+        if (ops == NULL)
-                break;
-#endif
-#ifdef CONFIG_KVM_XICS
-        case KVM_DEV_TYPE_XICS:
-                ops = &kvm_xics_ops;
-                break;
-#endif
-#ifdef CONFIG_KVM_VFIO
-        case KVM_DEV_TYPE_VFIO:
-                ops = &kvm_vfio_ops;
-                break;
-#endif
-#ifdef CONFIG_KVM_ARM_VGIC
-        case KVM_DEV_TYPE_ARM_VGIC_V2:
-                ops = &kvm_arm_vgic_v2_ops;
-                break;
-#endif
-#ifdef CONFIG_S390
-        case KVM_DEV_TYPE_FLIC:
-                ops = &kvm_flic_ops;
-                break;
-#endif
-        default:
                return -ENODEV;
-        }
        if (test)
                return 0;
@@ -2611,7 +2682,6 @@ static long kvm_dev_ioctl(struct file *filp,
        switch (ioctl) {
        case KVM_GET_API_VERSION:
-                r = -EINVAL;
                if (arg)
                        goto out;
                r = KVM_API_VERSION;
@@ -2623,7 +2693,6 @@ static long kvm_dev_ioctl(struct file *filp,
                r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
                break;
        case KVM_GET_VCPU_MMAP_SIZE:
-                r = -EINVAL;
                if (arg)
                        goto out;
                r = PAGE_SIZE;     /* struct kvm_run */
@@ -2668,7 +2737,7 @@ static void hardware_enable_nolock(void *junk)
        cpumask_set_cpu(cpu, cpus_hardware_enabled);
-        r = kvm_arch_hardware_enable(NULL);
+        r = kvm_arch_hardware_enable();
        if (r) {
                cpumask_clear_cpu(cpu, cpus_hardware_enabled);
@@ -2693,7 +2762,7 @@ static void hardware_disable_nolock(void *junk)
        if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
                return;
        cpumask_clear_cpu(cpu, cpus_hardware_enabled);
-        kvm_arch_hardware_disable(NULL);
+        kvm_arch_hardware_disable();
 }
 static void hardware_disable(void)
@@ -3123,6 +3192,8 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
        if (vcpu->preempted)
                vcpu->preempted = false;
+        kvm_arch_sched_in(vcpu, cpu);
        kvm_arch_vcpu_load(vcpu, cpu);
 }
@@ -3214,6 +3285,9 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
                goto out_undebugfs;
        }
+        r = kvm_vfio_ops_init();
+        WARN_ON(r);
        return 0;
 out_undebugfs:
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-10-08 05:27:39 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-10-08 05:27:39 -0400
commit	e4e65676f272adb63655a2ca95207e8212d282f1 (patch)
tree	3679a3e6897d698ee949642660281e7f74e2852b /virt/kvm/kvm_main.c
parent	f89f4a06a59f30dec64b2afc4111426fc01e9e12 (diff)
parent	f439ed27f8b8b90d243ae15acb193d37f96eebe0 (diff)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 95519bc959ed..384eaa7b02fa 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c
@@ -52,11 +52,13 @@
52		52
53	#include <asm/processor.h>	53	#include <asm/processor.h>
54	#include <asm/io.h>	54	#include <asm/io.h>
		55	#include <asm/ioctl.h>
55	#include <asm/uaccess.h>	56	#include <asm/uaccess.h>
56	#include <asm/pgtable.h>	57	#include <asm/pgtable.h>
57		58
58	#include "coalesced_mmio.h"	59	#include "coalesced_mmio.h"
59	#include "async_pf.h"	60	#include "async_pf.h"
		61	#include "vfio.h"
60		62
61	#define CREATE_TRACE_POINTS	63	#define CREATE_TRACE_POINTS
62	#include <trace/events/kvm.h>	64	#include <trace/events/kvm.h>
@@ -95,8 +97,6 @@ static int hardware_enable_all(void);
95	static void hardware_disable_all(void);	97	static void hardware_disable_all(void);
96		98
97	static void kvm_io_bus_destroy(struct kvm_io_bus *bus);	99	static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
98	static void update_memslots(struct kvm_memslots *slots,
99	struct kvm_memory_slot *new, u64 last_generation);
100		100
101	static void kvm_release_pfn_dirty(pfn_t pfn);	101	static void kvm_release_pfn_dirty(pfn_t pfn);
102	static void mark_page_dirty_in_slot(struct kvm *kvm,	102	static void mark_page_dirty_in_slot(struct kvm *kvm,
@@ -129,7 +129,8 @@ int vcpu_load(struct kvm_vcpu *vcpu)
129	struct pid *oldpid = vcpu->pid;	129	struct pid *oldpid = vcpu->pid;
130	struct pid *newpid = get_task_pid(current, PIDTYPE_PID);	130	struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
131	rcu_assign_pointer(vcpu->pid, newpid);	131	rcu_assign_pointer(vcpu->pid, newpid);
132	synchronize_rcu();	132	if (oldpid)
		133	synchronize_rcu();
133	put_pid(oldpid);	134	put_pid(oldpid);
134	}	135	}
135	cpu = get_cpu();	136	cpu = get_cpu();
@@ -152,7 +153,7 @@ static void ack_flush(void *_completed)
152	{	153	{
153	}	154	}
154		155
155	static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)	156	bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
156	{	157	{
157	int i, cpu, me;	158	int i, cpu, me;
158	cpumask_var_t cpus;	159	cpumask_var_t cpus;
@@ -189,7 +190,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
189	long dirty_count = kvm->tlbs_dirty;	190	long dirty_count = kvm->tlbs_dirty;
190		191
191	smp_mb();	192	smp_mb();
192	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))	193	if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
193	++kvm->stat.remote_tlb_flush;	194	++kvm->stat.remote_tlb_flush;
194	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);	195	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
195	}	196	}
@@ -197,17 +198,17 @@ EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
197		198
198	void kvm_reload_remote_mmus(struct kvm *kvm)	199	void kvm_reload_remote_mmus(struct kvm *kvm)
199	{	200	{
200	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);	201	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
201	}	202	}
202		203
203	void kvm_make_mclock_inprogress_request(struct kvm *kvm)	204	void kvm_make_mclock_inprogress_request(struct kvm *kvm)
204	{	205	{
205	make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);	206	kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
206	}	207	}
207		208
208	void kvm_make_scan_ioapic_request(struct kvm *kvm)	209	void kvm_make_scan_ioapic_request(struct kvm *kvm)
209	{	210	{
210	make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);	211	kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
211	}	212	}
212		213
213	int kvm_vcpu_init(struct kvm_vcpu vcpu, struct kvm kvm, unsigned id)	214	int kvm_vcpu_init(struct kvm_vcpu vcpu, struct kvm kvm, unsigned id)
@@ -295,6 +296,9 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
295	kvm_flush_remote_tlbs(kvm);	296	kvm_flush_remote_tlbs(kvm);
296		297
297	spin_unlock(&kvm->mmu_lock);	298	spin_unlock(&kvm->mmu_lock);
		299
		300	kvm_arch_mmu_notifier_invalidate_page(kvm, address);
		301
298	srcu_read_unlock(&kvm->srcu, idx);	302	srcu_read_unlock(&kvm->srcu, idx);
299	}	303	}
300		304
@@ -368,7 +372,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
368		372
369	static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,	373	static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
370	struct mm_struct *mm,	374	struct mm_struct *mm,
371	unsigned long address)	375	unsigned long start,
		376	unsigned long end)
372	{	377	{
373	struct kvm *kvm = mmu_notifier_to_kvm(mn);	378	struct kvm *kvm = mmu_notifier_to_kvm(mn);
374	int young, idx;	379	int young, idx;
@@ -376,7 +381,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
376	idx = srcu_read_lock(&kvm->srcu);	381	idx = srcu_read_lock(&kvm->srcu);
377	spin_lock(&kvm->mmu_lock);	382	spin_lock(&kvm->mmu_lock);
378		383
379	young = kvm_age_hva(kvm, address);	384	young = kvm_age_hva(kvm, start, end);
380	if (young)	385	if (young)
381	kvm_flush_remote_tlbs(kvm);	386	kvm_flush_remote_tlbs(kvm);
382		387
@@ -476,6 +481,13 @@ static struct kvm *kvm_create_vm(unsigned long type)
476	kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);	481	kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
477	if (!kvm->memslots)	482	if (!kvm->memslots)
478	goto out_err_no_srcu;	483	goto out_err_no_srcu;
		484
		485	/*
		486	* Init kvm generation close to the maximum to easily test the
		487	* code of handling generation number wrap-around.
		488	*/
		489	kvm->memslots->generation = -150;
		490
479	kvm_init_memslots_id(kvm);	491	kvm_init_memslots_id(kvm);
480	if (init_srcu_struct(&kvm->srcu))	492	if (init_srcu_struct(&kvm->srcu))
481	goto out_err_no_srcu;	493	goto out_err_no_srcu;
@@ -687,8 +699,7 @@ static void sort_memslots(struct kvm_memslots *slots)
687	}	699	}
688		700
689	static void update_memslots(struct kvm_memslots *slots,	701	static void update_memslots(struct kvm_memslots *slots,
690	struct kvm_memory_slot *new,	702	struct kvm_memory_slot *new)
691	u64 last_generation)
692	{	703	{
693	if (new) {	704	if (new) {
694	int id = new->id;	705	int id = new->id;
@@ -699,15 +710,13 @@ static void update_memslots(struct kvm_memslots *slots,
699	if (new->npages != npages)	710	if (new->npages != npages)
700	sort_memslots(slots);	711	sort_memslots(slots);
701	}	712	}
702
703	slots->generation = last_generation + 1;
704	}	713	}
705		714
706	static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)	715	static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
707	{	716	{
708	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;	717	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
709		718
710	#ifdef KVM_CAP_READONLY_MEM	719	#ifdef __KVM_HAVE_READONLY_MEM
711	valid_flags \|= KVM_MEM_READONLY;	720	valid_flags \|= KVM_MEM_READONLY;
712	#endif	721	#endif
713		722
@@ -722,10 +731,24 @@ static struct kvm_memslots install_new_memslots(struct kvm kvm,
722	{	731	{
723	struct kvm_memslots *old_memslots = kvm->memslots;	732	struct kvm_memslots *old_memslots = kvm->memslots;
724		733
725	update_memslots(slots, new, kvm->memslots->generation);	734	/*
		735	* Set the low bit in the generation, which disables SPTE caching
		736	* until the end of synchronize_srcu_expedited.
		737	*/
		738	WARN_ON(old_memslots->generation & 1);
		739	slots->generation = old_memslots->generation + 1;
		740
		741	update_memslots(slots, new);
726	rcu_assign_pointer(kvm->memslots, slots);	742	rcu_assign_pointer(kvm->memslots, slots);
727	synchronize_srcu_expedited(&kvm->srcu);	743	synchronize_srcu_expedited(&kvm->srcu);
728		744
		745	/*
		746	* Increment the new memslot generation a second time. This prevents
		747	* vm exits that race with memslot updates from caching a memslot
		748	* generation that will (potentially) be valid forever.
		749	*/
		750	slots->generation++;
		751
729	kvm_arch_memslots_updated(kvm);	752	kvm_arch_memslots_updated(kvm);
730		753
731	return old_memslots;	754	return old_memslots;
@@ -776,7 +799,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
776	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;	799	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
777	npages = mem->memory_size >> PAGE_SHIFT;	800	npages = mem->memory_size >> PAGE_SHIFT;
778		801
779	r = -EINVAL;
780	if (npages > KVM_MEM_MAX_NR_PAGES)	802	if (npages > KVM_MEM_MAX_NR_PAGES)
781	goto out;	803	goto out;
782		804
@@ -790,7 +812,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
790	new.npages = npages;	812	new.npages = npages;
791	new.flags = mem->flags;	813	new.flags = mem->flags;
792		814
793	r = -EINVAL;
794	if (npages) {	815	if (npages) {
795	if (!old.npages)	816	if (!old.npages)
796	change = KVM_MR_CREATE;	817	change = KVM_MR_CREATE;
@@ -846,7 +867,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
846	}	867	}
847		868
848	if ((change == KVM_MR_DELETE) \|\| (change == KVM_MR_MOVE)) {	869	if ((change == KVM_MR_DELETE) \|\| (change == KVM_MR_MOVE)) {
849	r = -ENOMEM;
850	slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),	870	slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
851	GFP_KERNEL);	871	GFP_KERNEL);
852	if (!slots)	872	if (!slots)
@@ -1075,9 +1095,9 @@ EXPORT_SYMBOL_GPL(gfn_to_hva);
1075	* If writable is set to false, the hva returned by this function is only	1095	* If writable is set to false, the hva returned by this function is only
1076	* allowed to be read.	1096	* allowed to be read.
1077	*/	1097	*/
1078	unsigned long gfn_to_hva_prot(struct kvm kvm, gfn_t gfn, bool writable)	1098	unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
		1099	gfn_t gfn, bool *writable)
1079	{	1100	{
1080	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
1081	unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);	1101	unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
1082		1102
1083	if (!kvm_is_error_hva(hva) && writable)	1103	if (!kvm_is_error_hva(hva) && writable)
@@ -1086,6 +1106,13 @@ unsigned long gfn_to_hva_prot(struct kvm kvm, gfn_t gfn, bool writable)
1086	return hva;	1106	return hva;
1087	}	1107	}
1088		1108
		1109	unsigned long gfn_to_hva_prot(struct kvm kvm, gfn_t gfn, bool writable)
		1110	{
		1111	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
		1112
		1113	return gfn_to_hva_memslot_prot(slot, gfn, writable);
		1114	}
		1115
1089	static int kvm_read_hva(void data, void __user hva, int len)	1116	static int kvm_read_hva(void data, void __user hva, int len)
1090	{	1117	{
1091	return __copy_from_user(data, hva, len);	1118	return __copy_from_user(data, hva, len);
@@ -1107,6 +1134,43 @@ static int get_user_page_nowait(struct task_struct tsk, struct mm_struct mm,
1107	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);	1134	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
1108	}	1135	}
1109		1136
		1137	int kvm_get_user_page_io(struct task_struct tsk, struct mm_struct mm,
		1138	unsigned long addr, bool write_fault,
		1139	struct page **pagep)
		1140	{
		1141	int npages;
		1142	int locked = 1;
		1143	int flags = FOLL_TOUCH \| FOLL_HWPOISON \|
		1144	(pagep ? FOLL_GET : 0) \|
		1145	(write_fault ? FOLL_WRITE : 0);
		1146
		1147	/*
		1148	* If retrying the fault, we get here not having allowed the filemap
		1149	* to wait on the page lock. We should now allow waiting on the IO with
		1150	* the mmap semaphore released.
		1151	*/
		1152	down_read(&mm->mmap_sem);
		1153	npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
		1154	&locked);
		1155	if (!locked) {
		1156	VM_BUG_ON(npages);
		1157
		1158	if (!pagep)
		1159	return 0;
		1160
		1161	/*
		1162	* The previous call has now waited on the IO. Now we can
		1163	* retry and complete. Pass TRIED to ensure we do not re
		1164	* schedule async IO (see e.g. filemap_fault).
		1165	*/
		1166	down_read(&mm->mmap_sem);
		1167	npages = __get_user_pages(tsk, mm, addr, 1, flags \| FOLL_TRIED,
		1168	pagep, NULL, NULL);
		1169	}
		1170	up_read(&mm->mmap_sem);
		1171	return npages;
		1172	}
		1173
1110	static inline int check_user_page_hwpoison(unsigned long addr)	1174	static inline int check_user_page_hwpoison(unsigned long addr)
1111	{	1175	{
1112	int rc, flags = FOLL_TOUCH \| FOLL_HWPOISON \| FOLL_WRITE;	1176	int rc, flags = FOLL_TOUCH \| FOLL_HWPOISON \| FOLL_WRITE;
@@ -1169,9 +1233,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
1169	npages = get_user_page_nowait(current, current->mm,	1233	npages = get_user_page_nowait(current, current->mm,
1170	addr, write_fault, page);	1234	addr, write_fault, page);
1171	up_read(&current->mm->mmap_sem);	1235	up_read(&current->mm->mmap_sem);
1172	} else	1236	} else {
1173	npages = get_user_pages_fast(addr, 1, write_fault,	1237	/*
1174	page);	1238	* By now we have tried gup_fast, and possibly async_pf, and we
		1239	* are certainly not atomic. Time to retry the gup, allowing
		1240	* mmap semaphore to be relinquished in the case of IO.
		1241	*/
		1242	npages = kvm_get_user_page_io(current, current->mm, addr,
		1243	write_fault, page);
		1244	}
1175	if (npages != 1)	1245	if (npages != 1)
1176	return npages;	1246	return npages;
1177		1247
@@ -1768,8 +1838,7 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
1768	bool eligible;	1838	bool eligible;
1769		1839
1770	eligible = !vcpu->spin_loop.in_spin_loop \|\|	1840	eligible = !vcpu->spin_loop.in_spin_loop \|\|
1771	(vcpu->spin_loop.in_spin_loop &&	1841	vcpu->spin_loop.dy_eligible;
1772	vcpu->spin_loop.dy_eligible);
1773		1842
1774	if (vcpu->spin_loop.in_spin_loop)	1843	if (vcpu->spin_loop.in_spin_loop)
1775	kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);	1844	kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
@@ -1975,6 +2044,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
1975	if (vcpu->kvm->mm != current->mm)	2044	if (vcpu->kvm->mm != current->mm)
1976	return -EIO;	2045	return -EIO;
1977		2046
		2047	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
		2048	return -EINVAL;
		2049
1978	#if defined(CONFIG_S390) \|\| defined(CONFIG_PPC) \|\| defined(CONFIG_MIPS)	2050	#if defined(CONFIG_S390) \|\| defined(CONFIG_PPC) \|\| defined(CONFIG_MIPS)
1979	/*	2051	/*
1980	* Special cases: vcpu ioctls that are asynchronous to vcpu execution,	2052	* Special cases: vcpu ioctls that are asynchronous to vcpu execution,
@@ -2259,6 +2331,29 @@ struct kvm_device kvm_device_from_filp(struct file filp)
2259	return filp->private_data;	2331	return filp->private_data;
2260	}	2332	}
2261		2333
		2334	static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
		2335	#ifdef CONFIG_KVM_MPIC
		2336	[KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
		2337	[KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
		2338	#endif
		2339
		2340	#ifdef CONFIG_KVM_XICS
		2341	[KVM_DEV_TYPE_XICS] = &kvm_xics_ops,
		2342	#endif
		2343	};
		2344
		2345	int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
		2346	{
		2347	if (type >= ARRAY_SIZE(kvm_device_ops_table))
		2348	return -ENOSPC;
		2349
		2350	if (kvm_device_ops_table[type] != NULL)
		2351	return -EEXIST;
		2352
		2353	kvm_device_ops_table[type] = ops;
		2354	return 0;
		2355	}
		2356
2262	static int kvm_ioctl_create_device(struct kvm *kvm,	2357	static int kvm_ioctl_create_device(struct kvm *kvm,
2263	struct kvm_create_device *cd)	2358	struct kvm_create_device *cd)
2264	{	2359	{
@@ -2267,36 +2362,12 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
2267	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;	2362	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
2268	int ret;	2363	int ret;
2269		2364
2270	switch (cd->type) {	2365	if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
2271	#ifdef CONFIG_KVM_MPIC	2366	return -ENODEV;
2272	case KVM_DEV_TYPE_FSL_MPIC_20:	2367
2273	case KVM_DEV_TYPE_FSL_MPIC_42:	2368	ops = kvm_device_ops_table[cd->type];
2274	ops = &kvm_mpic_ops;	2369	if (ops == NULL)
2275	break;
2276	#endif
2277	#ifdef CONFIG_KVM_XICS
2278	case KVM_DEV_TYPE_XICS:
2279	ops = &kvm_xics_ops;
2280	break;
2281	#endif
2282	#ifdef CONFIG_KVM_VFIO
2283	case KVM_DEV_TYPE_VFIO:
2284	ops = &kvm_vfio_ops;
2285	break;
2286	#endif
2287	#ifdef CONFIG_KVM_ARM_VGIC
2288	case KVM_DEV_TYPE_ARM_VGIC_V2:
2289	ops = &kvm_arm_vgic_v2_ops;
2290	break;
2291	#endif
2292	#ifdef CONFIG_S390
2293	case KVM_DEV_TYPE_FLIC:
2294	ops = &kvm_flic_ops;
2295	break;
2296	#endif
2297	default:
2298	return -ENODEV;	2370	return -ENODEV;
2299	}
2300		2371
2301	if (test)	2372	if (test)
2302	return 0;	2373	return 0;
@@ -2611,7 +2682,6 @@ static long kvm_dev_ioctl(struct file *filp,
2611		2682
2612	switch (ioctl) {	2683	switch (ioctl) {
2613	case KVM_GET_API_VERSION:	2684	case KVM_GET_API_VERSION:
2614	r = -EINVAL;
2615	if (arg)	2685	if (arg)
2616	goto out;	2686	goto out;
2617	r = KVM_API_VERSION;	2687	r = KVM_API_VERSION;
@@ -2623,7 +2693,6 @@ static long kvm_dev_ioctl(struct file *filp,
2623	r = kvm_vm_ioctl_check_extension_generic(NULL, arg);	2693	r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
2624	break;	2694	break;
2625	case KVM_GET_VCPU_MMAP_SIZE:	2695	case KVM_GET_VCPU_MMAP_SIZE:
2626	r = -EINVAL;
2627	if (arg)	2696	if (arg)
2628	goto out;	2697	goto out;
2629	r = PAGE_SIZE; /* struct kvm_run */	2698	r = PAGE_SIZE; /* struct kvm_run */
@@ -2668,7 +2737,7 @@ static void hardware_enable_nolock(void *junk)
2668		2737
2669	cpumask_set_cpu(cpu, cpus_hardware_enabled);	2738	cpumask_set_cpu(cpu, cpus_hardware_enabled);
2670		2739
2671	r = kvm_arch_hardware_enable(NULL);	2740	r = kvm_arch_hardware_enable();
2672		2741
2673	if (r) {	2742	if (r) {
2674	cpumask_clear_cpu(cpu, cpus_hardware_enabled);	2743	cpumask_clear_cpu(cpu, cpus_hardware_enabled);
@@ -2693,7 +2762,7 @@ static void hardware_disable_nolock(void *junk)
2693	if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))	2762	if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
2694	return;	2763	return;
2695	cpumask_clear_cpu(cpu, cpus_hardware_enabled);	2764	cpumask_clear_cpu(cpu, cpus_hardware_enabled);
2696	kvm_arch_hardware_disable(NULL);	2765	kvm_arch_hardware_disable();
2697	}	2766	}
2698		2767
2699	static void hardware_disable(void)	2768	static void hardware_disable(void)
@@ -3123,6 +3192,8 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
3123	if (vcpu->preempted)	3192	if (vcpu->preempted)
3124	vcpu->preempted = false;	3193	vcpu->preempted = false;
3125		3194
		3195	kvm_arch_sched_in(vcpu, cpu);
		3196
3126	kvm_arch_vcpu_load(vcpu, cpu);	3197	kvm_arch_vcpu_load(vcpu, cpu);
3127	}	3198	}
3128		3199
@@ -3214,6 +3285,9 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
3214	goto out_undebugfs;	3285	goto out_undebugfs;
3215	}	3286	}
3216		3287
		3288	r = kvm_vfio_ops_init();
		3289	WARN_ON(r);
		3290
3217	return 0;	3291	return 0;
3218		3292
3219	out_undebugfs:	3293	out_undebugfs: