/*
* Kernel-based Virtual Machine driver for Linux
*
* This module enables machines with Intel VT-x extensions to run virtual
* machines without emulation or binary translation.
*
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*
*/
#include <kvm/iodev.h>
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/percpu.h>
#include <linux/mm.h>
#include <linux/miscdevice.h>
#include <linux/vmalloc.h>
#include <linux/reboot.h>
#include <linux/debugfs.h>
#include <linux/highmem.h>
#include <linux/file.h>
#include <linux/syscore_ops.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
#include <linux/smp.h>
#include <linux/anon_inodes.h>
#include <linux/profile.h>
#include <linux/kvm_para.h>
#include <linux/pagemap.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/compat.h>
#include <linux/srcu.h>
#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <linux/sort.h>
#include <linux/bsearch.h>
#include <asm/processor.h>
#include <asm/io.h>
#include <asm/ioctl.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include "coalesced_mmio.h"
#include "async_pf.h"
#include "vfio.h"
#define CREATE_TRACE_POINTS
#include <trace/events/kvm.h>
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
static unsigned int halt_poll_ns;
module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
/*
* Ordering of locks:
*
* kvm->lock --> kvm->slots_lock --> kvm->irq_lock
*/
DEFINE_SPINLOCK(kvm_lock);
static DEFINE_RAW_SPINLOCK(kvm_count_lock);
LIST_HEAD(vm_list);
static cpumask_var_t cpus_hardware_enabled;
static int kvm_usage_count;
static atomic_t hardware_enable_failed;
struct kmem_cache *kvm_vcpu_cache;
EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
static __read_mostly struct preempt_ops kvm_preempt_ops;
struct dentry *kvm_debugfs_dir;
EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
unsigned long arg);
#ifdef CONFIG_KVM_COMPAT
static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
unsigned long arg);
#endif
static int hardware_enable_all(void);
static void hardware_disable_all(void);
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
static void kvm_release_pfn_dirty(pfn_t pfn);
static void mark_page_dirty_in_slot(struct kvm *kvm,
struct kvm_memory_slot *memslot, gfn_t gfn);
__visible bool kvm_rebooting;
EXPORT_SYMBOL_GPL(kvm_rebooting);
static bool largepages_enabled = true;
bool kvm_is_reserved_pfn(pfn_t pfn)
{
if (pfn_valid(pfn))
return PageReserved(pfn_to_page(pfn));
return true;
}
/*
* Switches to specified vcpu, until a matching vcpu_put()
*/
int vcpu_load(struct kvm_vcpu *vcpu)
{
int cpu;
if (mutex_lock_killable(&vcpu->mutex))
return -EINTR;
cpu = get_cpu();
preempt_notifier_register(&vcpu->preempt_notifier);
kvm_arch_vcpu_load(vcpu, cpu);
put_cpu();
return 0;
}
void vcpu_put(struct kvm_vcpu *vcpu)
{
preempt_disable();
kvm_arch_vcpu_put(vcpu);
preempt_notifier_unregister(&vcpu->preempt_notifier);
preempt_enable();
mutex_unlock(&vcpu->mutex);
}
static void ack_flush(void *_completed)
{
}
bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
{
int i, cpu, me;
cpumask_var_t cpus;
bool called = true;
struct kvm_vcpu *vcpu;
zalloc_cpumask_var(&cpus, GFP_ATOMIC);
me = get_cpu();
kvm_for_each_vcpu(i, vcpu, kvm) {
kvm_make_request(req, vcpu);
cpu = vcpu->cpu;
/* Set ->requests bit before we read ->mode */
smp_mb();
if (cpus != NULL && cpu != -1 && cpu != me &&
kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
cpumask_set_cpu(cpu, cpus);
}
if (unlikely(cpus == NULL))
smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
else if (!cpumask_empty(cpus))
smp_call_function_many(cpus, ack_flush, NULL, 1);
else
called = false;
put_cpu();
free_cpumask_var(cpus);
return called;
}
#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
void kvm_flush_remote_tlbs(struct kvm *kvm)
{
long dirty_count = kvm->tlbs_dirty;
smp_mb();
if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
++kvm->stat.remote_tlb_flush;
cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
}
EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
#endif
void kvm_reload_remote_mmus(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
}
void kvm_make_mclock_inprogress_request(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
}
void kvm_make_scan_ioapic_request(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
}
int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
{
struct page *page;
int r;
mutex_init(&vcpu->mutex);
vcpu->cpu = -1;
vcpu->kvm = kvm;
vcpu->vcpu_id = id;
vcpu->pid = NULL;
init_waitqueue_head(&vcpu->wq);
kvm_async_pf_vcpu_init(vcpu);
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page) {
r = -ENOMEM;
goto fail;
}
vcpu->run = page_address(page);
kvm_vcpu_set_in_spin_loop(vcpu, false);
kvm_vcpu_set_dy_eligible(vcpu, false);
vcpu->preempted = false;
r = kvm_arch_vcpu_init(vcpu);
if (r < 0)
goto fail_free_run;
return 0;
fail_free_run:
free_page((unsigned long)vcpu->run);
fail:
return r;
}
EXPORT_SYMBOL_GPL(kvm_vcpu_init);
void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
{
put_pid(vcpu->pid);
kvm_arch_vcpu_uninit(vcpu);
free_page((unsigned long)vcpu->run);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
{
return container_of(mn, struct kvm, mmu_notifier);
}
static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int need_tlb_flush, idx;
/*
* When ->invalidate_page runs, the linux pte has been zapped
* already but the page is still allocated until
* ->invalidate_page returns. So if we increase the sequence
* here the kvm page fault will notice if the spte can't be
* established because the page is going to be freed. If
* instead the kvm page fault establishes the spte before
* ->invalidate_page runs, kvm_unmap_hva will release it
* before returning.
*
* The sequence increase only need to be seen at spin_unlock
* time, and not at spin_lock time.
*
* Increasing the sequence after the spin_unlock would be
* unsafe because the kvm page fault could then establish the
* pte after kvm_unmap_hva returned, without noticing the page
* is going to be freed.
*/
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
kvm->mmu_notifier_seq++;
need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock);
kvm_arch_mmu_notifier_invalidate_page(kvm, address);
srcu_read_unlock(&kvm->srcu, idx);
}
static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address,
pte_t pte)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int idx;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
kvm->mmu_notifier_seq++;
kvm_set_spte_hva(kvm, address, pte);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
}
static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int need_tlb_flush = 0, idx;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
/*
* The count increase must become visible at unlock time as no
* spte can be established without taking the mmu_lock and
* count is also read inside the mmu_lock critical section.
*/
kvm->mmu_notifier_count++;
need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
need_tlb_flush |= kvm->tlbs_dirty;
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
}
static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
spin_lock(&kvm->mmu_lock);
/*
* This sequence increase will notify the kvm page fault that
* the page that is going to be mapped in the spte could have
* been freed.
*/
kvm->mmu_notifier_seq++;
smp_wmb();
/*
* The above sequence increase must be visible before the
* below count decrease, which is ensured by the smp_wmb above
* in conjunction with the smp_rmb in mmu_notifier_retry().
*/
kvm->mmu_notifier_count--;
spin_unlock(&kvm->mmu_lock);
BUG_ON(kvm->mmu_notifier_count < 0);
}
static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int young, idx;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
young = kvm_age_hva(kvm, start, end);
if (young)
kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
return young;
}
static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int young, idx;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
young = kvm_test_age_hva(kvm, address);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
return young;
}
static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
struct mm_struct *mm)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int idx;
idx = srcu_read_lock(&kvm->srcu);
kvm_arch_flush_shadow_all(kvm);
srcu_read_unlock(&kvm->srcu, idx);
}
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.invalidate_page = kvm_mmu_notifier_invalidate_page,
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
.test_young = kvm_mmu_notifier_test_young,
.change_pte = kvm_mmu_notifier_change_pte,
.release = kvm_mmu_notifier_release,
};
static int kvm_init_mmu_notifier(struct kvm *kvm)
{
kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
}
#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
static int kvm_init_mmu_notifier(struct kvm *kvm)
{
return 0;
}
#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
static void kvm_init_memslots_id(struct kvm *kvm)
{
int i;
struct kvm_memslots *slots = kvm->memslots;
for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
slots->id_to_index[i] = slots->memslots[i].id = i;
}
static struct kvm *kvm_create_vm(unsigned long type)
{
int r, i;
struct kvm *kvm = kvm_arch_alloc_vm();
if (!kvm)
return ERR_PTR(-ENOMEM);
r = kvm_arch_init_vm(kvm, type);
if (r)
goto out_err_no_disable;
r = hardware_enable_all();
if (r)
goto out_err_no_disable;
#ifdef CONFIG_HAVE_KVM_IRQFD
INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
#endif
BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
r = -ENOMEM;
kvm->memslots = kvm_kvzalloc(sizeof(struct kvm_memslots));
if (!kvm->memslots)
goto out_err_no_srcu;
/*
* Init kvm generation close to the maximum to easily test the
* code of handling generation number wrap-around.
*/
kvm->memslots->generation = -150;
kvm_init_memslots_id(kvm);
if (init_srcu_struct(&kvm->srcu))
goto out_err_no_srcu;
if (init_srcu_struct(&kvm->irq_srcu))
goto out_err_no_irq_srcu;
for (i = 0; i < KVM_NR_BUSES; i++) {
kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
GFP_KERNEL);
if (!kvm->buses[i])
goto out_err;
}
spin_lock_init(&kvm->mmu_lock);
kvm->mm = current->mm;
atomic_inc(&kvm->mm->mm_count);
kvm_eventfd_init(kvm);
mutex_init(&kvm->lock);
mutex_init(&kvm->irq_lock);
mutex_init(&kvm->slots_lock);
atomic_set(&kvm->users_count, 1);
INIT_LIST_HEAD(&kvm->devices);
r = kvm_init_mmu_notifier(kvm);
if (r)
goto out_err;
spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
spin_unlock(&kvm_lock);
return kvm;
out_err:
cleanup_srcu_struct(&kvm->irq_srcu);
out_err_no_irq_srcu:
cleanup_srcu_struct(&kvm->srcu);
out_err_no_srcu:
hardware_disable_all();
out_err_no_disable:
for (i = 0; i < KVM_NR_BUSES; i++)
kfree(kvm->buses[i]);
kvfree(kvm->memslots);
kvm_arch_free_vm(kvm);
return ERR_PTR(r);
}
/*
* Avoid using vmalloc for a small buffer.
* Should not be used when the size is statically known.
*/
void *kvm_kvzalloc(unsigned long size)
{
if (size > PAGE_SIZE)
|