/*
* Kernel-based Virtual Machine driver for Linux
*
* This module enables machines with Intel VT-x extensions to run virtual
* machines without emulation or binary translation.
*
* Copyright (C) 2006 Qumranet, Inc.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*
*/
#include "kvm.h"
#include "x86_emulate.h"
#include "segment_descriptor.h"
#include "irq.h"
#include <linux/kvm.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/percpu.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/miscdevice.h>
#include <linux/vmalloc.h>
#include <linux/reboot.h>
#include <linux/debugfs.h>
#include <linux/highmem.h>
#include <linux/file.h>
#include <linux/sysdev.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
#include <linux/smp.h>
#include <linux/anon_inodes.h>
#include <linux/profile.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/io.h>
#include <asm/uaccess.h>
#include <asm/desc.h>
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
static DEFINE_SPINLOCK(kvm_lock);
static LIST_HEAD(vm_list);
static cpumask_t cpus_hardware_enabled;
struct kvm_x86_ops *kvm_x86_ops;
struct kmem_cache *kvm_vcpu_cache;
EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
static __read_mostly struct preempt_ops kvm_preempt_ops;
#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
static struct kvm_stats_debugfs_item {
const char *name;
int offset;
struct dentry *dentry;
} debugfs_entries[] = {
{ "pf_fixed", STAT_OFFSET(pf_fixed) },
{ "pf_guest", STAT_OFFSET(pf_guest) },
{ "tlb_flush", STAT_OFFSET(tlb_flush) },
{ "invlpg", STAT_OFFSET(invlpg) },
{ "exits", STAT_OFFSET(exits) },
{ "io_exits", STAT_OFFSET(io_exits) },
{ "mmio_exits", STAT_OFFSET(mmio_exits) },
{ "signal_exits", STAT_OFFSET(signal_exits) },
{ "irq_window", STAT_OFFSET(irq_window_exits) },
{ "halt_exits", STAT_OFFSET(halt_exits) },
{ "halt_wakeup", STAT_OFFSET(halt_wakeup) },
{ "request_irq", STAT_OFFSET(request_irq_exits) },
{ "irq_exits", STAT_OFFSET(irq_exits) },
{ "light_exits", STAT_OFFSET(light_exits) },
{ "efer_reload", STAT_OFFSET(efer_reload) },
{ NULL }
};
static struct dentry *debugfs_dir;
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
| X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
| X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
#define CR4_RESERVED_BITS \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
#define EFER_RESERVED_BITS 0xfffffffffffff2fe
#ifdef CONFIG_X86_64
// LDT or TSS descriptor in the GDT. 16 bytes.
struct segment_descriptor_64 {
struct segment_descriptor s;
u32 base_higher;
u32 pad_zero;
};
#endif
static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
unsigned long arg);
unsigned long segment_base(u16 selector)
{
struct descriptor_table gdt;
struct segment_descriptor *d;
unsigned long table_base;
typedef unsigned long ul;
unsigned long v;
if (selector == 0)
return 0;
asm ("sgdt %0" : "=m"(gdt));
table_base = gdt.base;
if (selector & 4) { /* from ldt */
u16 ldt_selector;
asm ("sldt %0" : "=g"(ldt_selector));
table_base = segment_base(ldt_selector);
}
d = (struct segment_descriptor *)(table_base + (selector & ~7));
v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
#ifdef CONFIG_X86_64
if (d->system == 0
&& (d->type == 2 || d->type == 9 || d->type == 11))
v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
#endif
return v;
}
EXPORT_SYMBOL_GPL(segment_base);
static inline int valid_vcpu(int n)
{
return likely(n >= 0 && n < KVM_MAX_VCPUS);
}
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
return;
vcpu->guest_fpu_loaded = 1;
fx_save(&vcpu->host_fx_image);
fx_restore(&vcpu->guest_fx_image);
}
EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
if (!vcpu->guest_fpu_loaded)
return;
vcpu->guest_fpu_loaded = 0;
fx_save(&vcpu->guest_fx_image);
fx_restore(&vcpu->host_fx_image);
}
EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
/*
* Switches to specified vcpu, until a matching vcpu_put()
*/
static void vcpu_load(struct kvm_vcpu *vcpu)
{
int cpu;
mutex_lock(&vcpu->mutex);
cpu = get_cpu();
preempt_notifier_register(&vcpu->preempt_notifier);
kvm_x86_ops->vcpu_load(vcpu, cpu);
put_cpu();
}
static void vcpu_put(struct kvm_vcpu *vcpu)
{
preempt_disable();
kvm_x86_ops->vcpu_put(vcpu);
preempt_notifier_unregister(&vcpu->preempt_notifier);
preempt_enable();
mutex_unlock(&vcpu->mutex);
}
static void ack_flush(void *_completed)
{
atomic_t *completed = _completed;
atomic_inc(completed);
}
void kvm_flush_remote_tlbs(struct kvm *kvm)
{
int i, cpu, needed;
cpumask_t cpus;
struct kvm_vcpu *vcpu;
atomic_t completed;
atomic_set(&completed, 0);
cpus_clear(cpus);
needed = 0;
for (i = 0; i < KVM_MAX_VCPUS; ++i) {
vcpu = kvm->vcpus[i];
if (!vcpu)
continue;
if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
continue;
cpu = vcpu->cpu;
if (cpu != -1 && cpu != raw_smp_processor_id())
if (!cpu_isset(cpu, cpus)) {
cpu_set(cpu, cpus);
++needed;
}
}
/*
* We really want smp_call_function_mask() here. But that's not
* available, so ipi all cpus in parallel and wait for them
* to complete.
*/
for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus))
smp_call_function_single(cpu, ack_flush, &completed, 1, 0);
while (atomic_read(&completed) != needed) {
cpu_relax();
barrier();
}
}
int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
{
struct page *page;
int r;
mutex_init(&vcpu->mutex);
vcpu->cpu = -1;
vcpu->mmu.root_hpa = INVALID_PAGE;
vcpu->kvm = kvm;
vcpu->vcpu_id = id;
if (!irqchip_in_kernel(kvm) || id == 0)
vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
else
vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
init_waitqueue_head(&vcpu->wq);
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page) {
r = -ENOMEM;
goto fail;
}
vcpu->run = page_address(page);
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page) {
r = -ENOMEM;
goto fail_free_run;
}
vcpu->pio_data = page_address(page);
r = kvm_mmu_create(vcpu);
if (r < 0)
goto fail_free_pio_data;
return 0;
fail_free_pio_data:
free_page((unsigned long)vcpu->pio_data);
fail_free_run:
free_page((unsigned long)vcpu->run);
fail:
return -ENOMEM;
}
EXPORT_SYMBOL_GPL(kvm_vcpu_init);
void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
{
kvm_mmu_destroy(vcpu);
if (vcpu->apic)
hrtimer_cancel(&vcpu->apic->timer.dev);
kvm_free_apic(vcpu->apic);
free_page((unsigned long)vcpu->pio_data);
free_page((unsigned long)vcpu->run);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
static struct kvm *kvm_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
if (!kvm)
return ERR_PTR(-ENOMEM);
kvm_io_bus_init(&kvm->pio_bus);
mutex_init(&kvm->lock);
INIT_LIST_HEAD(&kvm->active_mmu_pages);
kvm_io_bus_init(&kvm->mmio_bus);
spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
spin_unlock(&kvm_lock);
return kvm;
}
/*
* Free any memory in @free but not in @dont.
*/
static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
struct kvm_memory_slot *dont)
{
int i;
if (!dont || free->phys_mem != dont->phys_mem)
if (free->phys_mem) {
for (i = 0; i < free->npages; ++i)
if (free->phys_mem[i])
__free_page(free->phys_mem[i]);
vfree(free->phys_mem);
}
if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
vfree(free->dirty_bitmap);
free->phys_mem = NULL;
free->npages = 0;
free->dirty_bitmap = NULL;
}
static void kvm_free_physmem(struct kvm *kvm)
{
int i;
for (i = 0; i < kvm->nmemslots; ++i)
kvm_free_physmem_slot(&kvm->memslots[i], NULL);
}
static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
{
int i;
for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
if (vcpu->pio.guest_pages[i]) {
__free_page(vcpu->pio.guest_pages[i]);
vcpu->pio.guest_pages[i] = NULL;
}
}
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
{
vcpu_load(vcpu);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
}
static void kvm_free_vcpus(struct kvm *kvm)
{
unsigned int i;
/*
* Unpin any mmu pages first.
*/
for (i = 0; i < KVM_MAX_VCPUS; ++i)
if (kvm->vcpus[i])
kvm_unload_vcpu_mmu(kvm->vcpus[i]);
for (i = 0; i < KVM_MAX_VCPUS; ++i) {
if (kvm->vcpus[i]) {
kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
kvm->vcpus[i] = NULL;
}
}
}
static void kvm_destroy_vm(struct kvm *kvm)
{
spin_lock(&kvm_lock);
list_del(&kvm->vm_list);
spin_unlock(&kvm_lock);
kvm_io_bus_destroy(&kvm->pio_bus);
kvm_io_bus_destroy(&kvm->mmio_bus);
kfree(kvm->vpic);
kfree(kvm->vioapic);
kvm_free_vcpus(kvm);
kvm_free_physmem(kvm);
kfree(kvm);
}
static int kvm_vm_release(struct inode *inode, struct file *filp)
{
struct kvm *kvm = filp->private_data;
kvm_destroy_vm(kvm);
return 0;
}
static void inject_gp(struct kvm_vcpu *vcpu)
{
kvm_x86_ops->inject_gp(vcpu, 0);
}
/*
* Load the pae pdptrs. Return true is they are all valid.
*/
static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
{
gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
int i;
u64 *pdpt;
int ret;
struct page *page;
u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
mutex_lock(&vcpu->kvm->lock);
page = gfn_to_page(vcpu->kvm, pdpt_gfn);
if (!page) {
ret = 0;
goto out;
}
pdpt = kmap_atomic(page, KM_USER0);
memcpy(pdpte, pdpt+offset, sizeof(pdpte));
kunmap_atomic(pdpt, KM_USER0);
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
ret = 0;
goto out;
}
}
ret = 1;
memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
out:
mutex_unlock(&vcpu->kvm->lock);
return ret;
}
void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
if (cr0 & CR0_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
cr0, vcpu->cr0);
inject_gp(vcpu);
return;
}
if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
inject_gp(vcpu);
return;
}
if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
"and a clear PE flag\n");
inject_gp(vcpu);
return;
}
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
if ((vcpu->shadow_efer & EFER_LME)) {
int cs_db, cs_l;
if (!is_pae(vcpu)) {
printk(KERN_DEBUG "set_cr0: #GP, start paging "
"in long mode while PAE is disabled\n");
inject_gp(vcpu);
return;
}
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
if (cs_l) {
printk(KERN_DEBUG "set_cr0: #GP, start paging "
"in long mode while CS.L == 1\n");
inject_gp(vcpu);
return;
}
} else
#endif
if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
"reserved bits\n");
inject_gp(vcpu);
return;
}
}
kvm_x86_ops->set_cr0(vcpu, cr0);
vcpu->cr0 = cr0;
mutex_lock(&vcpu->kvm->lock);
kvm_mmu_reset_context(vcpu);
mutex_unlock(&vcpu->kvm->lock);
return;
}
EXPORT_SYMBOL_GPL(set_cr0);
void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
}
EXPORT_SYMBOL_GPL(lmsw);
void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
if (cr4 & CR4_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
inject_gp(vcpu);
return;
}
if (is_long_mode(vcpu)) {
if (!(cr4 & X86_CR4_PAE)) {
printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
"in long mode\n");
inject_gp(vcpu);
return;
}
} else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
&& !load_pdptrs(vcpu, vcpu->cr3)) {
printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
inject_gp(vcpu);
return;
}
if (cr4 & X86_CR4_VMXE) {
printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
inject_gp(vcpu);
return;
}
kvm_x86_ops->set_cr4(vcpu, cr4);
vcpu->cr4 = cr4;
mutex_lock(&vcpu->kvm->lock);
kvm_mmu_reset_context(vcpu);
mutex_unlock(&vcpu->kvm->lock);
}
EXPORT_SYMBOL_GPL(set_cr4);
void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
if (is_long_mode(vcpu)) {
if (cr3 & CR3_L_MODE_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
inject_gp(vcpu);
return;
}
} else {
if (is_pae(vcpu)) {
if (cr3 & CR3_PAE_RESERVED_BITS) {
printk(KERN_DEBUG
"set_cr3: #GP, reserved bits\n");
inject_gp(vcpu);
return;
}
if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
"reserved bits\n");
inject_gp(vcpu);
return;
}
} else {
if (cr3 & CR3_NONPAE_RESERVED_BITS) {
printk(KERN_DEBUG
"set_cr3: #GP, reserved bits\n");
inject_gp(vcpu);
return;
}
}
}
mutex_lock(&vcpu->kvm->lock);
/*
* Does the new cr3 value map to physical memory? (Note, we
* catch an invalid cr3 even in real-mode, because it would
* cause trouble later on when we turn on paging anyway.)
*
* A real CPU would silently accept an invalid cr3 and would
* attempt to use it - with largely undefined (and often hard
* to debug) behavior on the guest side.
*/
if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
inject_gp(vcpu);
else {
vcpu->cr3 = cr3;
vcpu->mmu.new_cr3(vcpu);
}
mutex_unlock(&vcpu->kvm->lock);
}
EXPORT_SYMBOL_GPL(set_cr3);
void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
|