/* * Kernel-based Virtual Machine driver for Linux * * derived from drivers/kvm/kvm_main.c * * Copyright (C) 2006 Qumranet, Inc. * * Authors: * Avi Kivity * Yaniv Kamay * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. * */ #include "kvm.h" #include "x86.h" #include "segment_descriptor.h" #include "irq.h" #include #include #include #include #include #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) unsigned long segment_base(u16 selector) { struct descriptor_table gdt; struct segment_descriptor *d; unsigned long table_base; unsigned long v; if (selector == 0) return 0; asm("sgdt %0" : "=m"(gdt)); table_base = gdt.base; if (selector & 4) { /* from ldt */ u16 ldt_selector; asm("sldt %0" : "=g"(ldt_selector)); table_base = segment_base(ldt_selector); } d = (struct segment_descriptor *)(table_base + (selector & ~7)); v = d->base_low | ((unsigned long)d->base_mid << 16) | ((unsigned long)d->base_high << 24); #ifdef CONFIG_X86_64 if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) v |= ((unsigned long) \ ((struct segment_descriptor_64 *)d)->base_higher) << 32; #endif return v; } EXPORT_SYMBOL_GPL(segment_base); u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { if (irqchip_in_kernel(vcpu->kvm)) return vcpu->apic_base; else return vcpu->apic_base; } EXPORT_SYMBOL_GPL(kvm_get_apic_base); void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) { /* TODO: reserve bits check */ if (irqchip_in_kernel(vcpu->kvm)) kvm_lapic_set_base(vcpu, data); else vcpu->apic_base = data; } EXPORT_SYMBOL_GPL(kvm_set_apic_base); static void inject_gp(struct kvm_vcpu *vcpu) { kvm_x86_ops->inject_gp(vcpu, 0); } /* * Load the pae pdptrs. Return true is they are all valid. */ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; int i; int ret; u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)]; mutex_lock(&vcpu->kvm->lock); ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, offset * sizeof(u64), sizeof(pdpte)); if (ret < 0) { ret = 0; goto out; } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { ret = 0; goto out; } } ret = 1; memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs)); out: mutex_unlock(&vcpu->kvm->lock); return ret; } void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { if (cr0 & CR0_RESERVED_BITS) { printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", cr0, vcpu->cr0); inject_gp(vcpu); return; } if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); inject_gp(vcpu); return; } if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { printk(KERN_DEBUG "set_cr0: #GP, set PG flag " "and a clear PE flag\n"); inject_gp(vcpu); return; } if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 if ((vcpu->shadow_efer & EFER_LME)) { int cs_db, cs_l; if (!is_pae(vcpu)) { printk(KERN_DEBUG "set_cr0: #GP, start paging " "in long mode while PAE is disabled\n"); inject_gp(vcpu); return; } kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); if (cs_l) { printk(KERN_DEBUG "set_cr0: #GP, start paging " "in long mode while CS.L == 1\n"); inject_gp(vcpu); return; } } else #endif if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) { printk(KERN_DEBUG "set_cr0: #GP, pdptrs " "reserved bits\n"); inject_gp(vcpu); return; } } kvm_x86_ops->set_cr0(vcpu, cr0); vcpu->cr0 = cr0; mutex_lock(&vcpu->kvm->lock); kvm_mmu_reset_context(vcpu); mutex_unlock(&vcpu->kvm->lock); return; } EXPORT_SYMBOL_GPL(set_cr0); void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(lmsw); void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & CR4_RESERVED_BITS) { printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); inject_gp(vcpu); return; } if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) { printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " "in long mode\n"); inject_gp(vcpu); return; } } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) && !load_pdptrs(vcpu, vcpu->cr3)) { printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); inject_gp(vcpu); return; } if (cr4 & X86_CR4_VMXE) { printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); inject_gp(vcpu); return; } kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->cr4 = cr4; mutex_lock(&vcpu->kvm->lock); kvm_mmu_reset_context(vcpu); mutex_unlock(&vcpu->kvm->lock); } EXPORT_SYMBOL_GPL(set_cr4); void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (is_long_mode(vcpu)) { if (cr3 & CR3_L_MODE_RESERVED_BITS) { printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); inject_gp(vcpu); return; } } else { if (is_pae(vcpu)) { if (cr3 & CR3_PAE_RESERVED_BITS) { printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); inject_gp(vcpu); return; } if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { printk(KERN_DEBUG "set_cr3: #GP, pdptrs " "reserved bits\n"); inject_gp(vcpu); return; } } /* * We don't check reserved bits in nonpae mode, because * this isn't enforced, and VMware depends on this. */ } mutex_lock(&vcpu->kvm->lock); /* * Does the new cr3 value map to physical memory? (Note, we * catch an invalid cr3 even in real-mode, because it would * cause trouble later on when we turn on paging anyway.) * * A real CPU would silently accept an invalid cr3 and would * attempt to use it - with largely undefined (and often hard * to debug) behavior on the guest side. */ if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) inject_gp(vcpu); else { vcpu->cr3 = cr3; vcpu->mmu.new_cr3(vcpu); } mutex_unlock(&vcpu->kvm->lock); } EXPORT_SYMBOL_GPL(set_cr3); void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) { printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); inject_gp(vcpu); return; } if (irqchip_in_kernel(vcpu->kvm)) kvm_lapic_set_tpr(vcpu, cr8); else vcpu->cr8 = cr8; } EXPORT_SYMBOL_GPL(set_cr8); unsigned long get_cr8(struct kvm_vcpu *vcpu) { if (irqchip_in_kernel(vcpu->kvm)) return kvm_lapic_get_cr8(vcpu); else return vcpu->cr8; } EXPORT_SYMBOL_GPL(get_cr8); /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. * * This list is modified at module load time to reflect the * capabilities of the host cpu. */ static u32 msrs_to_save[] = { MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_K6_STAR, #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TIME_STAMP_COUNTER, }; static unsigned num_msrs_to_save; static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, }; /* * Adapt set_msr() to msr_io()'s calling convention */ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { return kvm_set_msr(vcpu, index, *data); } /* * Read or write a bunch of msrs. All parameters are kernel addresses. * * @return number of msrs set successfully. */ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, struct kvm_msr_entry *entries, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { int i; vcpu_load(vcpu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; vcpu_put(vcpu); return i; } /* * Read or write a bunch of msrs. Parameters are user addresses. * * @return number of msrs set successfully. */ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data), int writeback) { struct kvm_msrs msrs; struct kvm_msr_entry *entries; int r, n; unsigned size; r = -EFAULT; if (copy_from_user(&msrs, user_msrs, sizeof msrs)) goto out; r = -E2BIG; if (msrs.nmsrs >= MAX_IO_MSRS) goto out; r = -ENOMEM; size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; entries = vmalloc(size); if (!entries) goto out; r = -EFAULT; if (copy_from_user(entries, user_msrs->entries, size)) goto out_free; r = n = __msr_io(vcpu, &msrs, entries, do_msr); if (r < 0) goto out_free; r = -EFAULT; if (writeback && copy_to_user(user_msrs->entries, entries, size)) goto out_free; r = n; out_free: vfree(entries); out: return r; } long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { void __user *argp = (void __user *)arg; long r; switch (ioctl) { case KVM_GET_MSR_INDEX_LIST: { struct kvm_msr_list __user *user_msr_list = argp; struct kvm_msr_list msr_list; unsigned n; r = -EFAULT; if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) goto out; n = msr_list.nmsrs; msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) goto out; r = -E2BIG; if (n < num_msrs_to_save) goto out; r = -EFAULT; if (copy_to_user(user_msr_list->indices, &msrs_to_save, num_msrs_to_save * sizeof(u32))) goto out; if (copy_to_user(user_msr_list->indices + num_msrs_to_save * sizeof(u32), &emulated_msrs, ARRAY_SIZE(emulated_msrs) * sizeof(u32))) goto out; r = 0; break; } default: r = -EINVAL; } out: return r; } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { kvm_x86_ops->vcpu_load(vcpu, cpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); } static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) { u64 efer; int i; struct kvm_cpuid_entry *e, *entry; rdmsrl(MSR_EFER, efer); entry = NULL; for (i = 0; i < vcpu->cpuid_nent; ++i) { e = &vcpu->cpuid_entries[i]; if (e->function == 0x80000001) { entry = e; break; } } if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) { entry->edx &= ~(1 << 20); printk(KERN_INFO "kvm: guest NX capability removed\n"); } } static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, struct kvm_cpuid_entry __user *entries) { int r; r = -E2BIG; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) goto out; r = -EFAULT; if (copy_from_user(&vcpu->cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry))) goto out; vcpu->cpuid_nent = cpuid->nent; cpuid_fix_nx_cap(vcpu); return 0; out: return r; } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { vcpu_load(vcpu); memcpy(s->regs, vcpu->apic->regs, sizeof *s); vcpu_put(vcpu); return 0; } static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { vcpu_load(vcpu); memcpy(vcpu->apic->regs, s->regs, sizeof *s); kvm_apic_post_state_restore(vcpu); vcpu_put(vcpu); return 0; } long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; int r; switch (ioctl) { case KVM_GET_LAPIC: { struct kvm_lapic_state lapic; memset(&lapic, 0, sizeof lapic); r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); if (r) goto out; r = -EFAULT; if (copy_to_user(argp, &lapic, sizeof lapic)) goto out; r = 0; break; } case KVM_SET_LAPIC: { struct kvm_lapic_state lapic; r = -EFAULT; if (copy_from_user(&lapic, argp, sizeof lapic)) goto out; r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; if (r) goto out; r = 0; break; } case KVM_SET_CPUID: { struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); if (r) goto out; break; } case KVM_GET_MSRS: r = msr_io(vcpu, argp, kvm_get_msr, 1); break; case KVM_SET_MSRS: r = msr_io(vcpu, argp, do_set_msr, 0); break; default: r = -EINVAL; } out: return r; } static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) { int ret; if (addr > (unsigned int)(-3 * PAGE_SIZE)) return -1; ret = kvm_x86_ops->set_tss_addr(kvm, addr); return ret; } static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, u32 kvm_nr_mmu_pages) { if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; mutex_lock(&kvm->lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->n_requested_mmu_pages = kvm_nr_mmu_pages; mutex_unlock(&kvm->lock); return 0; } static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) { return kvm->n_alloc_mmu_pages; } /* * Set a new alias region. Aliases map a portion of physical memory into * another portion. This is useful for memory windows, for example the PC * VGA region. */ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, struct kvm_memory_alias *alias) { int r, n; struct kvm_mem_alias *p; r = -EINVAL; /* General sanity checks */ if (alias->memory_size & (PAGE_SIZE - 1)) goto out; if (alias->guest_phys_addr & (PAGE_SIZE - 1)) goto out; if (alias->slot >= KVM_ALIAS_SLOTS) goto out; if (alias->guest_phys_addr + alias->memory_size < alias->guest_phys_addr) goto out; if (alias->target_phys_addr + alias->memory_size < alias->target_phys_addr) goto out; mutex_lock(&kvm->lock); p = &kvm->aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; p->npages = alias->memory_size >> PAGE_SHIFT; p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; for (n = KVM_ALIAS_SLOTS; n > 0; --n) if (kvm->aliases[n - 1].npages) break; kvm->naliases = n; kvm_mmu_zap_all(kvm); mutex_unlock(&kvm->lock); return 0; out: return r; } static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { int r; r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[0], sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_PIC_SLAVE: memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[1], sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_IOAPIC: memcpy(&chip->chip.ioapic, ioapic_irqchip(kvm), sizeof(struct kvm_ioapic_state)); break; default: r = -EINVAL; break; } return r; } static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { int r; r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_PIC_SLAVE: memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_IOAPIC: memcpy(ioapic_irqchip(kvm), &chip->chip.ioapic, sizeof(struct kvm_ioapic_state)); break; default: r = -EINVAL; break; } kvm_pic_update_irq(pic_irqchip(kvm)); return r; } long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; int r = -EINVAL; switch (ioctl) { case KVM_SET_TSS_ADDR: r = kvm_vm_ioctl_set_tss_addr(kvm, arg); if (r < 0) goto out; break; case KVM_SET_MEMORY_REGION: { struct kvm_memory_region kvm_mem; struct kvm_userspace_memory_region kvm_userspace_mem; r = -EFAULT; if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) goto out; kvm_userspace_mem.slot = kvm_mem.slot; kvm_userspace_mem.flags = kvm_mem.flags; kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; kvm_userspace_mem.memory_size = kvm_mem.memory_size; r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); if (r) goto out; break; } case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); if (r) goto out; break; case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; case KVM_SET_MEMORY_ALIAS: { struct kvm_memory_alias alias; r = -EFAULT; if (copy_from_user(&alias, argp, sizeof alias)) goto out; r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); if (r) goto out; break; } case KVM_CREATE_IRQCHIP: r = -ENOMEM; kvm->vpic = kvm_create_pic(kvm); if (kvm->vpic) { r = kvm_ioapic_init(kvm); if (r) { kfree(kvm->vpic); kvm->vpic = NULL; goto out; } } else goto out; break; case KVM_IRQ_LINE: { struct kvm_irq_level irq_event; r = -EFAULT; if (copy_from_user(&irq_event, argp, sizeof irq_event)) goto out; if (irqchip_in_kernel(kvm)) { mutex_lock(&kvm->lock); if (irq_event.irq < 16) kvm_pic_set_irq(pic_irqchip(kvm), irq_event.irq, irq_event.level); kvm_ioapic_set_irq(kvm->vioapic, irq_event.irq, irq_event.level); mutex_unlock(&kvm->lock); r = 0; } break; } case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ struct kvm_irqchip chip; r = -EFAULT; if (copy_from_user(&chip, argp, sizeof chip)) goto out; r = -ENXIO; if (!irqchip_in_kernel(kvm)) goto out; r = kvm_vm_ioctl_get_irqchip(kvm, &chip); if (r) goto out; r = -EFAULT; if (copy_to_user(argp, &chip, sizeof chip)) goto out; r = 0; break; } case KVM_SET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ struct kvm_irqchip chip; r = -EFAULT; if (copy_from_user(&chip, argp, sizeof chip)) goto out; r = -ENXIO; if (!irqchip_in_kernel(kvm)) goto out; r = kvm_vm_ioctl_set_irqchip(kvm, &chip); if (r) goto out; r = 0; break; } default: ; } out: return r; } static __init void kvm_init_msr_list(void) { u32 dummy[2]; unsigned i, j; for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue; if (j < i) msrs_to_save[j] = msrs_to_save[i]; j++; } num_msrs_to_save = j; } __init void kvm_arch_init(void) { kvm_init_msr_list(); }