aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kvm/api.txt109
-rw-r--r--arch/Kconfig9
-rw-r--r--arch/ia64/include/asm/kvm.h1
-rw-r--r--arch/ia64/include/asm/kvm_host.h1
-rw-r--r--arch/ia64/kvm/Makefile2
-rw-r--r--arch/ia64/kvm/kvm-ia64.c19
-rw-r--r--arch/powerpc/kvm/powerpc.c5
-rw-r--r--arch/powerpc/kvm/timing.h2
-rw-r--r--arch/s390/include/asm/kvm.h3
-rw-r--r--arch/s390/kvm/kvm-s390.c30
-rw-r--r--arch/s390/kvm/sigp.c6
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/kvm.h30
-rw-r--r--arch/x86/include/asm/kvm_emulate.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h34
-rw-r--r--arch/x86/include/asm/svm.h3
-rw-r--r--arch/x86/include/asm/thread_info.h7
-rw-r--r--arch/x86/include/asm/vmx.h4
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/signal.c3
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c159
-rw-r--r--arch/x86/kvm/i8254.c2
-rw-r--r--arch/x86/kvm/i8259.c44
-rw-r--r--arch/x86/kvm/irq.h7
-rw-r--r--arch/x86/kvm/lapic.c8
-rw-r--r--arch/x86/kvm/mmu.c3
-rw-r--r--arch/x86/kvm/paging_tmpl.h1
-rw-r--r--arch/x86/kvm/svm.c331
-rw-r--r--arch/x86/kvm/trace.h165
-rw-r--r--arch/x86/kvm/vmx.c448
-rw-r--r--arch/x86/kvm/x86.c550
-rw-r--r--include/linux/kvm.h272
-rw-r--r--include/linux/kvm_host.h53
-rw-r--r--include/linux/user-return-notifier.h49
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/user-return-notifier.c46
-rw-r--r--virt/kvm/assigned-dev.c818
-rw-r--r--virt/kvm/eventfd.c2
-rw-r--r--virt/kvm/ioapic.c80
-rw-r--r--virt/kvm/ioapic.h5
-rw-r--r--virt/kvm/irq_comm.c231
-rw-r--r--virt/kvm/kvm_main.c961
45 files changed, 2936 insertions, 1579 deletions
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 5a4bc8cf6d04..e1a114161027 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -593,6 +593,115 @@ struct kvm_irqchip {
593 } chip; 593 } chip;
594}; 594};
595 595
5964.27 KVM_XEN_HVM_CONFIG
597
598Capability: KVM_CAP_XEN_HVM
599Architectures: x86
600Type: vm ioctl
601Parameters: struct kvm_xen_hvm_config (in)
602Returns: 0 on success, -1 on error
603
604Sets the MSR that the Xen HVM guest uses to initialize its hypercall
605page, and provides the starting address and size of the hypercall
606blobs in userspace. When the guest writes the MSR, kvm copies one
607page of a blob (32- or 64-bit, depending on the vcpu mode) to guest
608memory.
609
610struct kvm_xen_hvm_config {
611 __u32 flags;
612 __u32 msr;
613 __u64 blob_addr_32;
614 __u64 blob_addr_64;
615 __u8 blob_size_32;
616 __u8 blob_size_64;
617 __u8 pad2[30];
618};
619
6204.27 KVM_GET_CLOCK
621
622Capability: KVM_CAP_ADJUST_CLOCK
623Architectures: x86
624Type: vm ioctl
625Parameters: struct kvm_clock_data (out)
626Returns: 0 on success, -1 on error
627
628Gets the current timestamp of kvmclock as seen by the current guest. In
629conjunction with KVM_SET_CLOCK, it is used to ensure monotonicity on scenarios
630such as migration.
631
632struct kvm_clock_data {
633 __u64 clock; /* kvmclock current value */
634 __u32 flags;
635 __u32 pad[9];
636};
637
6384.28 KVM_SET_CLOCK
639
640Capability: KVM_CAP_ADJUST_CLOCK
641Architectures: x86
642Type: vm ioctl
643Parameters: struct kvm_clock_data (in)
644Returns: 0 on success, -1 on error
645
646Sets the current timestamp of kvmclock to the valued specific in its parameter.
647In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios
648such as migration.
649
650struct kvm_clock_data {
651 __u64 clock; /* kvmclock current value */
652 __u32 flags;
653 __u32 pad[9];
654};
655
6564.29 KVM_GET_VCPU_EVENTS
657
658Capability: KVM_CAP_VCPU_EVENTS
659Architectures: x86
660Type: vm ioctl
661Parameters: struct kvm_vcpu_event (out)
662Returns: 0 on success, -1 on error
663
664Gets currently pending exceptions, interrupts, and NMIs as well as related
665states of the vcpu.
666
667struct kvm_vcpu_events {
668 struct {
669 __u8 injected;
670 __u8 nr;
671 __u8 has_error_code;
672 __u8 pad;
673 __u32 error_code;
674 } exception;
675 struct {
676 __u8 injected;
677 __u8 nr;
678 __u8 soft;
679 __u8 pad;
680 } interrupt;
681 struct {
682 __u8 injected;
683 __u8 pending;
684 __u8 masked;
685 __u8 pad;
686 } nmi;
687 __u32 sipi_vector;
688 __u32 flags; /* must be zero */
689};
690
6914.30 KVM_SET_VCPU_EVENTS
692
693Capability: KVM_CAP_VCPU_EVENTS
694Architectures: x86
695Type: vm ioctl
696Parameters: struct kvm_vcpu_event (in)
697Returns: 0 on success, -1 on error
698
699Set pending exceptions, interrupts, and NMIs as well as related states of the
700vcpu.
701
702See KVM_GET_VCPU_EVENTS for the data structure.
703
704
5965. The kvm_run structure 7055. The kvm_run structure
597 706
598Application code obtains a pointer to the kvm_run structure by 707Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/Kconfig b/arch/Kconfig
index eef3bbb97075..d82875820a15 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -83,6 +83,13 @@ config KRETPROBES
83 def_bool y 83 def_bool y
84 depends on KPROBES && HAVE_KRETPROBES 84 depends on KPROBES && HAVE_KRETPROBES
85 85
86config USER_RETURN_NOTIFIER
87 bool
88 depends on HAVE_USER_RETURN_NOTIFIER
89 help
90 Provide a kernel-internal notification when a cpu is about to
91 switch to user mode.
92
86config HAVE_IOREMAP_PROT 93config HAVE_IOREMAP_PROT
87 bool 94 bool
88 95
@@ -132,5 +139,7 @@ config HAVE_HW_BREAKPOINT
132 select ANON_INODES 139 select ANON_INODES
133 select PERF_EVENTS 140 select PERF_EVENTS
134 141
142config HAVE_USER_RETURN_NOTIFIER
143 bool
135 144
136source "kernel/gcov/Kconfig" 145source "kernel/gcov/Kconfig"
diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h
index 18a7e49abbc5..bc90c75adf67 100644
--- a/arch/ia64/include/asm/kvm.h
+++ b/arch/ia64/include/asm/kvm.h
@@ -60,6 +60,7 @@ struct kvm_ioapic_state {
60#define KVM_IRQCHIP_PIC_MASTER 0 60#define KVM_IRQCHIP_PIC_MASTER 0
61#define KVM_IRQCHIP_PIC_SLAVE 1 61#define KVM_IRQCHIP_PIC_SLAVE 1
62#define KVM_IRQCHIP_IOAPIC 2 62#define KVM_IRQCHIP_IOAPIC 2
63#define KVM_NR_IRQCHIPS 3
63 64
64#define KVM_CONTEXT_SIZE 8*1024 65#define KVM_CONTEXT_SIZE 8*1024
65 66
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index d9b6325a9328..a362e67e0ca6 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -475,7 +475,6 @@ struct kvm_arch {
475 struct list_head assigned_dev_head; 475 struct list_head assigned_dev_head;
476 struct iommu_domain *iommu_domain; 476 struct iommu_domain *iommu_domain;
477 int iommu_flags; 477 int iommu_flags;
478 struct hlist_head irq_ack_notifier_list;
479 478
480 unsigned long irq_sources_bitmap; 479 unsigned long irq_sources_bitmap;
481 unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; 480 unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index 0bb99b732908..1089b3e918ac 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -49,7 +49,7 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
49EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/ 49EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
50 50
51common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 51common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
52 coalesced_mmio.o irq_comm.o) 52 coalesced_mmio.o irq_comm.o assigned-dev.o)
53 53
54ifeq ($(CONFIG_IOMMU_API),y) 54ifeq ($(CONFIG_IOMMU_API),y)
55common-objs += $(addprefix ../../../virt/kvm/, iommu.o) 55common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 0ad09f05efa9..5fdeec5fddcf 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -124,7 +124,7 @@ long ia64_pal_vp_create(u64 *vpd, u64 *host_iva, u64 *opt_handler)
124 124
125static DEFINE_SPINLOCK(vp_lock); 125static DEFINE_SPINLOCK(vp_lock);
126 126
127void kvm_arch_hardware_enable(void *garbage) 127int kvm_arch_hardware_enable(void *garbage)
128{ 128{
129 long status; 129 long status;
130 long tmp_base; 130 long tmp_base;
@@ -137,7 +137,7 @@ void kvm_arch_hardware_enable(void *garbage)
137 slot = ia64_itr_entry(0x3, KVM_VMM_BASE, pte, KVM_VMM_SHIFT); 137 slot = ia64_itr_entry(0x3, KVM_VMM_BASE, pte, KVM_VMM_SHIFT);
138 local_irq_restore(saved_psr); 138 local_irq_restore(saved_psr);
139 if (slot < 0) 139 if (slot < 0)
140 return; 140 return -EINVAL;
141 141
142 spin_lock(&vp_lock); 142 spin_lock(&vp_lock);
143 status = ia64_pal_vp_init_env(kvm_vsa_base ? 143 status = ia64_pal_vp_init_env(kvm_vsa_base ?
@@ -145,7 +145,7 @@ void kvm_arch_hardware_enable(void *garbage)
145 __pa(kvm_vm_buffer), KVM_VM_BUFFER_BASE, &tmp_base); 145 __pa(kvm_vm_buffer), KVM_VM_BUFFER_BASE, &tmp_base);
146 if (status != 0) { 146 if (status != 0) {
147 printk(KERN_WARNING"kvm: Failed to Enable VT Support!!!!\n"); 147 printk(KERN_WARNING"kvm: Failed to Enable VT Support!!!!\n");
148 return ; 148 return -EINVAL;
149 } 149 }
150 150
151 if (!kvm_vsa_base) { 151 if (!kvm_vsa_base) {
@@ -154,6 +154,8 @@ void kvm_arch_hardware_enable(void *garbage)
154 } 154 }
155 spin_unlock(&vp_lock); 155 spin_unlock(&vp_lock);
156 ia64_ptr_entry(0x3, slot); 156 ia64_ptr_entry(0x3, slot);
157
158 return 0;
157} 159}
158 160
159void kvm_arch_hardware_disable(void *garbage) 161void kvm_arch_hardware_disable(void *garbage)
@@ -851,8 +853,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm,
851 r = 0; 853 r = 0;
852 switch (chip->chip_id) { 854 switch (chip->chip_id) {
853 case KVM_IRQCHIP_IOAPIC: 855 case KVM_IRQCHIP_IOAPIC:
854 memcpy(&chip->chip.ioapic, ioapic_irqchip(kvm), 856 r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
855 sizeof(struct kvm_ioapic_state));
856 break; 857 break;
857 default: 858 default:
858 r = -EINVAL; 859 r = -EINVAL;
@@ -868,9 +869,7 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
868 r = 0; 869 r = 0;
869 switch (chip->chip_id) { 870 switch (chip->chip_id) {
870 case KVM_IRQCHIP_IOAPIC: 871 case KVM_IRQCHIP_IOAPIC:
871 memcpy(ioapic_irqchip(kvm), 872 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
872 &chip->chip.ioapic,
873 sizeof(struct kvm_ioapic_state));
874 break; 873 break;
875 default: 874 default:
876 r = -EINVAL; 875 r = -EINVAL;
@@ -944,7 +943,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
944{ 943{
945 struct kvm *kvm = filp->private_data; 944 struct kvm *kvm = filp->private_data;
946 void __user *argp = (void __user *)arg; 945 void __user *argp = (void __user *)arg;
947 int r = -EINVAL; 946 int r = -ENOTTY;
948 947
949 switch (ioctl) { 948 switch (ioctl) {
950 case KVM_SET_MEMORY_REGION: { 949 case KVM_SET_MEMORY_REGION: {
@@ -985,10 +984,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
985 goto out; 984 goto out;
986 if (irqchip_in_kernel(kvm)) { 985 if (irqchip_in_kernel(kvm)) {
987 __s32 status; 986 __s32 status;
988 mutex_lock(&kvm->irq_lock);
989 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 987 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
990 irq_event.irq, irq_event.level); 988 irq_event.irq, irq_event.level);
991 mutex_unlock(&kvm->irq_lock);
992 if (ioctl == KVM_IRQ_LINE_STATUS) { 989 if (ioctl == KVM_IRQ_LINE_STATUS) {
993 irq_event.status = status; 990 irq_event.status = status;
994 if (copy_to_user(argp, &irq_event, 991 if (copy_to_user(argp, &irq_event,
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 2a4551f78f60..5902bbc2411e 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -78,8 +78,9 @@ int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
78 return r; 78 return r;
79} 79}
80 80
81void kvm_arch_hardware_enable(void *garbage) 81int kvm_arch_hardware_enable(void *garbage)
82{ 82{
83 return 0;
83} 84}
84 85
85void kvm_arch_hardware_disable(void *garbage) 86void kvm_arch_hardware_disable(void *garbage)
@@ -421,7 +422,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
421 422
422 switch (ioctl) { 423 switch (ioctl) {
423 default: 424 default:
424 r = -EINVAL; 425 r = -ENOTTY;
425 } 426 }
426 427
427 return r; 428 return r;
diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h
index 806ef67868bd..8167d42a776f 100644
--- a/arch/powerpc/kvm/timing.h
+++ b/arch/powerpc/kvm/timing.h
@@ -51,7 +51,7 @@ static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type)
51 51
52 /* The BUILD_BUG_ON below breaks in funny ways, commented out 52 /* The BUILD_BUG_ON below breaks in funny ways, commented out
53 * for now ... -BenH 53 * for now ... -BenH
54 BUILD_BUG_ON(__builtin_constant_p(type)); 54 BUILD_BUG_ON(!__builtin_constant_p(type));
55 */ 55 */
56 switch (type) { 56 switch (type) {
57 case EXT_INTR_EXITS: 57 case EXT_INTR_EXITS:
diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h
index 3dfcaeb5d7f4..82b32a100c7d 100644
--- a/arch/s390/include/asm/kvm.h
+++ b/arch/s390/include/asm/kvm.h
@@ -1,6 +1,5 @@
1#ifndef __LINUX_KVM_S390_H 1#ifndef __LINUX_KVM_S390_H
2#define __LINUX_KVM_S390_H 2#define __LINUX_KVM_S390_H
3
4/* 3/*
5 * asm-s390/kvm.h - KVM s390 specific structures and definitions 4 * asm-s390/kvm.h - KVM s390 specific structures and definitions
6 * 5 *
@@ -15,6 +14,8 @@
15 */ 14 */
16#include <linux/types.h> 15#include <linux/types.h>
17 16
17#define __KVM_S390
18
18/* for KVM_GET_REGS and KVM_SET_REGS */ 19/* for KVM_GET_REGS and KVM_SET_REGS */
19struct kvm_regs { 20struct kvm_regs {
20 /* general purpose regs for s390 */ 21 /* general purpose regs for s390 */
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 07ced89740d7..f8bcaefd7d34 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -74,9 +74,10 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
74static unsigned long long *facilities; 74static unsigned long long *facilities;
75 75
76/* Section: not file related */ 76/* Section: not file related */
77void kvm_arch_hardware_enable(void *garbage) 77int kvm_arch_hardware_enable(void *garbage)
78{ 78{
79 /* every s390 is virtualization enabled ;-) */ 79 /* every s390 is virtualization enabled ;-) */
80 return 0;
80} 81}
81 82
82void kvm_arch_hardware_disable(void *garbage) 83void kvm_arch_hardware_disable(void *garbage)
@@ -116,10 +117,16 @@ long kvm_arch_dev_ioctl(struct file *filp,
116 117
117int kvm_dev_ioctl_check_extension(long ext) 118int kvm_dev_ioctl_check_extension(long ext)
118{ 119{
120 int r;
121
119 switch (ext) { 122 switch (ext) {
123 case KVM_CAP_S390_PSW:
124 r = 1;
125 break;
120 default: 126 default:
121 return 0; 127 r = 0;
122 } 128 }
129 return r;
123} 130}
124 131
125/* Section: vm related */ 132/* Section: vm related */
@@ -150,7 +157,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
150 break; 157 break;
151 } 158 }
152 default: 159 default:
153 r = -EINVAL; 160 r = -ENOTTY;
154 } 161 }
155 162
156 return r; 163 return r;
@@ -419,8 +426,10 @@ static int kvm_arch_vcpu_ioctl_set_initial_psw(struct kvm_vcpu *vcpu, psw_t psw)
419 vcpu_load(vcpu); 426 vcpu_load(vcpu);
420 if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING) 427 if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING)
421 rc = -EBUSY; 428 rc = -EBUSY;
422 else 429 else {
423 vcpu->arch.sie_block->gpsw = psw; 430 vcpu->run->psw_mask = psw.mask;
431 vcpu->run->psw_addr = psw.addr;
432 }
424 vcpu_put(vcpu); 433 vcpu_put(vcpu);
425 return rc; 434 return rc;
426} 435}
@@ -508,9 +517,6 @@ rerun_vcpu:
508 517
509 switch (kvm_run->exit_reason) { 518 switch (kvm_run->exit_reason) {
510 case KVM_EXIT_S390_SIEIC: 519 case KVM_EXIT_S390_SIEIC:
511 vcpu->arch.sie_block->gpsw.mask = kvm_run->s390_sieic.mask;
512 vcpu->arch.sie_block->gpsw.addr = kvm_run->s390_sieic.addr;
513 break;
514 case KVM_EXIT_UNKNOWN: 520 case KVM_EXIT_UNKNOWN:
515 case KVM_EXIT_INTR: 521 case KVM_EXIT_INTR:
516 case KVM_EXIT_S390_RESET: 522 case KVM_EXIT_S390_RESET:
@@ -519,6 +525,9 @@ rerun_vcpu:
519 BUG(); 525 BUG();
520 } 526 }
521 527
528 vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
529 vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
530
522 might_fault(); 531 might_fault();
523 532
524 do { 533 do {
@@ -538,8 +547,6 @@ rerun_vcpu:
538 /* intercept cannot be handled in-kernel, prepare kvm-run */ 547 /* intercept cannot be handled in-kernel, prepare kvm-run */
539 kvm_run->exit_reason = KVM_EXIT_S390_SIEIC; 548 kvm_run->exit_reason = KVM_EXIT_S390_SIEIC;
540 kvm_run->s390_sieic.icptcode = vcpu->arch.sie_block->icptcode; 549 kvm_run->s390_sieic.icptcode = vcpu->arch.sie_block->icptcode;
541 kvm_run->s390_sieic.mask = vcpu->arch.sie_block->gpsw.mask;
542 kvm_run->s390_sieic.addr = vcpu->arch.sie_block->gpsw.addr;
543 kvm_run->s390_sieic.ipa = vcpu->arch.sie_block->ipa; 550 kvm_run->s390_sieic.ipa = vcpu->arch.sie_block->ipa;
544 kvm_run->s390_sieic.ipb = vcpu->arch.sie_block->ipb; 551 kvm_run->s390_sieic.ipb = vcpu->arch.sie_block->ipb;
545 rc = 0; 552 rc = 0;
@@ -551,6 +558,9 @@ rerun_vcpu:
551 rc = 0; 558 rc = 0;
552 } 559 }
553 560
561 kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask;
562 kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr;
563
554 if (vcpu->sigset_active) 564 if (vcpu->sigset_active)
555 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 565 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
556 566
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 40c8c6748cfe..15ee1111de58 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -188,9 +188,9 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
188 188
189 /* make sure that the new value is valid memory */ 189 /* make sure that the new value is valid memory */
190 address = address & 0x7fffe000u; 190 address = address & 0x7fffe000u;
191 if ((copy_from_guest(vcpu, &tmp, 191 if ((copy_from_user(&tmp, (void __user *)
192 (u64) (address + vcpu->arch.sie_block->gmsor) , 1)) || 192 (address + vcpu->arch.sie_block->gmsor) , 1)) ||
193 (copy_from_guest(vcpu, &tmp, (u64) (address + 193 (copy_from_user(&tmp, (void __user *)(address +
194 vcpu->arch.sie_block->gmsor + PAGE_SIZE), 1))) { 194 vcpu->arch.sie_block->gmsor + PAGE_SIZE), 1))) {
195 *reg |= SIGP_STAT_INVALID_PARAMETER; 195 *reg |= SIGP_STAT_INVALID_PARAMETER;
196 return 1; /* invalid parameter */ 196 return 1; /* invalid parameter */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 178084b4377c..1b2182b4d5c8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -51,6 +51,7 @@ config X86
51 select HAVE_KERNEL_LZMA 51 select HAVE_KERNEL_LZMA
52 select HAVE_HW_BREAKPOINT 52 select HAVE_HW_BREAKPOINT
53 select HAVE_ARCH_KMEMCHECK 53 select HAVE_ARCH_KMEMCHECK
54 select HAVE_USER_RETURN_NOTIFIER
54 55
55config OUTPUT_FORMAT 56config OUTPUT_FORMAT
56 string 57 string
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 4a5fe914dc59..950df434763f 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -19,6 +19,8 @@
19#define __KVM_HAVE_MSIX 19#define __KVM_HAVE_MSIX
20#define __KVM_HAVE_MCE 20#define __KVM_HAVE_MCE
21#define __KVM_HAVE_PIT_STATE2 21#define __KVM_HAVE_PIT_STATE2
22#define __KVM_HAVE_XEN_HVM
23#define __KVM_HAVE_VCPU_EVENTS
22 24
23/* Architectural interrupt line count. */ 25/* Architectural interrupt line count. */
24#define KVM_NR_INTERRUPTS 256 26#define KVM_NR_INTERRUPTS 256
@@ -79,6 +81,7 @@ struct kvm_ioapic_state {
79#define KVM_IRQCHIP_PIC_MASTER 0 81#define KVM_IRQCHIP_PIC_MASTER 0
80#define KVM_IRQCHIP_PIC_SLAVE 1 82#define KVM_IRQCHIP_PIC_SLAVE 1
81#define KVM_IRQCHIP_IOAPIC 2 83#define KVM_IRQCHIP_IOAPIC 2
84#define KVM_NR_IRQCHIPS 3
82 85
83/* for KVM_GET_REGS and KVM_SET_REGS */ 86/* for KVM_GET_REGS and KVM_SET_REGS */
84struct kvm_regs { 87struct kvm_regs {
@@ -250,4 +253,31 @@ struct kvm_reinject_control {
250 __u8 pit_reinject; 253 __u8 pit_reinject;
251 __u8 reserved[31]; 254 __u8 reserved[31];
252}; 255};
256
257/* for KVM_GET/SET_VCPU_EVENTS */
258struct kvm_vcpu_events {
259 struct {
260 __u8 injected;
261 __u8 nr;
262 __u8 has_error_code;
263 __u8 pad;
264 __u32 error_code;
265 } exception;
266 struct {
267 __u8 injected;
268 __u8 nr;
269 __u8 soft;
270 __u8 pad;
271 } interrupt;
272 struct {
273 __u8 injected;
274 __u8 pending;
275 __u8 masked;
276 __u8 pad;
277 } nmi;
278 __u32 sipi_vector;
279 __u32 flags;
280 __u32 reserved[10];
281};
282
253#endif /* _ASM_X86_KVM_H */ 283#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b7ed2c423116..7c18e1230f54 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -129,7 +129,7 @@ struct decode_cache {
129 u8 seg_override; 129 u8 seg_override;
130 unsigned int d; 130 unsigned int d;
131 unsigned long regs[NR_VCPU_REGS]; 131 unsigned long regs[NR_VCPU_REGS];
132 unsigned long eip; 132 unsigned long eip, eip_orig;
133 /* modrm */ 133 /* modrm */
134 u8 modrm; 134 u8 modrm;
135 u8 modrm_mod; 135 u8 modrm_mod;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d83892226f73..4f865e8b8540 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -354,7 +354,6 @@ struct kvm_vcpu_arch {
354 unsigned int time_offset; 354 unsigned int time_offset;
355 struct page *time_page; 355 struct page *time_page;
356 356
357 bool singlestep; /* guest is single stepped by KVM */
358 bool nmi_pending; 357 bool nmi_pending;
359 bool nmi_injected; 358 bool nmi_injected;
360 359
@@ -371,6 +370,10 @@ struct kvm_vcpu_arch {
371 u64 mcg_status; 370 u64 mcg_status;
372 u64 mcg_ctl; 371 u64 mcg_ctl;
373 u64 *mce_banks; 372 u64 *mce_banks;
373
374 /* used for guest single stepping over the given code position */
375 u16 singlestep_cs;
376 unsigned long singlestep_rip;
374}; 377};
375 378
376struct kvm_mem_alias { 379struct kvm_mem_alias {
@@ -397,7 +400,6 @@ struct kvm_arch{
397 struct kvm_pic *vpic; 400 struct kvm_pic *vpic;
398 struct kvm_ioapic *vioapic; 401 struct kvm_ioapic *vioapic;
399 struct kvm_pit *vpit; 402 struct kvm_pit *vpit;
400 struct hlist_head irq_ack_notifier_list;
401 int vapics_in_nmi_mode; 403 int vapics_in_nmi_mode;
402 404
403 unsigned int tss_addr; 405 unsigned int tss_addr;
@@ -410,8 +412,10 @@ struct kvm_arch{
410 gpa_t ept_identity_map_addr; 412 gpa_t ept_identity_map_addr;
411 413
412 unsigned long irq_sources_bitmap; 414 unsigned long irq_sources_bitmap;
413 unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
414 u64 vm_init_tsc; 415 u64 vm_init_tsc;
416 s64 kvmclock_offset;
417
418 struct kvm_xen_hvm_config xen_hvm_config;
415}; 419};
416 420
417struct kvm_vm_stat { 421struct kvm_vm_stat {
@@ -461,7 +465,7 @@ struct descriptor_table {
461struct kvm_x86_ops { 465struct kvm_x86_ops {
462 int (*cpu_has_kvm_support)(void); /* __init */ 466 int (*cpu_has_kvm_support)(void); /* __init */
463 int (*disabled_by_bios)(void); /* __init */ 467 int (*disabled_by_bios)(void); /* __init */
464 void (*hardware_enable)(void *dummy); /* __init */ 468 int (*hardware_enable)(void *dummy);
465 void (*hardware_disable)(void *dummy); 469 void (*hardware_disable)(void *dummy);
466 void (*check_processor_compatibility)(void *rtn); 470 void (*check_processor_compatibility)(void *rtn);
467 int (*hardware_setup)(void); /* __init */ 471 int (*hardware_setup)(void); /* __init */
@@ -477,8 +481,8 @@ struct kvm_x86_ops {
477 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 481 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
478 void (*vcpu_put)(struct kvm_vcpu *vcpu); 482 void (*vcpu_put)(struct kvm_vcpu *vcpu);
479 483
480 int (*set_guest_debug)(struct kvm_vcpu *vcpu, 484 void (*set_guest_debug)(struct kvm_vcpu *vcpu,
481 struct kvm_guest_debug *dbg); 485 struct kvm_guest_debug *dbg);
482 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 486 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
483 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 487 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
484 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 488 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -506,8 +510,8 @@ struct kvm_x86_ops {
506 510
507 void (*tlb_flush)(struct kvm_vcpu *vcpu); 511 void (*tlb_flush)(struct kvm_vcpu *vcpu);
508 512
509 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); 513 void (*run)(struct kvm_vcpu *vcpu);
510 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); 514 int (*handle_exit)(struct kvm_vcpu *vcpu);
511 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); 515 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
512 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); 516 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
513 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); 517 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
@@ -519,6 +523,8 @@ struct kvm_x86_ops {
519 bool has_error_code, u32 error_code); 523 bool has_error_code, u32 error_code);
520 int (*interrupt_allowed)(struct kvm_vcpu *vcpu); 524 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
521 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 525 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
526 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
527 void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
522 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 528 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
523 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 529 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
524 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 530 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
@@ -568,7 +574,7 @@ enum emulation_result {
568#define EMULTYPE_NO_DECODE (1 << 0) 574#define EMULTYPE_NO_DECODE (1 << 0)
569#define EMULTYPE_TRAP_UD (1 << 1) 575#define EMULTYPE_TRAP_UD (1 << 1)
570#define EMULTYPE_SKIP (1 << 2) 576#define EMULTYPE_SKIP (1 << 2)
571int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, 577int emulate_instruction(struct kvm_vcpu *vcpu,
572 unsigned long cr2, u16 error_code, int emulation_type); 578 unsigned long cr2, u16 error_code, int emulation_type);
573void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 579void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
574void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 580void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -585,9 +591,9 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
585 591
586struct x86_emulate_ctxt; 592struct x86_emulate_ctxt;
587 593
588int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 594int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
589 int size, unsigned port); 595 int size, unsigned port);
590int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 596int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
591 int size, unsigned long count, int down, 597 int size, unsigned long count, int down,
592 gva_t address, int rep, unsigned port); 598 gva_t address, int rep, unsigned port);
593void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 599void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
@@ -616,6 +622,9 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
616int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 622int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
617int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 623int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
618 624
625unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
626void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
627
619void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); 628void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
620void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 629void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
621void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 630void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
@@ -802,4 +811,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
802int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 811int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
803int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 812int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
804 813
814void kvm_define_shared_msr(unsigned index, u32 msr);
815void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
816
805#endif /* _ASM_X86_KVM_HOST_H */ 817#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 85574b7c1bc1..1fecb7e61130 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
57 u16 intercept_dr_write; 57 u16 intercept_dr_write;
58 u32 intercept_exceptions; 58 u32 intercept_exceptions;
59 u64 intercept; 59 u64 intercept;
60 u8 reserved_1[44]; 60 u8 reserved_1[42];
61 u16 pause_filter_count;
61 u64 iopm_base_pa; 62 u64 iopm_base_pa;
62 u64 msrpm_base_pa; 63 u64 msrpm_base_pa;
63 u64 tsc_offset; 64 u64 tsc_offset;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d27d0a2fec4c..375c917c37d2 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
83#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ 83#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
84#define TIF_SECCOMP 8 /* secure computing */ 84#define TIF_SECCOMP 8 /* secure computing */
85#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ 85#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
86#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
86#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 87#define TIF_NOTSC 16 /* TSC is not accessible in userland */
87#define TIF_IA32 17 /* 32bit process */ 88#define TIF_IA32 17 /* 32bit process */
88#define TIF_FORK 18 /* ret_from_fork */ 89#define TIF_FORK 18 /* ret_from_fork */
@@ -107,6 +108,7 @@ struct thread_info {
107#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 108#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
108#define _TIF_SECCOMP (1 << TIF_SECCOMP) 109#define _TIF_SECCOMP (1 << TIF_SECCOMP)
109#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) 110#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
111#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
110#define _TIF_NOTSC (1 << TIF_NOTSC) 112#define _TIF_NOTSC (1 << TIF_NOTSC)
111#define _TIF_IA32 (1 << TIF_IA32) 113#define _TIF_IA32 (1 << TIF_IA32)
112#define _TIF_FORK (1 << TIF_FORK) 114#define _TIF_FORK (1 << TIF_FORK)
@@ -142,13 +144,14 @@ struct thread_info {
142 144
143/* Only used for 64 bit */ 145/* Only used for 64 bit */
144#define _TIF_DO_NOTIFY_MASK \ 146#define _TIF_DO_NOTIFY_MASK \
145 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) 147 (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
148 _TIF_USER_RETURN_NOTIFY)
146 149
147/* flags to check in __switch_to() */ 150/* flags to check in __switch_to() */
148#define _TIF_WORK_CTXSW \ 151#define _TIF_WORK_CTXSW \
149 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) 152 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
150 153
151#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW 154#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
152#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) 155#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
153 156
154#define PREEMPT_ACTIVE 0x10000000 157#define PREEMPT_ACTIVE 0x10000000
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 272514c2d456..2b4945419a84 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -56,6 +56,7 @@
56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
59#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
59 60
60 61
61#define PIN_BASED_EXT_INTR_MASK 0x00000001 62#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -144,6 +145,8 @@ enum vmcs_field {
144 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, 145 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
145 TPR_THRESHOLD = 0x0000401c, 146 TPR_THRESHOLD = 0x0000401c,
146 SECONDARY_VM_EXEC_CONTROL = 0x0000401e, 147 SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
148 PLE_GAP = 0x00004020,
149 PLE_WINDOW = 0x00004022,
147 VM_INSTRUCTION_ERROR = 0x00004400, 150 VM_INSTRUCTION_ERROR = 0x00004400,
148 VM_EXIT_REASON = 0x00004402, 151 VM_EXIT_REASON = 0x00004402,
149 VM_EXIT_INTR_INFO = 0x00004404, 152 VM_EXIT_INTR_INFO = 0x00004404,
@@ -248,6 +251,7 @@ enum vmcs_field {
248#define EXIT_REASON_MSR_READ 31 251#define EXIT_REASON_MSR_READ 31
249#define EXIT_REASON_MSR_WRITE 32 252#define EXIT_REASON_MSR_WRITE 32
250#define EXIT_REASON_MWAIT_INSTRUCTION 36 253#define EXIT_REASON_MWAIT_INSTRUCTION 36
254#define EXIT_REASON_PAUSE_INSTRUCTION 40
251#define EXIT_REASON_MCE_DURING_VMENTRY 41 255#define EXIT_REASON_MCE_DURING_VMENTRY 41
252#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 256#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
253#define EXIT_REASON_APIC_ACCESS 44 257#define EXIT_REASON_APIC_ACCESS 44
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 744508e7cfdd..5e2ba634ea15 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,6 +9,7 @@
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <linux/user-return-notifier.h>
12#include <trace/events/power.h> 13#include <trace/events/power.h>
13#include <linux/hw_breakpoint.h> 14#include <linux/hw_breakpoint.h>
14#include <asm/system.h> 15#include <asm/system.h>
@@ -209,6 +210,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
209 */ 210 */
210 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 211 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
211 } 212 }
213 propagate_user_return_notify(prev_p, next_p);
212} 214}
213 215
214int sys_fork(struct pt_regs *regs) 216int sys_fork(struct pt_regs *regs)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index fbf3b07c8567..74fe6d86dc5d 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
19#include <linux/stddef.h> 19#include <linux/stddef.h>
20#include <linux/personality.h> 20#include <linux/personality.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/user-return-notifier.h>
22 23
23#include <asm/processor.h> 24#include <asm/processor.h>
24#include <asm/ucontext.h> 25#include <asm/ucontext.h>
@@ -863,6 +864,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
863 if (current->replacement_session_keyring) 864 if (current->replacement_session_keyring)
864 key_replace_session_keyring(); 865 key_replace_session_keyring();
865 } 866 }
867 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
868 fire_user_return_notifiers();
866 869
867#ifdef CONFIG_X86_32 870#ifdef CONFIG_X86_32
868 clear_thread_flag(TIF_IRET); 871 clear_thread_flag(TIF_IRET);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b84e571f4175..4cd498332466 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
28 select HAVE_KVM_IRQCHIP 28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select USER_RETURN_NOTIFIER
31 ---help--- 32 ---help---
32 Support hosting fully virtualized guest machines using hardware 33 Support hosting fully virtualized guest machines using hardware
33 virtualization extensions. You will need a fairly recent 34 virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 0e7fe78d0f74..31a7035c4bd9 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -6,7 +6,8 @@ CFLAGS_svm.o := -I.
6CFLAGS_vmx.o := -I. 6CFLAGS_vmx.o := -I.
7 7
8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o) 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o)
10kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
11 12
12kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 13kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1be5cd640e93..7e8faea4651e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -75,6 +75,8 @@
75#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 75#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
77#define GroupMask 0xff /* Group number stored in bits 0:7 */ 77#define GroupMask 0xff /* Group number stored in bits 0:7 */
78/* Misc flags */
79#define No64 (1<<28)
78/* Source 2 operand type */ 80/* Source 2 operand type */
79#define Src2None (0<<29) 81#define Src2None (0<<29)
80#define Src2CL (1<<29) 82#define Src2CL (1<<29)
@@ -92,19 +94,23 @@ static u32 opcode_table[256] = {
92 /* 0x00 - 0x07 */ 94 /* 0x00 - 0x07 */
93 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
94 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
95 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 97 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
98 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
96 /* 0x08 - 0x0F */ 99 /* 0x08 - 0x0F */
97 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 100 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
98 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 101 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
99 0, 0, 0, 0, 102 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
103 ImplicitOps | Stack | No64, 0,
100 /* 0x10 - 0x17 */ 104 /* 0x10 - 0x17 */
101 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
102 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
103 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
108 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
104 /* 0x18 - 0x1F */ 109 /* 0x18 - 0x1F */
105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 110 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 111 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 112 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
113 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
108 /* 0x20 - 0x27 */ 114 /* 0x20 - 0x27 */
109 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 115 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
110 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 116 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -133,7 +139,8 @@ static u32 opcode_table[256] = {
133 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, 139 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
134 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, 140 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
135 /* 0x60 - 0x67 */ 141 /* 0x60 - 0x67 */
136 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 142 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
143 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
137 0, 0, 0, 0, 144 0, 0, 0, 0,
138 /* 0x68 - 0x6F */ 145 /* 0x68 - 0x6F */
139 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, 146 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
@@ -158,7 +165,7 @@ static u32 opcode_table[256] = {
158 /* 0x90 - 0x97 */ 165 /* 0x90 - 0x97 */
159 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 166 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
160 /* 0x98 - 0x9F */ 167 /* 0x98 - 0x9F */
161 0, 0, SrcImm | Src2Imm16, 0, 168 0, 0, SrcImm | Src2Imm16 | No64, 0,
162 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 169 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
163 /* 0xA0 - 0xA7 */ 170 /* 0xA0 - 0xA7 */
164 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 171 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -185,7 +192,7 @@ static u32 opcode_table[256] = {
185 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, 192 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
186 /* 0xC8 - 0xCF */ 193 /* 0xC8 - 0xCF */
187 0, 0, 0, ImplicitOps | Stack, 194 0, 0, 0, ImplicitOps | Stack,
188 ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, 195 ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
189 /* 0xD0 - 0xD7 */ 196 /* 0xD0 - 0xD7 */
190 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 197 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
191 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 198 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -198,7 +205,7 @@ static u32 opcode_table[256] = {
198 ByteOp | SrcImmUByte, SrcImmUByte, 205 ByteOp | SrcImmUByte, SrcImmUByte,
199 /* 0xE8 - 0xEF */ 206 /* 0xE8 - 0xEF */
200 SrcImm | Stack, SrcImm | ImplicitOps, 207 SrcImm | Stack, SrcImm | ImplicitOps,
201 SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, 208 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
202 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 209 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
203 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 210 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
204 /* 0xF0 - 0xF7 */ 211 /* 0xF0 - 0xF7 */
@@ -244,11 +251,13 @@ static u32 twobyte_table[256] = {
244 /* 0x90 - 0x9F */ 251 /* 0x90 - 0x9F */
245 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 252 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
246 /* 0xA0 - 0xA7 */ 253 /* 0xA0 - 0xA7 */
247 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 254 ImplicitOps | Stack, ImplicitOps | Stack,
255 0, DstMem | SrcReg | ModRM | BitOp,
248 DstMem | SrcReg | Src2ImmByte | ModRM, 256 DstMem | SrcReg | Src2ImmByte | ModRM,
249 DstMem | SrcReg | Src2CL | ModRM, 0, 0, 257 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
250 /* 0xA8 - 0xAF */ 258 /* 0xA8 - 0xAF */
251 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 259 ImplicitOps | Stack, ImplicitOps | Stack,
260 0, DstMem | SrcReg | ModRM | BitOp,
252 DstMem | SrcReg | Src2ImmByte | ModRM, 261 DstMem | SrcReg | Src2ImmByte | ModRM,
253 DstMem | SrcReg | Src2CL | ModRM, 262 DstMem | SrcReg | Src2CL | ModRM,
254 ModRM, 0, 263 ModRM, 0,
@@ -613,6 +622,9 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
613{ 622{
614 int rc = 0; 623 int rc = 0;
615 624
625 /* x86 instructions are limited to 15 bytes. */
626 if (eip + size - ctxt->decode.eip_orig > 15)
627 return X86EMUL_UNHANDLEABLE;
616 eip += ctxt->cs_base; 628 eip += ctxt->cs_base;
617 while (size--) { 629 while (size--) {
618 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); 630 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
@@ -871,7 +883,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
871 /* Shadow copy of register state. Committed on successful emulation. */ 883 /* Shadow copy of register state. Committed on successful emulation. */
872 884
873 memset(c, 0, sizeof(struct decode_cache)); 885 memset(c, 0, sizeof(struct decode_cache));
874 c->eip = kvm_rip_read(ctxt->vcpu); 886 c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
875 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 887 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
876 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 888 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
877 889
@@ -962,6 +974,11 @@ done_prefixes:
962 } 974 }
963 } 975 }
964 976
977 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
978 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");;
979 return -1;
980 }
981
965 if (c->d & Group) { 982 if (c->d & Group) {
966 group = c->d & GroupMask; 983 group = c->d & GroupMask;
967 c->modrm = insn_fetch(u8, 1, c->eip); 984 c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1186,6 +1203,69 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1186 return rc; 1203 return rc;
1187} 1204}
1188 1205
1206static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1207{
1208 struct decode_cache *c = &ctxt->decode;
1209 struct kvm_segment segment;
1210
1211 kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
1212
1213 c->src.val = segment.selector;
1214 emulate_push(ctxt);
1215}
1216
1217static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1218 struct x86_emulate_ops *ops, int seg)
1219{
1220 struct decode_cache *c = &ctxt->decode;
1221 unsigned long selector;
1222 int rc;
1223
1224 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
1225 if (rc != 0)
1226 return rc;
1227
1228 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg);
1229 return rc;
1230}
1231
1232static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
1233{
1234 struct decode_cache *c = &ctxt->decode;
1235 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
1236 int reg = VCPU_REGS_RAX;
1237
1238 while (reg <= VCPU_REGS_RDI) {
1239 (reg == VCPU_REGS_RSP) ?
1240 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1241
1242 emulate_push(ctxt);
1243 ++reg;
1244 }
1245}
1246
1247static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1248 struct x86_emulate_ops *ops)
1249{
1250 struct decode_cache *c = &ctxt->decode;
1251 int rc = 0;
1252 int reg = VCPU_REGS_RDI;
1253
1254 while (reg >= VCPU_REGS_RAX) {
1255 if (reg == VCPU_REGS_RSP) {
1256 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1257 c->op_bytes);
1258 --reg;
1259 }
1260
1261 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
1262 if (rc != 0)
1263 break;
1264 --reg;
1265 }
1266 return rc;
1267}
1268
1189static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1269static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1190 struct x86_emulate_ops *ops) 1270 struct x86_emulate_ops *ops)
1191{ 1271{
@@ -1707,18 +1787,45 @@ special_insn:
1707 add: /* add */ 1787 add: /* add */
1708 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 1788 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
1709 break; 1789 break;
1790 case 0x06: /* push es */
1791 emulate_push_sreg(ctxt, VCPU_SREG_ES);
1792 break;
1793 case 0x07: /* pop es */
1794 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
1795 if (rc != 0)
1796 goto done;
1797 break;
1710 case 0x08 ... 0x0d: 1798 case 0x08 ... 0x0d:
1711 or: /* or */ 1799 or: /* or */
1712 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 1800 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
1713 break; 1801 break;
1802 case 0x0e: /* push cs */
1803 emulate_push_sreg(ctxt, VCPU_SREG_CS);
1804 break;
1714 case 0x10 ... 0x15: 1805 case 0x10 ... 0x15:
1715 adc: /* adc */ 1806 adc: /* adc */
1716 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); 1807 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
1717 break; 1808 break;
1809 case 0x16: /* push ss */
1810 emulate_push_sreg(ctxt, VCPU_SREG_SS);
1811 break;
1812 case 0x17: /* pop ss */
1813 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
1814 if (rc != 0)
1815 goto done;
1816 break;
1718 case 0x18 ... 0x1d: 1817 case 0x18 ... 0x1d:
1719 sbb: /* sbb */ 1818 sbb: /* sbb */
1720 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 1819 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1721 break; 1820 break;
1821 case 0x1e: /* push ds */
1822 emulate_push_sreg(ctxt, VCPU_SREG_DS);
1823 break;
1824 case 0x1f: /* pop ds */
1825 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
1826 if (rc != 0)
1827 goto done;
1828 break;
1722 case 0x20 ... 0x25: 1829 case 0x20 ... 0x25:
1723 and: /* and */ 1830 and: /* and */
1724 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); 1831 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
@@ -1750,6 +1857,14 @@ special_insn:
1750 if (rc != 0) 1857 if (rc != 0)
1751 goto done; 1858 goto done;
1752 break; 1859 break;
1860 case 0x60: /* pusha */
1861 emulate_pusha(ctxt);
1862 break;
1863 case 0x61: /* popa */
1864 rc = emulate_popa(ctxt, ops);
1865 if (rc != 0)
1866 goto done;
1867 break;
1753 case 0x63: /* movsxd */ 1868 case 0x63: /* movsxd */
1754 if (ctxt->mode != X86EMUL_MODE_PROT64) 1869 if (ctxt->mode != X86EMUL_MODE_PROT64)
1755 goto cannot_emulate; 1870 goto cannot_emulate;
@@ -1761,7 +1876,7 @@ special_insn:
1761 break; 1876 break;
1762 case 0x6c: /* insb */ 1877 case 0x6c: /* insb */
1763 case 0x6d: /* insw/insd */ 1878 case 0x6d: /* insw/insd */
1764 if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 1879 if (kvm_emulate_pio_string(ctxt->vcpu,
1765 1, 1880 1,
1766 (c->d & ByteOp) ? 1 : c->op_bytes, 1881 (c->d & ByteOp) ? 1 : c->op_bytes,
1767 c->rep_prefix ? 1882 c->rep_prefix ?
@@ -1777,7 +1892,7 @@ special_insn:
1777 return 0; 1892 return 0;
1778 case 0x6e: /* outsb */ 1893 case 0x6e: /* outsb */
1779 case 0x6f: /* outsw/outsd */ 1894 case 0x6f: /* outsw/outsd */
1780 if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 1895 if (kvm_emulate_pio_string(ctxt->vcpu,
1781 0, 1896 0,
1782 (c->d & ByteOp) ? 1 : c->op_bytes, 1897 (c->d & ByteOp) ? 1 : c->op_bytes,
1783 c->rep_prefix ? 1898 c->rep_prefix ?
@@ -2070,7 +2185,7 @@ special_insn:
2070 case 0xef: /* out (e/r)ax,dx */ 2185 case 0xef: /* out (e/r)ax,dx */
2071 port = c->regs[VCPU_REGS_RDX]; 2186 port = c->regs[VCPU_REGS_RDX];
2072 io_dir_in = 0; 2187 io_dir_in = 0;
2073 do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, 2188 do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
2074 (c->d & ByteOp) ? 1 : c->op_bytes, 2189 (c->d & ByteOp) ? 1 : c->op_bytes,
2075 port) != 0) { 2190 port) != 0) {
2076 c->eip = saved_eip; 2191 c->eip = saved_eip;
@@ -2297,6 +2412,14 @@ twobyte_insn:
2297 jmp_rel(c, c->src.val); 2412 jmp_rel(c, c->src.val);
2298 c->dst.type = OP_NONE; 2413 c->dst.type = OP_NONE;
2299 break; 2414 break;
2415 case 0xa0: /* push fs */
2416 emulate_push_sreg(ctxt, VCPU_SREG_FS);
2417 break;
2418 case 0xa1: /* pop fs */
2419 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
2420 if (rc != 0)
2421 goto done;
2422 break;
2300 case 0xa3: 2423 case 0xa3:
2301 bt: /* bt */ 2424 bt: /* bt */
2302 c->dst.type = OP_NONE; 2425 c->dst.type = OP_NONE;
@@ -2308,6 +2431,14 @@ twobyte_insn:
2308 case 0xa5: /* shld cl, r, r/m */ 2431 case 0xa5: /* shld cl, r, r/m */
2309 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 2432 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
2310 break; 2433 break;
2434 case 0xa8: /* push gs */
2435 emulate_push_sreg(ctxt, VCPU_SREG_GS);
2436 break;
2437 case 0xa9: /* pop gs */
2438 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
2439 if (rc != 0)
2440 goto done;
2441 break;
2311 case 0xab: 2442 case 0xab:
2312 bts: /* bts */ 2443 bts: /* bts */
2313 /* only subword offset */ 2444 /* only subword offset */
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 144e7f60b5e2..fab7440c9bb2 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -688,10 +688,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
688 struct kvm_vcpu *vcpu; 688 struct kvm_vcpu *vcpu;
689 int i; 689 int i;
690 690
691 mutex_lock(&kvm->irq_lock);
692 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 691 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
693 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 692 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
694 mutex_unlock(&kvm->irq_lock);
695 693
696 /* 694 /*
697 * Provides NMI watchdog support via Virtual Wire mode. 695 * Provides NMI watchdog support via Virtual Wire mode.
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 01f151682802..d057c0cbd245 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -38,7 +38,15 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
38 s->isr_ack |= (1 << irq); 38 s->isr_ack |= (1 << irq);
39 if (s != &s->pics_state->pics[0]) 39 if (s != &s->pics_state->pics[0])
40 irq += 8; 40 irq += 8;
41 /*
42 * We are dropping lock while calling ack notifiers since ack
43 * notifier callbacks for assigned devices call into PIC recursively.
44 * Other interrupt may be delivered to PIC while lock is dropped but
45 * it should be safe since PIC state is already updated at this stage.
46 */
47 spin_unlock(&s->pics_state->lock);
41 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); 48 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
49 spin_lock(&s->pics_state->lock);
42} 50}
43 51
44void kvm_pic_clear_isr_ack(struct kvm *kvm) 52void kvm_pic_clear_isr_ack(struct kvm *kvm)
@@ -176,16 +184,18 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
176static inline void pic_intack(struct kvm_kpic_state *s, int irq) 184static inline void pic_intack(struct kvm_kpic_state *s, int irq)
177{ 185{
178 s->isr |= 1 << irq; 186 s->isr |= 1 << irq;
179 if (s->auto_eoi) {
180 if (s->rotate_on_auto_eoi)
181 s->priority_add = (irq + 1) & 7;
182 pic_clear_isr(s, irq);
183 }
184 /* 187 /*
185 * We don't clear a level sensitive interrupt here 188 * We don't clear a level sensitive interrupt here
186 */ 189 */
187 if (!(s->elcr & (1 << irq))) 190 if (!(s->elcr & (1 << irq)))
188 s->irr &= ~(1 << irq); 191 s->irr &= ~(1 << irq);
192
193 if (s->auto_eoi) {
194 if (s->rotate_on_auto_eoi)
195 s->priority_add = (irq + 1) & 7;
196 pic_clear_isr(s, irq);
197 }
198
189} 199}
190 200
191int kvm_pic_read_irq(struct kvm *kvm) 201int kvm_pic_read_irq(struct kvm *kvm)
@@ -225,22 +235,11 @@ int kvm_pic_read_irq(struct kvm *kvm)
225 235
226void kvm_pic_reset(struct kvm_kpic_state *s) 236void kvm_pic_reset(struct kvm_kpic_state *s)
227{ 237{
228 int irq, irqbase, n; 238 int irq;
229 struct kvm *kvm = s->pics_state->irq_request_opaque; 239 struct kvm *kvm = s->pics_state->irq_request_opaque;
230 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; 240 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
241 u8 irr = s->irr, isr = s->imr;
231 242
232 if (s == &s->pics_state->pics[0])
233 irqbase = 0;
234 else
235 irqbase = 8;
236
237 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
238 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
239 if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
240 n = irq + irqbase;
241 kvm_notify_acked_irq(kvm, SELECT_PIC(n), n);
242 }
243 }
244 s->last_irr = 0; 243 s->last_irr = 0;
245 s->irr = 0; 244 s->irr = 0;
246 s->imr = 0; 245 s->imr = 0;
@@ -256,6 +255,13 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
256 s->rotate_on_auto_eoi = 0; 255 s->rotate_on_auto_eoi = 0;
257 s->special_fully_nested_mode = 0; 256 s->special_fully_nested_mode = 0;
258 s->init4 = 0; 257 s->init4 = 0;
258
259 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
260 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
261 if (irr & (1 << irq) || isr & (1 << irq)) {
262 pic_clear_isr(s, irq);
263 }
264 }
259} 265}
260 266
261static void pic_ioport_write(void *opaque, u32 addr, u32 val) 267static void pic_ioport_write(void *opaque, u32 addr, u32 val)
@@ -298,9 +304,9 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
298 priority = get_priority(s, s->isr); 304 priority = get_priority(s, s->isr);
299 if (priority != 8) { 305 if (priority != 8) {
300 irq = (priority + s->priority_add) & 7; 306 irq = (priority + s->priority_add) & 7;
301 pic_clear_isr(s, irq);
302 if (cmd == 5) 307 if (cmd == 5)
303 s->priority_add = (irq + 1) & 7; 308 s->priority_add = (irq + 1) & 7;
309 pic_clear_isr(s, irq);
304 pic_update_irq(s->pics_state); 310 pic_update_irq(s->pics_state);
305 } 311 }
306 break; 312 break;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7d6058a2fd38..be399e207d57 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -71,6 +71,7 @@ struct kvm_pic {
71 int output; /* intr from master PIC */ 71 int output; /* intr from master PIC */
72 struct kvm_io_device dev; 72 struct kvm_io_device dev;
73 void (*ack_notifier)(void *opaque, int irq); 73 void (*ack_notifier)(void *opaque, int irq);
74 unsigned long irq_states[16];
74}; 75};
75 76
76struct kvm_pic *kvm_create_pic(struct kvm *kvm); 77struct kvm_pic *kvm_create_pic(struct kvm *kvm);
@@ -85,7 +86,11 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
85 86
86static inline int irqchip_in_kernel(struct kvm *kvm) 87static inline int irqchip_in_kernel(struct kvm *kvm)
87{ 88{
88 return pic_irqchip(kvm) != NULL; 89 int ret;
90
91 ret = (pic_irqchip(kvm) != NULL);
92 smp_rmb();
93 return ret;
89} 94}
90 95
91void kvm_pic_reset(struct kvm_kpic_state *s); 96void kvm_pic_reset(struct kvm_kpic_state *s);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 23c217692ea9..cd60c0bd1b32 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,7 +32,6 @@
32#include <asm/current.h> 32#include <asm/current.h>
33#include <asm/apicdef.h> 33#include <asm/apicdef.h>
34#include <asm/atomic.h> 34#include <asm/atomic.h>
35#include <asm/apicdef.h>
36#include "kvm_cache_regs.h" 35#include "kvm_cache_regs.h"
37#include "irq.h" 36#include "irq.h"
38#include "trace.h" 37#include "trace.h"
@@ -471,11 +470,8 @@ static void apic_set_eoi(struct kvm_lapic *apic)
471 trigger_mode = IOAPIC_LEVEL_TRIG; 470 trigger_mode = IOAPIC_LEVEL_TRIG;
472 else 471 else
473 trigger_mode = IOAPIC_EDGE_TRIG; 472 trigger_mode = IOAPIC_EDGE_TRIG;
474 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) { 473 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
475 mutex_lock(&apic->vcpu->kvm->irq_lock);
476 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 474 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
477 mutex_unlock(&apic->vcpu->kvm->irq_lock);
478 }
479} 475}
480 476
481static void apic_send_ipi(struct kvm_lapic *apic) 477static void apic_send_ipi(struct kvm_lapic *apic)
@@ -504,9 +500,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
504 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, 500 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
505 irq.vector); 501 irq.vector);
506 502
507 mutex_lock(&apic->vcpu->kvm->irq_lock);
508 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); 503 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
509 mutex_unlock(&apic->vcpu->kvm->irq_lock);
510} 504}
511 505
512static u32 apic_get_tmcct(struct kvm_lapic *apic) 506static u32 apic_get_tmcct(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 818b92ad82cf..4c3e5b2314cb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2789,7 +2789,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2789 if (r) 2789 if (r)
2790 goto out; 2790 goto out;
2791 2791
2792 er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); 2792 er = emulate_instruction(vcpu, cr2, error_code, 0);
2793 2793
2794 switch (er) { 2794 switch (er) {
2795 case EMULATE_DONE: 2795 case EMULATE_DONE:
@@ -2800,6 +2800,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2800 case EMULATE_FAIL: 2800 case EMULATE_FAIL:
2801 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2801 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2802 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 2802 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2803 vcpu->run->internal.ndata = 0;
2803 return 0; 2804 return 0;
2804 default: 2805 default:
2805 BUG(); 2806 BUG();
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 72558f8ff3f5..a6017132fba8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -467,7 +467,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
467 level = iterator.level; 467 level = iterator.level;
468 sptep = iterator.sptep; 468 sptep = iterator.sptep;
469 469
470 /* FIXME: properly handle invlpg on large guest pages */
471 if (level == PT_PAGE_TABLE_LEVEL || 470 if (level == PT_PAGE_TABLE_LEVEL ||
472 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || 471 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
473 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { 472 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c17404add91f..3de0b37ec038 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -46,6 +46,7 @@ MODULE_LICENSE("GPL");
46#define SVM_FEATURE_NPT (1 << 0) 46#define SVM_FEATURE_NPT (1 << 0)
47#define SVM_FEATURE_LBRV (1 << 1) 47#define SVM_FEATURE_LBRV (1 << 1)
48#define SVM_FEATURE_SVML (1 << 2) 48#define SVM_FEATURE_SVML (1 << 2)
49#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
49 50
50#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 51#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
51#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ 52#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -53,15 +54,6 @@ MODULE_LICENSE("GPL");
53 54
54#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 55#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
55 56
56/* Turn on to get debugging output*/
57/* #define NESTED_DEBUG */
58
59#ifdef NESTED_DEBUG
60#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
61#else
62#define nsvm_printk(fmt, args...) do {} while(0)
63#endif
64
65static const u32 host_save_user_msrs[] = { 57static const u32 host_save_user_msrs[] = {
66#ifdef CONFIG_X86_64 58#ifdef CONFIG_X86_64
67 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, 59 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -85,6 +77,9 @@ struct nested_state {
85 /* gpa pointers to the real vectors */ 77 /* gpa pointers to the real vectors */
86 u64 vmcb_msrpm; 78 u64 vmcb_msrpm;
87 79
80 /* A VMEXIT is required but not yet emulated */
81 bool exit_required;
82
88 /* cache for intercepts of the guest */ 83 /* cache for intercepts of the guest */
89 u16 intercept_cr_read; 84 u16 intercept_cr_read;
90 u16 intercept_cr_write; 85 u16 intercept_cr_write;
@@ -112,6 +107,8 @@ struct vcpu_svm {
112 u32 *msrpm; 107 u32 *msrpm;
113 108
114 struct nested_state nested; 109 struct nested_state nested;
110
111 bool nmi_singlestep;
115}; 112};
116 113
117/* enable NPT for AMD64 and X86 with PAE */ 114/* enable NPT for AMD64 and X86 with PAE */
@@ -286,7 +283,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
286 struct vcpu_svm *svm = to_svm(vcpu); 283 struct vcpu_svm *svm = to_svm(vcpu);
287 284
288 if (!svm->next_rip) { 285 if (!svm->next_rip) {
289 if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) != 286 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
290 EMULATE_DONE) 287 EMULATE_DONE)
291 printk(KERN_DEBUG "%s: NOP\n", __func__); 288 printk(KERN_DEBUG "%s: NOP\n", __func__);
292 return; 289 return;
@@ -316,7 +313,7 @@ static void svm_hardware_disable(void *garbage)
316 cpu_svm_disable(); 313 cpu_svm_disable();
317} 314}
318 315
319static void svm_hardware_enable(void *garbage) 316static int svm_hardware_enable(void *garbage)
320{ 317{
321 318
322 struct svm_cpu_data *svm_data; 319 struct svm_cpu_data *svm_data;
@@ -325,16 +322,21 @@ static void svm_hardware_enable(void *garbage)
325 struct desc_struct *gdt; 322 struct desc_struct *gdt;
326 int me = raw_smp_processor_id(); 323 int me = raw_smp_processor_id();
327 324
325 rdmsrl(MSR_EFER, efer);
326 if (efer & EFER_SVME)
327 return -EBUSY;
328
328 if (!has_svm()) { 329 if (!has_svm()) {
329 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); 330 printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
330 return; 331 me);
332 return -EINVAL;
331 } 333 }
332 svm_data = per_cpu(svm_data, me); 334 svm_data = per_cpu(svm_data, me);
333 335
334 if (!svm_data) { 336 if (!svm_data) {
335 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", 337 printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
336 me); 338 me);
337 return; 339 return -EINVAL;
338 } 340 }
339 341
340 svm_data->asid_generation = 1; 342 svm_data->asid_generation = 1;
@@ -345,11 +347,12 @@ static void svm_hardware_enable(void *garbage)
345 gdt = (struct desc_struct *)gdt_descr.base; 347 gdt = (struct desc_struct *)gdt_descr.base;
346 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 348 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
347 349
348 rdmsrl(MSR_EFER, efer);
349 wrmsrl(MSR_EFER, efer | EFER_SVME); 350 wrmsrl(MSR_EFER, efer | EFER_SVME);
350 351
351 wrmsrl(MSR_VM_HSAVE_PA, 352 wrmsrl(MSR_VM_HSAVE_PA,
352 page_to_pfn(svm_data->save_area) << PAGE_SHIFT); 353 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
354
355 return 0;
353} 356}
354 357
355static void svm_cpu_uninit(int cpu) 358static void svm_cpu_uninit(int cpu)
@@ -476,7 +479,7 @@ static __init int svm_hardware_setup(void)
476 kvm_enable_efer_bits(EFER_SVME); 479 kvm_enable_efer_bits(EFER_SVME);
477 } 480 }
478 481
479 for_each_online_cpu(cpu) { 482 for_each_possible_cpu(cpu) {
480 r = svm_cpu_init(cpu); 483 r = svm_cpu_init(cpu);
481 if (r) 484 if (r)
482 goto err; 485 goto err;
@@ -510,7 +513,7 @@ static __exit void svm_hardware_unsetup(void)
510{ 513{
511 int cpu; 514 int cpu;
512 515
513 for_each_online_cpu(cpu) 516 for_each_possible_cpu(cpu)
514 svm_cpu_uninit(cpu); 517 svm_cpu_uninit(cpu);
515 518
516 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 519 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
@@ -625,11 +628,12 @@ static void init_vmcb(struct vcpu_svm *svm)
625 save->rip = 0x0000fff0; 628 save->rip = 0x0000fff0;
626 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 629 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
627 630
628 /* 631 /* This is the guest-visible cr0 value.
629 * cr0 val on cpu init should be 0x60000010, we enable cpu 632 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
630 * cache by default. the orderly way is to enable cache in bios.
631 */ 633 */
632 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; 634 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
635 kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
636
633 save->cr4 = X86_CR4_PAE; 637 save->cr4 = X86_CR4_PAE;
634 /* rdx = ?? */ 638 /* rdx = ?? */
635 639
@@ -644,8 +648,6 @@ static void init_vmcb(struct vcpu_svm *svm)
644 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| 648 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
645 INTERCEPT_CR3_MASK); 649 INTERCEPT_CR3_MASK);
646 save->g_pat = 0x0007040600070406ULL; 650 save->g_pat = 0x0007040600070406ULL;
647 /* enable caching because the QEMU Bios doesn't enable it */
648 save->cr0 = X86_CR0_ET;
649 save->cr3 = 0; 651 save->cr3 = 0;
650 save->cr4 = 0; 652 save->cr4 = 0;
651 } 653 }
@@ -654,6 +656,11 @@ static void init_vmcb(struct vcpu_svm *svm)
654 svm->nested.vmcb = 0; 656 svm->nested.vmcb = 0;
655 svm->vcpu.arch.hflags = 0; 657 svm->vcpu.arch.hflags = 0;
656 658
659 if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
660 control->pause_filter_count = 3000;
661 control->intercept |= (1ULL << INTERCEPT_PAUSE);
662 }
663
657 enable_gif(svm); 664 enable_gif(svm);
658} 665}
659 666
@@ -758,14 +765,13 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
758 int i; 765 int i;
759 766
760 if (unlikely(cpu != vcpu->cpu)) { 767 if (unlikely(cpu != vcpu->cpu)) {
761 u64 tsc_this, delta; 768 u64 delta;
762 769
763 /* 770 /*
764 * Make sure that the guest sees a monotonically 771 * Make sure that the guest sees a monotonically
765 * increasing TSC. 772 * increasing TSC.
766 */ 773 */
767 rdtscll(tsc_this); 774 delta = vcpu->arch.host_tsc - native_read_tsc();
768 delta = vcpu->arch.host_tsc - tsc_this;
769 svm->vmcb->control.tsc_offset += delta; 775 svm->vmcb->control.tsc_offset += delta;
770 if (is_nested(svm)) 776 if (is_nested(svm))
771 svm->nested.hsave->control.tsc_offset += delta; 777 svm->nested.hsave->control.tsc_offset += delta;
@@ -787,7 +793,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
787 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 793 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
788 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 794 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
789 795
790 rdtscll(vcpu->arch.host_tsc); 796 vcpu->arch.host_tsc = native_read_tsc();
791} 797}
792 798
793static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 799static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -1045,7 +1051,7 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1045 svm->vmcb->control.intercept_exceptions &= 1051 svm->vmcb->control.intercept_exceptions &=
1046 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 1052 ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
1047 1053
1048 if (vcpu->arch.singlestep) 1054 if (svm->nmi_singlestep)
1049 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); 1055 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
1050 1056
1051 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1057 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -1060,26 +1066,16 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1060 vcpu->guest_debug = 0; 1066 vcpu->guest_debug = 0;
1061} 1067}
1062 1068
1063static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1069static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1064{ 1070{
1065 int old_debug = vcpu->guest_debug;
1066 struct vcpu_svm *svm = to_svm(vcpu); 1071 struct vcpu_svm *svm = to_svm(vcpu);
1067 1072
1068 vcpu->guest_debug = dbg->control;
1069
1070 update_db_intercept(vcpu);
1071
1072 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1073 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1073 svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; 1074 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
1074 else 1075 else
1075 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1076 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1076 1077
1077 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 1078 update_db_intercept(vcpu);
1078 svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1079 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1080 svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1081
1082 return 0;
1083} 1079}
1084 1080
1085static void load_host_msrs(struct kvm_vcpu *vcpu) 1081static void load_host_msrs(struct kvm_vcpu *vcpu)
@@ -1180,7 +1176,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
1180 } 1176 }
1181} 1177}
1182 1178
1183static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1179static int pf_interception(struct vcpu_svm *svm)
1184{ 1180{
1185 u64 fault_address; 1181 u64 fault_address;
1186 u32 error_code; 1182 u32 error_code;
@@ -1194,17 +1190,19 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1194 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1190 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1195} 1191}
1196 1192
1197static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1193static int db_interception(struct vcpu_svm *svm)
1198{ 1194{
1195 struct kvm_run *kvm_run = svm->vcpu.run;
1196
1199 if (!(svm->vcpu.guest_debug & 1197 if (!(svm->vcpu.guest_debug &
1200 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && 1198 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1201 !svm->vcpu.arch.singlestep) { 1199 !svm->nmi_singlestep) {
1202 kvm_queue_exception(&svm->vcpu, DB_VECTOR); 1200 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1203 return 1; 1201 return 1;
1204 } 1202 }
1205 1203
1206 if (svm->vcpu.arch.singlestep) { 1204 if (svm->nmi_singlestep) {
1207 svm->vcpu.arch.singlestep = false; 1205 svm->nmi_singlestep = false;
1208 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) 1206 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1209 svm->vmcb->save.rflags &= 1207 svm->vmcb->save.rflags &=
1210 ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 1208 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
@@ -1223,25 +1221,27 @@ static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1223 return 1; 1221 return 1;
1224} 1222}
1225 1223
1226static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1224static int bp_interception(struct vcpu_svm *svm)
1227{ 1225{
1226 struct kvm_run *kvm_run = svm->vcpu.run;
1227
1228 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1228 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1229 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1229 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1230 kvm_run->debug.arch.exception = BP_VECTOR; 1230 kvm_run->debug.arch.exception = BP_VECTOR;
1231 return 0; 1231 return 0;
1232} 1232}
1233 1233
1234static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1234static int ud_interception(struct vcpu_svm *svm)
1235{ 1235{
1236 int er; 1236 int er;
1237 1237
1238 er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); 1238 er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
1239 if (er != EMULATE_DONE) 1239 if (er != EMULATE_DONE)
1240 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1240 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1241 return 1; 1241 return 1;
1242} 1242}
1243 1243
1244static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1244static int nm_interception(struct vcpu_svm *svm)
1245{ 1245{
1246 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1246 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
1247 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) 1247 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
@@ -1251,7 +1251,7 @@ static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1251 return 1; 1251 return 1;
1252} 1252}
1253 1253
1254static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1254static int mc_interception(struct vcpu_svm *svm)
1255{ 1255{
1256 /* 1256 /*
1257 * On an #MC intercept the MCE handler is not called automatically in 1257 * On an #MC intercept the MCE handler is not called automatically in
@@ -1264,8 +1264,10 @@ static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1264 return 1; 1264 return 1;
1265} 1265}
1266 1266
1267static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1267static int shutdown_interception(struct vcpu_svm *svm)
1268{ 1268{
1269 struct kvm_run *kvm_run = svm->vcpu.run;
1270
1269 /* 1271 /*
1270 * VMCB is undefined after a SHUTDOWN intercept 1272 * VMCB is undefined after a SHUTDOWN intercept
1271 * so reinitialize it. 1273 * so reinitialize it.
@@ -1277,7 +1279,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1277 return 0; 1279 return 0;
1278} 1280}
1279 1281
1280static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1282static int io_interception(struct vcpu_svm *svm)
1281{ 1283{
1282 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 1284 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1283 int size, in, string; 1285 int size, in, string;
@@ -1291,7 +1293,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1291 1293
1292 if (string) { 1294 if (string) {
1293 if (emulate_instruction(&svm->vcpu, 1295 if (emulate_instruction(&svm->vcpu,
1294 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) 1296 0, 0, 0) == EMULATE_DO_MMIO)
1295 return 0; 1297 return 0;
1296 return 1; 1298 return 1;
1297 } 1299 }
@@ -1301,33 +1303,33 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1301 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1303 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1302 1304
1303 skip_emulated_instruction(&svm->vcpu); 1305 skip_emulated_instruction(&svm->vcpu);
1304 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1306 return kvm_emulate_pio(&svm->vcpu, in, size, port);
1305} 1307}
1306 1308
1307static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1309static int nmi_interception(struct vcpu_svm *svm)
1308{ 1310{
1309 return 1; 1311 return 1;
1310} 1312}
1311 1313
1312static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1314static int intr_interception(struct vcpu_svm *svm)
1313{ 1315{
1314 ++svm->vcpu.stat.irq_exits; 1316 ++svm->vcpu.stat.irq_exits;
1315 return 1; 1317 return 1;
1316} 1318}
1317 1319
1318static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1320static int nop_on_interception(struct vcpu_svm *svm)
1319{ 1321{
1320 return 1; 1322 return 1;
1321} 1323}
1322 1324
1323static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1325static int halt_interception(struct vcpu_svm *svm)
1324{ 1326{
1325 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; 1327 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1326 skip_emulated_instruction(&svm->vcpu); 1328 skip_emulated_instruction(&svm->vcpu);
1327 return kvm_emulate_halt(&svm->vcpu); 1329 return kvm_emulate_halt(&svm->vcpu);
1328} 1330}
1329 1331
1330static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1332static int vmmcall_interception(struct vcpu_svm *svm)
1331{ 1333{
1332 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1334 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1333 skip_emulated_instruction(&svm->vcpu); 1335 skip_emulated_instruction(&svm->vcpu);
@@ -1378,8 +1380,15 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
1378 1380
1379 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1381 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1380 1382
1381 if (nested_svm_exit_handled(svm)) { 1383 if (svm->nested.intercept & 1ULL) {
1382 nsvm_printk("VMexit -> INTR\n"); 1384 /*
1385 * The #vmexit can't be emulated here directly because this
1386 * code path runs with irqs and preemtion disabled. A
1387 * #vmexit emulation might sleep. Only signal request for
1388 * the #vmexit here.
1389 */
1390 svm->nested.exit_required = true;
1391 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1383 return 1; 1392 return 1;
1384 } 1393 }
1385 1394
@@ -1390,10 +1399,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
1390{ 1399{
1391 struct page *page; 1400 struct page *page;
1392 1401
1393 down_read(&current->mm->mmap_sem);
1394 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); 1402 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1395 up_read(&current->mm->mmap_sem);
1396
1397 if (is_error_page(page)) 1403 if (is_error_page(page))
1398 goto error; 1404 goto error;
1399 1405
@@ -1532,14 +1538,12 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1532 } 1538 }
1533 default: { 1539 default: {
1534 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); 1540 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
1535 nsvm_printk("exit code: 0x%x\n", exit_code);
1536 if (svm->nested.intercept & exit_bits) 1541 if (svm->nested.intercept & exit_bits)
1537 vmexit = NESTED_EXIT_DONE; 1542 vmexit = NESTED_EXIT_DONE;
1538 } 1543 }
1539 } 1544 }
1540 1545
1541 if (vmexit == NESTED_EXIT_DONE) { 1546 if (vmexit == NESTED_EXIT_DONE) {
1542 nsvm_printk("#VMEXIT reason=%04x\n", exit_code);
1543 nested_svm_vmexit(svm); 1547 nested_svm_vmexit(svm);
1544 } 1548 }
1545 1549
@@ -1584,6 +1588,12 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1584 struct vmcb *hsave = svm->nested.hsave; 1588 struct vmcb *hsave = svm->nested.hsave;
1585 struct vmcb *vmcb = svm->vmcb; 1589 struct vmcb *vmcb = svm->vmcb;
1586 1590
1591 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
1592 vmcb->control.exit_info_1,
1593 vmcb->control.exit_info_2,
1594 vmcb->control.exit_int_info,
1595 vmcb->control.exit_int_info_err);
1596
1587 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); 1597 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
1588 if (!nested_vmcb) 1598 if (!nested_vmcb)
1589 return 1; 1599 return 1;
@@ -1617,6 +1627,22 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1617 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 1627 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
1618 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 1628 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
1619 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 1629 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
1630
1631 /*
1632 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
1633 * to make sure that we do not lose injected events. So check event_inj
1634 * here and copy it to exit_int_info if it is valid.
1635 * Exit_int_info and event_inj can't be both valid because the case
1636 * below only happens on a VMRUN instruction intercept which has
1637 * no valid exit_int_info set.
1638 */
1639 if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
1640 struct vmcb_control_area *nc = &nested_vmcb->control;
1641
1642 nc->exit_int_info = vmcb->control.event_inj;
1643 nc->exit_int_info_err = vmcb->control.event_inj_err;
1644 }
1645
1620 nested_vmcb->control.tlb_ctl = 0; 1646 nested_vmcb->control.tlb_ctl = 0;
1621 nested_vmcb->control.event_inj = 0; 1647 nested_vmcb->control.event_inj = 0;
1622 nested_vmcb->control.event_inj_err = 0; 1648 nested_vmcb->control.event_inj_err = 0;
@@ -1628,10 +1654,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1628 /* Restore the original control entries */ 1654 /* Restore the original control entries */
1629 copy_vmcb_control_area(vmcb, hsave); 1655 copy_vmcb_control_area(vmcb, hsave);
1630 1656
1631 /* Kill any pending exceptions */
1632 if (svm->vcpu.arch.exception.pending == true)
1633 nsvm_printk("WARNING: Pending Exception\n");
1634
1635 kvm_clear_exception_queue(&svm->vcpu); 1657 kvm_clear_exception_queue(&svm->vcpu);
1636 kvm_clear_interrupt_queue(&svm->vcpu); 1658 kvm_clear_interrupt_queue(&svm->vcpu);
1637 1659
@@ -1702,6 +1724,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1702 /* nested_vmcb is our indicator if nested SVM is activated */ 1724 /* nested_vmcb is our indicator if nested SVM is activated */
1703 svm->nested.vmcb = svm->vmcb->save.rax; 1725 svm->nested.vmcb = svm->vmcb->save.rax;
1704 1726
1727 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
1728 nested_vmcb->save.rip,
1729 nested_vmcb->control.int_ctl,
1730 nested_vmcb->control.event_inj,
1731 nested_vmcb->control.nested_ctl);
1732
1705 /* Clear internal status */ 1733 /* Clear internal status */
1706 kvm_clear_exception_queue(&svm->vcpu); 1734 kvm_clear_exception_queue(&svm->vcpu);
1707 kvm_clear_interrupt_queue(&svm->vcpu); 1735 kvm_clear_interrupt_queue(&svm->vcpu);
@@ -1789,28 +1817,15 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1789 svm->nested.intercept = nested_vmcb->control.intercept; 1817 svm->nested.intercept = nested_vmcb->control.intercept;
1790 1818
1791 force_new_asid(&svm->vcpu); 1819 force_new_asid(&svm->vcpu);
1792 svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
1793 svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
1794 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 1820 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
1795 if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
1796 nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
1797 nested_vmcb->control.int_ctl);
1798 }
1799 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 1821 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
1800 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 1822 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
1801 else 1823 else
1802 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; 1824 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
1803 1825
1804 nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
1805 nested_vmcb->control.exit_int_info,
1806 nested_vmcb->control.int_state);
1807
1808 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 1826 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
1809 svm->vmcb->control.int_state = nested_vmcb->control.int_state; 1827 svm->vmcb->control.int_state = nested_vmcb->control.int_state;
1810 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; 1828 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
1811 if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
1812 nsvm_printk("Injecting Event: 0x%x\n",
1813 nested_vmcb->control.event_inj);
1814 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 1829 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1815 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 1830 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1816 1831
@@ -1837,7 +1852,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1837 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; 1852 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
1838} 1853}
1839 1854
1840static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1855static int vmload_interception(struct vcpu_svm *svm)
1841{ 1856{
1842 struct vmcb *nested_vmcb; 1857 struct vmcb *nested_vmcb;
1843 1858
@@ -1857,7 +1872,7 @@ static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1857 return 1; 1872 return 1;
1858} 1873}
1859 1874
1860static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1875static int vmsave_interception(struct vcpu_svm *svm)
1861{ 1876{
1862 struct vmcb *nested_vmcb; 1877 struct vmcb *nested_vmcb;
1863 1878
@@ -1877,10 +1892,8 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1877 return 1; 1892 return 1;
1878} 1893}
1879 1894
1880static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1895static int vmrun_interception(struct vcpu_svm *svm)
1881{ 1896{
1882 nsvm_printk("VMrun\n");
1883
1884 if (nested_svm_check_permissions(svm)) 1897 if (nested_svm_check_permissions(svm))
1885 return 1; 1898 return 1;
1886 1899
@@ -1907,7 +1920,7 @@ failed:
1907 return 1; 1920 return 1;
1908} 1921}
1909 1922
1910static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1923static int stgi_interception(struct vcpu_svm *svm)
1911{ 1924{
1912 if (nested_svm_check_permissions(svm)) 1925 if (nested_svm_check_permissions(svm))
1913 return 1; 1926 return 1;
@@ -1920,7 +1933,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1920 return 1; 1933 return 1;
1921} 1934}
1922 1935
1923static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1936static int clgi_interception(struct vcpu_svm *svm)
1924{ 1937{
1925 if (nested_svm_check_permissions(svm)) 1938 if (nested_svm_check_permissions(svm))
1926 return 1; 1939 return 1;
@@ -1937,10 +1950,12 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1937 return 1; 1950 return 1;
1938} 1951}
1939 1952
1940static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1953static int invlpga_interception(struct vcpu_svm *svm)
1941{ 1954{
1942 struct kvm_vcpu *vcpu = &svm->vcpu; 1955 struct kvm_vcpu *vcpu = &svm->vcpu;
1943 nsvm_printk("INVLPGA\n"); 1956
1957 trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
1958 vcpu->arch.regs[VCPU_REGS_RAX]);
1944 1959
1945 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 1960 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
1946 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); 1961 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
@@ -1950,15 +1965,21 @@ static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1950 return 1; 1965 return 1;
1951} 1966}
1952 1967
1953static int invalid_op_interception(struct vcpu_svm *svm, 1968static int skinit_interception(struct vcpu_svm *svm)
1954 struct kvm_run *kvm_run)
1955{ 1969{
1970 trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
1971
1956 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1972 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1957 return 1; 1973 return 1;
1958} 1974}
1959 1975
1960static int task_switch_interception(struct vcpu_svm *svm, 1976static int invalid_op_interception(struct vcpu_svm *svm)
1961 struct kvm_run *kvm_run) 1977{
1978 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1979 return 1;
1980}
1981
1982static int task_switch_interception(struct vcpu_svm *svm)
1962{ 1983{
1963 u16 tss_selector; 1984 u16 tss_selector;
1964 int reason; 1985 int reason;
@@ -2008,14 +2029,14 @@ static int task_switch_interception(struct vcpu_svm *svm,
2008 return kvm_task_switch(&svm->vcpu, tss_selector, reason); 2029 return kvm_task_switch(&svm->vcpu, tss_selector, reason);
2009} 2030}
2010 2031
2011static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2032static int cpuid_interception(struct vcpu_svm *svm)
2012{ 2033{
2013 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 2034 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2014 kvm_emulate_cpuid(&svm->vcpu); 2035 kvm_emulate_cpuid(&svm->vcpu);
2015 return 1; 2036 return 1;
2016} 2037}
2017 2038
2018static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2039static int iret_interception(struct vcpu_svm *svm)
2019{ 2040{
2020 ++svm->vcpu.stat.nmi_window_exits; 2041 ++svm->vcpu.stat.nmi_window_exits;
2021 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); 2042 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
@@ -2023,26 +2044,27 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2023 return 1; 2044 return 1;
2024} 2045}
2025 2046
2026static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2047static int invlpg_interception(struct vcpu_svm *svm)
2027{ 2048{
2028 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) 2049 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
2029 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2050 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2030 return 1; 2051 return 1;
2031} 2052}
2032 2053
2033static int emulate_on_interception(struct vcpu_svm *svm, 2054static int emulate_on_interception(struct vcpu_svm *svm)
2034 struct kvm_run *kvm_run)
2035{ 2055{
2036 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) 2056 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
2037 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2057 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2038 return 1; 2058 return 1;
2039} 2059}
2040 2060
2041static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2061static int cr8_write_interception(struct vcpu_svm *svm)
2042{ 2062{
2063 struct kvm_run *kvm_run = svm->vcpu.run;
2064
2043 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 2065 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2044 /* instruction emulation calls kvm_set_cr8() */ 2066 /* instruction emulation calls kvm_set_cr8() */
2045 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); 2067 emulate_instruction(&svm->vcpu, 0, 0, 0);
2046 if (irqchip_in_kernel(svm->vcpu.kvm)) { 2068 if (irqchip_in_kernel(svm->vcpu.kvm)) {
2047 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2069 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
2048 return 1; 2070 return 1;
@@ -2128,7 +2150,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2128 return 0; 2150 return 0;
2129} 2151}
2130 2152
2131static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2153static int rdmsr_interception(struct vcpu_svm *svm)
2132{ 2154{
2133 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2155 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2134 u64 data; 2156 u64 data;
@@ -2221,7 +2243,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2221 return 0; 2243 return 0;
2222} 2244}
2223 2245
2224static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2246static int wrmsr_interception(struct vcpu_svm *svm)
2225{ 2247{
2226 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2248 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2227 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 2249 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
@@ -2237,17 +2259,18 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2237 return 1; 2259 return 1;
2238} 2260}
2239 2261
2240static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2262static int msr_interception(struct vcpu_svm *svm)
2241{ 2263{
2242 if (svm->vmcb->control.exit_info_1) 2264 if (svm->vmcb->control.exit_info_1)
2243 return wrmsr_interception(svm, kvm_run); 2265 return wrmsr_interception(svm);
2244 else 2266 else
2245 return rdmsr_interception(svm, kvm_run); 2267 return rdmsr_interception(svm);
2246} 2268}
2247 2269
2248static int interrupt_window_interception(struct vcpu_svm *svm, 2270static int interrupt_window_interception(struct vcpu_svm *svm)
2249 struct kvm_run *kvm_run)
2250{ 2271{
2272 struct kvm_run *kvm_run = svm->vcpu.run;
2273
2251 svm_clear_vintr(svm); 2274 svm_clear_vintr(svm);
2252 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2275 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2253 /* 2276 /*
@@ -2265,8 +2288,13 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
2265 return 1; 2288 return 1;
2266} 2289}
2267 2290
2268static int (*svm_exit_handlers[])(struct vcpu_svm *svm, 2291static int pause_interception(struct vcpu_svm *svm)
2269 struct kvm_run *kvm_run) = { 2292{
2293 kvm_vcpu_on_spin(&(svm->vcpu));
2294 return 1;
2295}
2296
2297static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2270 [SVM_EXIT_READ_CR0] = emulate_on_interception, 2298 [SVM_EXIT_READ_CR0] = emulate_on_interception,
2271 [SVM_EXIT_READ_CR3] = emulate_on_interception, 2299 [SVM_EXIT_READ_CR3] = emulate_on_interception,
2272 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2300 [SVM_EXIT_READ_CR4] = emulate_on_interception,
@@ -2301,6 +2329,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2301 [SVM_EXIT_CPUID] = cpuid_interception, 2329 [SVM_EXIT_CPUID] = cpuid_interception,
2302 [SVM_EXIT_IRET] = iret_interception, 2330 [SVM_EXIT_IRET] = iret_interception,
2303 [SVM_EXIT_INVD] = emulate_on_interception, 2331 [SVM_EXIT_INVD] = emulate_on_interception,
2332 [SVM_EXIT_PAUSE] = pause_interception,
2304 [SVM_EXIT_HLT] = halt_interception, 2333 [SVM_EXIT_HLT] = halt_interception,
2305 [SVM_EXIT_INVLPG] = invlpg_interception, 2334 [SVM_EXIT_INVLPG] = invlpg_interception,
2306 [SVM_EXIT_INVLPGA] = invlpga_interception, 2335 [SVM_EXIT_INVLPGA] = invlpga_interception,
@@ -2314,26 +2343,36 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2314 [SVM_EXIT_VMSAVE] = vmsave_interception, 2343 [SVM_EXIT_VMSAVE] = vmsave_interception,
2315 [SVM_EXIT_STGI] = stgi_interception, 2344 [SVM_EXIT_STGI] = stgi_interception,
2316 [SVM_EXIT_CLGI] = clgi_interception, 2345 [SVM_EXIT_CLGI] = clgi_interception,
2317 [SVM_EXIT_SKINIT] = invalid_op_interception, 2346 [SVM_EXIT_SKINIT] = skinit_interception,
2318 [SVM_EXIT_WBINVD] = emulate_on_interception, 2347 [SVM_EXIT_WBINVD] = emulate_on_interception,
2319 [SVM_EXIT_MONITOR] = invalid_op_interception, 2348 [SVM_EXIT_MONITOR] = invalid_op_interception,
2320 [SVM_EXIT_MWAIT] = invalid_op_interception, 2349 [SVM_EXIT_MWAIT] = invalid_op_interception,
2321 [SVM_EXIT_NPF] = pf_interception, 2350 [SVM_EXIT_NPF] = pf_interception,
2322}; 2351};
2323 2352
2324static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 2353static int handle_exit(struct kvm_vcpu *vcpu)
2325{ 2354{
2326 struct vcpu_svm *svm = to_svm(vcpu); 2355 struct vcpu_svm *svm = to_svm(vcpu);
2356 struct kvm_run *kvm_run = vcpu->run;
2327 u32 exit_code = svm->vmcb->control.exit_code; 2357 u32 exit_code = svm->vmcb->control.exit_code;
2328 2358
2329 trace_kvm_exit(exit_code, svm->vmcb->save.rip); 2359 trace_kvm_exit(exit_code, svm->vmcb->save.rip);
2330 2360
2361 if (unlikely(svm->nested.exit_required)) {
2362 nested_svm_vmexit(svm);
2363 svm->nested.exit_required = false;
2364
2365 return 1;
2366 }
2367
2331 if (is_nested(svm)) { 2368 if (is_nested(svm)) {
2332 int vmexit; 2369 int vmexit;
2333 2370
2334 nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", 2371 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
2335 exit_code, svm->vmcb->control.exit_info_1, 2372 svm->vmcb->control.exit_info_1,
2336 svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); 2373 svm->vmcb->control.exit_info_2,
2374 svm->vmcb->control.exit_int_info,
2375 svm->vmcb->control.exit_int_info_err);
2337 2376
2338 vmexit = nested_svm_exit_special(svm); 2377 vmexit = nested_svm_exit_special(svm);
2339 2378
@@ -2383,7 +2422,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2383 return 0; 2422 return 0;
2384 } 2423 }
2385 2424
2386 return svm_exit_handlers[exit_code](svm, kvm_run); 2425 return svm_exit_handlers[exit_code](svm);
2387} 2426}
2388 2427
2389static void reload_tss(struct kvm_vcpu *vcpu) 2428static void reload_tss(struct kvm_vcpu *vcpu)
@@ -2460,20 +2499,47 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
2460 !(svm->vcpu.arch.hflags & HF_NMI_MASK); 2499 !(svm->vcpu.arch.hflags & HF_NMI_MASK);
2461} 2500}
2462 2501
2502static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
2503{
2504 struct vcpu_svm *svm = to_svm(vcpu);
2505
2506 return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
2507}
2508
2509static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2510{
2511 struct vcpu_svm *svm = to_svm(vcpu);
2512
2513 if (masked) {
2514 svm->vcpu.arch.hflags |= HF_NMI_MASK;
2515 svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
2516 } else {
2517 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
2518 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
2519 }
2520}
2521
2463static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) 2522static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2464{ 2523{
2465 struct vcpu_svm *svm = to_svm(vcpu); 2524 struct vcpu_svm *svm = to_svm(vcpu);
2466 struct vmcb *vmcb = svm->vmcb; 2525 struct vmcb *vmcb = svm->vmcb;
2467 return (vmcb->save.rflags & X86_EFLAGS_IF) && 2526 int ret;
2468 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && 2527
2469 gif_set(svm) && 2528 if (!gif_set(svm) ||
2470 !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)); 2529 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
2530 return 0;
2531
2532 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
2533
2534 if (is_nested(svm))
2535 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
2536
2537 return ret;
2471} 2538}
2472 2539
2473static void enable_irq_window(struct kvm_vcpu *vcpu) 2540static void enable_irq_window(struct kvm_vcpu *vcpu)
2474{ 2541{
2475 struct vcpu_svm *svm = to_svm(vcpu); 2542 struct vcpu_svm *svm = to_svm(vcpu);
2476 nsvm_printk("Trying to open IRQ window\n");
2477 2543
2478 nested_svm_intr(svm); 2544 nested_svm_intr(svm);
2479 2545
@@ -2498,7 +2564,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2498 /* Something prevents NMI from been injected. Single step over 2564 /* Something prevents NMI from been injected. Single step over
2499 possible problem (IRET or exception injection or interrupt 2565 possible problem (IRET or exception injection or interrupt
2500 shadow) */ 2566 shadow) */
2501 vcpu->arch.singlestep = true; 2567 svm->nmi_singlestep = true;
2502 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 2568 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2503 update_db_intercept(vcpu); 2569 update_db_intercept(vcpu);
2504} 2570}
@@ -2588,13 +2654,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2588#define R "e" 2654#define R "e"
2589#endif 2655#endif
2590 2656
2591static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2657static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2592{ 2658{
2593 struct vcpu_svm *svm = to_svm(vcpu); 2659 struct vcpu_svm *svm = to_svm(vcpu);
2594 u16 fs_selector; 2660 u16 fs_selector;
2595 u16 gs_selector; 2661 u16 gs_selector;
2596 u16 ldt_selector; 2662 u16 ldt_selector;
2597 2663
2664 /*
2665 * A vmexit emulation is required before the vcpu can be executed
2666 * again.
2667 */
2668 if (unlikely(svm->nested.exit_required))
2669 return;
2670
2598 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 2671 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
2599 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 2672 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2600 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 2673 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
@@ -2893,6 +2966,8 @@ static struct kvm_x86_ops svm_x86_ops = {
2893 .queue_exception = svm_queue_exception, 2966 .queue_exception = svm_queue_exception,
2894 .interrupt_allowed = svm_interrupt_allowed, 2967 .interrupt_allowed = svm_interrupt_allowed,
2895 .nmi_allowed = svm_nmi_allowed, 2968 .nmi_allowed = svm_nmi_allowed,
2969 .get_nmi_mask = svm_get_nmi_mask,
2970 .set_nmi_mask = svm_set_nmi_mask,
2896 .enable_nmi_window = enable_nmi_window, 2971 .enable_nmi_window = enable_nmi_window,
2897 .enable_irq_window = enable_irq_window, 2972 .enable_irq_window = enable_irq_window,
2898 .update_cr8_intercept = update_cr8_intercept, 2973 .update_cr8_intercept = update_cr8_intercept,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0d480e77eacf..816e0449db0b 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -349,6 +349,171 @@ TRACE_EVENT(kvm_apic_accept_irq,
349 __entry->coalesced ? " (coalesced)" : "") 349 __entry->coalesced ? " (coalesced)" : "")
350); 350);
351 351
352/*
353 * Tracepoint for nested VMRUN
354 */
355TRACE_EVENT(kvm_nested_vmrun,
356 TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
357 __u32 event_inj, bool npt),
358 TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt),
359
360 TP_STRUCT__entry(
361 __field( __u64, rip )
362 __field( __u64, vmcb )
363 __field( __u64, nested_rip )
364 __field( __u32, int_ctl )
365 __field( __u32, event_inj )
366 __field( bool, npt )
367 ),
368
369 TP_fast_assign(
370 __entry->rip = rip;
371 __entry->vmcb = vmcb;
372 __entry->nested_rip = nested_rip;
373 __entry->int_ctl = int_ctl;
374 __entry->event_inj = event_inj;
375 __entry->npt = npt;
376 ),
377
378 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
379 "event_inj: 0x%08x npt: %s\n",
380 __entry->rip, __entry->vmcb, __entry->nested_rip,
381 __entry->int_ctl, __entry->event_inj,
382 __entry->npt ? "on" : "off")
383);
384
385/*
386 * Tracepoint for #VMEXIT while nested
387 */
388TRACE_EVENT(kvm_nested_vmexit,
389 TP_PROTO(__u64 rip, __u32 exit_code,
390 __u64 exit_info1, __u64 exit_info2,
391 __u32 exit_int_info, __u32 exit_int_info_err),
392 TP_ARGS(rip, exit_code, exit_info1, exit_info2,
393 exit_int_info, exit_int_info_err),
394
395 TP_STRUCT__entry(
396 __field( __u64, rip )
397 __field( __u32, exit_code )
398 __field( __u64, exit_info1 )
399 __field( __u64, exit_info2 )
400 __field( __u32, exit_int_info )
401 __field( __u32, exit_int_info_err )
402 ),
403
404 TP_fast_assign(
405 __entry->rip = rip;
406 __entry->exit_code = exit_code;
407 __entry->exit_info1 = exit_info1;
408 __entry->exit_info2 = exit_info2;
409 __entry->exit_int_info = exit_int_info;
410 __entry->exit_int_info_err = exit_int_info_err;
411 ),
412 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
413 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
414 __entry->rip,
415 ftrace_print_symbols_seq(p, __entry->exit_code,
416 kvm_x86_ops->exit_reasons_str),
417 __entry->exit_info1, __entry->exit_info2,
418 __entry->exit_int_info, __entry->exit_int_info_err)
419);
420
421/*
422 * Tracepoint for #VMEXIT reinjected to the guest
423 */
424TRACE_EVENT(kvm_nested_vmexit_inject,
425 TP_PROTO(__u32 exit_code,
426 __u64 exit_info1, __u64 exit_info2,
427 __u32 exit_int_info, __u32 exit_int_info_err),
428 TP_ARGS(exit_code, exit_info1, exit_info2,
429 exit_int_info, exit_int_info_err),
430
431 TP_STRUCT__entry(
432 __field( __u32, exit_code )
433 __field( __u64, exit_info1 )
434 __field( __u64, exit_info2 )
435 __field( __u32, exit_int_info )
436 __field( __u32, exit_int_info_err )
437 ),
438
439 TP_fast_assign(
440 __entry->exit_code = exit_code;
441 __entry->exit_info1 = exit_info1;
442 __entry->exit_info2 = exit_info2;
443 __entry->exit_int_info = exit_int_info;
444 __entry->exit_int_info_err = exit_int_info_err;
445 ),
446
447 TP_printk("reason: %s ext_inf1: 0x%016llx "
448 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
449 ftrace_print_symbols_seq(p, __entry->exit_code,
450 kvm_x86_ops->exit_reasons_str),
451 __entry->exit_info1, __entry->exit_info2,
452 __entry->exit_int_info, __entry->exit_int_info_err)
453);
454
455/*
456 * Tracepoint for nested #vmexit because of interrupt pending
457 */
458TRACE_EVENT(kvm_nested_intr_vmexit,
459 TP_PROTO(__u64 rip),
460 TP_ARGS(rip),
461
462 TP_STRUCT__entry(
463 __field( __u64, rip )
464 ),
465
466 TP_fast_assign(
467 __entry->rip = rip
468 ),
469
470 TP_printk("rip: 0x%016llx\n", __entry->rip)
471);
472
473/*
474 * Tracepoint for nested #vmexit because of interrupt pending
475 */
476TRACE_EVENT(kvm_invlpga,
477 TP_PROTO(__u64 rip, int asid, u64 address),
478 TP_ARGS(rip, asid, address),
479
480 TP_STRUCT__entry(
481 __field( __u64, rip )
482 __field( int, asid )
483 __field( __u64, address )
484 ),
485
486 TP_fast_assign(
487 __entry->rip = rip;
488 __entry->asid = asid;
489 __entry->address = address;
490 ),
491
492 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n",
493 __entry->rip, __entry->asid, __entry->address)
494);
495
496/*
497 * Tracepoint for nested #vmexit because of interrupt pending
498 */
499TRACE_EVENT(kvm_skinit,
500 TP_PROTO(__u64 rip, __u32 slb),
501 TP_ARGS(rip, slb),
502
503 TP_STRUCT__entry(
504 __field( __u64, rip )
505 __field( __u32, slb )
506 ),
507
508 TP_fast_assign(
509 __entry->rip = rip;
510 __entry->slb = slb;
511 ),
512
513 TP_printk("rip: 0x%016llx slb: 0x%08x\n",
514 __entry->rip, __entry->slb)
515);
516
352#endif /* _TRACE_KVM_H */ 517#endif /* _TRACE_KVM_H */
353 518
354/* This part must be outside protection */ 519/* This part must be outside protection */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed53b42caba1..d4918d6fc924 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,12 +61,37 @@ module_param_named(unrestricted_guest,
61static int __read_mostly emulate_invalid_guest_state = 0; 61static int __read_mostly emulate_invalid_guest_state = 0;
62module_param(emulate_invalid_guest_state, bool, S_IRUGO); 62module_param(emulate_invalid_guest_state, bool, S_IRUGO);
63 63
64/*
65 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
66 * ple_gap: upper bound on the amount of time between two successive
67 * executions of PAUSE in a loop. Also indicate if ple enabled.
68 * According to test, this time is usually small than 41 cycles.
69 * ple_window: upper bound on the amount of time a guest is allowed to execute
70 * in a PAUSE loop. Tests indicate that most spinlocks are held for
71 * less than 2^12 cycles
72 * Time is measured based on a counter that runs at the same rate as the TSC,
73 * refer SDM volume 3b section 21.6.13 & 22.1.3.
74 */
75#define KVM_VMX_DEFAULT_PLE_GAP 41
76#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
77static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
78module_param(ple_gap, int, S_IRUGO);
79
80static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
81module_param(ple_window, int, S_IRUGO);
82
64struct vmcs { 83struct vmcs {
65 u32 revision_id; 84 u32 revision_id;
66 u32 abort; 85 u32 abort;
67 char data[0]; 86 char data[0];
68}; 87};
69 88
89struct shared_msr_entry {
90 unsigned index;
91 u64 data;
92 u64 mask;
93};
94
70struct vcpu_vmx { 95struct vcpu_vmx {
71 struct kvm_vcpu vcpu; 96 struct kvm_vcpu vcpu;
72 struct list_head local_vcpus_link; 97 struct list_head local_vcpus_link;
@@ -74,13 +99,12 @@ struct vcpu_vmx {
74 int launched; 99 int launched;
75 u8 fail; 100 u8 fail;
76 u32 idt_vectoring_info; 101 u32 idt_vectoring_info;
77 struct kvm_msr_entry *guest_msrs; 102 struct shared_msr_entry *guest_msrs;
78 struct kvm_msr_entry *host_msrs;
79 int nmsrs; 103 int nmsrs;
80 int save_nmsrs; 104 int save_nmsrs;
81 int msr_offset_efer;
82#ifdef CONFIG_X86_64 105#ifdef CONFIG_X86_64
83 int msr_offset_kernel_gs_base; 106 u64 msr_host_kernel_gs_base;
107 u64 msr_guest_kernel_gs_base;
84#endif 108#endif
85 struct vmcs *vmcs; 109 struct vmcs *vmcs;
86 struct { 110 struct {
@@ -88,7 +112,6 @@ struct vcpu_vmx {
88 u16 fs_sel, gs_sel, ldt_sel; 112 u16 fs_sel, gs_sel, ldt_sel;
89 int gs_ldt_reload_needed; 113 int gs_ldt_reload_needed;
90 int fs_reload_needed; 114 int fs_reload_needed;
91 int guest_efer_loaded;
92 } host_state; 115 } host_state;
93 struct { 116 struct {
94 int vm86_active; 117 int vm86_active;
@@ -107,7 +130,6 @@ struct vcpu_vmx {
107 } rmode; 130 } rmode;
108 int vpid; 131 int vpid;
109 bool emulation_required; 132 bool emulation_required;
110 enum emulation_result invalid_state_emulation_result;
111 133
112 /* Support for vnmi-less CPUs */ 134 /* Support for vnmi-less CPUs */
113 int soft_vnmi_blocked; 135 int soft_vnmi_blocked;
@@ -176,6 +198,8 @@ static struct kvm_vmx_segment_field {
176 VMX_SEGMENT_FIELD(LDTR), 198 VMX_SEGMENT_FIELD(LDTR),
177}; 199};
178 200
201static u64 host_efer;
202
179static void ept_save_pdptrs(struct kvm_vcpu *vcpu); 203static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
180 204
181/* 205/*
@@ -184,28 +208,12 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
184 */ 208 */
185static const u32 vmx_msr_index[] = { 209static const u32 vmx_msr_index[] = {
186#ifdef CONFIG_X86_64 210#ifdef CONFIG_X86_64
187 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, 211 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
188#endif 212#endif
189 MSR_EFER, MSR_K6_STAR, 213 MSR_EFER, MSR_K6_STAR,
190}; 214};
191#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 215#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
192 216
193static void load_msrs(struct kvm_msr_entry *e, int n)
194{
195 int i;
196
197 for (i = 0; i < n; ++i)
198 wrmsrl(e[i].index, e[i].data);
199}
200
201static void save_msrs(struct kvm_msr_entry *e, int n)
202{
203 int i;
204
205 for (i = 0; i < n; ++i)
206 rdmsrl(e[i].index, e[i].data);
207}
208
209static inline int is_page_fault(u32 intr_info) 217static inline int is_page_fault(u32 intr_info)
210{ 218{
211 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 219 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -320,6 +328,12 @@ static inline int cpu_has_vmx_unrestricted_guest(void)
320 SECONDARY_EXEC_UNRESTRICTED_GUEST; 328 SECONDARY_EXEC_UNRESTRICTED_GUEST;
321} 329}
322 330
331static inline int cpu_has_vmx_ple(void)
332{
333 return vmcs_config.cpu_based_2nd_exec_ctrl &
334 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
335}
336
323static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 337static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
324{ 338{
325 return flexpriority_enabled && 339 return flexpriority_enabled &&
@@ -348,7 +362,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
348 int i; 362 int i;
349 363
350 for (i = 0; i < vmx->nmsrs; ++i) 364 for (i = 0; i < vmx->nmsrs; ++i)
351 if (vmx->guest_msrs[i].index == msr) 365 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
352 return i; 366 return i;
353 return -1; 367 return -1;
354} 368}
@@ -379,7 +393,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
379 : : "a" (&operand), "c" (ext) : "cc", "memory"); 393 : : "a" (&operand), "c" (ext) : "cc", "memory");
380} 394}
381 395
382static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) 396static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
383{ 397{
384 int i; 398 int i;
385 399
@@ -570,17 +584,12 @@ static void reload_tss(void)
570 load_TR_desc(); 584 load_TR_desc();
571} 585}
572 586
573static void load_transition_efer(struct vcpu_vmx *vmx) 587static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
574{ 588{
575 int efer_offset = vmx->msr_offset_efer;
576 u64 host_efer;
577 u64 guest_efer; 589 u64 guest_efer;
578 u64 ignore_bits; 590 u64 ignore_bits;
579 591
580 if (efer_offset < 0) 592 guest_efer = vmx->vcpu.arch.shadow_efer;
581 return;
582 host_efer = vmx->host_msrs[efer_offset].data;
583 guest_efer = vmx->guest_msrs[efer_offset].data;
584 593
585 /* 594 /*
586 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 595 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -593,27 +602,17 @@ static void load_transition_efer(struct vcpu_vmx *vmx)
593 if (guest_efer & EFER_LMA) 602 if (guest_efer & EFER_LMA)
594 ignore_bits &= ~(u64)EFER_SCE; 603 ignore_bits &= ~(u64)EFER_SCE;
595#endif 604#endif
596 if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
597 return;
598
599 vmx->host_state.guest_efer_loaded = 1;
600 guest_efer &= ~ignore_bits; 605 guest_efer &= ~ignore_bits;
601 guest_efer |= host_efer & ignore_bits; 606 guest_efer |= host_efer & ignore_bits;
602 wrmsrl(MSR_EFER, guest_efer); 607 vmx->guest_msrs[efer_offset].data = guest_efer;
603 vmx->vcpu.stat.efer_reload++; 608 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
604} 609 return true;
605
606static void reload_host_efer(struct vcpu_vmx *vmx)
607{
608 if (vmx->host_state.guest_efer_loaded) {
609 vmx->host_state.guest_efer_loaded = 0;
610 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
611 }
612} 610}
613 611
614static void vmx_save_host_state(struct kvm_vcpu *vcpu) 612static void vmx_save_host_state(struct kvm_vcpu *vcpu)
615{ 613{
616 struct vcpu_vmx *vmx = to_vmx(vcpu); 614 struct vcpu_vmx *vmx = to_vmx(vcpu);
615 int i;
617 616
618 if (vmx->host_state.loaded) 617 if (vmx->host_state.loaded)
619 return; 618 return;
@@ -650,13 +649,15 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
650#endif 649#endif
651 650
652#ifdef CONFIG_X86_64 651#ifdef CONFIG_X86_64
653 if (is_long_mode(&vmx->vcpu)) 652 if (is_long_mode(&vmx->vcpu)) {
654 save_msrs(vmx->host_msrs + 653 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
655 vmx->msr_offset_kernel_gs_base, 1); 654 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
656 655 }
657#endif 656#endif
658 load_msrs(vmx->guest_msrs, vmx->save_nmsrs); 657 for (i = 0; i < vmx->save_nmsrs; ++i)
659 load_transition_efer(vmx); 658 kvm_set_shared_msr(vmx->guest_msrs[i].index,
659 vmx->guest_msrs[i].data,
660 vmx->guest_msrs[i].mask);
660} 661}
661 662
662static void __vmx_load_host_state(struct vcpu_vmx *vmx) 663static void __vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -684,9 +685,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
684 local_irq_restore(flags); 685 local_irq_restore(flags);
685 } 686 }
686 reload_tss(); 687 reload_tss();
687 save_msrs(vmx->guest_msrs, vmx->save_nmsrs); 688#ifdef CONFIG_X86_64
688 load_msrs(vmx->host_msrs, vmx->save_nmsrs); 689 if (is_long_mode(&vmx->vcpu)) {
689 reload_host_efer(vmx); 690 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
691 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
692 }
693#endif
690} 694}
691 695
692static void vmx_load_host_state(struct vcpu_vmx *vmx) 696static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -877,19 +881,14 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
877/* 881/*
878 * Swap MSR entry in host/guest MSR entry array. 882 * Swap MSR entry in host/guest MSR entry array.
879 */ 883 */
880#ifdef CONFIG_X86_64
881static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) 884static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
882{ 885{
883 struct kvm_msr_entry tmp; 886 struct shared_msr_entry tmp;
884 887
885 tmp = vmx->guest_msrs[to]; 888 tmp = vmx->guest_msrs[to];
886 vmx->guest_msrs[to] = vmx->guest_msrs[from]; 889 vmx->guest_msrs[to] = vmx->guest_msrs[from];
887 vmx->guest_msrs[from] = tmp; 890 vmx->guest_msrs[from] = tmp;
888 tmp = vmx->host_msrs[to];
889 vmx->host_msrs[to] = vmx->host_msrs[from];
890 vmx->host_msrs[from] = tmp;
891} 891}
892#endif
893 892
894/* 893/*
895 * Set up the vmcs to automatically save and restore system 894 * Set up the vmcs to automatically save and restore system
@@ -898,15 +897,13 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
898 */ 897 */
899static void setup_msrs(struct vcpu_vmx *vmx) 898static void setup_msrs(struct vcpu_vmx *vmx)
900{ 899{
901 int save_nmsrs; 900 int save_nmsrs, index;
902 unsigned long *msr_bitmap; 901 unsigned long *msr_bitmap;
903 902
904 vmx_load_host_state(vmx); 903 vmx_load_host_state(vmx);
905 save_nmsrs = 0; 904 save_nmsrs = 0;
906#ifdef CONFIG_X86_64 905#ifdef CONFIG_X86_64
907 if (is_long_mode(&vmx->vcpu)) { 906 if (is_long_mode(&vmx->vcpu)) {
908 int index;
909
910 index = __find_msr_index(vmx, MSR_SYSCALL_MASK); 907 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
911 if (index >= 0) 908 if (index >= 0)
912 move_msr_up(vmx, index, save_nmsrs++); 909 move_msr_up(vmx, index, save_nmsrs++);
@@ -916,9 +913,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
916 index = __find_msr_index(vmx, MSR_CSTAR); 913 index = __find_msr_index(vmx, MSR_CSTAR);
917 if (index >= 0) 914 if (index >= 0)
918 move_msr_up(vmx, index, save_nmsrs++); 915 move_msr_up(vmx, index, save_nmsrs++);
919 index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
920 if (index >= 0)
921 move_msr_up(vmx, index, save_nmsrs++);
922 /* 916 /*
923 * MSR_K6_STAR is only needed on long mode guests, and only 917 * MSR_K6_STAR is only needed on long mode guests, and only
924 * if efer.sce is enabled. 918 * if efer.sce is enabled.
@@ -928,13 +922,11 @@ static void setup_msrs(struct vcpu_vmx *vmx)
928 move_msr_up(vmx, index, save_nmsrs++); 922 move_msr_up(vmx, index, save_nmsrs++);
929 } 923 }
930#endif 924#endif
931 vmx->save_nmsrs = save_nmsrs; 925 index = __find_msr_index(vmx, MSR_EFER);
926 if (index >= 0 && update_transition_efer(vmx, index))
927 move_msr_up(vmx, index, save_nmsrs++);
932 928
933#ifdef CONFIG_X86_64 929 vmx->save_nmsrs = save_nmsrs;
934 vmx->msr_offset_kernel_gs_base =
935 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
936#endif
937 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
938 930
939 if (cpu_has_vmx_msr_bitmap()) { 931 if (cpu_has_vmx_msr_bitmap()) {
940 if (is_long_mode(&vmx->vcpu)) 932 if (is_long_mode(&vmx->vcpu))
@@ -976,7 +968,7 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
976static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 968static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
977{ 969{
978 u64 data; 970 u64 data;
979 struct kvm_msr_entry *msr; 971 struct shared_msr_entry *msr;
980 972
981 if (!pdata) { 973 if (!pdata) {
982 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); 974 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
@@ -991,9 +983,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
991 case MSR_GS_BASE: 983 case MSR_GS_BASE:
992 data = vmcs_readl(GUEST_GS_BASE); 984 data = vmcs_readl(GUEST_GS_BASE);
993 break; 985 break;
986 case MSR_KERNEL_GS_BASE:
987 vmx_load_host_state(to_vmx(vcpu));
988 data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
989 break;
990#endif
994 case MSR_EFER: 991 case MSR_EFER:
995 return kvm_get_msr_common(vcpu, msr_index, pdata); 992 return kvm_get_msr_common(vcpu, msr_index, pdata);
996#endif
997 case MSR_IA32_TSC: 993 case MSR_IA32_TSC:
998 data = guest_read_tsc(); 994 data = guest_read_tsc();
999 break; 995 break;
@@ -1007,6 +1003,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1007 data = vmcs_readl(GUEST_SYSENTER_ESP); 1003 data = vmcs_readl(GUEST_SYSENTER_ESP);
1008 break; 1004 break;
1009 default: 1005 default:
1006 vmx_load_host_state(to_vmx(vcpu));
1010 msr = find_msr_entry(to_vmx(vcpu), msr_index); 1007 msr = find_msr_entry(to_vmx(vcpu), msr_index);
1011 if (msr) { 1008 if (msr) {
1012 vmx_load_host_state(to_vmx(vcpu)); 1009 vmx_load_host_state(to_vmx(vcpu));
@@ -1028,7 +1025,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1028static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1025static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1029{ 1026{
1030 struct vcpu_vmx *vmx = to_vmx(vcpu); 1027 struct vcpu_vmx *vmx = to_vmx(vcpu);
1031 struct kvm_msr_entry *msr; 1028 struct shared_msr_entry *msr;
1032 u64 host_tsc; 1029 u64 host_tsc;
1033 int ret = 0; 1030 int ret = 0;
1034 1031
@@ -1044,6 +1041,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1044 case MSR_GS_BASE: 1041 case MSR_GS_BASE:
1045 vmcs_writel(GUEST_GS_BASE, data); 1042 vmcs_writel(GUEST_GS_BASE, data);
1046 break; 1043 break;
1044 case MSR_KERNEL_GS_BASE:
1045 vmx_load_host_state(vmx);
1046 vmx->msr_guest_kernel_gs_base = data;
1047 break;
1047#endif 1048#endif
1048 case MSR_IA32_SYSENTER_CS: 1049 case MSR_IA32_SYSENTER_CS:
1049 vmcs_write32(GUEST_SYSENTER_CS, data); 1050 vmcs_write32(GUEST_SYSENTER_CS, data);
@@ -1097,30 +1098,14 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1097 } 1098 }
1098} 1099}
1099 1100
1100static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1101static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1101{ 1102{
1102 int old_debug = vcpu->guest_debug;
1103 unsigned long flags;
1104
1105 vcpu->guest_debug = dbg->control;
1106 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
1107 vcpu->guest_debug = 0;
1108
1109 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1103 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1110 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); 1104 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
1111 else 1105 else
1112 vmcs_writel(GUEST_DR7, vcpu->arch.dr7); 1106 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
1113 1107
1114 flags = vmcs_readl(GUEST_RFLAGS);
1115 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
1116 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1117 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1118 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1119 vmcs_writel(GUEST_RFLAGS, flags);
1120
1121 update_exception_bitmap(vcpu); 1108 update_exception_bitmap(vcpu);
1122
1123 return 0;
1124} 1109}
1125 1110
1126static __init int cpu_has_kvm_support(void) 1111static __init int cpu_has_kvm_support(void)
@@ -1139,12 +1124,15 @@ static __init int vmx_disabled_by_bios(void)
1139 /* locked but not enabled */ 1124 /* locked but not enabled */
1140} 1125}
1141 1126
1142static void hardware_enable(void *garbage) 1127static int hardware_enable(void *garbage)
1143{ 1128{
1144 int cpu = raw_smp_processor_id(); 1129 int cpu = raw_smp_processor_id();
1145 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1130 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1146 u64 old; 1131 u64 old;
1147 1132
1133 if (read_cr4() & X86_CR4_VMXE)
1134 return -EBUSY;
1135
1148 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 1136 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1149 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1137 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1150 if ((old & (FEATURE_CONTROL_LOCKED | 1138 if ((old & (FEATURE_CONTROL_LOCKED |
@@ -1159,6 +1147,10 @@ static void hardware_enable(void *garbage)
1159 asm volatile (ASM_VMX_VMXON_RAX 1147 asm volatile (ASM_VMX_VMXON_RAX
1160 : : "a"(&phys_addr), "m"(phys_addr) 1148 : : "a"(&phys_addr), "m"(phys_addr)
1161 : "memory", "cc"); 1149 : "memory", "cc");
1150
1151 ept_sync_global();
1152
1153 return 0;
1162} 1154}
1163 1155
1164static void vmclear_local_vcpus(void) 1156static void vmclear_local_vcpus(void)
@@ -1250,7 +1242,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1250 SECONDARY_EXEC_WBINVD_EXITING | 1242 SECONDARY_EXEC_WBINVD_EXITING |
1251 SECONDARY_EXEC_ENABLE_VPID | 1243 SECONDARY_EXEC_ENABLE_VPID |
1252 SECONDARY_EXEC_ENABLE_EPT | 1244 SECONDARY_EXEC_ENABLE_EPT |
1253 SECONDARY_EXEC_UNRESTRICTED_GUEST; 1245 SECONDARY_EXEC_UNRESTRICTED_GUEST |
1246 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1254 if (adjust_vmx_controls(min2, opt2, 1247 if (adjust_vmx_controls(min2, opt2,
1255 MSR_IA32_VMX_PROCBASED_CTLS2, 1248 MSR_IA32_VMX_PROCBASED_CTLS2,
1256 &_cpu_based_2nd_exec_control) < 0) 1249 &_cpu_based_2nd_exec_control) < 0)
@@ -1344,15 +1337,17 @@ static void free_kvm_area(void)
1344{ 1337{
1345 int cpu; 1338 int cpu;
1346 1339
1347 for_each_online_cpu(cpu) 1340 for_each_possible_cpu(cpu) {
1348 free_vmcs(per_cpu(vmxarea, cpu)); 1341 free_vmcs(per_cpu(vmxarea, cpu));
1342 per_cpu(vmxarea, cpu) = NULL;
1343 }
1349} 1344}
1350 1345
1351static __init int alloc_kvm_area(void) 1346static __init int alloc_kvm_area(void)
1352{ 1347{
1353 int cpu; 1348 int cpu;
1354 1349
1355 for_each_online_cpu(cpu) { 1350 for_each_possible_cpu(cpu) {
1356 struct vmcs *vmcs; 1351 struct vmcs *vmcs;
1357 1352
1358 vmcs = alloc_vmcs_cpu(cpu); 1353 vmcs = alloc_vmcs_cpu(cpu);
@@ -1394,6 +1389,9 @@ static __init int hardware_setup(void)
1394 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 1389 if (enable_ept && !cpu_has_vmx_ept_2m_page())
1395 kvm_disable_largepages(); 1390 kvm_disable_largepages();
1396 1391
1392 if (!cpu_has_vmx_ple())
1393 ple_gap = 0;
1394
1397 return alloc_kvm_area(); 1395 return alloc_kvm_area();
1398} 1396}
1399 1397
@@ -1536,8 +1534,16 @@ continue_rmode:
1536static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1534static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1537{ 1535{
1538 struct vcpu_vmx *vmx = to_vmx(vcpu); 1536 struct vcpu_vmx *vmx = to_vmx(vcpu);
1539 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 1537 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1538
1539 if (!msr)
1540 return;
1540 1541
1542 /*
1543 * Force kernel_gs_base reloading before EFER changes, as control
1544 * of this msr depends on is_long_mode().
1545 */
1546 vmx_load_host_state(to_vmx(vcpu));
1541 vcpu->arch.shadow_efer = efer; 1547 vcpu->arch.shadow_efer = efer;
1542 if (!msr) 1548 if (!msr)
1543 return; 1549 return;
@@ -1727,6 +1733,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1727 vmcs_write64(EPT_POINTER, eptp); 1733 vmcs_write64(EPT_POINTER, eptp);
1728 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 1734 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
1729 vcpu->kvm->arch.ept_identity_map_addr; 1735 vcpu->kvm->arch.ept_identity_map_addr;
1736 ept_load_pdptrs(vcpu);
1730 } 1737 }
1731 1738
1732 vmx_flush_tlb(vcpu); 1739 vmx_flush_tlb(vcpu);
@@ -2302,13 +2309,22 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2302 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 2309 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2303 if (vmx->vpid == 0) 2310 if (vmx->vpid == 0)
2304 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 2311 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2305 if (!enable_ept) 2312 if (!enable_ept) {
2306 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 2313 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2314 enable_unrestricted_guest = 0;
2315 }
2307 if (!enable_unrestricted_guest) 2316 if (!enable_unrestricted_guest)
2308 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2317 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2318 if (!ple_gap)
2319 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
2309 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 2320 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2310 } 2321 }
2311 2322
2323 if (ple_gap) {
2324 vmcs_write32(PLE_GAP, ple_gap);
2325 vmcs_write32(PLE_WINDOW, ple_window);
2326 }
2327
2312 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); 2328 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
2313 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 2329 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2314 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 2330 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
@@ -2376,10 +2392,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2376 if (wrmsr_safe(index, data_low, data_high) < 0) 2392 if (wrmsr_safe(index, data_low, data_high) < 0)
2377 continue; 2393 continue;
2378 data = data_low | ((u64)data_high << 32); 2394 data = data_low | ((u64)data_high << 32);
2379 vmx->host_msrs[j].index = index; 2395 vmx->guest_msrs[j].index = i;
2380 vmx->host_msrs[j].reserved = 0; 2396 vmx->guest_msrs[j].data = 0;
2381 vmx->host_msrs[j].data = data; 2397 vmx->guest_msrs[j].mask = -1ull;
2382 vmx->guest_msrs[j] = vmx->host_msrs[j];
2383 ++vmx->nmsrs; 2398 ++vmx->nmsrs;
2384 } 2399 }
2385 2400
@@ -2510,7 +2525,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2510 if (vmx->vpid != 0) 2525 if (vmx->vpid != 0)
2511 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2526 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2512 2527
2513 vmx->vcpu.arch.cr0 = 0x60000010; 2528 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
2514 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ 2529 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
2515 vmx_set_cr4(&vmx->vcpu, 0); 2530 vmx_set_cr4(&vmx->vcpu, 0);
2516 vmx_set_efer(&vmx->vcpu, 0); 2531 vmx_set_efer(&vmx->vcpu, 0);
@@ -2627,6 +2642,34 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2627 GUEST_INTR_STATE_NMI)); 2642 GUEST_INTR_STATE_NMI));
2628} 2643}
2629 2644
2645static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2646{
2647 if (!cpu_has_virtual_nmis())
2648 return to_vmx(vcpu)->soft_vnmi_blocked;
2649 else
2650 return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2651 GUEST_INTR_STATE_NMI);
2652}
2653
2654static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2655{
2656 struct vcpu_vmx *vmx = to_vmx(vcpu);
2657
2658 if (!cpu_has_virtual_nmis()) {
2659 if (vmx->soft_vnmi_blocked != masked) {
2660 vmx->soft_vnmi_blocked = masked;
2661 vmx->vnmi_blocked_time = 0;
2662 }
2663 } else {
2664 if (masked)
2665 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2666 GUEST_INTR_STATE_NMI);
2667 else
2668 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2669 GUEST_INTR_STATE_NMI);
2670 }
2671}
2672
2630static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 2673static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
2631{ 2674{
2632 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 2675 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
@@ -2659,7 +2702,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2659 * Cause the #SS fault with 0 error code in VM86 mode. 2702 * Cause the #SS fault with 0 error code in VM86 mode.
2660 */ 2703 */
2661 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2704 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2662 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) 2705 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
2663 return 1; 2706 return 1;
2664 /* 2707 /*
2665 * Forward all other exceptions that are valid in real mode. 2708 * Forward all other exceptions that are valid in real mode.
@@ -2710,15 +2753,16 @@ static void kvm_machine_check(void)
2710#endif 2753#endif
2711} 2754}
2712 2755
2713static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2756static int handle_machine_check(struct kvm_vcpu *vcpu)
2714{ 2757{
2715 /* already handled by vcpu_run */ 2758 /* already handled by vcpu_run */
2716 return 1; 2759 return 1;
2717} 2760}
2718 2761
2719static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2762static int handle_exception(struct kvm_vcpu *vcpu)
2720{ 2763{
2721 struct vcpu_vmx *vmx = to_vmx(vcpu); 2764 struct vcpu_vmx *vmx = to_vmx(vcpu);
2765 struct kvm_run *kvm_run = vcpu->run;
2722 u32 intr_info, ex_no, error_code; 2766 u32 intr_info, ex_no, error_code;
2723 unsigned long cr2, rip, dr6; 2767 unsigned long cr2, rip, dr6;
2724 u32 vect_info; 2768 u32 vect_info;
@@ -2728,12 +2772,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2728 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2772 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2729 2773
2730 if (is_machine_check(intr_info)) 2774 if (is_machine_check(intr_info))
2731 return handle_machine_check(vcpu, kvm_run); 2775 return handle_machine_check(vcpu);
2732 2776
2733 if ((vect_info & VECTORING_INFO_VALID_MASK) && 2777 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2734 !is_page_fault(intr_info)) 2778 !is_page_fault(intr_info)) {
2735 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 2779 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2736 "intr info 0x%x\n", __func__, vect_info, intr_info); 2780 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
2781 vcpu->run->internal.ndata = 2;
2782 vcpu->run->internal.data[0] = vect_info;
2783 vcpu->run->internal.data[1] = intr_info;
2784 return 0;
2785 }
2737 2786
2738 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 2787 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
2739 return 1; /* already handled by vmx_vcpu_run() */ 2788 return 1; /* already handled by vmx_vcpu_run() */
@@ -2744,7 +2793,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2744 } 2793 }
2745 2794
2746 if (is_invalid_opcode(intr_info)) { 2795 if (is_invalid_opcode(intr_info)) {
2747 er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); 2796 er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
2748 if (er != EMULATE_DONE) 2797 if (er != EMULATE_DONE)
2749 kvm_queue_exception(vcpu, UD_VECTOR); 2798 kvm_queue_exception(vcpu, UD_VECTOR);
2750 return 1; 2799 return 1;
@@ -2803,20 +2852,19 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2803 return 0; 2852 return 0;
2804} 2853}
2805 2854
2806static int handle_external_interrupt(struct kvm_vcpu *vcpu, 2855static int handle_external_interrupt(struct kvm_vcpu *vcpu)
2807 struct kvm_run *kvm_run)
2808{ 2856{
2809 ++vcpu->stat.irq_exits; 2857 ++vcpu->stat.irq_exits;
2810 return 1; 2858 return 1;
2811} 2859}
2812 2860
2813static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2861static int handle_triple_fault(struct kvm_vcpu *vcpu)
2814{ 2862{
2815 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2863 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2816 return 0; 2864 return 0;
2817} 2865}
2818 2866
2819static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2867static int handle_io(struct kvm_vcpu *vcpu)
2820{ 2868{
2821 unsigned long exit_qualification; 2869 unsigned long exit_qualification;
2822 int size, in, string; 2870 int size, in, string;
@@ -2827,8 +2875,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2827 string = (exit_qualification & 16) != 0; 2875 string = (exit_qualification & 16) != 0;
2828 2876
2829 if (string) { 2877 if (string) {
2830 if (emulate_instruction(vcpu, 2878 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
2831 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
2832 return 0; 2879 return 0;
2833 return 1; 2880 return 1;
2834 } 2881 }
@@ -2838,7 +2885,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2838 port = exit_qualification >> 16; 2885 port = exit_qualification >> 16;
2839 2886
2840 skip_emulated_instruction(vcpu); 2887 skip_emulated_instruction(vcpu);
2841 return kvm_emulate_pio(vcpu, kvm_run, in, size, port); 2888 return kvm_emulate_pio(vcpu, in, size, port);
2842} 2889}
2843 2890
2844static void 2891static void
@@ -2852,7 +2899,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
2852 hypercall[2] = 0xc1; 2899 hypercall[2] = 0xc1;
2853} 2900}
2854 2901
2855static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2902static int handle_cr(struct kvm_vcpu *vcpu)
2856{ 2903{
2857 unsigned long exit_qualification, val; 2904 unsigned long exit_qualification, val;
2858 int cr; 2905 int cr;
@@ -2887,7 +2934,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2887 return 1; 2934 return 1;
2888 if (cr8_prev <= cr8) 2935 if (cr8_prev <= cr8)
2889 return 1; 2936 return 1;
2890 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2937 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2891 return 0; 2938 return 0;
2892 } 2939 }
2893 }; 2940 };
@@ -2922,13 +2969,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2922 default: 2969 default:
2923 break; 2970 break;
2924 } 2971 }
2925 kvm_run->exit_reason = 0; 2972 vcpu->run->exit_reason = 0;
2926 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 2973 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
2927 (int)(exit_qualification >> 4) & 3, cr); 2974 (int)(exit_qualification >> 4) & 3, cr);
2928 return 0; 2975 return 0;
2929} 2976}
2930 2977
2931static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2978static int handle_dr(struct kvm_vcpu *vcpu)
2932{ 2979{
2933 unsigned long exit_qualification; 2980 unsigned long exit_qualification;
2934 unsigned long val; 2981 unsigned long val;
@@ -2944,13 +2991,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2944 * guest debugging itself. 2991 * guest debugging itself.
2945 */ 2992 */
2946 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 2993 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
2947 kvm_run->debug.arch.dr6 = vcpu->arch.dr6; 2994 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
2948 kvm_run->debug.arch.dr7 = dr; 2995 vcpu->run->debug.arch.dr7 = dr;
2949 kvm_run->debug.arch.pc = 2996 vcpu->run->debug.arch.pc =
2950 vmcs_readl(GUEST_CS_BASE) + 2997 vmcs_readl(GUEST_CS_BASE) +
2951 vmcs_readl(GUEST_RIP); 2998 vmcs_readl(GUEST_RIP);
2952 kvm_run->debug.arch.exception = DB_VECTOR; 2999 vcpu->run->debug.arch.exception = DB_VECTOR;
2953 kvm_run->exit_reason = KVM_EXIT_DEBUG; 3000 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
2954 return 0; 3001 return 0;
2955 } else { 3002 } else {
2956 vcpu->arch.dr7 &= ~DR7_GD; 3003 vcpu->arch.dr7 &= ~DR7_GD;
@@ -3016,13 +3063,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3016 return 1; 3063 return 1;
3017} 3064}
3018 3065
3019static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3066static int handle_cpuid(struct kvm_vcpu *vcpu)
3020{ 3067{
3021 kvm_emulate_cpuid(vcpu); 3068 kvm_emulate_cpuid(vcpu);
3022 return 1; 3069 return 1;
3023} 3070}
3024 3071
3025static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3072static int handle_rdmsr(struct kvm_vcpu *vcpu)
3026{ 3073{
3027 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3074 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3028 u64 data; 3075 u64 data;
@@ -3041,7 +3088,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3041 return 1; 3088 return 1;
3042} 3089}
3043 3090
3044static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3091static int handle_wrmsr(struct kvm_vcpu *vcpu)
3045{ 3092{
3046 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3093 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3047 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 3094 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
@@ -3058,14 +3105,12 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3058 return 1; 3105 return 1;
3059} 3106}
3060 3107
3061static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, 3108static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
3062 struct kvm_run *kvm_run)
3063{ 3109{
3064 return 1; 3110 return 1;
3065} 3111}
3066 3112
3067static int handle_interrupt_window(struct kvm_vcpu *vcpu, 3113static int handle_interrupt_window(struct kvm_vcpu *vcpu)
3068 struct kvm_run *kvm_run)
3069{ 3114{
3070 u32 cpu_based_vm_exec_control; 3115 u32 cpu_based_vm_exec_control;
3071 3116
@@ -3081,34 +3126,34 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
3081 * possible 3126 * possible
3082 */ 3127 */
3083 if (!irqchip_in_kernel(vcpu->kvm) && 3128 if (!irqchip_in_kernel(vcpu->kvm) &&
3084 kvm_run->request_interrupt_window && 3129 vcpu->run->request_interrupt_window &&
3085 !kvm_cpu_has_interrupt(vcpu)) { 3130 !kvm_cpu_has_interrupt(vcpu)) {
3086 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 3131 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3087 return 0; 3132 return 0;
3088 } 3133 }
3089 return 1; 3134 return 1;
3090} 3135}
3091 3136
3092static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3137static int handle_halt(struct kvm_vcpu *vcpu)
3093{ 3138{
3094 skip_emulated_instruction(vcpu); 3139 skip_emulated_instruction(vcpu);
3095 return kvm_emulate_halt(vcpu); 3140 return kvm_emulate_halt(vcpu);
3096} 3141}
3097 3142
3098static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3143static int handle_vmcall(struct kvm_vcpu *vcpu)
3099{ 3144{
3100 skip_emulated_instruction(vcpu); 3145 skip_emulated_instruction(vcpu);
3101 kvm_emulate_hypercall(vcpu); 3146 kvm_emulate_hypercall(vcpu);
3102 return 1; 3147 return 1;
3103} 3148}
3104 3149
3105static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3150static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3106{ 3151{
3107 kvm_queue_exception(vcpu, UD_VECTOR); 3152 kvm_queue_exception(vcpu, UD_VECTOR);
3108 return 1; 3153 return 1;
3109} 3154}
3110 3155
3111static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3156static int handle_invlpg(struct kvm_vcpu *vcpu)
3112{ 3157{
3113 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3158 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3114 3159
@@ -3117,14 +3162,14 @@ static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3117 return 1; 3162 return 1;
3118} 3163}
3119 3164
3120static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3165static int handle_wbinvd(struct kvm_vcpu *vcpu)
3121{ 3166{
3122 skip_emulated_instruction(vcpu); 3167 skip_emulated_instruction(vcpu);
3123 /* TODO: Add support for VT-d/pass-through device */ 3168 /* TODO: Add support for VT-d/pass-through device */
3124 return 1; 3169 return 1;
3125} 3170}
3126 3171
3127static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3172static int handle_apic_access(struct kvm_vcpu *vcpu)
3128{ 3173{
3129 unsigned long exit_qualification; 3174 unsigned long exit_qualification;
3130 enum emulation_result er; 3175 enum emulation_result er;
@@ -3133,7 +3178,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3133 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3178 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3134 offset = exit_qualification & 0xffful; 3179 offset = exit_qualification & 0xffful;
3135 3180
3136 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3181 er = emulate_instruction(vcpu, 0, 0, 0);
3137 3182
3138 if (er != EMULATE_DONE) { 3183 if (er != EMULATE_DONE) {
3139 printk(KERN_ERR 3184 printk(KERN_ERR
@@ -3144,7 +3189,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3144 return 1; 3189 return 1;
3145} 3190}
3146 3191
3147static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3192static int handle_task_switch(struct kvm_vcpu *vcpu)
3148{ 3193{
3149 struct vcpu_vmx *vmx = to_vmx(vcpu); 3194 struct vcpu_vmx *vmx = to_vmx(vcpu);
3150 unsigned long exit_qualification; 3195 unsigned long exit_qualification;
@@ -3198,7 +3243,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3198 return 1; 3243 return 1;
3199} 3244}
3200 3245
3201static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3246static int handle_ept_violation(struct kvm_vcpu *vcpu)
3202{ 3247{
3203 unsigned long exit_qualification; 3248 unsigned long exit_qualification;
3204 gpa_t gpa; 3249 gpa_t gpa;
@@ -3219,8 +3264,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3219 vmcs_readl(GUEST_LINEAR_ADDRESS)); 3264 vmcs_readl(GUEST_LINEAR_ADDRESS));
3220 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 3265 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3221 (long unsigned int)exit_qualification); 3266 (long unsigned int)exit_qualification);
3222 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3267 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3223 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; 3268 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
3224 return 0; 3269 return 0;
3225 } 3270 }
3226 3271
@@ -3290,7 +3335,7 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
3290 } 3335 }
3291} 3336}
3292 3337
3293static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3338static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
3294{ 3339{
3295 u64 sptes[4]; 3340 u64 sptes[4];
3296 int nr_sptes, i; 3341 int nr_sptes, i;
@@ -3306,13 +3351,13 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3306 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) 3351 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
3307 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); 3352 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
3308 3353
3309 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3354 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3310 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; 3355 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
3311 3356
3312 return 0; 3357 return 0;
3313} 3358}
3314 3359
3315static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3360static int handle_nmi_window(struct kvm_vcpu *vcpu)
3316{ 3361{
3317 u32 cpu_based_vm_exec_control; 3362 u32 cpu_based_vm_exec_control;
3318 3363
@@ -3325,36 +3370,50 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3325 return 1; 3370 return 1;
3326} 3371}
3327 3372
3328static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, 3373static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3329 struct kvm_run *kvm_run)
3330{ 3374{
3331 struct vcpu_vmx *vmx = to_vmx(vcpu); 3375 struct vcpu_vmx *vmx = to_vmx(vcpu);
3332 enum emulation_result err = EMULATE_DONE; 3376 enum emulation_result err = EMULATE_DONE;
3333 3377 int ret = 1;
3334 local_irq_enable();
3335 preempt_enable();
3336 3378
3337 while (!guest_state_valid(vcpu)) { 3379 while (!guest_state_valid(vcpu)) {
3338 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3380 err = emulate_instruction(vcpu, 0, 0, 0);
3339 3381
3340 if (err == EMULATE_DO_MMIO) 3382 if (err == EMULATE_DO_MMIO) {
3341 break; 3383 ret = 0;
3384 goto out;
3385 }
3342 3386
3343 if (err != EMULATE_DONE) { 3387 if (err != EMULATE_DONE) {
3344 kvm_report_emulation_failure(vcpu, "emulation failure"); 3388 kvm_report_emulation_failure(vcpu, "emulation failure");
3345 break; 3389 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3390 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3391 vcpu->run->internal.ndata = 0;
3392 ret = 0;
3393 goto out;
3346 } 3394 }
3347 3395
3348 if (signal_pending(current)) 3396 if (signal_pending(current))
3349 break; 3397 goto out;
3350 if (need_resched()) 3398 if (need_resched())
3351 schedule(); 3399 schedule();
3352 } 3400 }
3353 3401
3354 preempt_disable(); 3402 vmx->emulation_required = 0;
3355 local_irq_disable(); 3403out:
3404 return ret;
3405}
3356 3406
3357 vmx->invalid_state_emulation_result = err; 3407/*
3408 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
3409 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
3410 */
3411static int handle_pause(struct kvm_vcpu *vcpu)
3412{
3413 skip_emulated_instruction(vcpu);
3414 kvm_vcpu_on_spin(vcpu);
3415
3416 return 1;
3358} 3417}
3359 3418
3360/* 3419/*
@@ -3362,8 +3421,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3362 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 3421 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
3363 * to be done to userspace and return 0. 3422 * to be done to userspace and return 0.
3364 */ 3423 */
3365static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, 3424static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3366 struct kvm_run *kvm_run) = {
3367 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 3425 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
3368 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 3426 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
3369 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 3427 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
@@ -3394,6 +3452,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3394 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 3452 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3395 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3453 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3396 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 3454 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
3455 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
3397}; 3456};
3398 3457
3399static const int kvm_vmx_max_exit_handlers = 3458static const int kvm_vmx_max_exit_handlers =
@@ -3403,7 +3462,7 @@ static const int kvm_vmx_max_exit_handlers =
3403 * The guest has exited. See if we can fix it or if we need userspace 3462 * The guest has exited. See if we can fix it or if we need userspace
3404 * assistance. 3463 * assistance.
3405 */ 3464 */
3406static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 3465static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3407{ 3466{
3408 struct vcpu_vmx *vmx = to_vmx(vcpu); 3467 struct vcpu_vmx *vmx = to_vmx(vcpu);
3409 u32 exit_reason = vmx->exit_reason; 3468 u32 exit_reason = vmx->exit_reason;
@@ -3411,13 +3470,9 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3411 3470
3412 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); 3471 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
3413 3472
3414 /* If we need to emulate an MMIO from handle_invalid_guest_state 3473 /* If guest state is invalid, start emulating */
3415 * we just return 0 */ 3474 if (vmx->emulation_required && emulate_invalid_guest_state)
3416 if (vmx->emulation_required && emulate_invalid_guest_state) { 3475 return handle_invalid_guest_state(vcpu);
3417 if (guest_state_valid(vcpu))
3418 vmx->emulation_required = 0;
3419 return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
3420 }
3421 3476
3422 /* Access CR3 don't cause VMExit in paging mode, so we need 3477 /* Access CR3 don't cause VMExit in paging mode, so we need
3423 * to sync with guest real CR3. */ 3478 * to sync with guest real CR3. */
@@ -3425,8 +3480,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3425 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3480 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3426 3481
3427 if (unlikely(vmx->fail)) { 3482 if (unlikely(vmx->fail)) {
3428 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3483 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3429 kvm_run->fail_entry.hardware_entry_failure_reason 3484 vcpu->run->fail_entry.hardware_entry_failure_reason
3430 = vmcs_read32(VM_INSTRUCTION_ERROR); 3485 = vmcs_read32(VM_INSTRUCTION_ERROR);
3431 return 0; 3486 return 0;
3432 } 3487 }
@@ -3459,10 +3514,10 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3459 3514
3460 if (exit_reason < kvm_vmx_max_exit_handlers 3515 if (exit_reason < kvm_vmx_max_exit_handlers
3461 && kvm_vmx_exit_handlers[exit_reason]) 3516 && kvm_vmx_exit_handlers[exit_reason])
3462 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); 3517 return kvm_vmx_exit_handlers[exit_reason](vcpu);
3463 else { 3518 else {
3464 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3519 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3465 kvm_run->hw.hardware_exit_reason = exit_reason; 3520 vcpu->run->hw.hardware_exit_reason = exit_reason;
3466 } 3521 }
3467 return 0; 3522 return 0;
3468} 3523}
@@ -3600,23 +3655,18 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3600#define Q "l" 3655#define Q "l"
3601#endif 3656#endif
3602 3657
3603static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3658static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3604{ 3659{
3605 struct vcpu_vmx *vmx = to_vmx(vcpu); 3660 struct vcpu_vmx *vmx = to_vmx(vcpu);
3606 3661
3607 if (enable_ept && is_paging(vcpu)) {
3608 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3609 ept_load_pdptrs(vcpu);
3610 }
3611 /* Record the guest's net vcpu time for enforced NMI injections. */ 3662 /* Record the guest's net vcpu time for enforced NMI injections. */
3612 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 3663 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
3613 vmx->entry_time = ktime_get(); 3664 vmx->entry_time = ktime_get();
3614 3665
3615 /* Handle invalid guest state instead of entering VMX */ 3666 /* Don't enter VMX if guest state is invalid, let the exit handler
3616 if (vmx->emulation_required && emulate_invalid_guest_state) { 3667 start emulation until we arrive back to a valid state */
3617 handle_invalid_guest_state(vcpu, kvm_run); 3668 if (vmx->emulation_required && emulate_invalid_guest_state)
3618 return; 3669 return;
3619 }
3620 3670
3621 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 3671 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3622 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 3672 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
@@ -3775,7 +3825,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3775 __clear_bit(vmx->vpid, vmx_vpid_bitmap); 3825 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3776 spin_unlock(&vmx_vpid_lock); 3826 spin_unlock(&vmx_vpid_lock);
3777 vmx_free_vmcs(vcpu); 3827 vmx_free_vmcs(vcpu);
3778 kfree(vmx->host_msrs);
3779 kfree(vmx->guest_msrs); 3828 kfree(vmx->guest_msrs);
3780 kvm_vcpu_uninit(vcpu); 3829 kvm_vcpu_uninit(vcpu);
3781 kmem_cache_free(kvm_vcpu_cache, vmx); 3830 kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -3802,10 +3851,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3802 goto uninit_vcpu; 3851 goto uninit_vcpu;
3803 } 3852 }
3804 3853
3805 vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
3806 if (!vmx->host_msrs)
3807 goto free_guest_msrs;
3808
3809 vmx->vmcs = alloc_vmcs(); 3854 vmx->vmcs = alloc_vmcs();
3810 if (!vmx->vmcs) 3855 if (!vmx->vmcs)
3811 goto free_msrs; 3856 goto free_msrs;
@@ -3836,8 +3881,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3836free_vmcs: 3881free_vmcs:
3837 free_vmcs(vmx->vmcs); 3882 free_vmcs(vmx->vmcs);
3838free_msrs: 3883free_msrs:
3839 kfree(vmx->host_msrs);
3840free_guest_msrs:
3841 kfree(vmx->guest_msrs); 3884 kfree(vmx->guest_msrs);
3842uninit_vcpu: 3885uninit_vcpu:
3843 kvm_vcpu_uninit(&vmx->vcpu); 3886 kvm_vcpu_uninit(&vmx->vcpu);
@@ -3973,6 +4016,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
3973 .queue_exception = vmx_queue_exception, 4016 .queue_exception = vmx_queue_exception,
3974 .interrupt_allowed = vmx_interrupt_allowed, 4017 .interrupt_allowed = vmx_interrupt_allowed,
3975 .nmi_allowed = vmx_nmi_allowed, 4018 .nmi_allowed = vmx_nmi_allowed,
4019 .get_nmi_mask = vmx_get_nmi_mask,
4020 .set_nmi_mask = vmx_set_nmi_mask,
3976 .enable_nmi_window = enable_nmi_window, 4021 .enable_nmi_window = enable_nmi_window,
3977 .enable_irq_window = enable_irq_window, 4022 .enable_irq_window = enable_irq_window,
3978 .update_cr8_intercept = update_cr8_intercept, 4023 .update_cr8_intercept = update_cr8_intercept,
@@ -3987,7 +4032,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
3987 4032
3988static int __init vmx_init(void) 4033static int __init vmx_init(void)
3989{ 4034{
3990 int r; 4035 int r, i;
4036
4037 rdmsrl_safe(MSR_EFER, &host_efer);
4038
4039 for (i = 0; i < NR_VMX_MSR; ++i)
4040 kvm_define_shared_msr(i, vmx_msr_index[i]);
3991 4041
3992 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); 4042 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
3993 if (!vmx_io_bitmap_a) 4043 if (!vmx_io_bitmap_a)
@@ -4049,8 +4099,6 @@ static int __init vmx_init(void)
4049 if (bypass_guest_pf) 4099 if (bypass_guest_pf)
4050 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); 4100 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
4051 4101
4052 ept_sync_global();
4053
4054 return 0; 4102 return 0;
4055 4103
4056out3: 4104out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4fc80174191c..9d068966fb2a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -37,6 +37,7 @@
37#include <linux/iommu.h> 37#include <linux/iommu.h>
38#include <linux/intel-iommu.h> 38#include <linux/intel-iommu.h>
39#include <linux/cpufreq.h> 39#include <linux/cpufreq.h>
40#include <linux/user-return-notifier.h>
40#include <trace/events/kvm.h> 41#include <trace/events/kvm.h>
41#undef TRACE_INCLUDE_FILE 42#undef TRACE_INCLUDE_FILE
42#define CREATE_TRACE_POINTS 43#define CREATE_TRACE_POINTS
@@ -88,6 +89,25 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
88int ignore_msrs = 0; 89int ignore_msrs = 0;
89module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 90module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
90 91
92#define KVM_NR_SHARED_MSRS 16
93
94struct kvm_shared_msrs_global {
95 int nr;
96 struct kvm_shared_msr {
97 u32 msr;
98 u64 value;
99 } msrs[KVM_NR_SHARED_MSRS];
100};
101
102struct kvm_shared_msrs {
103 struct user_return_notifier urn;
104 bool registered;
105 u64 current_value[KVM_NR_SHARED_MSRS];
106};
107
108static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
109static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
110
91struct kvm_stats_debugfs_item debugfs_entries[] = { 111struct kvm_stats_debugfs_item debugfs_entries[] = {
92 { "pf_fixed", VCPU_STAT(pf_fixed) }, 112 { "pf_fixed", VCPU_STAT(pf_fixed) },
93 { "pf_guest", VCPU_STAT(pf_guest) }, 113 { "pf_guest", VCPU_STAT(pf_guest) },
@@ -124,6 +144,72 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
124 { NULL } 144 { NULL }
125}; 145};
126 146
147static void kvm_on_user_return(struct user_return_notifier *urn)
148{
149 unsigned slot;
150 struct kvm_shared_msr *global;
151 struct kvm_shared_msrs *locals
152 = container_of(urn, struct kvm_shared_msrs, urn);
153
154 for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
155 global = &shared_msrs_global.msrs[slot];
156 if (global->value != locals->current_value[slot]) {
157 wrmsrl(global->msr, global->value);
158 locals->current_value[slot] = global->value;
159 }
160 }
161 locals->registered = false;
162 user_return_notifier_unregister(urn);
163}
164
165void kvm_define_shared_msr(unsigned slot, u32 msr)
166{
167 int cpu;
168 u64 value;
169
170 if (slot >= shared_msrs_global.nr)
171 shared_msrs_global.nr = slot + 1;
172 shared_msrs_global.msrs[slot].msr = msr;
173 rdmsrl_safe(msr, &value);
174 shared_msrs_global.msrs[slot].value = value;
175 for_each_online_cpu(cpu)
176 per_cpu(shared_msrs, cpu).current_value[slot] = value;
177}
178EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
179
180static void kvm_shared_msr_cpu_online(void)
181{
182 unsigned i;
183 struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
184
185 for (i = 0; i < shared_msrs_global.nr; ++i)
186 locals->current_value[i] = shared_msrs_global.msrs[i].value;
187}
188
189void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
190{
191 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
192
193 if (((value ^ smsr->current_value[slot]) & mask) == 0)
194 return;
195 smsr->current_value[slot] = value;
196 wrmsrl(shared_msrs_global.msrs[slot].msr, value);
197 if (!smsr->registered) {
198 smsr->urn.on_user_return = kvm_on_user_return;
199 user_return_notifier_register(&smsr->urn);
200 smsr->registered = true;
201 }
202}
203EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
204
205static void drop_user_return_notifiers(void *ignore)
206{
207 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
208
209 if (smsr->registered)
210 kvm_on_user_return(&smsr->urn);
211}
212
127unsigned long segment_base(u16 selector) 213unsigned long segment_base(u16 selector)
128{ 214{
129 struct descriptor_table gdt; 215 struct descriptor_table gdt;
@@ -485,16 +571,19 @@ static inline u32 bit(int bitno)
485 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 571 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
486 * 572 *
487 * This list is modified at module load time to reflect the 573 * This list is modified at module load time to reflect the
488 * capabilities of the host cpu. 574 * capabilities of the host cpu. This capabilities test skips MSRs that are
575 * kvm-specific. Those are put in the beginning of the list.
489 */ 576 */
577
578#define KVM_SAVE_MSRS_BEGIN 2
490static u32 msrs_to_save[] = { 579static u32 msrs_to_save[] = {
580 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
491 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 581 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
492 MSR_K6_STAR, 582 MSR_K6_STAR,
493#ifdef CONFIG_X86_64 583#ifdef CONFIG_X86_64
494 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 584 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
495#endif 585#endif
496 MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 586 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
497 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
498}; 587};
499 588
500static unsigned num_msrs_to_save; 589static unsigned num_msrs_to_save;
@@ -678,7 +767,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
678 /* With all the info we got, fill in the values */ 767 /* With all the info we got, fill in the values */
679 768
680 vcpu->hv_clock.system_time = ts.tv_nsec + 769 vcpu->hv_clock.system_time = ts.tv_nsec +
681 (NSEC_PER_SEC * (u64)ts.tv_sec); 770 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
771
682 /* 772 /*
683 * The interface expects us to write an even number signaling that the 773 * The interface expects us to write an even number signaling that the
684 * update is finished. Since the guest won't see the intermediate 774 * update is finished. Since the guest won't see the intermediate
@@ -836,6 +926,38 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
836 return 0; 926 return 0;
837} 927}
838 928
929static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
930{
931 struct kvm *kvm = vcpu->kvm;
932 int lm = is_long_mode(vcpu);
933 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
934 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
935 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
936 : kvm->arch.xen_hvm_config.blob_size_32;
937 u32 page_num = data & ~PAGE_MASK;
938 u64 page_addr = data & PAGE_MASK;
939 u8 *page;
940 int r;
941
942 r = -E2BIG;
943 if (page_num >= blob_size)
944 goto out;
945 r = -ENOMEM;
946 page = kzalloc(PAGE_SIZE, GFP_KERNEL);
947 if (!page)
948 goto out;
949 r = -EFAULT;
950 if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
951 goto out_free;
952 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
953 goto out_free;
954 r = 0;
955out_free:
956 kfree(page);
957out:
958 return r;
959}
960
839int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 961int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
840{ 962{
841 switch (msr) { 963 switch (msr) {
@@ -951,6 +1073,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
951 "0x%x data 0x%llx\n", msr, data); 1073 "0x%x data 0x%llx\n", msr, data);
952 break; 1074 break;
953 default: 1075 default:
1076 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1077 return xen_hvm_config(vcpu, data);
954 if (!ignore_msrs) { 1078 if (!ignore_msrs) {
955 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1079 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
956 msr, data); 1080 msr, data);
@@ -1225,6 +1349,9 @@ int kvm_dev_ioctl_check_extension(long ext)
1225 case KVM_CAP_PIT2: 1349 case KVM_CAP_PIT2:
1226 case KVM_CAP_PIT_STATE2: 1350 case KVM_CAP_PIT_STATE2:
1227 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 1351 case KVM_CAP_SET_IDENTITY_MAP_ADDR:
1352 case KVM_CAP_XEN_HVM:
1353 case KVM_CAP_ADJUST_CLOCK:
1354 case KVM_CAP_VCPU_EVENTS:
1228 r = 1; 1355 r = 1;
1229 break; 1356 break;
1230 case KVM_CAP_COALESCED_MMIO: 1357 case KVM_CAP_COALESCED_MMIO:
@@ -1239,8 +1366,8 @@ int kvm_dev_ioctl_check_extension(long ext)
1239 case KVM_CAP_NR_MEMSLOTS: 1366 case KVM_CAP_NR_MEMSLOTS:
1240 r = KVM_MEMORY_SLOTS; 1367 r = KVM_MEMORY_SLOTS;
1241 break; 1368 break;
1242 case KVM_CAP_PV_MMU: 1369 case KVM_CAP_PV_MMU: /* obsolete */
1243 r = !tdp_enabled; 1370 r = 0;
1244 break; 1371 break;
1245 case KVM_CAP_IOMMU: 1372 case KVM_CAP_IOMMU:
1246 r = iommu_found(); 1373 r = iommu_found();
@@ -1327,6 +1454,12 @@ out:
1327void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1454void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1328{ 1455{
1329 kvm_x86_ops->vcpu_load(vcpu, cpu); 1456 kvm_x86_ops->vcpu_load(vcpu, cpu);
1457 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1458 unsigned long khz = cpufreq_quick_get(cpu);
1459 if (!khz)
1460 khz = tsc_khz;
1461 per_cpu(cpu_tsc_khz, cpu) = khz;
1462 }
1330 kvm_request_guest_time_update(vcpu); 1463 kvm_request_guest_time_update(vcpu);
1331} 1464}
1332 1465
@@ -1760,6 +1893,61 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1760 return 0; 1893 return 0;
1761} 1894}
1762 1895
1896static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
1897 struct kvm_vcpu_events *events)
1898{
1899 vcpu_load(vcpu);
1900
1901 events->exception.injected = vcpu->arch.exception.pending;
1902 events->exception.nr = vcpu->arch.exception.nr;
1903 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
1904 events->exception.error_code = vcpu->arch.exception.error_code;
1905
1906 events->interrupt.injected = vcpu->arch.interrupt.pending;
1907 events->interrupt.nr = vcpu->arch.interrupt.nr;
1908 events->interrupt.soft = vcpu->arch.interrupt.soft;
1909
1910 events->nmi.injected = vcpu->arch.nmi_injected;
1911 events->nmi.pending = vcpu->arch.nmi_pending;
1912 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
1913
1914 events->sipi_vector = vcpu->arch.sipi_vector;
1915
1916 events->flags = 0;
1917
1918 vcpu_put(vcpu);
1919}
1920
1921static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
1922 struct kvm_vcpu_events *events)
1923{
1924 if (events->flags)
1925 return -EINVAL;
1926
1927 vcpu_load(vcpu);
1928
1929 vcpu->arch.exception.pending = events->exception.injected;
1930 vcpu->arch.exception.nr = events->exception.nr;
1931 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
1932 vcpu->arch.exception.error_code = events->exception.error_code;
1933
1934 vcpu->arch.interrupt.pending = events->interrupt.injected;
1935 vcpu->arch.interrupt.nr = events->interrupt.nr;
1936 vcpu->arch.interrupt.soft = events->interrupt.soft;
1937 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
1938 kvm_pic_clear_isr_ack(vcpu->kvm);
1939
1940 vcpu->arch.nmi_injected = events->nmi.injected;
1941 vcpu->arch.nmi_pending = events->nmi.pending;
1942 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
1943
1944 vcpu->arch.sipi_vector = events->sipi_vector;
1945
1946 vcpu_put(vcpu);
1947
1948 return 0;
1949}
1950
1763long kvm_arch_vcpu_ioctl(struct file *filp, 1951long kvm_arch_vcpu_ioctl(struct file *filp,
1764 unsigned int ioctl, unsigned long arg) 1952 unsigned int ioctl, unsigned long arg)
1765{ 1953{
@@ -1770,6 +1958,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1770 1958
1771 switch (ioctl) { 1959 switch (ioctl) {
1772 case KVM_GET_LAPIC: { 1960 case KVM_GET_LAPIC: {
1961 r = -EINVAL;
1962 if (!vcpu->arch.apic)
1963 goto out;
1773 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1964 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1774 1965
1775 r = -ENOMEM; 1966 r = -ENOMEM;
@@ -1785,6 +1976,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1785 break; 1976 break;
1786 } 1977 }
1787 case KVM_SET_LAPIC: { 1978 case KVM_SET_LAPIC: {
1979 r = -EINVAL;
1980 if (!vcpu->arch.apic)
1981 goto out;
1788 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1982 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1789 r = -ENOMEM; 1983 r = -ENOMEM;
1790 if (!lapic) 1984 if (!lapic)
@@ -1911,6 +2105,27 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1911 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2105 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
1912 break; 2106 break;
1913 } 2107 }
2108 case KVM_GET_VCPU_EVENTS: {
2109 struct kvm_vcpu_events events;
2110
2111 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
2112
2113 r = -EFAULT;
2114 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
2115 break;
2116 r = 0;
2117 break;
2118 }
2119 case KVM_SET_VCPU_EVENTS: {
2120 struct kvm_vcpu_events events;
2121
2122 r = -EFAULT;
2123 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
2124 break;
2125
2126 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
2127 break;
2128 }
1914 default: 2129 default:
1915 r = -EINVAL; 2130 r = -EINVAL;
1916 } 2131 }
@@ -2039,9 +2254,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2039 sizeof(struct kvm_pic_state)); 2254 sizeof(struct kvm_pic_state));
2040 break; 2255 break;
2041 case KVM_IRQCHIP_IOAPIC: 2256 case KVM_IRQCHIP_IOAPIC:
2042 memcpy(&chip->chip.ioapic, 2257 r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
2043 ioapic_irqchip(kvm),
2044 sizeof(struct kvm_ioapic_state));
2045 break; 2258 break;
2046 default: 2259 default:
2047 r = -EINVAL; 2260 r = -EINVAL;
@@ -2071,11 +2284,7 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2071 spin_unlock(&pic_irqchip(kvm)->lock); 2284 spin_unlock(&pic_irqchip(kvm)->lock);
2072 break; 2285 break;
2073 case KVM_IRQCHIP_IOAPIC: 2286 case KVM_IRQCHIP_IOAPIC:
2074 mutex_lock(&kvm->irq_lock); 2287 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
2075 memcpy(ioapic_irqchip(kvm),
2076 &chip->chip.ioapic,
2077 sizeof(struct kvm_ioapic_state));
2078 mutex_unlock(&kvm->irq_lock);
2079 break; 2288 break;
2080 default: 2289 default:
2081 r = -EINVAL; 2290 r = -EINVAL;
@@ -2183,7 +2392,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2183{ 2392{
2184 struct kvm *kvm = filp->private_data; 2393 struct kvm *kvm = filp->private_data;
2185 void __user *argp = (void __user *)arg; 2394 void __user *argp = (void __user *)arg;
2186 int r = -EINVAL; 2395 int r = -ENOTTY;
2187 /* 2396 /*
2188 * This union makes it completely explicit to gcc-3.x 2397 * This union makes it completely explicit to gcc-3.x
2189 * that these two variables' stack usage should be 2398 * that these two variables' stack usage should be
@@ -2245,25 +2454,39 @@ long kvm_arch_vm_ioctl(struct file *filp,
2245 if (r) 2454 if (r)
2246 goto out; 2455 goto out;
2247 break; 2456 break;
2248 case KVM_CREATE_IRQCHIP: 2457 case KVM_CREATE_IRQCHIP: {
2458 struct kvm_pic *vpic;
2459
2460 mutex_lock(&kvm->lock);
2461 r = -EEXIST;
2462 if (kvm->arch.vpic)
2463 goto create_irqchip_unlock;
2249 r = -ENOMEM; 2464 r = -ENOMEM;
2250 kvm->arch.vpic = kvm_create_pic(kvm); 2465 vpic = kvm_create_pic(kvm);
2251 if (kvm->arch.vpic) { 2466 if (vpic) {
2252 r = kvm_ioapic_init(kvm); 2467 r = kvm_ioapic_init(kvm);
2253 if (r) { 2468 if (r) {
2254 kfree(kvm->arch.vpic); 2469 kfree(vpic);
2255 kvm->arch.vpic = NULL; 2470 goto create_irqchip_unlock;
2256 goto out;
2257 } 2471 }
2258 } else 2472 } else
2259 goto out; 2473 goto create_irqchip_unlock;
2474 smp_wmb();
2475 kvm->arch.vpic = vpic;
2476 smp_wmb();
2260 r = kvm_setup_default_irq_routing(kvm); 2477 r = kvm_setup_default_irq_routing(kvm);
2261 if (r) { 2478 if (r) {
2479 mutex_lock(&kvm->irq_lock);
2262 kfree(kvm->arch.vpic); 2480 kfree(kvm->arch.vpic);
2263 kfree(kvm->arch.vioapic); 2481 kfree(kvm->arch.vioapic);
2264 goto out; 2482 kvm->arch.vpic = NULL;
2483 kvm->arch.vioapic = NULL;
2484 mutex_unlock(&kvm->irq_lock);
2265 } 2485 }
2486 create_irqchip_unlock:
2487 mutex_unlock(&kvm->lock);
2266 break; 2488 break;
2489 }
2267 case KVM_CREATE_PIT: 2490 case KVM_CREATE_PIT:
2268 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 2491 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
2269 goto create_pit; 2492 goto create_pit;
@@ -2293,10 +2516,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
2293 goto out; 2516 goto out;
2294 if (irqchip_in_kernel(kvm)) { 2517 if (irqchip_in_kernel(kvm)) {
2295 __s32 status; 2518 __s32 status;
2296 mutex_lock(&kvm->irq_lock);
2297 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2519 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2298 irq_event.irq, irq_event.level); 2520 irq_event.irq, irq_event.level);
2299 mutex_unlock(&kvm->irq_lock);
2300 if (ioctl == KVM_IRQ_LINE_STATUS) { 2521 if (ioctl == KVM_IRQ_LINE_STATUS) {
2301 irq_event.status = status; 2522 irq_event.status = status;
2302 if (copy_to_user(argp, &irq_event, 2523 if (copy_to_user(argp, &irq_event,
@@ -2422,6 +2643,55 @@ long kvm_arch_vm_ioctl(struct file *filp,
2422 r = 0; 2643 r = 0;
2423 break; 2644 break;
2424 } 2645 }
2646 case KVM_XEN_HVM_CONFIG: {
2647 r = -EFAULT;
2648 if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
2649 sizeof(struct kvm_xen_hvm_config)))
2650 goto out;
2651 r = -EINVAL;
2652 if (kvm->arch.xen_hvm_config.flags)
2653 goto out;
2654 r = 0;
2655 break;
2656 }
2657 case KVM_SET_CLOCK: {
2658 struct timespec now;
2659 struct kvm_clock_data user_ns;
2660 u64 now_ns;
2661 s64 delta;
2662
2663 r = -EFAULT;
2664 if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
2665 goto out;
2666
2667 r = -EINVAL;
2668 if (user_ns.flags)
2669 goto out;
2670
2671 r = 0;
2672 ktime_get_ts(&now);
2673 now_ns = timespec_to_ns(&now);
2674 delta = user_ns.clock - now_ns;
2675 kvm->arch.kvmclock_offset = delta;
2676 break;
2677 }
2678 case KVM_GET_CLOCK: {
2679 struct timespec now;
2680 struct kvm_clock_data user_ns;
2681 u64 now_ns;
2682
2683 ktime_get_ts(&now);
2684 now_ns = timespec_to_ns(&now);
2685 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
2686 user_ns.flags = 0;
2687
2688 r = -EFAULT;
2689 if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
2690 goto out;
2691 r = 0;
2692 break;
2693 }
2694
2425 default: 2695 default:
2426 ; 2696 ;
2427 } 2697 }
@@ -2434,7 +2704,8 @@ static void kvm_init_msr_list(void)
2434 u32 dummy[2]; 2704 u32 dummy[2];
2435 unsigned i, j; 2705 unsigned i, j;
2436 2706
2437 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 2707 /* skip the first msrs in the list. KVM-specific */
2708 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
2438 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2709 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2439 continue; 2710 continue;
2440 if (j < i) 2711 if (j < i)
@@ -2758,13 +3029,13 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
2758} 3029}
2759 3030
2760int emulate_instruction(struct kvm_vcpu *vcpu, 3031int emulate_instruction(struct kvm_vcpu *vcpu,
2761 struct kvm_run *run,
2762 unsigned long cr2, 3032 unsigned long cr2,
2763 u16 error_code, 3033 u16 error_code,
2764 int emulation_type) 3034 int emulation_type)
2765{ 3035{
2766 int r, shadow_mask; 3036 int r, shadow_mask;
2767 struct decode_cache *c; 3037 struct decode_cache *c;
3038 struct kvm_run *run = vcpu->run;
2768 3039
2769 kvm_clear_exception_queue(vcpu); 3040 kvm_clear_exception_queue(vcpu);
2770 vcpu->arch.mmio_fault_cr2 = cr2; 3041 vcpu->arch.mmio_fault_cr2 = cr2;
@@ -2784,7 +3055,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2784 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3055 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2785 3056
2786 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3057 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2787 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 3058 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
2788 vcpu->arch.emulate_ctxt.mode = 3059 vcpu->arch.emulate_ctxt.mode =
2789 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3060 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2790 ? X86EMUL_MODE_REAL : cs_l 3061 ? X86EMUL_MODE_REAL : cs_l
@@ -2862,7 +3133,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2862 return EMULATE_DO_MMIO; 3133 return EMULATE_DO_MMIO;
2863 } 3134 }
2864 3135
2865 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 3136 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2866 3137
2867 if (vcpu->mmio_is_write) { 3138 if (vcpu->mmio_is_write) {
2868 vcpu->mmio_needed = 0; 3139 vcpu->mmio_needed = 0;
@@ -2970,8 +3241,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
2970 return r; 3241 return r;
2971} 3242}
2972 3243
2973int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3244int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
2974 int size, unsigned port)
2975{ 3245{
2976 unsigned long val; 3246 unsigned long val;
2977 3247
@@ -3000,7 +3270,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
3000} 3270}
3001EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3271EXPORT_SYMBOL_GPL(kvm_emulate_pio);
3002 3272
3003int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3273int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3004 int size, unsigned long count, int down, 3274 int size, unsigned long count, int down,
3005 gva_t address, int rep, unsigned port) 3275 gva_t address, int rep, unsigned port)
3006{ 3276{
@@ -3073,9 +3343,6 @@ static void bounce_off(void *info)
3073 /* nothing */ 3343 /* nothing */
3074} 3344}
3075 3345
3076static unsigned int ref_freq;
3077static unsigned long tsc_khz_ref;
3078
3079static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 3346static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
3080 void *data) 3347 void *data)
3081{ 3348{
@@ -3084,14 +3351,11 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
3084 struct kvm_vcpu *vcpu; 3351 struct kvm_vcpu *vcpu;
3085 int i, send_ipi = 0; 3352 int i, send_ipi = 0;
3086 3353
3087 if (!ref_freq)
3088 ref_freq = freq->old;
3089
3090 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 3354 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
3091 return 0; 3355 return 0;
3092 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 3356 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
3093 return 0; 3357 return 0;
3094 per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 3358 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
3095 3359
3096 spin_lock(&kvm_lock); 3360 spin_lock(&kvm_lock);
3097 list_for_each_entry(kvm, &vm_list, vm_list) { 3361 list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -3128,9 +3392,28 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = {
3128 .notifier_call = kvmclock_cpufreq_notifier 3392 .notifier_call = kvmclock_cpufreq_notifier
3129}; 3393};
3130 3394
3395static void kvm_timer_init(void)
3396{
3397 int cpu;
3398
3399 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3400 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3401 CPUFREQ_TRANSITION_NOTIFIER);
3402 for_each_online_cpu(cpu) {
3403 unsigned long khz = cpufreq_get(cpu);
3404 if (!khz)
3405 khz = tsc_khz;
3406 per_cpu(cpu_tsc_khz, cpu) = khz;
3407 }
3408 } else {
3409 for_each_possible_cpu(cpu)
3410 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3411 }
3412}
3413
3131int kvm_arch_init(void *opaque) 3414int kvm_arch_init(void *opaque)
3132{ 3415{
3133 int r, cpu; 3416 int r;
3134 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 3417 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
3135 3418
3136 if (kvm_x86_ops) { 3419 if (kvm_x86_ops) {
@@ -3162,13 +3445,7 @@ int kvm_arch_init(void *opaque)
3162 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 3445 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
3163 PT_DIRTY_MASK, PT64_NX_MASK, 0); 3446 PT_DIRTY_MASK, PT64_NX_MASK, 0);
3164 3447
3165 for_each_possible_cpu(cpu) 3448 kvm_timer_init();
3166 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3167 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3168 tsc_khz_ref = tsc_khz;
3169 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3170 CPUFREQ_TRANSITION_NOTIFIER);
3171 }
3172 3449
3173 return 0; 3450 return 0;
3174 3451
@@ -3296,7 +3573,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
3296 unsigned long *rflags) 3573 unsigned long *rflags)
3297{ 3574{
3298 kvm_lmsw(vcpu, msw); 3575 kvm_lmsw(vcpu, msw);
3299 *rflags = kvm_x86_ops->get_rflags(vcpu); 3576 *rflags = kvm_get_rflags(vcpu);
3300} 3577}
3301 3578
3302unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 3579unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
@@ -3334,7 +3611,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3334 switch (cr) { 3611 switch (cr) {
3335 case 0: 3612 case 0:
3336 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 3613 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
3337 *rflags = kvm_x86_ops->get_rflags(vcpu); 3614 *rflags = kvm_get_rflags(vcpu);
3338 break; 3615 break;
3339 case 2: 3616 case 2:
3340 vcpu->arch.cr2 = val; 3617 vcpu->arch.cr2 = val;
@@ -3454,18 +3731,18 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3454 * 3731 *
3455 * No need to exit to userspace if we already have an interrupt queued. 3732 * No need to exit to userspace if we already have an interrupt queued.
3456 */ 3733 */
3457static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 3734static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
3458 struct kvm_run *kvm_run)
3459{ 3735{
3460 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 3736 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3461 kvm_run->request_interrupt_window && 3737 vcpu->run->request_interrupt_window &&
3462 kvm_arch_interrupt_allowed(vcpu)); 3738 kvm_arch_interrupt_allowed(vcpu));
3463} 3739}
3464 3740
3465static void post_kvm_run_save(struct kvm_vcpu *vcpu, 3741static void post_kvm_run_save(struct kvm_vcpu *vcpu)
3466 struct kvm_run *kvm_run)
3467{ 3742{
3468 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 3743 struct kvm_run *kvm_run = vcpu->run;
3744
3745 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3469 kvm_run->cr8 = kvm_get_cr8(vcpu); 3746 kvm_run->cr8 = kvm_get_cr8(vcpu);
3470 kvm_run->apic_base = kvm_get_apic_base(vcpu); 3747 kvm_run->apic_base = kvm_get_apic_base(vcpu);
3471 if (irqchip_in_kernel(vcpu->kvm)) 3748 if (irqchip_in_kernel(vcpu->kvm))
@@ -3526,7 +3803,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3526 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 3803 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3527} 3804}
3528 3805
3529static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3806static void inject_pending_event(struct kvm_vcpu *vcpu)
3530{ 3807{
3531 /* try to reinject previous events if any */ 3808 /* try to reinject previous events if any */
3532 if (vcpu->arch.exception.pending) { 3809 if (vcpu->arch.exception.pending) {
@@ -3562,11 +3839,11 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3562 } 3839 }
3563} 3840}
3564 3841
3565static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3842static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3566{ 3843{
3567 int r; 3844 int r;
3568 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 3845 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3569 kvm_run->request_interrupt_window; 3846 vcpu->run->request_interrupt_window;
3570 3847
3571 if (vcpu->requests) 3848 if (vcpu->requests)
3572 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3849 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3587,12 +3864,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3587 kvm_x86_ops->tlb_flush(vcpu); 3864 kvm_x86_ops->tlb_flush(vcpu);
3588 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 3865 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3589 &vcpu->requests)) { 3866 &vcpu->requests)) {
3590 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 3867 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
3591 r = 0; 3868 r = 0;
3592 goto out; 3869 goto out;
3593 } 3870 }
3594 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 3871 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3595 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 3872 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
3596 r = 0; 3873 r = 0;
3597 goto out; 3874 goto out;
3598 } 3875 }
@@ -3616,7 +3893,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3616 goto out; 3893 goto out;
3617 } 3894 }
3618 3895
3619 inject_pending_event(vcpu, kvm_run); 3896 inject_pending_event(vcpu);
3620 3897
3621 /* enable NMI/IRQ window open exits if needed */ 3898 /* enable NMI/IRQ window open exits if needed */
3622 if (vcpu->arch.nmi_pending) 3899 if (vcpu->arch.nmi_pending)
@@ -3642,7 +3919,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3642 } 3919 }
3643 3920
3644 trace_kvm_entry(vcpu->vcpu_id); 3921 trace_kvm_entry(vcpu->vcpu_id);
3645 kvm_x86_ops->run(vcpu, kvm_run); 3922 kvm_x86_ops->run(vcpu);
3646 3923
3647 /* 3924 /*
3648 * If the guest has used debug registers, at least dr7 3925 * If the guest has used debug registers, at least dr7
@@ -3684,13 +3961,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3684 3961
3685 kvm_lapic_sync_from_vapic(vcpu); 3962 kvm_lapic_sync_from_vapic(vcpu);
3686 3963
3687 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 3964 r = kvm_x86_ops->handle_exit(vcpu);
3688out: 3965out:
3689 return r; 3966 return r;
3690} 3967}
3691 3968
3692 3969
3693static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3970static int __vcpu_run(struct kvm_vcpu *vcpu)
3694{ 3971{
3695 int r; 3972 int r;
3696 3973
@@ -3710,7 +3987,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3710 r = 1; 3987 r = 1;
3711 while (r > 0) { 3988 while (r > 0) {
3712 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 3989 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3713 r = vcpu_enter_guest(vcpu, kvm_run); 3990 r = vcpu_enter_guest(vcpu);
3714 else { 3991 else {
3715 up_read(&vcpu->kvm->slots_lock); 3992 up_read(&vcpu->kvm->slots_lock);
3716 kvm_vcpu_block(vcpu); 3993 kvm_vcpu_block(vcpu);
@@ -3738,14 +4015,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3738 if (kvm_cpu_has_pending_timer(vcpu)) 4015 if (kvm_cpu_has_pending_timer(vcpu))
3739 kvm_inject_pending_timer_irqs(vcpu); 4016 kvm_inject_pending_timer_irqs(vcpu);
3740 4017
3741 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 4018 if (dm_request_for_irq_injection(vcpu)) {
3742 r = -EINTR; 4019 r = -EINTR;
3743 kvm_run->exit_reason = KVM_EXIT_INTR; 4020 vcpu->run->exit_reason = KVM_EXIT_INTR;
3744 ++vcpu->stat.request_irq_exits; 4021 ++vcpu->stat.request_irq_exits;
3745 } 4022 }
3746 if (signal_pending(current)) { 4023 if (signal_pending(current)) {
3747 r = -EINTR; 4024 r = -EINTR;
3748 kvm_run->exit_reason = KVM_EXIT_INTR; 4025 vcpu->run->exit_reason = KVM_EXIT_INTR;
3749 ++vcpu->stat.signal_exits; 4026 ++vcpu->stat.signal_exits;
3750 } 4027 }
3751 if (need_resched()) { 4028 if (need_resched()) {
@@ -3756,7 +4033,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3756 } 4033 }
3757 4034
3758 up_read(&vcpu->kvm->slots_lock); 4035 up_read(&vcpu->kvm->slots_lock);
3759 post_kvm_run_save(vcpu, kvm_run); 4036 post_kvm_run_save(vcpu);
3760 4037
3761 vapic_exit(vcpu); 4038 vapic_exit(vcpu);
3762 4039
@@ -3789,15 +4066,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3789 if (r) 4066 if (r)
3790 goto out; 4067 goto out;
3791 } 4068 }
3792#if CONFIG_HAS_IOMEM
3793 if (vcpu->mmio_needed) { 4069 if (vcpu->mmio_needed) {
3794 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 4070 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3795 vcpu->mmio_read_completed = 1; 4071 vcpu->mmio_read_completed = 1;
3796 vcpu->mmio_needed = 0; 4072 vcpu->mmio_needed = 0;
3797 4073
3798 down_read(&vcpu->kvm->slots_lock); 4074 down_read(&vcpu->kvm->slots_lock);
3799 r = emulate_instruction(vcpu, kvm_run, 4075 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
3800 vcpu->arch.mmio_fault_cr2, 0,
3801 EMULTYPE_NO_DECODE); 4076 EMULTYPE_NO_DECODE);
3802 up_read(&vcpu->kvm->slots_lock); 4077 up_read(&vcpu->kvm->slots_lock);
3803 if (r == EMULATE_DO_MMIO) { 4078 if (r == EMULATE_DO_MMIO) {
@@ -3808,12 +4083,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3808 goto out; 4083 goto out;
3809 } 4084 }
3810 } 4085 }
3811#endif
3812 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 4086 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3813 kvm_register_write(vcpu, VCPU_REGS_RAX, 4087 kvm_register_write(vcpu, VCPU_REGS_RAX,
3814 kvm_run->hypercall.ret); 4088 kvm_run->hypercall.ret);
3815 4089
3816 r = __vcpu_run(vcpu, kvm_run); 4090 r = __vcpu_run(vcpu);
3817 4091
3818out: 4092out:
3819 if (vcpu->sigset_active) 4093 if (vcpu->sigset_active)
@@ -3847,13 +4121,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3847#endif 4121#endif
3848 4122
3849 regs->rip = kvm_rip_read(vcpu); 4123 regs->rip = kvm_rip_read(vcpu);
3850 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 4124 regs->rflags = kvm_get_rflags(vcpu);
3851
3852 /*
3853 * Don't leak debug flags in case they were set for guest debugging
3854 */
3855 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3856 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3857 4125
3858 vcpu_put(vcpu); 4126 vcpu_put(vcpu);
3859 4127
@@ -3881,12 +4149,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3881 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 4149 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3882 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 4150 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3883 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 4151 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3884
3885#endif 4152#endif
3886 4153
3887 kvm_rip_write(vcpu, regs->rip); 4154 kvm_rip_write(vcpu, regs->rip);
3888 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 4155 kvm_set_rflags(vcpu, regs->rflags);
3889
3890 4156
3891 vcpu->arch.exception.pending = false; 4157 vcpu->arch.exception.pending = false;
3892 4158
@@ -4105,7 +4371,7 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4105{ 4371{
4106 return (seg != VCPU_SREG_LDTR) && 4372 return (seg != VCPU_SREG_LDTR) &&
4107 (seg != VCPU_SREG_TR) && 4373 (seg != VCPU_SREG_TR) &&
4108 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); 4374 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4109} 4375}
4110 4376
4111int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4377int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
@@ -4133,7 +4399,7 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4133{ 4399{
4134 tss->cr3 = vcpu->arch.cr3; 4400 tss->cr3 = vcpu->arch.cr3;
4135 tss->eip = kvm_rip_read(vcpu); 4401 tss->eip = kvm_rip_read(vcpu);
4136 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 4402 tss->eflags = kvm_get_rflags(vcpu);
4137 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4403 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4138 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4404 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4139 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4405 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4157,7 +4423,7 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4157 kvm_set_cr3(vcpu, tss->cr3); 4423 kvm_set_cr3(vcpu, tss->cr3);
4158 4424
4159 kvm_rip_write(vcpu, tss->eip); 4425 kvm_rip_write(vcpu, tss->eip);
4160 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 4426 kvm_set_rflags(vcpu, tss->eflags | 2);
4161 4427
4162 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 4428 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
4163 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 4429 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
@@ -4195,7 +4461,7 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4195 struct tss_segment_16 *tss) 4461 struct tss_segment_16 *tss)
4196{ 4462{
4197 tss->ip = kvm_rip_read(vcpu); 4463 tss->ip = kvm_rip_read(vcpu);
4198 tss->flag = kvm_x86_ops->get_rflags(vcpu); 4464 tss->flag = kvm_get_rflags(vcpu);
4199 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4465 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4200 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4466 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4201 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4467 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4210,14 +4476,13 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4210 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4476 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4211 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4477 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4212 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4478 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4213 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
4214} 4479}
4215 4480
4216static int load_state_from_tss16(struct kvm_vcpu *vcpu, 4481static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4217 struct tss_segment_16 *tss) 4482 struct tss_segment_16 *tss)
4218{ 4483{
4219 kvm_rip_write(vcpu, tss->ip); 4484 kvm_rip_write(vcpu, tss->ip);
4220 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 4485 kvm_set_rflags(vcpu, tss->flag | 2);
4221 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 4486 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
4222 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 4487 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
4223 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 4488 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
@@ -4363,8 +4628,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4363 } 4628 }
4364 4629
4365 if (reason == TASK_SWITCH_IRET) { 4630 if (reason == TASK_SWITCH_IRET) {
4366 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4631 u32 eflags = kvm_get_rflags(vcpu);
4367 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4632 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
4368 } 4633 }
4369 4634
4370 /* set back link to prev task only if NT bit is set in eflags 4635 /* set back link to prev task only if NT bit is set in eflags
@@ -4372,11 +4637,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4372 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4637 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4373 old_tss_sel = 0xffff; 4638 old_tss_sel = 0xffff;
4374 4639
4375 /* set back link to prev task only if NT bit is set in eflags
4376 note that old_tss_sel is not used afetr this point */
4377 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4378 old_tss_sel = 0xffff;
4379
4380 if (nseg_desc.type & 8) 4640 if (nseg_desc.type & 8)
4381 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, 4641 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
4382 old_tss_base, &nseg_desc); 4642 old_tss_base, &nseg_desc);
@@ -4385,8 +4645,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4385 old_tss_base, &nseg_desc); 4645 old_tss_base, &nseg_desc);
4386 4646
4387 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4647 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
4388 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4648 u32 eflags = kvm_get_rflags(vcpu);
4389 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 4649 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
4390 } 4650 }
4391 4651
4392 if (reason != TASK_SWITCH_IRET) { 4652 if (reason != TASK_SWITCH_IRET) {
@@ -4438,8 +4698,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4438 4698
4439 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 4699 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
4440 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 4700 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4441 if (!is_long_mode(vcpu) && is_pae(vcpu)) 4701 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
4442 load_pdptrs(vcpu, vcpu->arch.cr3); 4702 load_pdptrs(vcpu, vcpu->arch.cr3);
4703 mmu_reset_needed = 1;
4704 }
4443 4705
4444 if (mmu_reset_needed) 4706 if (mmu_reset_needed)
4445 kvm_mmu_reset_context(vcpu); 4707 kvm_mmu_reset_context(vcpu);
@@ -4480,12 +4742,32 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4480int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 4742int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4481 struct kvm_guest_debug *dbg) 4743 struct kvm_guest_debug *dbg)
4482{ 4744{
4745 unsigned long rflags;
4483 int i, r; 4746 int i, r;
4484 4747
4485 vcpu_load(vcpu); 4748 vcpu_load(vcpu);
4486 4749
4487 if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == 4750 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
4488 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { 4751 r = -EBUSY;
4752 if (vcpu->arch.exception.pending)
4753 goto unlock_out;
4754 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
4755 kvm_queue_exception(vcpu, DB_VECTOR);
4756 else
4757 kvm_queue_exception(vcpu, BP_VECTOR);
4758 }
4759
4760 /*
4761 * Read rflags as long as potentially injected trace flags are still
4762 * filtered out.
4763 */
4764 rflags = kvm_get_rflags(vcpu);
4765
4766 vcpu->guest_debug = dbg->control;
4767 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
4768 vcpu->guest_debug = 0;
4769
4770 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
4489 for (i = 0; i < KVM_NR_DB_REGS; ++i) 4771 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4490 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 4772 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4491 vcpu->arch.switch_db_regs = 4773 vcpu->arch.switch_db_regs =
@@ -4496,13 +4778,23 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4496 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4778 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4497 } 4779 }
4498 4780
4499 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 4781 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
4782 vcpu->arch.singlestep_cs =
4783 get_segment_selector(vcpu, VCPU_SREG_CS);
4784 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
4785 }
4786
4787 /*
4788 * Trigger an rflags update that will inject or remove the trace
4789 * flags.
4790 */
4791 kvm_set_rflags(vcpu, rflags);
4792
4793 kvm_x86_ops->set_guest_debug(vcpu, dbg);
4500 4794
4501 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 4795 r = 0;
4502 kvm_queue_exception(vcpu, DB_VECTOR);
4503 else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
4504 kvm_queue_exception(vcpu, BP_VECTOR);
4505 4796
4797unlock_out:
4506 vcpu_put(vcpu); 4798 vcpu_put(vcpu);
4507 4799
4508 return r; 4800 return r;
@@ -4703,14 +4995,26 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4703 return kvm_x86_ops->vcpu_reset(vcpu); 4995 return kvm_x86_ops->vcpu_reset(vcpu);
4704} 4996}
4705 4997
4706void kvm_arch_hardware_enable(void *garbage) 4998int kvm_arch_hardware_enable(void *garbage)
4707{ 4999{
4708 kvm_x86_ops->hardware_enable(garbage); 5000 /*
5001 * Since this may be called from a hotplug notifcation,
5002 * we can't get the CPU frequency directly.
5003 */
5004 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5005 int cpu = raw_smp_processor_id();
5006 per_cpu(cpu_tsc_khz, cpu) = 0;
5007 }
5008
5009 kvm_shared_msr_cpu_online();
5010
5011 return kvm_x86_ops->hardware_enable(garbage);
4709} 5012}
4710 5013
4711void kvm_arch_hardware_disable(void *garbage) 5014void kvm_arch_hardware_disable(void *garbage)
4712{ 5015{
4713 kvm_x86_ops->hardware_disable(garbage); 5016 kvm_x86_ops->hardware_disable(garbage);
5017 drop_user_return_notifiers(garbage);
4714} 5018}
4715 5019
4716int kvm_arch_hardware_setup(void) 5020int kvm_arch_hardware_setup(void)
@@ -4948,8 +5252,36 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4948 return kvm_x86_ops->interrupt_allowed(vcpu); 5252 return kvm_x86_ops->interrupt_allowed(vcpu);
4949} 5253}
4950 5254
5255unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
5256{
5257 unsigned long rflags;
5258
5259 rflags = kvm_x86_ops->get_rflags(vcpu);
5260 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5261 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
5262 return rflags;
5263}
5264EXPORT_SYMBOL_GPL(kvm_get_rflags);
5265
5266void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5267{
5268 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
5269 vcpu->arch.singlestep_cs ==
5270 get_segment_selector(vcpu, VCPU_SREG_CS) &&
5271 vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
5272 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
5273 kvm_x86_ops->set_rflags(vcpu, rflags);
5274}
5275EXPORT_SYMBOL_GPL(kvm_set_rflags);
5276
4951EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 5277EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
4952EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 5278EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
4953EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 5279EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
4954EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 5280EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
4955EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 5281EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
5282EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
5283EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
5284EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
5285EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5286EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5287EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index f8f8900fc5ec..2d241da07236 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -14,12 +14,76 @@
14 14
15#define KVM_API_VERSION 12 15#define KVM_API_VERSION 12
16 16
17/* for KVM_TRACE_ENABLE, deprecated */ 17/* *** Deprecated interfaces *** */
18
19#define KVM_TRC_SHIFT 16
20
21#define KVM_TRC_ENTRYEXIT (1 << KVM_TRC_SHIFT)
22#define KVM_TRC_HANDLER (1 << (KVM_TRC_SHIFT + 1))
23
24#define KVM_TRC_VMENTRY (KVM_TRC_ENTRYEXIT + 0x01)
25#define KVM_TRC_VMEXIT (KVM_TRC_ENTRYEXIT + 0x02)
26#define KVM_TRC_PAGE_FAULT (KVM_TRC_HANDLER + 0x01)
27
28#define KVM_TRC_HEAD_SIZE 12
29#define KVM_TRC_CYCLE_SIZE 8
30#define KVM_TRC_EXTRA_MAX 7
31
32#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02)
33#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03)
34#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04)
35#define KVM_TRC_IO_READ (KVM_TRC_HANDLER + 0x05)
36#define KVM_TRC_IO_WRITE (KVM_TRC_HANDLER + 0x06)
37#define KVM_TRC_CR_READ (KVM_TRC_HANDLER + 0x07)
38#define KVM_TRC_CR_WRITE (KVM_TRC_HANDLER + 0x08)
39#define KVM_TRC_DR_READ (KVM_TRC_HANDLER + 0x09)
40#define KVM_TRC_DR_WRITE (KVM_TRC_HANDLER + 0x0A)
41#define KVM_TRC_MSR_READ (KVM_TRC_HANDLER + 0x0B)
42#define KVM_TRC_MSR_WRITE (KVM_TRC_HANDLER + 0x0C)
43#define KVM_TRC_CPUID (KVM_TRC_HANDLER + 0x0D)
44#define KVM_TRC_INTR (KVM_TRC_HANDLER + 0x0E)
45#define KVM_TRC_NMI (KVM_TRC_HANDLER + 0x0F)
46#define KVM_TRC_VMMCALL (KVM_TRC_HANDLER + 0x10)
47#define KVM_TRC_HLT (KVM_TRC_HANDLER + 0x11)
48#define KVM_TRC_CLTS (KVM_TRC_HANDLER + 0x12)
49#define KVM_TRC_LMSW (KVM_TRC_HANDLER + 0x13)
50#define KVM_TRC_APIC_ACCESS (KVM_TRC_HANDLER + 0x14)
51#define KVM_TRC_TDP_FAULT (KVM_TRC_HANDLER + 0x15)
52#define KVM_TRC_GTLB_WRITE (KVM_TRC_HANDLER + 0x16)
53#define KVM_TRC_STLB_WRITE (KVM_TRC_HANDLER + 0x17)
54#define KVM_TRC_STLB_INVAL (KVM_TRC_HANDLER + 0x18)
55#define KVM_TRC_PPC_INSTR (KVM_TRC_HANDLER + 0x19)
56
18struct kvm_user_trace_setup { 57struct kvm_user_trace_setup {
19 __u32 buf_size; /* sub_buffer size of each per-cpu */ 58 __u32 buf_size;
20 __u32 buf_nr; /* the number of sub_buffers of each per-cpu */ 59 __u32 buf_nr;
60};
61
62#define __KVM_DEPRECATED_MAIN_W_0x06 \
63 _IOW(KVMIO, 0x06, struct kvm_user_trace_setup)
64#define __KVM_DEPRECATED_MAIN_0x07 _IO(KVMIO, 0x07)
65#define __KVM_DEPRECATED_MAIN_0x08 _IO(KVMIO, 0x08)
66
67#define __KVM_DEPRECATED_VM_R_0x70 _IOR(KVMIO, 0x70, struct kvm_assigned_irq)
68
69struct kvm_breakpoint {
70 __u32 enabled;
71 __u32 padding;
72 __u64 address;
73};
74
75struct kvm_debug_guest {
76 __u32 enabled;
77 __u32 pad;
78 struct kvm_breakpoint breakpoints[4];
79 __u32 singlestep;
21}; 80};
22 81
82#define __KVM_DEPRECATED_VCPU_W_0x87 _IOW(KVMIO, 0x87, struct kvm_debug_guest)
83
84/* *** End of deprecated interfaces *** */
85
86
23/* for KVM_CREATE_MEMORY_REGION */ 87/* for KVM_CREATE_MEMORY_REGION */
24struct kvm_memory_region { 88struct kvm_memory_region {
25 __u32 slot; 89 __u32 slot;
@@ -99,6 +163,7 @@ struct kvm_pit_config {
99 163
100/* For KVM_EXIT_INTERNAL_ERROR */ 164/* For KVM_EXIT_INTERNAL_ERROR */
101#define KVM_INTERNAL_ERROR_EMULATION 1 165#define KVM_INTERNAL_ERROR_EMULATION 1
166#define KVM_INTERNAL_ERROR_SIMUL_EX 2
102 167
103/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ 168/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
104struct kvm_run { 169struct kvm_run {
@@ -116,6 +181,11 @@ struct kvm_run {
116 __u64 cr8; 181 __u64 cr8;
117 __u64 apic_base; 182 __u64 apic_base;
118 183
184#ifdef __KVM_S390
185 /* the processor status word for s390 */
186 __u64 psw_mask; /* psw upper half */
187 __u64 psw_addr; /* psw lower half */
188#endif
119 union { 189 union {
120 /* KVM_EXIT_UNKNOWN */ 190 /* KVM_EXIT_UNKNOWN */
121 struct { 191 struct {
@@ -167,8 +237,6 @@ struct kvm_run {
167 /* KVM_EXIT_S390_SIEIC */ 237 /* KVM_EXIT_S390_SIEIC */
168 struct { 238 struct {
169 __u8 icptcode; 239 __u8 icptcode;
170 __u64 mask; /* psw upper half */
171 __u64 addr; /* psw lower half */
172 __u16 ipa; 240 __u16 ipa;
173 __u32 ipb; 241 __u32 ipb;
174 } s390_sieic; 242 } s390_sieic;
@@ -187,6 +255,9 @@ struct kvm_run {
187 } dcr; 255 } dcr;
188 struct { 256 struct {
189 __u32 suberror; 257 __u32 suberror;
258 /* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
259 __u32 ndata;
260 __u64 data[16];
190 } internal; 261 } internal;
191 /* Fix the size of the union. */ 262 /* Fix the size of the union. */
192 char padding[256]; 263 char padding[256];
@@ -329,24 +400,6 @@ struct kvm_ioeventfd {
329 __u8 pad[36]; 400 __u8 pad[36];
330}; 401};
331 402
332#define KVM_TRC_SHIFT 16
333/*
334 * kvm trace categories
335 */
336#define KVM_TRC_ENTRYEXIT (1 << KVM_TRC_SHIFT)
337#define KVM_TRC_HANDLER (1 << (KVM_TRC_SHIFT + 1)) /* only 12 bits */
338
339/*
340 * kvm trace action
341 */
342#define KVM_TRC_VMENTRY (KVM_TRC_ENTRYEXIT + 0x01)
343#define KVM_TRC_VMEXIT (KVM_TRC_ENTRYEXIT + 0x02)
344#define KVM_TRC_PAGE_FAULT (KVM_TRC_HANDLER + 0x01)
345
346#define KVM_TRC_HEAD_SIZE 12
347#define KVM_TRC_CYCLE_SIZE 8
348#define KVM_TRC_EXTRA_MAX 7
349
350#define KVMIO 0xAE 403#define KVMIO 0xAE
351 404
352/* 405/*
@@ -367,12 +420,10 @@ struct kvm_ioeventfd {
367 */ 420 */
368#define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */ 421#define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */
369#define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x05, struct kvm_cpuid2) 422#define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x05, struct kvm_cpuid2)
370/* 423#define KVM_TRACE_ENABLE __KVM_DEPRECATED_MAIN_W_0x06
371 * ioctls for kvm trace 424#define KVM_TRACE_PAUSE __KVM_DEPRECATED_MAIN_0x07
372 */ 425#define KVM_TRACE_DISABLE __KVM_DEPRECATED_MAIN_0x08
373#define KVM_TRACE_ENABLE _IOW(KVMIO, 0x06, struct kvm_user_trace_setup) 426
374#define KVM_TRACE_PAUSE _IO(KVMIO, 0x07)
375#define KVM_TRACE_DISABLE _IO(KVMIO, 0x08)
376/* 427/*
377 * Extension capability list. 428 * Extension capability list.
378 */ 429 */
@@ -436,6 +487,15 @@ struct kvm_ioeventfd {
436#endif 487#endif
437#define KVM_CAP_IOEVENTFD 36 488#define KVM_CAP_IOEVENTFD 36
438#define KVM_CAP_SET_IDENTITY_MAP_ADDR 37 489#define KVM_CAP_SET_IDENTITY_MAP_ADDR 37
490#ifdef __KVM_HAVE_XEN_HVM
491#define KVM_CAP_XEN_HVM 38
492#endif
493#define KVM_CAP_ADJUST_CLOCK 39
494#define KVM_CAP_INTERNAL_ERROR_DATA 40
495#ifdef __KVM_HAVE_VCPU_EVENTS
496#define KVM_CAP_VCPU_EVENTS 41
497#endif
498#define KVM_CAP_S390_PSW 42
439 499
440#ifdef KVM_CAP_IRQ_ROUTING 500#ifdef KVM_CAP_IRQ_ROUTING
441 501
@@ -488,6 +548,18 @@ struct kvm_x86_mce {
488}; 548};
489#endif 549#endif
490 550
551#ifdef KVM_CAP_XEN_HVM
552struct kvm_xen_hvm_config {
553 __u32 flags;
554 __u32 msr;
555 __u64 blob_addr_32;
556 __u64 blob_addr_64;
557 __u8 blob_size_32;
558 __u8 blob_size_64;
559 __u8 pad2[30];
560};
561#endif
562
491#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) 563#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
492 564
493struct kvm_irqfd { 565struct kvm_irqfd {
@@ -497,55 +569,66 @@ struct kvm_irqfd {
497 __u8 pad[20]; 569 __u8 pad[20];
498}; 570};
499 571
572struct kvm_clock_data {
573 __u64 clock;
574 __u32 flags;
575 __u32 pad[9];
576};
577
500/* 578/*
501 * ioctls for VM fds 579 * ioctls for VM fds
502 */ 580 */
503#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) 581#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region)
504/* 582/*
505 * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns 583 * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
506 * a vcpu fd. 584 * a vcpu fd.
507 */ 585 */
508#define KVM_CREATE_VCPU _IO(KVMIO, 0x41) 586#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
509#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) 587#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log)
510#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) 588#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias)
511#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) 589#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44)
512#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) 590#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45)
513#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\ 591#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \
514 struct kvm_userspace_memory_region) 592 struct kvm_userspace_memory_region)
515#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) 593#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
516#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) 594#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64)
517/* Device model IOC */ 595/* Device model IOC */
518#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) 596#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60)
519#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) 597#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level)
520#define KVM_GET_IRQCHIP _IOWR(KVMIO, 0x62, struct kvm_irqchip) 598#define KVM_GET_IRQCHIP _IOWR(KVMIO, 0x62, struct kvm_irqchip)
521#define KVM_SET_IRQCHIP _IOR(KVMIO, 0x63, struct kvm_irqchip) 599#define KVM_SET_IRQCHIP _IOR(KVMIO, 0x63, struct kvm_irqchip)
522#define KVM_CREATE_PIT _IO(KVMIO, 0x64) 600#define KVM_CREATE_PIT _IO(KVMIO, 0x64)
523#define KVM_GET_PIT _IOWR(KVMIO, 0x65, struct kvm_pit_state) 601#define KVM_GET_PIT _IOWR(KVMIO, 0x65, struct kvm_pit_state)
524#define KVM_SET_PIT _IOR(KVMIO, 0x66, struct kvm_pit_state) 602#define KVM_SET_PIT _IOR(KVMIO, 0x66, struct kvm_pit_state)
525#define KVM_IRQ_LINE_STATUS _IOWR(KVMIO, 0x67, struct kvm_irq_level) 603#define KVM_IRQ_LINE_STATUS _IOWR(KVMIO, 0x67, struct kvm_irq_level)
526#define KVM_REGISTER_COALESCED_MMIO \ 604#define KVM_REGISTER_COALESCED_MMIO \
527 _IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone) 605 _IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone)
528#define KVM_UNREGISTER_COALESCED_MMIO \ 606#define KVM_UNREGISTER_COALESCED_MMIO \
529 _IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone) 607 _IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone)
530#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \ 608#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
531 struct kvm_assigned_pci_dev) 609 struct kvm_assigned_pci_dev)
532#define KVM_SET_GSI_ROUTING _IOW(KVMIO, 0x6a, struct kvm_irq_routing) 610#define KVM_SET_GSI_ROUTING _IOW(KVMIO, 0x6a, struct kvm_irq_routing)
533/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */ 611/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */
534#define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \ 612#define KVM_ASSIGN_IRQ __KVM_DEPRECATED_VM_R_0x70
535 struct kvm_assigned_irq) 613#define KVM_ASSIGN_DEV_IRQ _IOW(KVMIO, 0x70, struct kvm_assigned_irq)
536#define KVM_ASSIGN_DEV_IRQ _IOW(KVMIO, 0x70, struct kvm_assigned_irq) 614#define KVM_REINJECT_CONTROL _IO(KVMIO, 0x71)
537#define KVM_REINJECT_CONTROL _IO(KVMIO, 0x71) 615#define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \
538#define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \ 616 struct kvm_assigned_pci_dev)
539 struct kvm_assigned_pci_dev) 617#define KVM_ASSIGN_SET_MSIX_NR _IOW(KVMIO, 0x73, \
540#define KVM_ASSIGN_SET_MSIX_NR \ 618 struct kvm_assigned_msix_nr)
541 _IOW(KVMIO, 0x73, struct kvm_assigned_msix_nr) 619#define KVM_ASSIGN_SET_MSIX_ENTRY _IOW(KVMIO, 0x74, \
542#define KVM_ASSIGN_SET_MSIX_ENTRY \ 620 struct kvm_assigned_msix_entry)
543 _IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry) 621#define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
544#define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq) 622#define KVM_IRQFD _IOW(KVMIO, 0x76, struct kvm_irqfd)
545#define KVM_IRQFD _IOW(KVMIO, 0x76, struct kvm_irqfd) 623#define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config)
546#define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config) 624#define KVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78)
547#define KVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78) 625#define KVM_IOEVENTFD _IOW(KVMIO, 0x79, struct kvm_ioeventfd)
548#define KVM_IOEVENTFD _IOW(KVMIO, 0x79, struct kvm_ioeventfd) 626#define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config)
627#define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data)
628#define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data)
629/* Available with KVM_CAP_PIT_STATE2 */
630#define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2)
631#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2)
549 632
550/* 633/*
551 * ioctls for vcpu fds 634 * ioctls for vcpu fds
@@ -558,7 +641,7 @@ struct kvm_irqfd {
558#define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation) 641#define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation)
559#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt) 642#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt)
560/* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */ 643/* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */
561#define KVM_DEBUG_GUEST __KVM_DEPRECATED_DEBUG_GUEST 644#define KVM_DEBUG_GUEST __KVM_DEPRECATED_VCPU_W_0x87
562#define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs) 645#define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs)
563#define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs) 646#define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs)
564#define KVM_SET_CPUID _IOW(KVMIO, 0x8a, struct kvm_cpuid) 647#define KVM_SET_CPUID _IOW(KVMIO, 0x8a, struct kvm_cpuid)
@@ -570,7 +653,7 @@ struct kvm_irqfd {
570#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2) 653#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2)
571#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2) 654#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
572/* Available with KVM_CAP_VAPIC */ 655/* Available with KVM_CAP_VAPIC */
573#define KVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl) 656#define KVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl)
574/* Available with KVM_CAP_VAPIC */ 657/* Available with KVM_CAP_VAPIC */
575#define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr) 658#define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr)
576/* valid for virtual machine (for floating interrupt)_and_ vcpu */ 659/* valid for virtual machine (for floating interrupt)_and_ vcpu */
@@ -582,66 +665,23 @@ struct kvm_irqfd {
582/* initial ipl psw for s390 */ 665/* initial ipl psw for s390 */
583#define KVM_S390_SET_INITIAL_PSW _IOW(KVMIO, 0x96, struct kvm_s390_psw) 666#define KVM_S390_SET_INITIAL_PSW _IOW(KVMIO, 0x96, struct kvm_s390_psw)
584/* initial reset for s390 */ 667/* initial reset for s390 */
585#define KVM_S390_INITIAL_RESET _IO(KVMIO, 0x97) 668#define KVM_S390_INITIAL_RESET _IO(KVMIO, 0x97)
586#define KVM_GET_MP_STATE _IOR(KVMIO, 0x98, struct kvm_mp_state) 669#define KVM_GET_MP_STATE _IOR(KVMIO, 0x98, struct kvm_mp_state)
587#define KVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state) 670#define KVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state)
588/* Available with KVM_CAP_NMI */ 671/* Available with KVM_CAP_NMI */
589#define KVM_NMI _IO(KVMIO, 0x9a) 672#define KVM_NMI _IO(KVMIO, 0x9a)
590/* Available with KVM_CAP_SET_GUEST_DEBUG */ 673/* Available with KVM_CAP_SET_GUEST_DEBUG */
591#define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug) 674#define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug)
592/* MCE for x86 */ 675/* MCE for x86 */
593#define KVM_X86_SETUP_MCE _IOW(KVMIO, 0x9c, __u64) 676#define KVM_X86_SETUP_MCE _IOW(KVMIO, 0x9c, __u64)
594#define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO, 0x9d, __u64) 677#define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO, 0x9d, __u64)
595#define KVM_X86_SET_MCE _IOW(KVMIO, 0x9e, struct kvm_x86_mce) 678#define KVM_X86_SET_MCE _IOW(KVMIO, 0x9e, struct kvm_x86_mce)
596 679/* IA64 stack access */
597/*
598 * Deprecated interfaces
599 */
600struct kvm_breakpoint {
601 __u32 enabled;
602 __u32 padding;
603 __u64 address;
604};
605
606struct kvm_debug_guest {
607 __u32 enabled;
608 __u32 pad;
609 struct kvm_breakpoint breakpoints[4];
610 __u32 singlestep;
611};
612
613#define __KVM_DEPRECATED_DEBUG_GUEST _IOW(KVMIO, 0x87, struct kvm_debug_guest)
614
615#define KVM_IA64_VCPU_GET_STACK _IOR(KVMIO, 0x9a, void *) 680#define KVM_IA64_VCPU_GET_STACK _IOR(KVMIO, 0x9a, void *)
616#define KVM_IA64_VCPU_SET_STACK _IOW(KVMIO, 0x9b, void *) 681#define KVM_IA64_VCPU_SET_STACK _IOW(KVMIO, 0x9b, void *)
617 682/* Available with KVM_CAP_VCPU_EVENTS */
618#define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) 683#define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events)
619#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) 684#define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events)
620
621#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02)
622#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03)
623#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04)
624#define KVM_TRC_IO_READ (KVM_TRC_HANDLER + 0x05)
625#define KVM_TRC_IO_WRITE (KVM_TRC_HANDLER + 0x06)
626#define KVM_TRC_CR_READ (KVM_TRC_HANDLER + 0x07)
627#define KVM_TRC_CR_WRITE (KVM_TRC_HANDLER + 0x08)
628#define KVM_TRC_DR_READ (KVM_TRC_HANDLER + 0x09)
629#define KVM_TRC_DR_WRITE (KVM_TRC_HANDLER + 0x0A)
630#define KVM_TRC_MSR_READ (KVM_TRC_HANDLER + 0x0B)
631#define KVM_TRC_MSR_WRITE (KVM_TRC_HANDLER + 0x0C)
632#define KVM_TRC_CPUID (KVM_TRC_HANDLER + 0x0D)
633#define KVM_TRC_INTR (KVM_TRC_HANDLER + 0x0E)
634#define KVM_TRC_NMI (KVM_TRC_HANDLER + 0x0F)
635#define KVM_TRC_VMMCALL (KVM_TRC_HANDLER + 0x10)
636#define KVM_TRC_HLT (KVM_TRC_HANDLER + 0x11)
637#define KVM_TRC_CLTS (KVM_TRC_HANDLER + 0x12)
638#define KVM_TRC_LMSW (KVM_TRC_HANDLER + 0x13)
639#define KVM_TRC_APIC_ACCESS (KVM_TRC_HANDLER + 0x14)
640#define KVM_TRC_TDP_FAULT (KVM_TRC_HANDLER + 0x15)
641#define KVM_TRC_GTLB_WRITE (KVM_TRC_HANDLER + 0x16)
642#define KVM_TRC_STLB_WRITE (KVM_TRC_HANDLER + 0x17)
643#define KVM_TRC_STLB_INVAL (KVM_TRC_HANDLER + 0x18)
644#define KVM_TRC_PPC_INSTR (KVM_TRC_HANDLER + 0x19)
645 685
646#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) 686#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
647 687
@@ -696,4 +736,4 @@ struct kvm_assigned_msix_entry {
696 __u16 padding[3]; 736 __u16 padding[3];
697}; 737};
698 738
699#endif 739#endif /* __LINUX_KVM_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b7bbb5ddd7ae..bd5a616d9373 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -120,7 +120,7 @@ struct kvm_kernel_irq_routing_entry {
120 u32 gsi; 120 u32 gsi;
121 u32 type; 121 u32 type;
122 int (*set)(struct kvm_kernel_irq_routing_entry *e, 122 int (*set)(struct kvm_kernel_irq_routing_entry *e,
123 struct kvm *kvm, int level); 123 struct kvm *kvm, int irq_source_id, int level);
124 union { 124 union {
125 struct { 125 struct {
126 unsigned irqchip; 126 unsigned irqchip;
@@ -128,9 +128,28 @@ struct kvm_kernel_irq_routing_entry {
128 } irqchip; 128 } irqchip;
129 struct msi_msg msi; 129 struct msi_msg msi;
130 }; 130 };
131 struct list_head link; 131 struct hlist_node link;
132};
133
134#ifdef __KVM_HAVE_IOAPIC
135
136struct kvm_irq_routing_table {
137 int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS];
138 struct kvm_kernel_irq_routing_entry *rt_entries;
139 u32 nr_rt_entries;
140 /*
141 * Array indexed by gsi. Each entry contains list of irq chips
142 * the gsi is connected to.
143 */
144 struct hlist_head map[0];
132}; 145};
133 146
147#else
148
149struct kvm_irq_routing_table {};
150
151#endif
152
134struct kvm { 153struct kvm {
135 spinlock_t mmu_lock; 154 spinlock_t mmu_lock;
136 spinlock_t requests_lock; 155 spinlock_t requests_lock;
@@ -166,8 +185,9 @@ struct kvm {
166 185
167 struct mutex irq_lock; 186 struct mutex irq_lock;
168#ifdef CONFIG_HAVE_KVM_IRQCHIP 187#ifdef CONFIG_HAVE_KVM_IRQCHIP
169 struct list_head irq_routing; /* of kvm_kernel_irq_routing_entry */ 188 struct kvm_irq_routing_table *irq_routing;
170 struct hlist_head mask_notifier_list; 189 struct hlist_head mask_notifier_list;
190 struct hlist_head irq_ack_notifier_list;
171#endif 191#endif
172 192
173#ifdef KVM_ARCH_WANT_MMU_NOTIFIER 193#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
@@ -266,6 +286,7 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
266void mark_page_dirty(struct kvm *kvm, gfn_t gfn); 286void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
267 287
268void kvm_vcpu_block(struct kvm_vcpu *vcpu); 288void kvm_vcpu_block(struct kvm_vcpu *vcpu);
289void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
269void kvm_resched(struct kvm_vcpu *vcpu); 290void kvm_resched(struct kvm_vcpu *vcpu);
270void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); 291void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
271void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); 292void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
@@ -325,7 +346,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
325void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); 346void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
326 347
327int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu); 348int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
328void kvm_arch_hardware_enable(void *garbage); 349int kvm_arch_hardware_enable(void *garbage);
329void kvm_arch_hardware_disable(void *garbage); 350void kvm_arch_hardware_disable(void *garbage);
330int kvm_arch_hardware_setup(void); 351int kvm_arch_hardware_setup(void);
331void kvm_arch_hardware_unsetup(void); 352void kvm_arch_hardware_unsetup(void);
@@ -390,7 +411,12 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
390 struct kvm_irq_mask_notifier *kimn); 411 struct kvm_irq_mask_notifier *kimn);
391void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask); 412void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
392 413
393int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level); 414#ifdef __KVM_HAVE_IOAPIC
415void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
416 union kvm_ioapic_redirect_entry *entry,
417 unsigned long *deliver_bitmask);
418#endif
419int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
394void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); 420void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
395void kvm_register_irq_ack_notifier(struct kvm *kvm, 421void kvm_register_irq_ack_notifier(struct kvm *kvm,
396 struct kvm_irq_ack_notifier *kian); 422 struct kvm_irq_ack_notifier *kian);
@@ -552,4 +578,21 @@ static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
552 return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id; 578 return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
553} 579}
554#endif 580#endif
581
582#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
583
584long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
585 unsigned long arg);
586
587#else
588
589static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
590 unsigned long arg)
591{
592 return -ENOTTY;
593}
594
555#endif 595#endif
596
597#endif
598
diff --git a/include/linux/user-return-notifier.h b/include/linux/user-return-notifier.h
new file mode 100644
index 000000000000..9c4a445bb43c
--- /dev/null
+++ b/include/linux/user-return-notifier.h
@@ -0,0 +1,49 @@
1#ifndef _LINUX_USER_RETURN_NOTIFIER_H
2#define _LINUX_USER_RETURN_NOTIFIER_H
3
4#ifdef CONFIG_USER_RETURN_NOTIFIER
5
6#include <linux/list.h>
7#include <linux/sched.h>
8
9struct user_return_notifier {
10 void (*on_user_return)(struct user_return_notifier *urn);
11 struct hlist_node link;
12};
13
14
15void user_return_notifier_register(struct user_return_notifier *urn);
16void user_return_notifier_unregister(struct user_return_notifier *urn);
17
18static inline void propagate_user_return_notify(struct task_struct *prev,
19 struct task_struct *next)
20{
21 if (test_tsk_thread_flag(prev, TIF_USER_RETURN_NOTIFY)) {
22 clear_tsk_thread_flag(prev, TIF_USER_RETURN_NOTIFY);
23 set_tsk_thread_flag(next, TIF_USER_RETURN_NOTIFY);
24 }
25}
26
27void fire_user_return_notifiers(void);
28
29static inline void clear_user_return_notifier(struct task_struct *p)
30{
31 clear_tsk_thread_flag(p, TIF_USER_RETURN_NOTIFY);
32}
33
34#else
35
36struct user_return_notifier {};
37
38static inline void propagate_user_return_notify(struct task_struct *prev,
39 struct task_struct *next)
40{
41}
42
43static inline void fire_user_return_notifiers(void) {}
44
45static inline void clear_user_return_notifier(struct task_struct *p) {}
46
47#endif
48
49#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 9943202b4355..864ff75d65f2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -99,6 +99,7 @@ obj-$(CONFIG_SLOW_WORK) += slow-work.o
99obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o 99obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
100obj-$(CONFIG_PERF_EVENTS) += perf_event.o 100obj-$(CONFIG_PERF_EVENTS) += perf_event.o
101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
102obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
102 103
103ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 104ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
104# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 105# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/fork.c b/kernel/fork.c
index 3d6f121bbe8a..edeff9ceaab9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -64,6 +64,7 @@
64#include <linux/magic.h> 64#include <linux/magic.h>
65#include <linux/perf_event.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h>
67 68
68#include <asm/pgtable.h> 69#include <asm/pgtable.h>
69#include <asm/pgalloc.h> 70#include <asm/pgalloc.h>
@@ -249,6 +250,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
249 goto out; 250 goto out;
250 251
251 setup_thread_stack(tsk, orig); 252 setup_thread_stack(tsk, orig);
253 clear_user_return_notifier(tsk);
252 stackend = end_of_stack(tsk); 254 stackend = end_of_stack(tsk);
253 *stackend = STACK_END_MAGIC; /* for overflow detection */ 255 *stackend = STACK_END_MAGIC; /* for overflow detection */
254 256
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
new file mode 100644
index 000000000000..03e2d6fd9b18
--- /dev/null
+++ b/kernel/user-return-notifier.c
@@ -0,0 +1,46 @@
1
2#include <linux/user-return-notifier.h>
3#include <linux/percpu.h>
4#include <linux/sched.h>
5#include <linux/module.h>
6
7static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
8
9#define URN_LIST_HEAD per_cpu(return_notifier_list, raw_smp_processor_id())
10
11/*
12 * Request a notification when the current cpu returns to userspace. Must be
13 * called in atomic context. The notifier will also be called in atomic
14 * context.
15 */
16void user_return_notifier_register(struct user_return_notifier *urn)
17{
18 set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
19 hlist_add_head(&urn->link, &URN_LIST_HEAD);
20}
21EXPORT_SYMBOL_GPL(user_return_notifier_register);
22
23/*
24 * Removes a registered user return notifier. Must be called from atomic
25 * context, and from the same cpu registration occured in.
26 */
27void user_return_notifier_unregister(struct user_return_notifier *urn)
28{
29 hlist_del(&urn->link);
30 if (hlist_empty(&URN_LIST_HEAD))
31 clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
32}
33EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
34
35/* Calls registered user return notifiers */
36void fire_user_return_notifiers(void)
37{
38 struct user_return_notifier *urn;
39 struct hlist_node *tmp1, *tmp2;
40 struct hlist_head *head;
41
42 head = &get_cpu_var(return_notifier_list);
43 hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
44 urn->on_user_return(urn);
45 put_cpu_var(return_notifier_list);
46}
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
new file mode 100644
index 000000000000..fd9c097b760a
--- /dev/null
+++ b/virt/kvm/assigned-dev.c
@@ -0,0 +1,818 @@
1/*
2 * Kernel-based Virtual Machine - device assignment support
3 *
4 * Copyright (C) 2006-9 Red Hat, Inc
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
8 *
9 */
10
11#include <linux/kvm_host.h>
12#include <linux/kvm.h>
13#include <linux/uaccess.h>
14#include <linux/vmalloc.h>
15#include <linux/errno.h>
16#include <linux/spinlock.h>
17#include <linux/pci.h>
18#include <linux/interrupt.h>
19#include "irq.h"
20
21static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
22 int assigned_dev_id)
23{
24 struct list_head *ptr;
25 struct kvm_assigned_dev_kernel *match;
26
27 list_for_each(ptr, head) {
28 match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
29 if (match->assigned_dev_id == assigned_dev_id)
30 return match;
31 }
32 return NULL;
33}
34
35static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
36 *assigned_dev, int irq)
37{
38 int i, index;
39 struct msix_entry *host_msix_entries;
40
41 host_msix_entries = assigned_dev->host_msix_entries;
42
43 index = -1;
44 for (i = 0; i < assigned_dev->entries_nr; i++)
45 if (irq == host_msix_entries[i].vector) {
46 index = i;
47 break;
48 }
49 if (index < 0) {
50 printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
51 return 0;
52 }
53
54 return index;
55}
56
57static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
58{
59 struct kvm_assigned_dev_kernel *assigned_dev;
60 struct kvm *kvm;
61 int i;
62
63 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
64 interrupt_work);
65 kvm = assigned_dev->kvm;
66
67 spin_lock_irq(&assigned_dev->assigned_dev_lock);
68 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
69 struct kvm_guest_msix_entry *guest_entries =
70 assigned_dev->guest_msix_entries;
71 for (i = 0; i < assigned_dev->entries_nr; i++) {
72 if (!(guest_entries[i].flags &
73 KVM_ASSIGNED_MSIX_PENDING))
74 continue;
75 guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
76 kvm_set_irq(assigned_dev->kvm,
77 assigned_dev->irq_source_id,
78 guest_entries[i].vector, 1);
79 }
80 } else
81 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
82 assigned_dev->guest_irq, 1);
83
84 spin_unlock_irq(&assigned_dev->assigned_dev_lock);
85}
86
87static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
88{
89 unsigned long flags;
90 struct kvm_assigned_dev_kernel *assigned_dev =
91 (struct kvm_assigned_dev_kernel *) dev_id;
92
93 spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
94 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
95 int index = find_index_from_host_irq(assigned_dev, irq);
96 if (index < 0)
97 goto out;
98 assigned_dev->guest_msix_entries[index].flags |=
99 KVM_ASSIGNED_MSIX_PENDING;
100 }
101
102 schedule_work(&assigned_dev->interrupt_work);
103
104 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
105 disable_irq_nosync(irq);
106 assigned_dev->host_irq_disabled = true;
107 }
108
109out:
110 spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
111 return IRQ_HANDLED;
112}
113
114/* Ack the irq line for an assigned device */
115static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
116{
117 struct kvm_assigned_dev_kernel *dev;
118 unsigned long flags;
119
120 if (kian->gsi == -1)
121 return;
122
123 dev = container_of(kian, struct kvm_assigned_dev_kernel,
124 ack_notifier);
125
126 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
127
128 /* The guest irq may be shared so this ack may be
129 * from another device.
130 */
131 spin_lock_irqsave(&dev->assigned_dev_lock, flags);
132 if (dev->host_irq_disabled) {
133 enable_irq(dev->host_irq);
134 dev->host_irq_disabled = false;
135 }
136 spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
137}
138
139static void deassign_guest_irq(struct kvm *kvm,
140 struct kvm_assigned_dev_kernel *assigned_dev)
141{
142 kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
143 assigned_dev->ack_notifier.gsi = -1;
144
145 if (assigned_dev->irq_source_id != -1)
146 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
147 assigned_dev->irq_source_id = -1;
148 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
149}
150
151/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
152static void deassign_host_irq(struct kvm *kvm,
153 struct kvm_assigned_dev_kernel *assigned_dev)
154{
155 /*
156 * In kvm_free_device_irq, cancel_work_sync return true if:
157 * 1. work is scheduled, and then cancelled.
158 * 2. work callback is executed.
159 *
160 * The first one ensured that the irq is disabled and no more events
161 * would happen. But for the second one, the irq may be enabled (e.g.
162 * for MSI). So we disable irq here to prevent further events.
163 *
164 * Notice this maybe result in nested disable if the interrupt type is
165 * INTx, but it's OK for we are going to free it.
166 *
167 * If this function is a part of VM destroy, please ensure that till
168 * now, the kvm state is still legal for probably we also have to wait
169 * interrupt_work done.
170 */
171 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
172 int i;
173 for (i = 0; i < assigned_dev->entries_nr; i++)
174 disable_irq_nosync(assigned_dev->
175 host_msix_entries[i].vector);
176
177 cancel_work_sync(&assigned_dev->interrupt_work);
178
179 for (i = 0; i < assigned_dev->entries_nr; i++)
180 free_irq(assigned_dev->host_msix_entries[i].vector,
181 (void *)assigned_dev);
182
183 assigned_dev->entries_nr = 0;
184 kfree(assigned_dev->host_msix_entries);
185 kfree(assigned_dev->guest_msix_entries);
186 pci_disable_msix(assigned_dev->dev);
187 } else {
188 /* Deal with MSI and INTx */
189 disable_irq_nosync(assigned_dev->host_irq);
190 cancel_work_sync(&assigned_dev->interrupt_work);
191
192 free_irq(assigned_dev->host_irq, (void *)assigned_dev);
193
194 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
195 pci_disable_msi(assigned_dev->dev);
196 }
197
198 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
199}
200
201static int kvm_deassign_irq(struct kvm *kvm,
202 struct kvm_assigned_dev_kernel *assigned_dev,
203 unsigned long irq_requested_type)
204{
205 unsigned long guest_irq_type, host_irq_type;
206
207 if (!irqchip_in_kernel(kvm))
208 return -EINVAL;
209 /* no irq assignment to deassign */
210 if (!assigned_dev->irq_requested_type)
211 return -ENXIO;
212
213 host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
214 guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
215
216 if (host_irq_type)
217 deassign_host_irq(kvm, assigned_dev);
218 if (guest_irq_type)
219 deassign_guest_irq(kvm, assigned_dev);
220
221 return 0;
222}
223
224static void kvm_free_assigned_irq(struct kvm *kvm,
225 struct kvm_assigned_dev_kernel *assigned_dev)
226{
227 kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
228}
229
230static void kvm_free_assigned_device(struct kvm *kvm,
231 struct kvm_assigned_dev_kernel
232 *assigned_dev)
233{
234 kvm_free_assigned_irq(kvm, assigned_dev);
235
236 pci_reset_function(assigned_dev->dev);
237
238 pci_release_regions(assigned_dev->dev);
239 pci_disable_device(assigned_dev->dev);
240 pci_dev_put(assigned_dev->dev);
241
242 list_del(&assigned_dev->list);
243 kfree(assigned_dev);
244}
245
246void kvm_free_all_assigned_devices(struct kvm *kvm)
247{
248 struct list_head *ptr, *ptr2;
249 struct kvm_assigned_dev_kernel *assigned_dev;
250
251 list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
252 assigned_dev = list_entry(ptr,
253 struct kvm_assigned_dev_kernel,
254 list);
255
256 kvm_free_assigned_device(kvm, assigned_dev);
257 }
258}
259
260static int assigned_device_enable_host_intx(struct kvm *kvm,
261 struct kvm_assigned_dev_kernel *dev)
262{
263 dev->host_irq = dev->dev->irq;
264 /* Even though this is PCI, we don't want to use shared
265 * interrupts. Sharing host devices with guest-assigned devices
266 * on the same interrupt line is not a happy situation: there
267 * are going to be long delays in accepting, acking, etc.
268 */
269 if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
270 0, "kvm_assigned_intx_device", (void *)dev))
271 return -EIO;
272 return 0;
273}
274
275#ifdef __KVM_HAVE_MSI
276static int assigned_device_enable_host_msi(struct kvm *kvm,
277 struct kvm_assigned_dev_kernel *dev)
278{
279 int r;
280
281 if (!dev->dev->msi_enabled) {
282 r = pci_enable_msi(dev->dev);
283 if (r)
284 return r;
285 }
286
287 dev->host_irq = dev->dev->irq;
288 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
289 "kvm_assigned_msi_device", (void *)dev)) {
290 pci_disable_msi(dev->dev);
291 return -EIO;
292 }
293
294 return 0;
295}
296#endif
297
298#ifdef __KVM_HAVE_MSIX
299static int assigned_device_enable_host_msix(struct kvm *kvm,
300 struct kvm_assigned_dev_kernel *dev)
301{
302 int i, r = -EINVAL;
303
304 /* host_msix_entries and guest_msix_entries should have been
305 * initialized */
306 if (dev->entries_nr == 0)
307 return r;
308
309 r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
310 if (r)
311 return r;
312
313 for (i = 0; i < dev->entries_nr; i++) {
314 r = request_irq(dev->host_msix_entries[i].vector,
315 kvm_assigned_dev_intr, 0,
316 "kvm_assigned_msix_device",
317 (void *)dev);
318 /* FIXME: free requested_irq's on failure */
319 if (r)
320 return r;
321 }
322
323 return 0;
324}
325
326#endif
327
328static int assigned_device_enable_guest_intx(struct kvm *kvm,
329 struct kvm_assigned_dev_kernel *dev,
330 struct kvm_assigned_irq *irq)
331{
332 dev->guest_irq = irq->guest_irq;
333 dev->ack_notifier.gsi = irq->guest_irq;
334 return 0;
335}
336
337#ifdef __KVM_HAVE_MSI
338static int assigned_device_enable_guest_msi(struct kvm *kvm,
339 struct kvm_assigned_dev_kernel *dev,
340 struct kvm_assigned_irq *irq)
341{
342 dev->guest_irq = irq->guest_irq;
343 dev->ack_notifier.gsi = -1;
344 dev->host_irq_disabled = false;
345 return 0;
346}
347#endif
348
349#ifdef __KVM_HAVE_MSIX
350static int assigned_device_enable_guest_msix(struct kvm *kvm,
351 struct kvm_assigned_dev_kernel *dev,
352 struct kvm_assigned_irq *irq)
353{
354 dev->guest_irq = irq->guest_irq;
355 dev->ack_notifier.gsi = -1;
356 dev->host_irq_disabled = false;
357 return 0;
358}
359#endif
360
361static int assign_host_irq(struct kvm *kvm,
362 struct kvm_assigned_dev_kernel *dev,
363 __u32 host_irq_type)
364{
365 int r = -EEXIST;
366
367 if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
368 return r;
369
370 switch (host_irq_type) {
371 case KVM_DEV_IRQ_HOST_INTX:
372 r = assigned_device_enable_host_intx(kvm, dev);
373 break;
374#ifdef __KVM_HAVE_MSI
375 case KVM_DEV_IRQ_HOST_MSI:
376 r = assigned_device_enable_host_msi(kvm, dev);
377 break;
378#endif
379#ifdef __KVM_HAVE_MSIX
380 case KVM_DEV_IRQ_HOST_MSIX:
381 r = assigned_device_enable_host_msix(kvm, dev);
382 break;
383#endif
384 default:
385 r = -EINVAL;
386 }
387
388 if (!r)
389 dev->irq_requested_type |= host_irq_type;
390
391 return r;
392}
393
394static int assign_guest_irq(struct kvm *kvm,
395 struct kvm_assigned_dev_kernel *dev,
396 struct kvm_assigned_irq *irq,
397 unsigned long guest_irq_type)
398{
399 int id;
400 int r = -EEXIST;
401
402 if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
403 return r;
404
405 id = kvm_request_irq_source_id(kvm);
406 if (id < 0)
407 return id;
408
409 dev->irq_source_id = id;
410
411 switch (guest_irq_type) {
412 case KVM_DEV_IRQ_GUEST_INTX:
413 r = assigned_device_enable_guest_intx(kvm, dev, irq);
414 break;
415#ifdef __KVM_HAVE_MSI
416 case KVM_DEV_IRQ_GUEST_MSI:
417 r = assigned_device_enable_guest_msi(kvm, dev, irq);
418 break;
419#endif
420#ifdef __KVM_HAVE_MSIX
421 case KVM_DEV_IRQ_GUEST_MSIX:
422 r = assigned_device_enable_guest_msix(kvm, dev, irq);
423 break;
424#endif
425 default:
426 r = -EINVAL;
427 }
428
429 if (!r) {
430 dev->irq_requested_type |= guest_irq_type;
431 kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
432 } else
433 kvm_free_irq_source_id(kvm, dev->irq_source_id);
434
435 return r;
436}
437
438/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
439static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
440 struct kvm_assigned_irq *assigned_irq)
441{
442 int r = -EINVAL;
443 struct kvm_assigned_dev_kernel *match;
444 unsigned long host_irq_type, guest_irq_type;
445
446 if (!capable(CAP_SYS_RAWIO))
447 return -EPERM;
448
449 if (!irqchip_in_kernel(kvm))
450 return r;
451
452 mutex_lock(&kvm->lock);
453 r = -ENODEV;
454 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
455 assigned_irq->assigned_dev_id);
456 if (!match)
457 goto out;
458
459 host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
460 guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
461
462 r = -EINVAL;
463 /* can only assign one type at a time */
464 if (hweight_long(host_irq_type) > 1)
465 goto out;
466 if (hweight_long(guest_irq_type) > 1)
467 goto out;
468 if (host_irq_type == 0 && guest_irq_type == 0)
469 goto out;
470
471 r = 0;
472 if (host_irq_type)
473 r = assign_host_irq(kvm, match, host_irq_type);
474 if (r)
475 goto out;
476
477 if (guest_irq_type)
478 r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
479out:
480 mutex_unlock(&kvm->lock);
481 return r;
482}
483
484static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
485 struct kvm_assigned_irq
486 *assigned_irq)
487{
488 int r = -ENODEV;
489 struct kvm_assigned_dev_kernel *match;
490
491 mutex_lock(&kvm->lock);
492
493 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
494 assigned_irq->assigned_dev_id);
495 if (!match)
496 goto out;
497
498 r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
499out:
500 mutex_unlock(&kvm->lock);
501 return r;
502}
503
504static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
505 struct kvm_assigned_pci_dev *assigned_dev)
506{
507 int r = 0;
508 struct kvm_assigned_dev_kernel *match;
509 struct pci_dev *dev;
510
511 down_read(&kvm->slots_lock);
512 mutex_lock(&kvm->lock);
513
514 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
515 assigned_dev->assigned_dev_id);
516 if (match) {
517 /* device already assigned */
518 r = -EEXIST;
519 goto out;
520 }
521
522 match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
523 if (match == NULL) {
524 printk(KERN_INFO "%s: Couldn't allocate memory\n",
525 __func__);
526 r = -ENOMEM;
527 goto out;
528 }
529 dev = pci_get_bus_and_slot(assigned_dev->busnr,
530 assigned_dev->devfn);
531 if (!dev) {
532 printk(KERN_INFO "%s: host device not found\n", __func__);
533 r = -EINVAL;
534 goto out_free;
535 }
536 if (pci_enable_device(dev)) {
537 printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
538 r = -EBUSY;
539 goto out_put;
540 }
541 r = pci_request_regions(dev, "kvm_assigned_device");
542 if (r) {
543 printk(KERN_INFO "%s: Could not get access to device regions\n",
544 __func__);
545 goto out_disable;
546 }
547
548 pci_reset_function(dev);
549
550 match->assigned_dev_id = assigned_dev->assigned_dev_id;
551 match->host_busnr = assigned_dev->busnr;
552 match->host_devfn = assigned_dev->devfn;
553 match->flags = assigned_dev->flags;
554 match->dev = dev;
555 spin_lock_init(&match->assigned_dev_lock);
556 match->irq_source_id = -1;
557 match->kvm = kvm;
558 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
559 INIT_WORK(&match->interrupt_work,
560 kvm_assigned_dev_interrupt_work_handler);
561
562 list_add(&match->list, &kvm->arch.assigned_dev_head);
563
564 if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
565 if (!kvm->arch.iommu_domain) {
566 r = kvm_iommu_map_guest(kvm);
567 if (r)
568 goto out_list_del;
569 }
570 r = kvm_assign_device(kvm, match);
571 if (r)
572 goto out_list_del;
573 }
574
575out:
576 mutex_unlock(&kvm->lock);
577 up_read(&kvm->slots_lock);
578 return r;
579out_list_del:
580 list_del(&match->list);
581 pci_release_regions(dev);
582out_disable:
583 pci_disable_device(dev);
584out_put:
585 pci_dev_put(dev);
586out_free:
587 kfree(match);
588 mutex_unlock(&kvm->lock);
589 up_read(&kvm->slots_lock);
590 return r;
591}
592
593static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
594 struct kvm_assigned_pci_dev *assigned_dev)
595{
596 int r = 0;
597 struct kvm_assigned_dev_kernel *match;
598
599 mutex_lock(&kvm->lock);
600
601 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
602 assigned_dev->assigned_dev_id);
603 if (!match) {
604 printk(KERN_INFO "%s: device hasn't been assigned before, "
605 "so cannot be deassigned\n", __func__);
606 r = -EINVAL;
607 goto out;
608 }
609
610 if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
611 kvm_deassign_device(kvm, match);
612
613 kvm_free_assigned_device(kvm, match);
614
615out:
616 mutex_unlock(&kvm->lock);
617 return r;
618}
619
620
621#ifdef __KVM_HAVE_MSIX
622static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
623 struct kvm_assigned_msix_nr *entry_nr)
624{
625 int r = 0;
626 struct kvm_assigned_dev_kernel *adev;
627
628 mutex_lock(&kvm->lock);
629
630 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
631 entry_nr->assigned_dev_id);
632 if (!adev) {
633 r = -EINVAL;
634 goto msix_nr_out;
635 }
636
637 if (adev->entries_nr == 0) {
638 adev->entries_nr = entry_nr->entry_nr;
639 if (adev->entries_nr == 0 ||
640 adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
641 r = -EINVAL;
642 goto msix_nr_out;
643 }
644
645 adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
646 entry_nr->entry_nr,
647 GFP_KERNEL);
648 if (!adev->host_msix_entries) {
649 r = -ENOMEM;
650 goto msix_nr_out;
651 }
652 adev->guest_msix_entries = kzalloc(
653 sizeof(struct kvm_guest_msix_entry) *
654 entry_nr->entry_nr, GFP_KERNEL);
655 if (!adev->guest_msix_entries) {
656 kfree(adev->host_msix_entries);
657 r = -ENOMEM;
658 goto msix_nr_out;
659 }
660 } else /* Not allowed set MSI-X number twice */
661 r = -EINVAL;
662msix_nr_out:
663 mutex_unlock(&kvm->lock);
664 return r;
665}
666
667static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
668 struct kvm_assigned_msix_entry *entry)
669{
670 int r = 0, i;
671 struct kvm_assigned_dev_kernel *adev;
672
673 mutex_lock(&kvm->lock);
674
675 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
676 entry->assigned_dev_id);
677
678 if (!adev) {
679 r = -EINVAL;
680 goto msix_entry_out;
681 }
682
683 for (i = 0; i < adev->entries_nr; i++)
684 if (adev->guest_msix_entries[i].vector == 0 ||
685 adev->guest_msix_entries[i].entry == entry->entry) {
686 adev->guest_msix_entries[i].entry = entry->entry;
687 adev->guest_msix_entries[i].vector = entry->gsi;
688 adev->host_msix_entries[i].entry = entry->entry;
689 break;
690 }
691 if (i == adev->entries_nr) {
692 r = -ENOSPC;
693 goto msix_entry_out;
694 }
695
696msix_entry_out:
697 mutex_unlock(&kvm->lock);
698
699 return r;
700}
701#endif
702
703long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
704 unsigned long arg)
705{
706 void __user *argp = (void __user *)arg;
707 int r = -ENOTTY;
708
709 switch (ioctl) {
710 case KVM_ASSIGN_PCI_DEVICE: {
711 struct kvm_assigned_pci_dev assigned_dev;
712
713 r = -EFAULT;
714 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
715 goto out;
716 r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
717 if (r)
718 goto out;
719 break;
720 }
721 case KVM_ASSIGN_IRQ: {
722 r = -EOPNOTSUPP;
723 break;
724 }
725#ifdef KVM_CAP_ASSIGN_DEV_IRQ
726 case KVM_ASSIGN_DEV_IRQ: {
727 struct kvm_assigned_irq assigned_irq;
728
729 r = -EFAULT;
730 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
731 goto out;
732 r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
733 if (r)
734 goto out;
735 break;
736 }
737 case KVM_DEASSIGN_DEV_IRQ: {
738 struct kvm_assigned_irq assigned_irq;
739
740 r = -EFAULT;
741 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
742 goto out;
743 r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
744 if (r)
745 goto out;
746 break;
747 }
748#endif
749#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
750 case KVM_DEASSIGN_PCI_DEVICE: {
751 struct kvm_assigned_pci_dev assigned_dev;
752
753 r = -EFAULT;
754 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
755 goto out;
756 r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
757 if (r)
758 goto out;
759 break;
760 }
761#endif
762#ifdef KVM_CAP_IRQ_ROUTING
763 case KVM_SET_GSI_ROUTING: {
764 struct kvm_irq_routing routing;
765 struct kvm_irq_routing __user *urouting;
766 struct kvm_irq_routing_entry *entries;
767
768 r = -EFAULT;
769 if (copy_from_user(&routing, argp, sizeof(routing)))
770 goto out;
771 r = -EINVAL;
772 if (routing.nr >= KVM_MAX_IRQ_ROUTES)
773 goto out;
774 if (routing.flags)
775 goto out;
776 r = -ENOMEM;
777 entries = vmalloc(routing.nr * sizeof(*entries));
778 if (!entries)
779 goto out;
780 r = -EFAULT;
781 urouting = argp;
782 if (copy_from_user(entries, urouting->entries,
783 routing.nr * sizeof(*entries)))
784 goto out_free_irq_routing;
785 r = kvm_set_irq_routing(kvm, entries, routing.nr,
786 routing.flags);
787 out_free_irq_routing:
788 vfree(entries);
789 break;
790 }
791#endif /* KVM_CAP_IRQ_ROUTING */
792#ifdef __KVM_HAVE_MSIX
793 case KVM_ASSIGN_SET_MSIX_NR: {
794 struct kvm_assigned_msix_nr entry_nr;
795 r = -EFAULT;
796 if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
797 goto out;
798 r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
799 if (r)
800 goto out;
801 break;
802 }
803 case KVM_ASSIGN_SET_MSIX_ENTRY: {
804 struct kvm_assigned_msix_entry entry;
805 r = -EFAULT;
806 if (copy_from_user(&entry, argp, sizeof entry))
807 goto out;
808 r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
809 if (r)
810 goto out;
811 break;
812 }
813#endif
814 }
815out:
816 return r;
817}
818
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index bb4ebd89b9ff..30f70fd511c4 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -61,10 +61,8 @@ irqfd_inject(struct work_struct *work)
61 struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); 61 struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
62 struct kvm *kvm = irqfd->kvm; 62 struct kvm *kvm = irqfd->kvm;
63 63
64 mutex_lock(&kvm->irq_lock);
65 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); 64 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
66 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); 65 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
67 mutex_unlock(&kvm->irq_lock);
68} 66}
69 67
70/* 68/*
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 9fe140bb38ec..38a2d20b89de 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -182,6 +182,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
182 union kvm_ioapic_redirect_entry entry; 182 union kvm_ioapic_redirect_entry entry;
183 int ret = 1; 183 int ret = 1;
184 184
185 mutex_lock(&ioapic->lock);
185 if (irq >= 0 && irq < IOAPIC_NUM_PINS) { 186 if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
186 entry = ioapic->redirtbl[irq]; 187 entry = ioapic->redirtbl[irq];
187 level ^= entry.fields.polarity; 188 level ^= entry.fields.polarity;
@@ -198,34 +199,51 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
198 } 199 }
199 trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); 200 trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
200 } 201 }
202 mutex_unlock(&ioapic->lock);
203
201 return ret; 204 return ret;
202} 205}
203 206
204static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int pin, 207static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
205 int trigger_mode) 208 int trigger_mode)
206{ 209{
207 union kvm_ioapic_redirect_entry *ent; 210 int i;
211
212 for (i = 0; i < IOAPIC_NUM_PINS; i++) {
213 union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
208 214
209 ent = &ioapic->redirtbl[pin]; 215 if (ent->fields.vector != vector)
216 continue;
210 217
211 kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, pin); 218 /*
219 * We are dropping lock while calling ack notifiers because ack
220 * notifier callbacks for assigned devices call into IOAPIC
221 * recursively. Since remote_irr is cleared only after call
222 * to notifiers if the same vector will be delivered while lock
223 * is dropped it will be put into irr and will be delivered
224 * after ack notifier returns.
225 */
226 mutex_unlock(&ioapic->lock);
227 kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
228 mutex_lock(&ioapic->lock);
229
230 if (trigger_mode != IOAPIC_LEVEL_TRIG)
231 continue;
212 232
213 if (trigger_mode == IOAPIC_LEVEL_TRIG) {
214 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); 233 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
215 ent->fields.remote_irr = 0; 234 ent->fields.remote_irr = 0;
216 if (!ent->fields.mask && (ioapic->irr & (1 << pin))) 235 if (!ent->fields.mask && (ioapic->irr & (1 << i)))
217 ioapic_service(ioapic, pin); 236 ioapic_service(ioapic, i);
218 } 237 }
219} 238}
220 239
221void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) 240void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
222{ 241{
223 struct kvm_ioapic *ioapic = kvm->arch.vioapic; 242 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
224 int i;
225 243
226 for (i = 0; i < IOAPIC_NUM_PINS; i++) 244 mutex_lock(&ioapic->lock);
227 if (ioapic->redirtbl[i].fields.vector == vector) 245 __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
228 __kvm_ioapic_update_eoi(ioapic, i, trigger_mode); 246 mutex_unlock(&ioapic->lock);
229} 247}
230 248
231static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev) 249static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
@@ -250,8 +268,8 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
250 ioapic_debug("addr %lx\n", (unsigned long)addr); 268 ioapic_debug("addr %lx\n", (unsigned long)addr);
251 ASSERT(!(addr & 0xf)); /* check alignment */ 269 ASSERT(!(addr & 0xf)); /* check alignment */
252 270
253 mutex_lock(&ioapic->kvm->irq_lock);
254 addr &= 0xff; 271 addr &= 0xff;
272 mutex_lock(&ioapic->lock);
255 switch (addr) { 273 switch (addr) {
256 case IOAPIC_REG_SELECT: 274 case IOAPIC_REG_SELECT:
257 result = ioapic->ioregsel; 275 result = ioapic->ioregsel;
@@ -265,6 +283,8 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
265 result = 0; 283 result = 0;
266 break; 284 break;
267 } 285 }
286 mutex_unlock(&ioapic->lock);
287
268 switch (len) { 288 switch (len) {
269 case 8: 289 case 8:
270 *(u64 *) val = result; 290 *(u64 *) val = result;
@@ -277,7 +297,6 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
277 default: 297 default:
278 printk(KERN_WARNING "ioapic: wrong length %d\n", len); 298 printk(KERN_WARNING "ioapic: wrong length %d\n", len);
279 } 299 }
280 mutex_unlock(&ioapic->kvm->irq_lock);
281 return 0; 300 return 0;
282} 301}
283 302
@@ -293,15 +312,15 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
293 (void*)addr, len, val); 312 (void*)addr, len, val);
294 ASSERT(!(addr & 0xf)); /* check alignment */ 313 ASSERT(!(addr & 0xf)); /* check alignment */
295 314
296 mutex_lock(&ioapic->kvm->irq_lock);
297 if (len == 4 || len == 8) 315 if (len == 4 || len == 8)
298 data = *(u32 *) val; 316 data = *(u32 *) val;
299 else { 317 else {
300 printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); 318 printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
301 goto unlock; 319 return 0;
302 } 320 }
303 321
304 addr &= 0xff; 322 addr &= 0xff;
323 mutex_lock(&ioapic->lock);
305 switch (addr) { 324 switch (addr) {
306 case IOAPIC_REG_SELECT: 325 case IOAPIC_REG_SELECT:
307 ioapic->ioregsel = data; 326 ioapic->ioregsel = data;
@@ -312,15 +331,14 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
312 break; 331 break;
313#ifdef CONFIG_IA64 332#ifdef CONFIG_IA64
314 case IOAPIC_REG_EOI: 333 case IOAPIC_REG_EOI:
315 kvm_ioapic_update_eoi(ioapic->kvm, data, IOAPIC_LEVEL_TRIG); 334 __kvm_ioapic_update_eoi(ioapic, data, IOAPIC_LEVEL_TRIG);
316 break; 335 break;
317#endif 336#endif
318 337
319 default: 338 default:
320 break; 339 break;
321 } 340 }
322unlock: 341 mutex_unlock(&ioapic->lock);
323 mutex_unlock(&ioapic->kvm->irq_lock);
324 return 0; 342 return 0;
325} 343}
326 344
@@ -349,6 +367,7 @@ int kvm_ioapic_init(struct kvm *kvm)
349 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); 367 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
350 if (!ioapic) 368 if (!ioapic)
351 return -ENOMEM; 369 return -ENOMEM;
370 mutex_init(&ioapic->lock);
352 kvm->arch.vioapic = ioapic; 371 kvm->arch.vioapic = ioapic;
353 kvm_ioapic_reset(ioapic); 372 kvm_ioapic_reset(ioapic);
354 kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); 373 kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
@@ -360,3 +379,26 @@ int kvm_ioapic_init(struct kvm *kvm)
360 return ret; 379 return ret;
361} 380}
362 381
382int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
383{
384 struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
385 if (!ioapic)
386 return -EINVAL;
387
388 mutex_lock(&ioapic->lock);
389 memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
390 mutex_unlock(&ioapic->lock);
391 return 0;
392}
393
394int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
395{
396 struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
397 if (!ioapic)
398 return -EINVAL;
399
400 mutex_lock(&ioapic->lock);
401 memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
402 mutex_unlock(&ioapic->lock);
403 return 0;
404}
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 7080b713c160..419c43b667ab 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -41,9 +41,11 @@ struct kvm_ioapic {
41 u32 irr; 41 u32 irr;
42 u32 pad; 42 u32 pad;
43 union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS]; 43 union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
44 unsigned long irq_states[IOAPIC_NUM_PINS];
44 struct kvm_io_device dev; 45 struct kvm_io_device dev;
45 struct kvm *kvm; 46 struct kvm *kvm;
46 void (*ack_notifier)(void *opaque, int irq); 47 void (*ack_notifier)(void *opaque, int irq);
48 struct mutex lock;
47}; 49};
48 50
49#ifdef DEBUG 51#ifdef DEBUG
@@ -73,4 +75,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
73void kvm_ioapic_reset(struct kvm_ioapic *ioapic); 75void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
74int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, 76int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
75 struct kvm_lapic_irq *irq); 77 struct kvm_lapic_irq *irq);
78int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
79int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
80
76#endif 81#endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 001663ff401a..9b077342ab54 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -31,20 +31,39 @@
31 31
32#include "ioapic.h" 32#include "ioapic.h"
33 33
34static inline int kvm_irq_line_state(unsigned long *irq_state,
35 int irq_source_id, int level)
36{
37 /* Logical OR for level trig interrupt */
38 if (level)
39 set_bit(irq_source_id, irq_state);
40 else
41 clear_bit(irq_source_id, irq_state);
42
43 return !!(*irq_state);
44}
45
34static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, 46static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
35 struct kvm *kvm, int level) 47 struct kvm *kvm, int irq_source_id, int level)
36{ 48{
37#ifdef CONFIG_X86 49#ifdef CONFIG_X86
38 return kvm_pic_set_irq(pic_irqchip(kvm), e->irqchip.pin, level); 50 struct kvm_pic *pic = pic_irqchip(kvm);
51 level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin],
52 irq_source_id, level);
53 return kvm_pic_set_irq(pic, e->irqchip.pin, level);
39#else 54#else
40 return -1; 55 return -1;
41#endif 56#endif
42} 57}
43 58
44static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, 59static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
45 struct kvm *kvm, int level) 60 struct kvm *kvm, int irq_source_id, int level)
46{ 61{
47 return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level); 62 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
63 level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
64 irq_source_id, level);
65
66 return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
48} 67}
49 68
50inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) 69inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
@@ -63,8 +82,6 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
63 int i, r = -1; 82 int i, r = -1;
64 struct kvm_vcpu *vcpu, *lowest = NULL; 83 struct kvm_vcpu *vcpu, *lowest = NULL;
65 84
66 WARN_ON(!mutex_is_locked(&kvm->irq_lock));
67
68 if (irq->dest_mode == 0 && irq->dest_id == 0xff && 85 if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
69 kvm_is_dm_lowest_prio(irq)) 86 kvm_is_dm_lowest_prio(irq))
70 printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); 87 printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
@@ -96,10 +113,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
96} 113}
97 114
98static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, 115static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
99 struct kvm *kvm, int level) 116 struct kvm *kvm, int irq_source_id, int level)
100{ 117{
101 struct kvm_lapic_irq irq; 118 struct kvm_lapic_irq irq;
102 119
120 if (!level)
121 return -1;
122
103 trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); 123 trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
104 124
105 irq.dest_id = (e->msi.address_lo & 125 irq.dest_id = (e->msi.address_lo &
@@ -116,78 +136,67 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
116 return kvm_irq_delivery_to_apic(kvm, NULL, &irq); 136 return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
117} 137}
118 138
119/* This should be called with the kvm->irq_lock mutex held 139/*
120 * Return value: 140 * Return value:
121 * < 0 Interrupt was ignored (masked or not delivered for other reasons) 141 * < 0 Interrupt was ignored (masked or not delivered for other reasons)
122 * = 0 Interrupt was coalesced (previous irq is still pending) 142 * = 0 Interrupt was coalesced (previous irq is still pending)
123 * > 0 Number of CPUs interrupt was delivered to 143 * > 0 Number of CPUs interrupt was delivered to
124 */ 144 */
125int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) 145int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
126{ 146{
127 struct kvm_kernel_irq_routing_entry *e; 147 struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
128 unsigned long *irq_state, sig_level; 148 int ret = -1, i = 0;
129 int ret = -1; 149 struct kvm_irq_routing_table *irq_rt;
150 struct hlist_node *n;
130 151
131 trace_kvm_set_irq(irq, level, irq_source_id); 152 trace_kvm_set_irq(irq, level, irq_source_id);
132 153
133 WARN_ON(!mutex_is_locked(&kvm->irq_lock));
134
135 if (irq < KVM_IOAPIC_NUM_PINS) {
136 irq_state = (unsigned long *)&kvm->arch.irq_states[irq];
137
138 /* Logical OR for level trig interrupt */
139 if (level)
140 set_bit(irq_source_id, irq_state);
141 else
142 clear_bit(irq_source_id, irq_state);
143 sig_level = !!(*irq_state);
144 } else if (!level)
145 return ret;
146 else /* Deal with MSI/MSI-X */
147 sig_level = 1;
148
149 /* Not possible to detect if the guest uses the PIC or the 154 /* Not possible to detect if the guest uses the PIC or the
150 * IOAPIC. So set the bit in both. The guest will ignore 155 * IOAPIC. So set the bit in both. The guest will ignore
151 * writes to the unused one. 156 * writes to the unused one.
152 */ 157 */
153 list_for_each_entry(e, &kvm->irq_routing, link) 158 rcu_read_lock();
154 if (e->gsi == irq) { 159 irq_rt = rcu_dereference(kvm->irq_routing);
155 int r = e->set(e, kvm, sig_level); 160 if (irq < irq_rt->nr_rt_entries)
156 if (r < 0) 161 hlist_for_each_entry(e, n, &irq_rt->map[irq], link)
157 continue; 162 irq_set[i++] = *e;
163 rcu_read_unlock();
164
165 while(i--) {
166 int r;
167 r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level);
168 if (r < 0)
169 continue;
170
171 ret = r + ((ret < 0) ? 0 : ret);
172 }
158 173
159 ret = r + ((ret < 0) ? 0 : ret);
160 }
161 return ret; 174 return ret;
162} 175}
163 176
164void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) 177void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
165{ 178{
166 struct kvm_kernel_irq_routing_entry *e;
167 struct kvm_irq_ack_notifier *kian; 179 struct kvm_irq_ack_notifier *kian;
168 struct hlist_node *n; 180 struct hlist_node *n;
169 unsigned gsi = pin; 181 int gsi;
170 182
171 trace_kvm_ack_irq(irqchip, pin); 183 trace_kvm_ack_irq(irqchip, pin);
172 184
173 list_for_each_entry(e, &kvm->irq_routing, link) 185 rcu_read_lock();
174 if (e->type == KVM_IRQ_ROUTING_IRQCHIP && 186 gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
175 e->irqchip.irqchip == irqchip && 187 if (gsi != -1)
176 e->irqchip.pin == pin) { 188 hlist_for_each_entry_rcu(kian, n, &kvm->irq_ack_notifier_list,
177 gsi = e->gsi; 189 link)
178 break; 190 if (kian->gsi == gsi)
179 } 191 kian->irq_acked(kian);
180 192 rcu_read_unlock();
181 hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link)
182 if (kian->gsi == gsi)
183 kian->irq_acked(kian);
184} 193}
185 194
186void kvm_register_irq_ack_notifier(struct kvm *kvm, 195void kvm_register_irq_ack_notifier(struct kvm *kvm,
187 struct kvm_irq_ack_notifier *kian) 196 struct kvm_irq_ack_notifier *kian)
188{ 197{
189 mutex_lock(&kvm->irq_lock); 198 mutex_lock(&kvm->irq_lock);
190 hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list); 199 hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
191 mutex_unlock(&kvm->irq_lock); 200 mutex_unlock(&kvm->irq_lock);
192} 201}
193 202
@@ -195,8 +204,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
195 struct kvm_irq_ack_notifier *kian) 204 struct kvm_irq_ack_notifier *kian)
196{ 205{
197 mutex_lock(&kvm->irq_lock); 206 mutex_lock(&kvm->irq_lock);
198 hlist_del_init(&kian->link); 207 hlist_del_init_rcu(&kian->link);
199 mutex_unlock(&kvm->irq_lock); 208 mutex_unlock(&kvm->irq_lock);
209 synchronize_rcu();
200} 210}
201 211
202int kvm_request_irq_source_id(struct kvm *kvm) 212int kvm_request_irq_source_id(struct kvm *kvm)
@@ -205,16 +215,17 @@ int kvm_request_irq_source_id(struct kvm *kvm)
205 int irq_source_id; 215 int irq_source_id;
206 216
207 mutex_lock(&kvm->irq_lock); 217 mutex_lock(&kvm->irq_lock);
208 irq_source_id = find_first_zero_bit(bitmap, 218 irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
209 sizeof(kvm->arch.irq_sources_bitmap));
210 219
211 if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { 220 if (irq_source_id >= BITS_PER_LONG) {
212 printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n"); 221 printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
213 return -EFAULT; 222 irq_source_id = -EFAULT;
223 goto unlock;
214 } 224 }
215 225
216 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); 226 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
217 set_bit(irq_source_id, bitmap); 227 set_bit(irq_source_id, bitmap);
228unlock:
218 mutex_unlock(&kvm->irq_lock); 229 mutex_unlock(&kvm->irq_lock);
219 230
220 return irq_source_id; 231 return irq_source_id;
@@ -228,13 +239,23 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
228 239
229 mutex_lock(&kvm->irq_lock); 240 mutex_lock(&kvm->irq_lock);
230 if (irq_source_id < 0 || 241 if (irq_source_id < 0 ||
231 irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { 242 irq_source_id >= BITS_PER_LONG) {
232 printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); 243 printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
233 return; 244 goto unlock;
234 } 245 }
235 for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
236 clear_bit(irq_source_id, &kvm->arch.irq_states[i]);
237 clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); 246 clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
247 if (!irqchip_in_kernel(kvm))
248 goto unlock;
249
250 for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) {
251 clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]);
252 if (i >= 16)
253 continue;
254#ifdef CONFIG_X86
255 clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]);
256#endif
257 }
258unlock:
238 mutex_unlock(&kvm->irq_lock); 259 mutex_unlock(&kvm->irq_lock);
239} 260}
240 261
@@ -243,7 +264,7 @@ void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
243{ 264{
244 mutex_lock(&kvm->irq_lock); 265 mutex_lock(&kvm->irq_lock);
245 kimn->irq = irq; 266 kimn->irq = irq;
246 hlist_add_head(&kimn->link, &kvm->mask_notifier_list); 267 hlist_add_head_rcu(&kimn->link, &kvm->mask_notifier_list);
247 mutex_unlock(&kvm->irq_lock); 268 mutex_unlock(&kvm->irq_lock);
248} 269}
249 270
@@ -251,8 +272,9 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
251 struct kvm_irq_mask_notifier *kimn) 272 struct kvm_irq_mask_notifier *kimn)
252{ 273{
253 mutex_lock(&kvm->irq_lock); 274 mutex_lock(&kvm->irq_lock);
254 hlist_del(&kimn->link); 275 hlist_del_rcu(&kimn->link);
255 mutex_unlock(&kvm->irq_lock); 276 mutex_unlock(&kvm->irq_lock);
277 synchronize_rcu();
256} 278}
257 279
258void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) 280void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
@@ -260,33 +282,37 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
260 struct kvm_irq_mask_notifier *kimn; 282 struct kvm_irq_mask_notifier *kimn;
261 struct hlist_node *n; 283 struct hlist_node *n;
262 284
263 WARN_ON(!mutex_is_locked(&kvm->irq_lock)); 285 rcu_read_lock();
264 286 hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link)
265 hlist_for_each_entry(kimn, n, &kvm->mask_notifier_list, link)
266 if (kimn->irq == irq) 287 if (kimn->irq == irq)
267 kimn->func(kimn, mask); 288 kimn->func(kimn, mask);
268} 289 rcu_read_unlock();
269
270static void __kvm_free_irq_routing(struct list_head *irq_routing)
271{
272 struct kvm_kernel_irq_routing_entry *e, *n;
273
274 list_for_each_entry_safe(e, n, irq_routing, link)
275 kfree(e);
276} 290}
277 291
278void kvm_free_irq_routing(struct kvm *kvm) 292void kvm_free_irq_routing(struct kvm *kvm)
279{ 293{
280 mutex_lock(&kvm->irq_lock); 294 /* Called only during vm destruction. Nobody can use the pointer
281 __kvm_free_irq_routing(&kvm->irq_routing); 295 at this stage */
282 mutex_unlock(&kvm->irq_lock); 296 kfree(kvm->irq_routing);
283} 297}
284 298
285static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, 299static int setup_routing_entry(struct kvm_irq_routing_table *rt,
300 struct kvm_kernel_irq_routing_entry *e,
286 const struct kvm_irq_routing_entry *ue) 301 const struct kvm_irq_routing_entry *ue)
287{ 302{
288 int r = -EINVAL; 303 int r = -EINVAL;
289 int delta; 304 int delta;
305 struct kvm_kernel_irq_routing_entry *ei;
306 struct hlist_node *n;
307
308 /*
309 * Do not allow GSI to be mapped to the same irqchip more than once.
310 * Allow only one to one mapping between GSI and MSI.
311 */
312 hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link)
313 if (ei->type == KVM_IRQ_ROUTING_MSI ||
314 ue->u.irqchip.irqchip == ei->irqchip.irqchip)
315 return r;
290 316
291 e->gsi = ue->gsi; 317 e->gsi = ue->gsi;
292 e->type = ue->type; 318 e->type = ue->type;
@@ -309,6 +335,9 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
309 } 335 }
310 e->irqchip.irqchip = ue->u.irqchip.irqchip; 336 e->irqchip.irqchip = ue->u.irqchip.irqchip;
311 e->irqchip.pin = ue->u.irqchip.pin + delta; 337 e->irqchip.pin = ue->u.irqchip.pin + delta;
338 if (e->irqchip.pin >= KVM_IOAPIC_NUM_PINS)
339 goto out;
340 rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi;
312 break; 341 break;
313 case KVM_IRQ_ROUTING_MSI: 342 case KVM_IRQ_ROUTING_MSI:
314 e->set = kvm_set_msi; 343 e->set = kvm_set_msi;
@@ -319,6 +348,8 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
319 default: 348 default:
320 goto out; 349 goto out;
321 } 350 }
351
352 hlist_add_head(&e->link, &rt->map[e->gsi]);
322 r = 0; 353 r = 0;
323out: 354out:
324 return r; 355 return r;
@@ -330,43 +361,53 @@ int kvm_set_irq_routing(struct kvm *kvm,
330 unsigned nr, 361 unsigned nr,
331 unsigned flags) 362 unsigned flags)
332{ 363{
333 struct list_head irq_list = LIST_HEAD_INIT(irq_list); 364 struct kvm_irq_routing_table *new, *old;
334 struct list_head tmp = LIST_HEAD_INIT(tmp); 365 u32 i, j, nr_rt_entries = 0;
335 struct kvm_kernel_irq_routing_entry *e = NULL;
336 unsigned i;
337 int r; 366 int r;
338 367
339 for (i = 0; i < nr; ++i) { 368 for (i = 0; i < nr; ++i) {
369 if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES)
370 return -EINVAL;
371 nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
372 }
373
374 nr_rt_entries += 1;
375
376 new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head))
377 + (nr * sizeof(struct kvm_kernel_irq_routing_entry)),
378 GFP_KERNEL);
379
380 if (!new)
381 return -ENOMEM;
382
383 new->rt_entries = (void *)&new->map[nr_rt_entries];
384
385 new->nr_rt_entries = nr_rt_entries;
386 for (i = 0; i < 3; i++)
387 for (j = 0; j < KVM_IOAPIC_NUM_PINS; j++)
388 new->chip[i][j] = -1;
389
390 for (i = 0; i < nr; ++i) {
340 r = -EINVAL; 391 r = -EINVAL;
341 if (ue->gsi >= KVM_MAX_IRQ_ROUTES)
342 goto out;
343 if (ue->flags) 392 if (ue->flags)
344 goto out; 393 goto out;
345 r = -ENOMEM; 394 r = setup_routing_entry(new, &new->rt_entries[i], ue);
346 e = kzalloc(sizeof(*e), GFP_KERNEL);
347 if (!e)
348 goto out;
349 r = setup_routing_entry(e, ue);
350 if (r) 395 if (r)
351 goto out; 396 goto out;
352 ++ue; 397 ++ue;
353 list_add(&e->link, &irq_list);
354 e = NULL;
355 } 398 }
356 399
357 mutex_lock(&kvm->irq_lock); 400 mutex_lock(&kvm->irq_lock);
358 list_splice(&kvm->irq_routing, &tmp); 401 old = kvm->irq_routing;
359 INIT_LIST_HEAD(&kvm->irq_routing); 402 rcu_assign_pointer(kvm->irq_routing, new);
360 list_splice(&irq_list, &kvm->irq_routing);
361 INIT_LIST_HEAD(&irq_list);
362 list_splice(&tmp, &irq_list);
363 mutex_unlock(&kvm->irq_lock); 403 mutex_unlock(&kvm->irq_lock);
404 synchronize_rcu();
364 405
406 new = old;
365 r = 0; 407 r = 0;
366 408
367out: 409out:
368 kfree(e); 410 kfree(new);
369 __kvm_free_irq_routing(&irq_list);
370 return r; 411 return r;
371} 412}
372 413
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7495ce347344..f92ba138007a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -43,6 +43,7 @@
43#include <linux/swap.h> 43#include <linux/swap.h>
44#include <linux/bitops.h> 44#include <linux/bitops.h>
45#include <linux/spinlock.h> 45#include <linux/spinlock.h>
46#include <linux/compat.h>
46 47
47#include <asm/processor.h> 48#include <asm/processor.h>
48#include <asm/io.h> 49#include <asm/io.h>
@@ -53,12 +54,6 @@
53#include "coalesced_mmio.h" 54#include "coalesced_mmio.h"
54#endif 55#endif
55 56
56#ifdef KVM_CAP_DEVICE_ASSIGNMENT
57#include <linux/pci.h>
58#include <linux/interrupt.h>
59#include "irq.h"
60#endif
61
62#define CREATE_TRACE_POINTS 57#define CREATE_TRACE_POINTS
63#include <trace/events/kvm.h> 58#include <trace/events/kvm.h>
64 59
@@ -75,6 +70,8 @@ DEFINE_SPINLOCK(kvm_lock);
75LIST_HEAD(vm_list); 70LIST_HEAD(vm_list);
76 71
77static cpumask_var_t cpus_hardware_enabled; 72static cpumask_var_t cpus_hardware_enabled;
73static int kvm_usage_count = 0;
74static atomic_t hardware_enable_failed;
78 75
79struct kmem_cache *kvm_vcpu_cache; 76struct kmem_cache *kvm_vcpu_cache;
80EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 77EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
@@ -85,615 +82,13 @@ struct dentry *kvm_debugfs_dir;
85 82
86static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 83static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
87 unsigned long arg); 84 unsigned long arg);
85static int hardware_enable_all(void);
86static void hardware_disable_all(void);
88 87
89static bool kvm_rebooting; 88static bool kvm_rebooting;
90 89
91static bool largepages_enabled = true; 90static bool largepages_enabled = true;
92 91
93#ifdef KVM_CAP_DEVICE_ASSIGNMENT
94static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
95 int assigned_dev_id)
96{
97 struct list_head *ptr;
98 struct kvm_assigned_dev_kernel *match;
99
100 list_for_each(ptr, head) {
101 match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
102 if (match->assigned_dev_id == assigned_dev_id)
103 return match;
104 }
105 return NULL;
106}
107
108static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
109 *assigned_dev, int irq)
110{
111 int i, index;
112 struct msix_entry *host_msix_entries;
113
114 host_msix_entries = assigned_dev->host_msix_entries;
115
116 index = -1;
117 for (i = 0; i < assigned_dev->entries_nr; i++)
118 if (irq == host_msix_entries[i].vector) {
119 index = i;
120 break;
121 }
122 if (index < 0) {
123 printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
124 return 0;
125 }
126
127 return index;
128}
129
130static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
131{
132 struct kvm_assigned_dev_kernel *assigned_dev;
133 struct kvm *kvm;
134 int i;
135
136 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
137 interrupt_work);
138 kvm = assigned_dev->kvm;
139
140 mutex_lock(&kvm->irq_lock);
141 spin_lock_irq(&assigned_dev->assigned_dev_lock);
142 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
143 struct kvm_guest_msix_entry *guest_entries =
144 assigned_dev->guest_msix_entries;
145 for (i = 0; i < assigned_dev->entries_nr; i++) {
146 if (!(guest_entries[i].flags &
147 KVM_ASSIGNED_MSIX_PENDING))
148 continue;
149 guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
150 kvm_set_irq(assigned_dev->kvm,
151 assigned_dev->irq_source_id,
152 guest_entries[i].vector, 1);
153 }
154 } else
155 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
156 assigned_dev->guest_irq, 1);
157
158 spin_unlock_irq(&assigned_dev->assigned_dev_lock);
159 mutex_unlock(&assigned_dev->kvm->irq_lock);
160}
161
162static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
163{
164 unsigned long flags;
165 struct kvm_assigned_dev_kernel *assigned_dev =
166 (struct kvm_assigned_dev_kernel *) dev_id;
167
168 spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
169 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
170 int index = find_index_from_host_irq(assigned_dev, irq);
171 if (index < 0)
172 goto out;
173 assigned_dev->guest_msix_entries[index].flags |=
174 KVM_ASSIGNED_MSIX_PENDING;
175 }
176
177 schedule_work(&assigned_dev->interrupt_work);
178
179 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
180 disable_irq_nosync(irq);
181 assigned_dev->host_irq_disabled = true;
182 }
183
184out:
185 spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
186 return IRQ_HANDLED;
187}
188
189/* Ack the irq line for an assigned device */
190static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
191{
192 struct kvm_assigned_dev_kernel *dev;
193 unsigned long flags;
194
195 if (kian->gsi == -1)
196 return;
197
198 dev = container_of(kian, struct kvm_assigned_dev_kernel,
199 ack_notifier);
200
201 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
202
203 /* The guest irq may be shared so this ack may be
204 * from another device.
205 */
206 spin_lock_irqsave(&dev->assigned_dev_lock, flags);
207 if (dev->host_irq_disabled) {
208 enable_irq(dev->host_irq);
209 dev->host_irq_disabled = false;
210 }
211 spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
212}
213
214static void deassign_guest_irq(struct kvm *kvm,
215 struct kvm_assigned_dev_kernel *assigned_dev)
216{
217 kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
218 assigned_dev->ack_notifier.gsi = -1;
219
220 if (assigned_dev->irq_source_id != -1)
221 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
222 assigned_dev->irq_source_id = -1;
223 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
224}
225
226/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
227static void deassign_host_irq(struct kvm *kvm,
228 struct kvm_assigned_dev_kernel *assigned_dev)
229{
230 /*
231 * In kvm_free_device_irq, cancel_work_sync return true if:
232 * 1. work is scheduled, and then cancelled.
233 * 2. work callback is executed.
234 *
235 * The first one ensured that the irq is disabled and no more events
236 * would happen. But for the second one, the irq may be enabled (e.g.
237 * for MSI). So we disable irq here to prevent further events.
238 *
239 * Notice this maybe result in nested disable if the interrupt type is
240 * INTx, but it's OK for we are going to free it.
241 *
242 * If this function is a part of VM destroy, please ensure that till
243 * now, the kvm state is still legal for probably we also have to wait
244 * interrupt_work done.
245 */
246 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
247 int i;
248 for (i = 0; i < assigned_dev->entries_nr; i++)
249 disable_irq_nosync(assigned_dev->
250 host_msix_entries[i].vector);
251
252 cancel_work_sync(&assigned_dev->interrupt_work);
253
254 for (i = 0; i < assigned_dev->entries_nr; i++)
255 free_irq(assigned_dev->host_msix_entries[i].vector,
256 (void *)assigned_dev);
257
258 assigned_dev->entries_nr = 0;
259 kfree(assigned_dev->host_msix_entries);
260 kfree(assigned_dev->guest_msix_entries);
261 pci_disable_msix(assigned_dev->dev);
262 } else {
263 /* Deal with MSI and INTx */
264 disable_irq_nosync(assigned_dev->host_irq);
265 cancel_work_sync(&assigned_dev->interrupt_work);
266
267 free_irq(assigned_dev->host_irq, (void *)assigned_dev);
268
269 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
270 pci_disable_msi(assigned_dev->dev);
271 }
272
273 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
274}
275
276static int kvm_deassign_irq(struct kvm *kvm,
277 struct kvm_assigned_dev_kernel *assigned_dev,
278 unsigned long irq_requested_type)
279{
280 unsigned long guest_irq_type, host_irq_type;
281
282 if (!irqchip_in_kernel(kvm))
283 return -EINVAL;
284 /* no irq assignment to deassign */
285 if (!assigned_dev->irq_requested_type)
286 return -ENXIO;
287
288 host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
289 guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
290
291 if (host_irq_type)
292 deassign_host_irq(kvm, assigned_dev);
293 if (guest_irq_type)
294 deassign_guest_irq(kvm, assigned_dev);
295
296 return 0;
297}
298
299static void kvm_free_assigned_irq(struct kvm *kvm,
300 struct kvm_assigned_dev_kernel *assigned_dev)
301{
302 kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
303}
304
305static void kvm_free_assigned_device(struct kvm *kvm,
306 struct kvm_assigned_dev_kernel
307 *assigned_dev)
308{
309 kvm_free_assigned_irq(kvm, assigned_dev);
310
311 pci_reset_function(assigned_dev->dev);
312
313 pci_release_regions(assigned_dev->dev);
314 pci_disable_device(assigned_dev->dev);
315 pci_dev_put(assigned_dev->dev);
316
317 list_del(&assigned_dev->list);
318 kfree(assigned_dev);
319}
320
321void kvm_free_all_assigned_devices(struct kvm *kvm)
322{
323 struct list_head *ptr, *ptr2;
324 struct kvm_assigned_dev_kernel *assigned_dev;
325
326 list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
327 assigned_dev = list_entry(ptr,
328 struct kvm_assigned_dev_kernel,
329 list);
330
331 kvm_free_assigned_device(kvm, assigned_dev);
332 }
333}
334
335static int assigned_device_enable_host_intx(struct kvm *kvm,
336 struct kvm_assigned_dev_kernel *dev)
337{
338 dev->host_irq = dev->dev->irq;
339 /* Even though this is PCI, we don't want to use shared
340 * interrupts. Sharing host devices with guest-assigned devices
341 * on the same interrupt line is not a happy situation: there
342 * are going to be long delays in accepting, acking, etc.
343 */
344 if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
345 0, "kvm_assigned_intx_device", (void *)dev))
346 return -EIO;
347 return 0;
348}
349
350#ifdef __KVM_HAVE_MSI
351static int assigned_device_enable_host_msi(struct kvm *kvm,
352 struct kvm_assigned_dev_kernel *dev)
353{
354 int r;
355
356 if (!dev->dev->msi_enabled) {
357 r = pci_enable_msi(dev->dev);
358 if (r)
359 return r;
360 }
361
362 dev->host_irq = dev->dev->irq;
363 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
364 "kvm_assigned_msi_device", (void *)dev)) {
365 pci_disable_msi(dev->dev);
366 return -EIO;
367 }
368
369 return 0;
370}
371#endif
372
373#ifdef __KVM_HAVE_MSIX
374static int assigned_device_enable_host_msix(struct kvm *kvm,
375 struct kvm_assigned_dev_kernel *dev)
376{
377 int i, r = -EINVAL;
378
379 /* host_msix_entries and guest_msix_entries should have been
380 * initialized */
381 if (dev->entries_nr == 0)
382 return r;
383
384 r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
385 if (r)
386 return r;
387
388 for (i = 0; i < dev->entries_nr; i++) {
389 r = request_irq(dev->host_msix_entries[i].vector,
390 kvm_assigned_dev_intr, 0,
391 "kvm_assigned_msix_device",
392 (void *)dev);
393 /* FIXME: free requested_irq's on failure */
394 if (r)
395 return r;
396 }
397
398 return 0;
399}
400
401#endif
402
403static int assigned_device_enable_guest_intx(struct kvm *kvm,
404 struct kvm_assigned_dev_kernel *dev,
405 struct kvm_assigned_irq *irq)
406{
407 dev->guest_irq = irq->guest_irq;
408 dev->ack_notifier.gsi = irq->guest_irq;
409 return 0;
410}
411
412#ifdef __KVM_HAVE_MSI
413static int assigned_device_enable_guest_msi(struct kvm *kvm,
414 struct kvm_assigned_dev_kernel *dev,
415 struct kvm_assigned_irq *irq)
416{
417 dev->guest_irq = irq->guest_irq;
418 dev->ack_notifier.gsi = -1;
419 dev->host_irq_disabled = false;
420 return 0;
421}
422#endif
423#ifdef __KVM_HAVE_MSIX
424static int assigned_device_enable_guest_msix(struct kvm *kvm,
425 struct kvm_assigned_dev_kernel *dev,
426 struct kvm_assigned_irq *irq)
427{
428 dev->guest_irq = irq->guest_irq;
429 dev->ack_notifier.gsi = -1;
430 dev->host_irq_disabled = false;
431 return 0;
432}
433#endif
434
435static int assign_host_irq(struct kvm *kvm,
436 struct kvm_assigned_dev_kernel *dev,
437 __u32 host_irq_type)
438{
439 int r = -EEXIST;
440
441 if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
442 return r;
443
444 switch (host_irq_type) {
445 case KVM_DEV_IRQ_HOST_INTX:
446 r = assigned_device_enable_host_intx(kvm, dev);
447 break;
448#ifdef __KVM_HAVE_MSI
449 case KVM_DEV_IRQ_HOST_MSI:
450 r = assigned_device_enable_host_msi(kvm, dev);
451 break;
452#endif
453#ifdef __KVM_HAVE_MSIX
454 case KVM_DEV_IRQ_HOST_MSIX:
455 r = assigned_device_enable_host_msix(kvm, dev);
456 break;
457#endif
458 default:
459 r = -EINVAL;
460 }
461
462 if (!r)
463 dev->irq_requested_type |= host_irq_type;
464
465 return r;
466}
467
468static int assign_guest_irq(struct kvm *kvm,
469 struct kvm_assigned_dev_kernel *dev,
470 struct kvm_assigned_irq *irq,
471 unsigned long guest_irq_type)
472{
473 int id;
474 int r = -EEXIST;
475
476 if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
477 return r;
478
479 id = kvm_request_irq_source_id(kvm);
480 if (id < 0)
481 return id;
482
483 dev->irq_source_id = id;
484
485 switch (guest_irq_type) {
486 case KVM_DEV_IRQ_GUEST_INTX:
487 r = assigned_device_enable_guest_intx(kvm, dev, irq);
488 break;
489#ifdef __KVM_HAVE_MSI
490 case KVM_DEV_IRQ_GUEST_MSI:
491 r = assigned_device_enable_guest_msi(kvm, dev, irq);
492 break;
493#endif
494#ifdef __KVM_HAVE_MSIX
495 case KVM_DEV_IRQ_GUEST_MSIX:
496 r = assigned_device_enable_guest_msix(kvm, dev, irq);
497 break;
498#endif
499 default:
500 r = -EINVAL;
501 }
502
503 if (!r) {
504 dev->irq_requested_type |= guest_irq_type;
505 kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
506 } else
507 kvm_free_irq_source_id(kvm, dev->irq_source_id);
508
509 return r;
510}
511
512/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
513static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
514 struct kvm_assigned_irq *assigned_irq)
515{
516 int r = -EINVAL;
517 struct kvm_assigned_dev_kernel *match;
518 unsigned long host_irq_type, guest_irq_type;
519
520 if (!capable(CAP_SYS_RAWIO))
521 return -EPERM;
522
523 if (!irqchip_in_kernel(kvm))
524 return r;
525
526 mutex_lock(&kvm->lock);
527 r = -ENODEV;
528 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
529 assigned_irq->assigned_dev_id);
530 if (!match)
531 goto out;
532
533 host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
534 guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
535
536 r = -EINVAL;
537 /* can only assign one type at a time */
538 if (hweight_long(host_irq_type) > 1)
539 goto out;
540 if (hweight_long(guest_irq_type) > 1)
541 goto out;
542 if (host_irq_type == 0 && guest_irq_type == 0)
543 goto out;
544
545 r = 0;
546 if (host_irq_type)
547 r = assign_host_irq(kvm, match, host_irq_type);
548 if (r)
549 goto out;
550
551 if (guest_irq_type)
552 r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
553out:
554 mutex_unlock(&kvm->lock);
555 return r;
556}
557
558static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
559 struct kvm_assigned_irq
560 *assigned_irq)
561{
562 int r = -ENODEV;
563 struct kvm_assigned_dev_kernel *match;
564
565 mutex_lock(&kvm->lock);
566
567 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
568 assigned_irq->assigned_dev_id);
569 if (!match)
570 goto out;
571
572 r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
573out:
574 mutex_unlock(&kvm->lock);
575 return r;
576}
577
578static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
579 struct kvm_assigned_pci_dev *assigned_dev)
580{
581 int r = 0;
582 struct kvm_assigned_dev_kernel *match;
583 struct pci_dev *dev;
584
585 down_read(&kvm->slots_lock);
586 mutex_lock(&kvm->lock);
587
588 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
589 assigned_dev->assigned_dev_id);
590 if (match) {
591 /* device already assigned */
592 r = -EEXIST;
593 goto out;
594 }
595
596 match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
597 if (match == NULL) {
598 printk(KERN_INFO "%s: Couldn't allocate memory\n",
599 __func__);
600 r = -ENOMEM;
601 goto out;
602 }
603 dev = pci_get_bus_and_slot(assigned_dev->busnr,
604 assigned_dev->devfn);
605 if (!dev) {
606 printk(KERN_INFO "%s: host device not found\n", __func__);
607 r = -EINVAL;
608 goto out_free;
609 }
610 if (pci_enable_device(dev)) {
611 printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
612 r = -EBUSY;
613 goto out_put;
614 }
615 r = pci_request_regions(dev, "kvm_assigned_device");
616 if (r) {
617 printk(KERN_INFO "%s: Could not get access to device regions\n",
618 __func__);
619 goto out_disable;
620 }
621
622 pci_reset_function(dev);
623
624 match->assigned_dev_id = assigned_dev->assigned_dev_id;
625 match->host_busnr = assigned_dev->busnr;
626 match->host_devfn = assigned_dev->devfn;
627 match->flags = assigned_dev->flags;
628 match->dev = dev;
629 spin_lock_init(&match->assigned_dev_lock);
630 match->irq_source_id = -1;
631 match->kvm = kvm;
632 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
633 INIT_WORK(&match->interrupt_work,
634 kvm_assigned_dev_interrupt_work_handler);
635
636 list_add(&match->list, &kvm->arch.assigned_dev_head);
637
638 if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
639 if (!kvm->arch.iommu_domain) {
640 r = kvm_iommu_map_guest(kvm);
641 if (r)
642 goto out_list_del;
643 }
644 r = kvm_assign_device(kvm, match);
645 if (r)
646 goto out_list_del;
647 }
648
649out:
650 mutex_unlock(&kvm->lock);
651 up_read(&kvm->slots_lock);
652 return r;
653out_list_del:
654 list_del(&match->list);
655 pci_release_regions(dev);
656out_disable:
657 pci_disable_device(dev);
658out_put:
659 pci_dev_put(dev);
660out_free:
661 kfree(match);
662 mutex_unlock(&kvm->lock);
663 up_read(&kvm->slots_lock);
664 return r;
665}
666#endif
667
668#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
669static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
670 struct kvm_assigned_pci_dev *assigned_dev)
671{
672 int r = 0;
673 struct kvm_assigned_dev_kernel *match;
674
675 mutex_lock(&kvm->lock);
676
677 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
678 assigned_dev->assigned_dev_id);
679 if (!match) {
680 printk(KERN_INFO "%s: device hasn't been assigned before, "
681 "so cannot be deassigned\n", __func__);
682 r = -EINVAL;
683 goto out;
684 }
685
686 if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
687 kvm_deassign_device(kvm, match);
688
689 kvm_free_assigned_device(kvm, match);
690
691out:
692 mutex_unlock(&kvm->lock);
693 return r;
694}
695#endif
696
697inline int kvm_is_mmio_pfn(pfn_t pfn) 92inline int kvm_is_mmio_pfn(pfn_t pfn)
698{ 93{
699 if (pfn_valid(pfn)) { 94 if (pfn_valid(pfn)) {
@@ -949,6 +344,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
949 344
950static struct kvm *kvm_create_vm(void) 345static struct kvm *kvm_create_vm(void)
951{ 346{
347 int r = 0;
952 struct kvm *kvm = kvm_arch_create_vm(); 348 struct kvm *kvm = kvm_arch_create_vm();
953#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 349#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
954 struct page *page; 350 struct page *page;
@@ -956,16 +352,21 @@ static struct kvm *kvm_create_vm(void)
956 352
957 if (IS_ERR(kvm)) 353 if (IS_ERR(kvm))
958 goto out; 354 goto out;
355
356 r = hardware_enable_all();
357 if (r)
358 goto out_err_nodisable;
359
959#ifdef CONFIG_HAVE_KVM_IRQCHIP 360#ifdef CONFIG_HAVE_KVM_IRQCHIP
960 INIT_LIST_HEAD(&kvm->irq_routing);
961 INIT_HLIST_HEAD(&kvm->mask_notifier_list); 361 INIT_HLIST_HEAD(&kvm->mask_notifier_list);
362 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
962#endif 363#endif
963 364
964#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 365#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
965 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 366 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
966 if (!page) { 367 if (!page) {
967 kfree(kvm); 368 r = -ENOMEM;
968 return ERR_PTR(-ENOMEM); 369 goto out_err;
969 } 370 }
970 kvm->coalesced_mmio_ring = 371 kvm->coalesced_mmio_ring =
971 (struct kvm_coalesced_mmio_ring *)page_address(page); 372 (struct kvm_coalesced_mmio_ring *)page_address(page);
@@ -973,15 +374,13 @@ static struct kvm *kvm_create_vm(void)
973 374
974#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 375#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
975 { 376 {
976 int err;
977 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 377 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
978 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm); 378 r = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
979 if (err) { 379 if (r) {
980#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 380#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
981 put_page(page); 381 put_page(page);
982#endif 382#endif
983 kfree(kvm); 383 goto out_err;
984 return ERR_PTR(err);
985 } 384 }
986 } 385 }
987#endif 386#endif
@@ -1005,6 +404,12 @@ static struct kvm *kvm_create_vm(void)
1005#endif 404#endif
1006out: 405out:
1007 return kvm; 406 return kvm;
407
408out_err:
409 hardware_disable_all();
410out_err_nodisable:
411 kfree(kvm);
412 return ERR_PTR(r);
1008} 413}
1009 414
1010/* 415/*
@@ -1063,6 +468,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
1063 kvm_arch_flush_shadow(kvm); 468 kvm_arch_flush_shadow(kvm);
1064#endif 469#endif
1065 kvm_arch_destroy_vm(kvm); 470 kvm_arch_destroy_vm(kvm);
471 hardware_disable_all();
1066 mmdrop(mm); 472 mmdrop(mm);
1067} 473}
1068 474
@@ -1689,9 +1095,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1689 if (signal_pending(current)) 1095 if (signal_pending(current))
1690 break; 1096 break;
1691 1097
1692 vcpu_put(vcpu);
1693 schedule(); 1098 schedule();
1694 vcpu_load(vcpu);
1695 } 1099 }
1696 1100
1697 finish_wait(&vcpu->wq, &wait); 1101 finish_wait(&vcpu->wq, &wait);
@@ -1705,6 +1109,21 @@ void kvm_resched(struct kvm_vcpu *vcpu)
1705} 1109}
1706EXPORT_SYMBOL_GPL(kvm_resched); 1110EXPORT_SYMBOL_GPL(kvm_resched);
1707 1111
1112void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu)
1113{
1114 ktime_t expires;
1115 DEFINE_WAIT(wait);
1116
1117 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1118
1119 /* Sleep for 100 us, and hope lock-holder got scheduled */
1120 expires = ktime_add_ns(ktime_get(), 100000UL);
1121 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1122
1123 finish_wait(&vcpu->wq, &wait);
1124}
1125EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
1126
1708static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1127static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1709{ 1128{
1710 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1129 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
@@ -1828,88 +1247,6 @@ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
1828 return 0; 1247 return 0;
1829} 1248}
1830 1249
1831#ifdef __KVM_HAVE_MSIX
1832static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
1833 struct kvm_assigned_msix_nr *entry_nr)
1834{
1835 int r = 0;
1836 struct kvm_assigned_dev_kernel *adev;
1837
1838 mutex_lock(&kvm->lock);
1839
1840 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
1841 entry_nr->assigned_dev_id);
1842 if (!adev) {
1843 r = -EINVAL;
1844 goto msix_nr_out;
1845 }
1846
1847 if (adev->entries_nr == 0) {
1848 adev->entries_nr = entry_nr->entry_nr;
1849 if (adev->entries_nr == 0 ||
1850 adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
1851 r = -EINVAL;
1852 goto msix_nr_out;
1853 }
1854
1855 adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
1856 entry_nr->entry_nr,
1857 GFP_KERNEL);
1858 if (!adev->host_msix_entries) {
1859 r = -ENOMEM;
1860 goto msix_nr_out;
1861 }
1862 adev->guest_msix_entries = kzalloc(
1863 sizeof(struct kvm_guest_msix_entry) *
1864 entry_nr->entry_nr, GFP_KERNEL);
1865 if (!adev->guest_msix_entries) {
1866 kfree(adev->host_msix_entries);
1867 r = -ENOMEM;
1868 goto msix_nr_out;
1869 }
1870 } else /* Not allowed set MSI-X number twice */
1871 r = -EINVAL;
1872msix_nr_out:
1873 mutex_unlock(&kvm->lock);
1874 return r;
1875}
1876
1877static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
1878 struct kvm_assigned_msix_entry *entry)
1879{
1880 int r = 0, i;
1881 struct kvm_assigned_dev_kernel *adev;
1882
1883 mutex_lock(&kvm->lock);
1884
1885 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
1886 entry->assigned_dev_id);
1887
1888 if (!adev) {
1889 r = -EINVAL;
1890 goto msix_entry_out;
1891 }
1892
1893 for (i = 0; i < adev->entries_nr; i++)
1894 if (adev->guest_msix_entries[i].vector == 0 ||
1895 adev->guest_msix_entries[i].entry == entry->entry) {
1896 adev->guest_msix_entries[i].entry = entry->entry;
1897 adev->guest_msix_entries[i].vector = entry->gsi;
1898 adev->host_msix_entries[i].entry = entry->entry;
1899 break;
1900 }
1901 if (i == adev->entries_nr) {
1902 r = -ENOSPC;
1903 goto msix_entry_out;
1904 }
1905
1906msix_entry_out:
1907 mutex_unlock(&kvm->lock);
1908
1909 return r;
1910}
1911#endif
1912
1913static long kvm_vcpu_ioctl(struct file *filp, 1250static long kvm_vcpu_ioctl(struct file *filp,
1914 unsigned int ioctl, unsigned long arg) 1251 unsigned int ioctl, unsigned long arg)
1915{ 1252{
@@ -2168,112 +1505,6 @@ static long kvm_vm_ioctl(struct file *filp,
2168 break; 1505 break;
2169 } 1506 }
2170#endif 1507#endif
2171#ifdef KVM_CAP_DEVICE_ASSIGNMENT
2172 case KVM_ASSIGN_PCI_DEVICE: {
2173 struct kvm_assigned_pci_dev assigned_dev;
2174
2175 r = -EFAULT;
2176 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
2177 goto out;
2178 r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
2179 if (r)
2180 goto out;
2181 break;
2182 }
2183 case KVM_ASSIGN_IRQ: {
2184 r = -EOPNOTSUPP;
2185 break;
2186 }
2187#ifdef KVM_CAP_ASSIGN_DEV_IRQ
2188 case KVM_ASSIGN_DEV_IRQ: {
2189 struct kvm_assigned_irq assigned_irq;
2190
2191 r = -EFAULT;
2192 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
2193 goto out;
2194 r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
2195 if (r)
2196 goto out;
2197 break;
2198 }
2199 case KVM_DEASSIGN_DEV_IRQ: {
2200 struct kvm_assigned_irq assigned_irq;
2201
2202 r = -EFAULT;
2203 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
2204 goto out;
2205 r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
2206 if (r)
2207 goto out;
2208 break;
2209 }
2210#endif
2211#endif
2212#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
2213 case KVM_DEASSIGN_PCI_DEVICE: {
2214 struct kvm_assigned_pci_dev assigned_dev;
2215
2216 r = -EFAULT;
2217 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
2218 goto out;
2219 r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
2220 if (r)
2221 goto out;
2222 break;
2223 }
2224#endif
2225#ifdef KVM_CAP_IRQ_ROUTING
2226 case KVM_SET_GSI_ROUTING: {
2227 struct kvm_irq_routing routing;
2228 struct kvm_irq_routing __user *urouting;
2229 struct kvm_irq_routing_entry *entries;
2230
2231 r = -EFAULT;
2232 if (copy_from_user(&routing, argp, sizeof(routing)))
2233 goto out;
2234 r = -EINVAL;
2235 if (routing.nr >= KVM_MAX_IRQ_ROUTES)
2236 goto out;
2237 if (routing.flags)
2238 goto out;
2239 r = -ENOMEM;
2240 entries = vmalloc(routing.nr * sizeof(*entries));
2241 if (!entries)
2242 goto out;
2243 r = -EFAULT;
2244 urouting = argp;
2245 if (copy_from_user(entries, urouting->entries,
2246 routing.nr * sizeof(*entries)))
2247 goto out_free_irq_routing;
2248 r = kvm_set_irq_routing(kvm, entries, routing.nr,
2249 routing.flags);
2250 out_free_irq_routing:
2251 vfree(entries);
2252 break;
2253 }
2254#endif /* KVM_CAP_IRQ_ROUTING */
2255#ifdef __KVM_HAVE_MSIX
2256 case KVM_ASSIGN_SET_MSIX_NR: {
2257 struct kvm_assigned_msix_nr entry_nr;
2258 r = -EFAULT;
2259 if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
2260 goto out;
2261 r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
2262 if (r)
2263 goto out;
2264 break;
2265 }
2266 case KVM_ASSIGN_SET_MSIX_ENTRY: {
2267 struct kvm_assigned_msix_entry entry;
2268 r = -EFAULT;
2269 if (copy_from_user(&entry, argp, sizeof entry))
2270 goto out;
2271 r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
2272 if (r)
2273 goto out;
2274 break;
2275 }
2276#endif
2277 case KVM_IRQFD: { 1508 case KVM_IRQFD: {
2278 struct kvm_irqfd data; 1509 struct kvm_irqfd data;
2279 1510
@@ -2305,11 +1536,59 @@ static long kvm_vm_ioctl(struct file *filp,
2305#endif 1536#endif
2306 default: 1537 default:
2307 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 1538 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
1539 if (r == -ENOTTY)
1540 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
2308 } 1541 }
2309out: 1542out:
2310 return r; 1543 return r;
2311} 1544}
2312 1545
1546#ifdef CONFIG_COMPAT
1547struct compat_kvm_dirty_log {
1548 __u32 slot;
1549 __u32 padding1;
1550 union {
1551 compat_uptr_t dirty_bitmap; /* one bit per page */
1552 __u64 padding2;
1553 };
1554};
1555
1556static long kvm_vm_compat_ioctl(struct file *filp,
1557 unsigned int ioctl, unsigned long arg)
1558{
1559 struct kvm *kvm = filp->private_data;
1560 int r;
1561
1562 if (kvm->mm != current->mm)
1563 return -EIO;
1564 switch (ioctl) {
1565 case KVM_GET_DIRTY_LOG: {
1566 struct compat_kvm_dirty_log compat_log;
1567 struct kvm_dirty_log log;
1568
1569 r = -EFAULT;
1570 if (copy_from_user(&compat_log, (void __user *)arg,
1571 sizeof(compat_log)))
1572 goto out;
1573 log.slot = compat_log.slot;
1574 log.padding1 = compat_log.padding1;
1575 log.padding2 = compat_log.padding2;
1576 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
1577
1578 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
1579 if (r)
1580 goto out;
1581 break;
1582 }
1583 default:
1584 r = kvm_vm_ioctl(filp, ioctl, arg);
1585 }
1586
1587out:
1588 return r;
1589}
1590#endif
1591
2313static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1592static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2314{ 1593{
2315 struct page *page[1]; 1594 struct page *page[1];
@@ -2344,7 +1623,9 @@ static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
2344static struct file_operations kvm_vm_fops = { 1623static struct file_operations kvm_vm_fops = {
2345 .release = kvm_vm_release, 1624 .release = kvm_vm_release,
2346 .unlocked_ioctl = kvm_vm_ioctl, 1625 .unlocked_ioctl = kvm_vm_ioctl,
2347 .compat_ioctl = kvm_vm_ioctl, 1626#ifdef CONFIG_COMPAT
1627 .compat_ioctl = kvm_vm_compat_ioctl,
1628#endif
2348 .mmap = kvm_vm_mmap, 1629 .mmap = kvm_vm_mmap,
2349}; 1630};
2350 1631
@@ -2372,6 +1653,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
2372#ifdef CONFIG_KVM_APIC_ARCHITECTURE 1653#ifdef CONFIG_KVM_APIC_ARCHITECTURE
2373 case KVM_CAP_SET_BOOT_CPU_ID: 1654 case KVM_CAP_SET_BOOT_CPU_ID:
2374#endif 1655#endif
1656 case KVM_CAP_INTERNAL_ERROR_DATA:
2375 return 1; 1657 return 1;
2376#ifdef CONFIG_HAVE_KVM_IRQCHIP 1658#ifdef CONFIG_HAVE_KVM_IRQCHIP
2377 case KVM_CAP_IRQ_ROUTING: 1659 case KVM_CAP_IRQ_ROUTING:
@@ -2442,11 +1724,21 @@ static struct miscdevice kvm_dev = {
2442static void hardware_enable(void *junk) 1724static void hardware_enable(void *junk)
2443{ 1725{
2444 int cpu = raw_smp_processor_id(); 1726 int cpu = raw_smp_processor_id();
1727 int r;
2445 1728
2446 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 1729 if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
2447 return; 1730 return;
1731
2448 cpumask_set_cpu(cpu, cpus_hardware_enabled); 1732 cpumask_set_cpu(cpu, cpus_hardware_enabled);
2449 kvm_arch_hardware_enable(NULL); 1733
1734 r = kvm_arch_hardware_enable(NULL);
1735
1736 if (r) {
1737 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
1738 atomic_inc(&hardware_enable_failed);
1739 printk(KERN_INFO "kvm: enabling virtualization on "
1740 "CPU%d failed\n", cpu);
1741 }
2450} 1742}
2451 1743
2452static void hardware_disable(void *junk) 1744static void hardware_disable(void *junk)
@@ -2459,11 +1751,52 @@ static void hardware_disable(void *junk)
2459 kvm_arch_hardware_disable(NULL); 1751 kvm_arch_hardware_disable(NULL);
2460} 1752}
2461 1753
1754static void hardware_disable_all_nolock(void)
1755{
1756 BUG_ON(!kvm_usage_count);
1757
1758 kvm_usage_count--;
1759 if (!kvm_usage_count)
1760 on_each_cpu(hardware_disable, NULL, 1);
1761}
1762
1763static void hardware_disable_all(void)
1764{
1765 spin_lock(&kvm_lock);
1766 hardware_disable_all_nolock();
1767 spin_unlock(&kvm_lock);
1768}
1769
1770static int hardware_enable_all(void)
1771{
1772 int r = 0;
1773
1774 spin_lock(&kvm_lock);
1775
1776 kvm_usage_count++;
1777 if (kvm_usage_count == 1) {
1778 atomic_set(&hardware_enable_failed, 0);
1779 on_each_cpu(hardware_enable, NULL, 1);
1780
1781 if (atomic_read(&hardware_enable_failed)) {
1782 hardware_disable_all_nolock();
1783 r = -EBUSY;
1784 }
1785 }
1786
1787 spin_unlock(&kvm_lock);
1788
1789 return r;
1790}
1791
2462static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 1792static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2463 void *v) 1793 void *v)
2464{ 1794{
2465 int cpu = (long)v; 1795 int cpu = (long)v;
2466 1796
1797 if (!kvm_usage_count)
1798 return NOTIFY_OK;
1799
2467 val &= ~CPU_TASKS_FROZEN; 1800 val &= ~CPU_TASKS_FROZEN;
2468 switch (val) { 1801 switch (val) {
2469 case CPU_DYING: 1802 case CPU_DYING:
@@ -2666,13 +1999,15 @@ static void kvm_exit_debug(void)
2666 1999
2667static int kvm_suspend(struct sys_device *dev, pm_message_t state) 2000static int kvm_suspend(struct sys_device *dev, pm_message_t state)
2668{ 2001{
2669 hardware_disable(NULL); 2002 if (kvm_usage_count)
2003 hardware_disable(NULL);
2670 return 0; 2004 return 0;
2671} 2005}
2672 2006
2673static int kvm_resume(struct sys_device *dev) 2007static int kvm_resume(struct sys_device *dev)
2674{ 2008{
2675 hardware_enable(NULL); 2009 if (kvm_usage_count)
2010 hardware_enable(NULL);
2676 return 0; 2011 return 0;
2677} 2012}
2678 2013
@@ -2747,7 +2082,6 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
2747 goto out_free_1; 2082 goto out_free_1;
2748 } 2083 }
2749 2084
2750 on_each_cpu(hardware_enable, NULL, 1);
2751 r = register_cpu_notifier(&kvm_cpu_notifier); 2085 r = register_cpu_notifier(&kvm_cpu_notifier);
2752 if (r) 2086 if (r)
2753 goto out_free_2; 2087 goto out_free_2;
@@ -2797,7 +2131,6 @@ out_free_3:
2797 unregister_reboot_notifier(&kvm_reboot_notifier); 2131 unregister_reboot_notifier(&kvm_reboot_notifier);
2798 unregister_cpu_notifier(&kvm_cpu_notifier); 2132 unregister_cpu_notifier(&kvm_cpu_notifier);
2799out_free_2: 2133out_free_2:
2800 on_each_cpu(hardware_disable, NULL, 1);
2801out_free_1: 2134out_free_1:
2802 kvm_arch_hardware_unsetup(); 2135 kvm_arch_hardware_unsetup();
2803out_free_0a: 2136out_free_0a: