aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/api.txt33
-rw-r--r--Documentation/virtual/kvm/hypercalls.txt66
-rw-r--r--Documentation/virtual/kvm/msr.txt32
-rw-r--r--Documentation/virtual/kvm/ppc-pv.txt22
-rw-r--r--arch/ia64/kvm/kvm-ia64.c41
-rw-r--r--arch/powerpc/include/asm/kvm_host.h3
-rw-r--r--arch/powerpc/kvm/44x_tlb.c1
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c51
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c6
-rw-r--r--arch/powerpc/kvm/book3s_pr.c4
-rw-r--r--arch/powerpc/kvm/e500_tlb.c3
-rw-r--r--arch/powerpc/kvm/powerpc.c14
-rw-r--r--arch/s390/include/asm/processor.h1
-rw-r--r--arch/s390/kernel/dis.c27
-rw-r--r--arch/s390/kvm/Kconfig1
-rw-r--r--arch/s390/kvm/diag.c4
-rw-r--r--arch/s390/kvm/intercept.c11
-rw-r--r--arch/s390/kvm/interrupt.c25
-rw-r--r--arch/s390/kvm/kvm-s390.c17
-rw-r--r--arch/s390/kvm/priv.c9
-rw-r--r--arch/s390/kvm/sigp.c2
-rw-r--r--arch/s390/kvm/trace-s390.h210
-rw-r--r--arch/s390/kvm/trace.h341
-rw-r--r--arch/x86/Kconfig21
-rw-r--r--arch/x86/include/asm/kvm.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h48
-rw-r--r--arch/x86/include/asm/kvm_host.h36
-rw-r--r--arch/x86/include/asm/kvm_para.h6
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/kvm.c3
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c14
-rw-r--r--arch/x86/kvm/emulate.c538
-rw-r--r--arch/x86/kvm/i8254.c64
-rw-r--r--arch/x86/kvm/i8254.h6
-rw-r--r--arch/x86/kvm/i8259.c70
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/kvm_timer.h18
-rw-r--r--arch/x86/kvm/lapic.c484
-rw-r--r--arch/x86/kvm/lapic.h61
-rw-r--r--arch/x86/kvm/mmu.c240
-rw-r--r--arch/x86/kvm/mmu.h25
-rw-r--r--arch/x86/kvm/mmu_audit.c8
-rw-r--r--arch/x86/kvm/paging_tmpl.h199
-rw-r--r--arch/x86/kvm/pmu.c2
-rw-r--r--arch/x86/kvm/svm.c82
-rw-r--r--arch/x86/kvm/timer.c47
-rw-r--r--arch/x86/kvm/vmx.c233
-rw-r--r--arch/x86/kvm/x86.c384
-rw-r--r--arch/x86/kvm/x86.h1
-rw-r--r--include/linux/kvm.h25
-rw-r--r--include/linux/kvm_host.h145
-rw-r--r--kernel/jump_label.c1
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/async_pf.c11
-rw-r--r--virt/kvm/eventfd.c150
-rw-r--r--virt/kvm/ioapic.c37
-rw-r--r--virt/kvm/iommu.c16
-rw-r--r--virt/kvm/irq_comm.c17
-rw-r--r--virt/kvm/kvm_main.c541
62 files changed, 3006 insertions, 1466 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index bf33aaa4c59f..f6ec3a92e621 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -857,7 +857,8 @@ struct kvm_userspace_memory_region {
857}; 857};
858 858
859/* for kvm_memory_region::flags */ 859/* for kvm_memory_region::flags */
860#define KVM_MEM_LOG_DIRTY_PAGES 1UL 860#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0)
861#define KVM_MEM_READONLY (1UL << 1)
861 862
862This ioctl allows the user to create or modify a guest physical memory 863This ioctl allows the user to create or modify a guest physical memory
863slot. When changing an existing slot, it may be moved in the guest 864slot. When changing an existing slot, it may be moved in the guest
@@ -873,14 +874,17 @@ It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
873be identical. This allows large pages in the guest to be backed by large 874be identical. This allows large pages in the guest to be backed by large
874pages in the host. 875pages in the host.
875 876
876The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which 877The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which instructs
877instructs kvm to keep track of writes to memory within the slot. See 878kvm to keep track of writes to memory within the slot. See KVM_GET_DIRTY_LOG
878the KVM_GET_DIRTY_LOG ioctl. 879ioctl. The KVM_CAP_READONLY_MEM capability indicates the availability of the
880KVM_MEM_READONLY flag. When this flag is set for a memory region, KVM only
881allows read accesses. Writes will be posted to userspace as KVM_EXIT_MMIO
882exits.
879 883
880When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory 884When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of
881region are automatically reflected into the guest. For example, an mmap() 885the memory region are automatically reflected into the guest. For example, an
882that affects the region will be made visible immediately. Another example 886mmap() that affects the region will be made visible immediately. Another
883is madvise(MADV_DROP). 887example is madvise(MADV_DROP).
884 888
885It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. 889It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl.
886The KVM_SET_MEMORY_REGION does not allow fine grained control over memory 890The KVM_SET_MEMORY_REGION does not allow fine grained control over memory
@@ -1946,6 +1950,19 @@ the guest using the specified gsi pin. The irqfd is removed using
1946the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd 1950the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd
1947and kvm_irqfd.gsi. 1951and kvm_irqfd.gsi.
1948 1952
1953With KVM_CAP_IRQFD_RESAMPLE, KVM_IRQFD supports a de-assert and notify
1954mechanism allowing emulation of level-triggered, irqfd-based
1955interrupts. When KVM_IRQFD_FLAG_RESAMPLE is set the user must pass an
1956additional eventfd in the kvm_irqfd.resamplefd field. When operating
1957in resample mode, posting of an interrupt through kvm_irq.fd asserts
1958the specified gsi in the irqchip. When the irqchip is resampled, such
1959as from an EOI, the gsi is de-asserted and the user is notifed via
1960kvm_irqfd.resamplefd. It is the user's responsibility to re-queue
1961the interrupt if the device making use of it still requires service.
1962Note that closing the resamplefd is not sufficient to disable the
1963irqfd. The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment
1964and need not be specified with KVM_IRQFD_FLAG_DEASSIGN.
1965
19494.76 KVM_PPC_ALLOCATE_HTAB 19664.76 KVM_PPC_ALLOCATE_HTAB
1950 1967
1951Capability: KVM_CAP_PPC_ALLOC_HTAB 1968Capability: KVM_CAP_PPC_ALLOC_HTAB
diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
new file mode 100644
index 000000000000..ea113b5d87a4
--- /dev/null
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -0,0 +1,66 @@
1Linux KVM Hypercall:
2===================
3X86:
4 KVM Hypercalls have a three-byte sequence of either the vmcall or the vmmcall
5 instruction. The hypervisor can replace it with instructions that are
6 guaranteed to be supported.
7
8 Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
9 The hypercall number should be placed in rax and the return value will be
10 placed in rax. No other registers will be clobbered unless explicitly stated
11 by the particular hypercall.
12
13S390:
14 R2-R7 are used for parameters 1-6. In addition, R1 is used for hypercall
15 number. The return value is written to R2.
16
17 S390 uses diagnose instruction as hypercall (0x500) along with hypercall
18 number in R1.
19
20 PowerPC:
21 It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers.
22 Return value is placed in R3.
23
24 KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions'
25 property inside the device tree's /hypervisor node.
26 For more information refer to Documentation/virtual/kvm/ppc-pv.txt
27
28KVM Hypercalls Documentation
29===========================
30The template for each hypercall is:
311. Hypercall name.
322. Architecture(s)
333. Status (deprecated, obsolete, active)
344. Purpose
35
361. KVM_HC_VAPIC_POLL_IRQ
37------------------------
38Architecture: x86
39Status: active
40Purpose: Trigger guest exit so that the host can check for pending
41interrupts on reentry.
42
432. KVM_HC_MMU_OP
44------------------------
45Architecture: x86
46Status: deprecated.
47Purpose: Support MMU operations such as writing to PTE,
48flushing TLB, release PT.
49
503. KVM_HC_FEATURES
51------------------------
52Architecture: PPC
53Status: active
54Purpose: Expose hypercall availability to the guest. On x86 platforms, cpuid
55used to enumerate which hypercalls are available. On PPC, either device tree
56based lookup ( which is also what EPAPR dictates) OR KVM specific enumeration
57mechanism (which is this hypercall) can be used.
58
594. KVM_HC_PPC_MAP_MAGIC_PAGE
60------------------------
61Architecture: PPC
62Status: active
63Purpose: To enable communication between the hypervisor and guest there is a
64shared page that contains parts of supervisor visible register state.
65The guest can map this shared page to access its supervisor register through
66memory using this hypercall.
diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt
index 730471048583..6d470ae7b073 100644
--- a/Documentation/virtual/kvm/msr.txt
+++ b/Documentation/virtual/kvm/msr.txt
@@ -34,9 +34,12 @@ MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00
34 time information and check that they are both equal and even. 34 time information and check that they are both equal and even.
35 An odd version indicates an in-progress update. 35 An odd version indicates an in-progress update.
36 36
37 sec: number of seconds for wallclock. 37 sec: number of seconds for wallclock at time of boot.
38 38
39 nsec: number of nanoseconds for wallclock. 39 nsec: number of nanoseconds for wallclock at time of boot.
40
41 In order to get the current wallclock time, the system_time from
42 MSR_KVM_SYSTEM_TIME_NEW needs to be added.
40 43
41 Note that although MSRs are per-CPU entities, the effect of this 44 Note that although MSRs are per-CPU entities, the effect of this
42 particular MSR is global. 45 particular MSR is global.
@@ -82,20 +85,25 @@ MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01
82 time at the time this structure was last updated. Unit is 85 time at the time this structure was last updated. Unit is
83 nanoseconds. 86 nanoseconds.
84 87
85 tsc_to_system_mul: a function of the tsc frequency. One has 88 tsc_to_system_mul: multiplier to be used when converting
86 to multiply any tsc-related quantity by this value to get 89 tsc-related quantity to nanoseconds
87 a value in nanoseconds, besides dividing by 2^tsc_shift
88 90
89 tsc_shift: cycle to nanosecond divider, as a power of two, to 91 tsc_shift: shift to be used when converting tsc-related
90 allow for shift rights. One has to shift right any tsc-related 92 quantity to nanoseconds. This shift will ensure that
91 quantity by this value to get a value in nanoseconds, besides 93 multiplication with tsc_to_system_mul does not overflow.
92 multiplying by tsc_to_system_mul. 94 A positive value denotes a left shift, a negative value
95 a right shift.
93 96
94 With this information, guests can derive per-CPU time by 97 The conversion from tsc to nanoseconds involves an additional
95 doing: 98 right shift by 32 bits. With this information, guests can
99 derive per-CPU time by doing:
96 100
97 time = (current_tsc - tsc_timestamp) 101 time = (current_tsc - tsc_timestamp)
98 time = (time * tsc_to_system_mul) >> tsc_shift 102 if (tsc_shift >= 0)
103 time <<= tsc_shift;
104 else
105 time >>= -tsc_shift;
106 time = (time * tsc_to_system_mul) >> 32
99 time = time + system_time 107 time = time + system_time
100 108
101 flags: bits in this field indicate extended capabilities 109 flags: bits in this field indicate extended capabilities
diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt
index 4911cf95c67e..4cd076febb02 100644
--- a/Documentation/virtual/kvm/ppc-pv.txt
+++ b/Documentation/virtual/kvm/ppc-pv.txt
@@ -174,3 +174,25 @@ following:
174That way we can inject an arbitrary amount of code as replacement for a single 174That way we can inject an arbitrary amount of code as replacement for a single
175instruction. This allows us to check for pending interrupts when setting EE=1 175instruction. This allows us to check for pending interrupts when setting EE=1
176for example. 176for example.
177
178Hypercall ABIs in KVM on PowerPC
179=================================
1801) KVM hypercalls (ePAPR)
181
182These are ePAPR compliant hypercall implementation (mentioned above). Even
183generic hypercalls are implemented here, like the ePAPR idle hcall. These are
184available on all targets.
185
1862) PAPR hypercalls
187
188PAPR hypercalls are needed to run server PowerPC PAPR guests (-M pseries in QEMU).
189These are the same hypercalls that pHyp, the POWER hypervisor implements. Some of
190them are handled in the kernel, some are handled in user space. This is only
191available on book3s_64.
192
1933) OSI hypercalls
194
195Mac-on-Linux is another user of KVM on PowerPC, which has its own hypercall (long
196before KVM). This is supported to maintain compatibility. All these hypercalls get
197forwarded to user space. This is only useful on book3s_32, but can be used with
198book3s_64 as well.
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index bd77cb507c1c..8b3a9c0e771d 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -924,6 +924,16 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
924 return 0; 924 return 0;
925} 925}
926 926
927int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
928{
929 if (!irqchip_in_kernel(kvm))
930 return -ENXIO;
931
932 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
933 irq_event->irq, irq_event->level);
934 return 0;
935}
936
927long kvm_arch_vm_ioctl(struct file *filp, 937long kvm_arch_vm_ioctl(struct file *filp,
928 unsigned int ioctl, unsigned long arg) 938 unsigned int ioctl, unsigned long arg)
929{ 939{
@@ -963,29 +973,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
963 goto out; 973 goto out;
964 } 974 }
965 break; 975 break;
966 case KVM_IRQ_LINE_STATUS:
967 case KVM_IRQ_LINE: {
968 struct kvm_irq_level irq_event;
969
970 r = -EFAULT;
971 if (copy_from_user(&irq_event, argp, sizeof irq_event))
972 goto out;
973 r = -ENXIO;
974 if (irqchip_in_kernel(kvm)) {
975 __s32 status;
976 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
977 irq_event.irq, irq_event.level);
978 if (ioctl == KVM_IRQ_LINE_STATUS) {
979 r = -EFAULT;
980 irq_event.status = status;
981 if (copy_to_user(argp, &irq_event,
982 sizeof irq_event))
983 goto out;
984 }
985 r = 0;
986 }
987 break;
988 }
989 case KVM_GET_IRQCHIP: { 976 case KVM_GET_IRQCHIP: {
990 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 977 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
991 struct kvm_irqchip chip; 978 struct kvm_irqchip chip;
@@ -1626,11 +1613,17 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
1626 return; 1613 return;
1627} 1614}
1628 1615
1629void kvm_arch_flush_shadow(struct kvm *kvm) 1616void kvm_arch_flush_shadow_all(struct kvm *kvm)
1630{ 1617{
1631 kvm_flush_remote_tlbs(kvm); 1618 kvm_flush_remote_tlbs(kvm);
1632} 1619}
1633 1620
1621void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
1622 struct kvm_memory_slot *slot)
1623{
1624 kvm_arch_flush_shadow_all();
1625}
1626
1634long kvm_arch_dev_ioctl(struct file *filp, 1627long kvm_arch_dev_ioctl(struct file *filp,
1635 unsigned int ioctl, unsigned long arg) 1628 unsigned int ioctl, unsigned long arg)
1636{ 1629{
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index a8bf5c673a3c..28e8f5e5c63e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -53,6 +53,8 @@
53 53
54struct kvm; 54struct kvm;
55extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 55extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
56extern int kvm_unmap_hva_range(struct kvm *kvm,
57 unsigned long start, unsigned long end);
56extern int kvm_age_hva(struct kvm *kvm, unsigned long hva); 58extern int kvm_age_hva(struct kvm *kvm, unsigned long hva);
57extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 59extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
58extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 60extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
@@ -220,6 +222,7 @@ struct revmap_entry {
220#define KVMPPC_GOT_PAGE 0x80 222#define KVMPPC_GOT_PAGE 0x80
221 223
222struct kvm_arch_memory_slot { 224struct kvm_arch_memory_slot {
225 unsigned long *rmap;
223}; 226};
224 227
225struct kvm_arch { 228struct kvm_arch {
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 33aa715dab28..5dd3ab469976 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -319,7 +319,6 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
319 if (is_error_page(new_page)) { 319 if (is_error_page(new_page)) {
320 printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n", 320 printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n",
321 (unsigned long long)gfn); 321 (unsigned long long)gfn);
322 kvm_release_page_clean(new_page);
323 return; 322 return;
324 } 323 }
325 hpaddr = page_to_phys(new_page); 324 hpaddr = page_to_phys(new_page);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index d03eb6f7b058..d95d11322a15 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -705,7 +705,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
705 goto out_unlock; 705 goto out_unlock;
706 hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; 706 hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
707 707
708 rmap = &memslot->rmap[gfn - memslot->base_gfn]; 708 rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
709 lock_rmap(rmap); 709 lock_rmap(rmap);
710 710
711 /* Check if we might have been invalidated; let the guest retry if so */ 711 /* Check if we might have been invalidated; let the guest retry if so */
@@ -756,9 +756,12 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
756 goto out_put; 756 goto out_put;
757} 757}
758 758
759static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 759static int kvm_handle_hva_range(struct kvm *kvm,
760 int (*handler)(struct kvm *kvm, unsigned long *rmapp, 760 unsigned long start,
761 unsigned long gfn)) 761 unsigned long end,
762 int (*handler)(struct kvm *kvm,
763 unsigned long *rmapp,
764 unsigned long gfn))
762{ 765{
763 int ret; 766 int ret;
764 int retval = 0; 767 int retval = 0;
@@ -767,15 +770,25 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
767 770
768 slots = kvm_memslots(kvm); 771 slots = kvm_memslots(kvm);
769 kvm_for_each_memslot(memslot, slots) { 772 kvm_for_each_memslot(memslot, slots) {
770 unsigned long start = memslot->userspace_addr; 773 unsigned long hva_start, hva_end;
771 unsigned long end; 774 gfn_t gfn, gfn_end;
772 775
773 end = start + (memslot->npages << PAGE_SHIFT); 776 hva_start = max(start, memslot->userspace_addr);
774 if (hva >= start && hva < end) { 777 hva_end = min(end, memslot->userspace_addr +
775 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 778 (memslot->npages << PAGE_SHIFT));
779 if (hva_start >= hva_end)
780 continue;
781 /*
782 * {gfn(page) | page intersects with [hva_start, hva_end)} =
783 * {gfn, gfn+1, ..., gfn_end-1}.
784 */
785 gfn = hva_to_gfn_memslot(hva_start, memslot);
786 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
787
788 for (; gfn < gfn_end; ++gfn) {
789 gfn_t gfn_offset = gfn - memslot->base_gfn;
776 790
777 ret = handler(kvm, &memslot->rmap[gfn_offset], 791 ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn);
778 memslot->base_gfn + gfn_offset);
779 retval |= ret; 792 retval |= ret;
780 } 793 }
781 } 794 }
@@ -783,6 +796,13 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
783 return retval; 796 return retval;
784} 797}
785 798
799static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
800 int (*handler)(struct kvm *kvm, unsigned long *rmapp,
801 unsigned long gfn))
802{
803 return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
804}
805
786static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 806static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
787 unsigned long gfn) 807 unsigned long gfn)
788{ 808{
@@ -850,6 +870,13 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
850 return 0; 870 return 0;
851} 871}
852 872
873int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
874{
875 if (kvm->arch.using_mmu_notifiers)
876 kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
877 return 0;
878}
879
853static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 880static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
854 unsigned long gfn) 881 unsigned long gfn)
855{ 882{
@@ -1009,7 +1036,7 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
1009 unsigned long *rmapp, *map; 1036 unsigned long *rmapp, *map;
1010 1037
1011 preempt_disable(); 1038 preempt_disable();
1012 rmapp = memslot->rmap; 1039 rmapp = memslot->arch.rmap;
1013 map = memslot->dirty_bitmap; 1040 map = memslot->dirty_bitmap;
1014 for (i = 0; i < memslot->npages; ++i) { 1041 for (i = 0; i < memslot->npages; ++i) {
1015 if (kvm_test_clear_dirty(kvm, rmapp)) 1042 if (kvm_test_clear_dirty(kvm, rmapp))
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 5c70d19494f9..fb0e821622d4 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -84,7 +84,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
84 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 84 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
85 return; 85 return;
86 86
87 rmap = real_vmalloc_addr(&memslot->rmap[gfn - memslot->base_gfn]); 87 rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
88 lock_rmap(rmap); 88 lock_rmap(rmap);
89 89
90 head = *rmap & KVMPPC_RMAP_INDEX; 90 head = *rmap & KVMPPC_RMAP_INDEX;
@@ -180,7 +180,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
180 if (!slot_is_aligned(memslot, psize)) 180 if (!slot_is_aligned(memslot, psize))
181 return H_PARAMETER; 181 return H_PARAMETER;
182 slot_fn = gfn - memslot->base_gfn; 182 slot_fn = gfn - memslot->base_gfn;
183 rmap = &memslot->rmap[slot_fn]; 183 rmap = &memslot->arch.rmap[slot_fn];
184 184
185 if (!kvm->arch.using_mmu_notifiers) { 185 if (!kvm->arch.using_mmu_notifiers) {
186 physp = kvm->arch.slot_phys[memslot->id]; 186 physp = kvm->arch.slot_phys[memslot->id];
@@ -197,7 +197,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
197 pa &= PAGE_MASK; 197 pa &= PAGE_MASK;
198 } else { 198 } else {
199 /* Translate to host virtual address */ 199 /* Translate to host virtual address */
200 hva = gfn_to_hva_memslot(memslot, gfn); 200 hva = __gfn_to_hva_memslot(memslot, gfn);
201 201
202 /* Look up the Linux PTE for the backing page */ 202 /* Look up the Linux PTE for the backing page */
203 pte_size = psize; 203 pte_size = psize;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index a1baec340f7e..05c28f59f77f 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -242,10 +242,8 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
242 int i; 242 int i;
243 243
244 hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT); 244 hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
245 if (is_error_page(hpage)) { 245 if (is_error_page(hpage))
246 kvm_release_page_clean(hpage);
247 return; 246 return;
248 }
249 247
250 hpage_offset = pte->raddr & ~PAGE_MASK; 248 hpage_offset = pte->raddr & ~PAGE_MASK;
251 hpage_offset &= ~0xFFFULL; 249 hpage_offset &= ~0xFFFULL;
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index a2b66717813d..ff38b664195d 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -520,11 +520,10 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
520 520
521 if (likely(!pfnmap)) { 521 if (likely(!pfnmap)) {
522 unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); 522 unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
523 pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn); 523 pfn = gfn_to_pfn_memslot(slot, gfn);
524 if (is_error_pfn(pfn)) { 524 if (is_error_pfn(pfn)) {
525 printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", 525 printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
526 (long)gfn); 526 (long)gfn);
527 kvm_release_pfn_clean(pfn);
528 return; 527 return;
529 } 528 }
530 529
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 87f4dc886076..4d213b8b0fb5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -302,10 +302,18 @@ long kvm_arch_dev_ioctl(struct file *filp,
302void kvm_arch_free_memslot(struct kvm_memory_slot *free, 302void kvm_arch_free_memslot(struct kvm_memory_slot *free,
303 struct kvm_memory_slot *dont) 303 struct kvm_memory_slot *dont)
304{ 304{
305 if (!dont || free->arch.rmap != dont->arch.rmap) {
306 vfree(free->arch.rmap);
307 free->arch.rmap = NULL;
308 }
305} 309}
306 310
307int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) 311int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
308{ 312{
313 slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
314 if (!slot->arch.rmap)
315 return -ENOMEM;
316
309 return 0; 317 return 0;
310} 318}
311 319
@@ -326,8 +334,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
326 kvmppc_core_commit_memory_region(kvm, mem); 334 kvmppc_core_commit_memory_region(kvm, mem);
327} 335}
328 336
337void kvm_arch_flush_shadow_all(struct kvm *kvm)
338{
339}
329 340
330void kvm_arch_flush_shadow(struct kvm *kvm) 341void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
342 struct kvm_memory_slot *slot)
331{ 343{
332} 344}
333 345
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index f3e0aabfc6bc..56831dfa9198 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -159,6 +159,7 @@ extern unsigned long thread_saved_pc(struct task_struct *t);
159 159
160extern void show_code(struct pt_regs *regs); 160extern void show_code(struct pt_regs *regs);
161extern void print_fn_code(unsigned char *code, unsigned long len); 161extern void print_fn_code(unsigned char *code, unsigned long len);
162extern int insn_to_mnemonic(unsigned char *instruction, char buf[8]);
162 163
163unsigned long get_wchan(struct task_struct *p); 164unsigned long get_wchan(struct task_struct *p);
164#define task_pt_regs(tsk) ((struct pt_regs *) \ 165#define task_pt_regs(tsk) ((struct pt_regs *) \
diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c
index cc84a24c023f..f00286bd2ef9 100644
--- a/arch/s390/kernel/dis.c
+++ b/arch/s390/kernel/dis.c
@@ -1501,6 +1501,33 @@ static struct insn *find_insn(unsigned char *code)
1501 return NULL; 1501 return NULL;
1502} 1502}
1503 1503
1504/**
1505 * insn_to_mnemonic - decode an s390 instruction
1506 * @instruction: instruction to decode
1507 * @buf: buffer to fill with mnemonic
1508 *
1509 * Decode the instruction at @instruction and store the corresponding
1510 * mnemonic into @buf.
1511 * @buf is left unchanged if the instruction could not be decoded.
1512 * Returns:
1513 * %0 on success, %-ENOENT if the instruction was not found.
1514 */
1515int insn_to_mnemonic(unsigned char *instruction, char buf[8])
1516{
1517 struct insn *insn;
1518
1519 insn = find_insn(instruction);
1520 if (!insn)
1521 return -ENOENT;
1522 if (insn->name[0] == '\0')
1523 snprintf(buf, sizeof(buf), "%s",
1524 long_insn_name[(int) insn->name[1]]);
1525 else
1526 snprintf(buf, sizeof(buf), "%.5s", insn->name);
1527 return 0;
1528}
1529EXPORT_SYMBOL_GPL(insn_to_mnemonic);
1530
1504static int print_insn(char *buffer, unsigned char *code, unsigned long addr) 1531static int print_insn(char *buffer, unsigned char *code, unsigned long addr)
1505{ 1532{
1506 struct insn *insn; 1533 struct insn *insn;
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 9b04a32e5695..b58dd869cb32 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
21 depends on HAVE_KVM && EXPERIMENTAL 21 depends on HAVE_KVM && EXPERIMENTAL
22 select PREEMPT_NOTIFIERS 22 select PREEMPT_NOTIFIERS
23 select ANON_INODES 23 select ANON_INODES
24 select HAVE_KVM_CPU_RELAX_INTERCEPT
24 ---help--- 25 ---help---
25 Support hosting paravirtualized guest machines using the SIE 26 Support hosting paravirtualized guest machines using the SIE
26 virtualization capability on the mainframe. This should work 27 virtualization capability on the mainframe. This should work
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index c88bb7793390..a390687feb13 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -14,6 +14,8 @@
14#include <linux/kvm.h> 14#include <linux/kvm.h>
15#include <linux/kvm_host.h> 15#include <linux/kvm_host.h>
16#include "kvm-s390.h" 16#include "kvm-s390.h"
17#include "trace.h"
18#include "trace-s390.h"
17 19
18static int diag_release_pages(struct kvm_vcpu *vcpu) 20static int diag_release_pages(struct kvm_vcpu *vcpu)
19{ 21{
@@ -98,6 +100,7 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
98 vcpu->run->exit_reason = KVM_EXIT_S390_RESET; 100 vcpu->run->exit_reason = KVM_EXIT_S390_RESET;
99 VCPU_EVENT(vcpu, 3, "requesting userspace resets %llx", 101 VCPU_EVENT(vcpu, 3, "requesting userspace resets %llx",
100 vcpu->run->s390_reset_flags); 102 vcpu->run->s390_reset_flags);
103 trace_kvm_s390_request_resets(vcpu->run->s390_reset_flags);
101 return -EREMOTE; 104 return -EREMOTE;
102} 105}
103 106
@@ -105,6 +108,7 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
105{ 108{
106 int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16; 109 int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16;
107 110
111 trace_kvm_s390_handle_diag(vcpu, code);
108 switch (code) { 112 switch (code) {
109 case 0x10: 113 case 0x10:
110 return diag_release_pages(vcpu); 114 return diag_release_pages(vcpu);
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index adae539f12e2..22798ec33fd1 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -19,6 +19,8 @@
19 19
20#include "kvm-s390.h" 20#include "kvm-s390.h"
21#include "gaccess.h" 21#include "gaccess.h"
22#include "trace.h"
23#include "trace-s390.h"
22 24
23static int handle_lctlg(struct kvm_vcpu *vcpu) 25static int handle_lctlg(struct kvm_vcpu *vcpu)
24{ 26{
@@ -45,6 +47,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
45 47
46 VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2, 48 VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2,
47 disp2); 49 disp2);
50 trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
48 51
49 do { 52 do {
50 rc = get_guest_u64(vcpu, useraddr, 53 rc = get_guest_u64(vcpu, useraddr,
@@ -82,6 +85,7 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
82 85
83 VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2, 86 VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2,
84 disp2); 87 disp2);
88 trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr);
85 89
86 reg = reg1; 90 reg = reg1;
87 do { 91 do {
@@ -135,6 +139,8 @@ static int handle_stop(struct kvm_vcpu *vcpu)
135 vcpu->stat.exit_stop_request++; 139 vcpu->stat.exit_stop_request++;
136 spin_lock_bh(&vcpu->arch.local_int.lock); 140 spin_lock_bh(&vcpu->arch.local_int.lock);
137 141
142 trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits);
143
138 if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) { 144 if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) {
139 vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP; 145 vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP;
140 rc = SIE_INTERCEPT_RERUNVCPU; 146 rc = SIE_INTERCEPT_RERUNVCPU;
@@ -171,6 +177,7 @@ static int handle_validity(struct kvm_vcpu *vcpu)
171 int rc; 177 int rc;
172 178
173 vcpu->stat.exit_validity++; 179 vcpu->stat.exit_validity++;
180 trace_kvm_s390_intercept_validity(vcpu, viwhy);
174 if (viwhy == 0x37) { 181 if (viwhy == 0x37) {
175 vmaddr = gmap_fault(vcpu->arch.sie_block->prefix, 182 vmaddr = gmap_fault(vcpu->arch.sie_block->prefix,
176 vcpu->arch.gmap); 183 vcpu->arch.gmap);
@@ -213,6 +220,9 @@ static int handle_instruction(struct kvm_vcpu *vcpu)
213 intercept_handler_t handler; 220 intercept_handler_t handler;
214 221
215 vcpu->stat.exit_instruction++; 222 vcpu->stat.exit_instruction++;
223 trace_kvm_s390_intercept_instruction(vcpu,
224 vcpu->arch.sie_block->ipa,
225 vcpu->arch.sie_block->ipb);
216 handler = instruction_handlers[vcpu->arch.sie_block->ipa >> 8]; 226 handler = instruction_handlers[vcpu->arch.sie_block->ipa >> 8];
217 if (handler) 227 if (handler)
218 return handler(vcpu); 228 return handler(vcpu);
@@ -222,6 +232,7 @@ static int handle_instruction(struct kvm_vcpu *vcpu)
222static int handle_prog(struct kvm_vcpu *vcpu) 232static int handle_prog(struct kvm_vcpu *vcpu)
223{ 233{
224 vcpu->stat.exit_program_interruption++; 234 vcpu->stat.exit_program_interruption++;
235 trace_kvm_s390_intercept_prog(vcpu, vcpu->arch.sie_block->iprcc);
225 return kvm_s390_inject_program_int(vcpu, vcpu->arch.sie_block->iprcc); 236 return kvm_s390_inject_program_int(vcpu, vcpu->arch.sie_block->iprcc);
226} 237}
227 238
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index b7bc1aac8ed2..ff1e2f8ef94a 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -19,6 +19,7 @@
19#include <asm/uaccess.h> 19#include <asm/uaccess.h>
20#include "kvm-s390.h" 20#include "kvm-s390.h"
21#include "gaccess.h" 21#include "gaccess.h"
22#include "trace-s390.h"
22 23
23static int psw_extint_disabled(struct kvm_vcpu *vcpu) 24static int psw_extint_disabled(struct kvm_vcpu *vcpu)
24{ 25{
@@ -130,6 +131,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
130 case KVM_S390_INT_EMERGENCY: 131 case KVM_S390_INT_EMERGENCY:
131 VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp emerg"); 132 VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp emerg");
132 vcpu->stat.deliver_emergency_signal++; 133 vcpu->stat.deliver_emergency_signal++;
134 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
135 inti->emerg.code, 0);
133 rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1201); 136 rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1201);
134 if (rc == -EFAULT) 137 if (rc == -EFAULT)
135 exception = 1; 138 exception = 1;
@@ -152,6 +155,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
152 case KVM_S390_INT_EXTERNAL_CALL: 155 case KVM_S390_INT_EXTERNAL_CALL:
153 VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call"); 156 VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call");
154 vcpu->stat.deliver_external_call++; 157 vcpu->stat.deliver_external_call++;
158 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
159 inti->extcall.code, 0);
155 rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202); 160 rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202);
156 if (rc == -EFAULT) 161 if (rc == -EFAULT)
157 exception = 1; 162 exception = 1;
@@ -175,6 +180,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
175 VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x", 180 VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
176 inti->ext.ext_params); 181 inti->ext.ext_params);
177 vcpu->stat.deliver_service_signal++; 182 vcpu->stat.deliver_service_signal++;
183 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
184 inti->ext.ext_params, 0);
178 rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2401); 185 rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2401);
179 if (rc == -EFAULT) 186 if (rc == -EFAULT)
180 exception = 1; 187 exception = 1;
@@ -198,6 +205,9 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
198 VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx", 205 VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx",
199 inti->ext.ext_params, inti->ext.ext_params2); 206 inti->ext.ext_params, inti->ext.ext_params2);
200 vcpu->stat.deliver_virtio_interrupt++; 207 vcpu->stat.deliver_virtio_interrupt++;
208 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
209 inti->ext.ext_params,
210 inti->ext.ext_params2);
201 rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2603); 211 rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2603);
202 if (rc == -EFAULT) 212 if (rc == -EFAULT)
203 exception = 1; 213 exception = 1;
@@ -229,6 +239,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
229 case KVM_S390_SIGP_STOP: 239 case KVM_S390_SIGP_STOP:
230 VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop"); 240 VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop");
231 vcpu->stat.deliver_stop_signal++; 241 vcpu->stat.deliver_stop_signal++;
242 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
243 0, 0);
232 __set_intercept_indicator(vcpu, inti); 244 __set_intercept_indicator(vcpu, inti);
233 break; 245 break;
234 246
@@ -236,12 +248,16 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
236 VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x", 248 VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x",
237 inti->prefix.address); 249 inti->prefix.address);
238 vcpu->stat.deliver_prefix_signal++; 250 vcpu->stat.deliver_prefix_signal++;
251 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
252 inti->prefix.address, 0);
239 kvm_s390_set_prefix(vcpu, inti->prefix.address); 253 kvm_s390_set_prefix(vcpu, inti->prefix.address);
240 break; 254 break;
241 255
242 case KVM_S390_RESTART: 256 case KVM_S390_RESTART:
243 VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu restart"); 257 VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu restart");
244 vcpu->stat.deliver_restart_signal++; 258 vcpu->stat.deliver_restart_signal++;
259 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
260 0, 0);
245 rc = copy_to_guest(vcpu, offsetof(struct _lowcore, 261 rc = copy_to_guest(vcpu, offsetof(struct _lowcore,
246 restart_old_psw), &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 262 restart_old_psw), &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
247 if (rc == -EFAULT) 263 if (rc == -EFAULT)
@@ -259,6 +275,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
259 inti->pgm.code, 275 inti->pgm.code,
260 table[vcpu->arch.sie_block->ipa >> 14]); 276 table[vcpu->arch.sie_block->ipa >> 14]);
261 vcpu->stat.deliver_program_int++; 277 vcpu->stat.deliver_program_int++;
278 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
279 inti->pgm.code, 0);
262 rc = put_guest_u16(vcpu, __LC_PGM_INT_CODE, inti->pgm.code); 280 rc = put_guest_u16(vcpu, __LC_PGM_INT_CODE, inti->pgm.code);
263 if (rc == -EFAULT) 281 if (rc == -EFAULT)
264 exception = 1; 282 exception = 1;
@@ -405,9 +423,7 @@ no_timer:
405 set_current_state(TASK_INTERRUPTIBLE); 423 set_current_state(TASK_INTERRUPTIBLE);
406 spin_unlock_bh(&vcpu->arch.local_int.lock); 424 spin_unlock_bh(&vcpu->arch.local_int.lock);
407 spin_unlock(&vcpu->arch.local_int.float_int->lock); 425 spin_unlock(&vcpu->arch.local_int.float_int->lock);
408 vcpu_put(vcpu);
409 schedule(); 426 schedule();
410 vcpu_load(vcpu);
411 spin_lock(&vcpu->arch.local_int.float_int->lock); 427 spin_lock(&vcpu->arch.local_int.float_int->lock);
412 spin_lock_bh(&vcpu->arch.local_int.lock); 428 spin_lock_bh(&vcpu->arch.local_int.lock);
413 } 429 }
@@ -515,6 +531,7 @@ int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
515 inti->pgm.code = code; 531 inti->pgm.code = code;
516 532
517 VCPU_EVENT(vcpu, 3, "inject: program check %d (from kernel)", code); 533 VCPU_EVENT(vcpu, 3, "inject: program check %d (from kernel)", code);
534 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, inti->type, code, 0, 1);
518 spin_lock_bh(&li->lock); 535 spin_lock_bh(&li->lock);
519 list_add(&inti->list, &li->list); 536 list_add(&inti->list, &li->list);
520 atomic_set(&li->active, 1); 537 atomic_set(&li->active, 1);
@@ -556,6 +573,8 @@ int kvm_s390_inject_vm(struct kvm *kvm,
556 kfree(inti); 573 kfree(inti);
557 return -EINVAL; 574 return -EINVAL;
558 } 575 }
576 trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
577 2);
559 578
560 mutex_lock(&kvm->lock); 579 mutex_lock(&kvm->lock);
561 fi = &kvm->arch.float_int; 580 fi = &kvm->arch.float_int;
@@ -621,6 +640,8 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
621 kfree(inti); 640 kfree(inti);
622 return -EINVAL; 641 return -EINVAL;
623 } 642 }
643 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, s390int->type, s390int->parm,
644 s390int->parm64, 2);
624 645
625 mutex_lock(&vcpu->kvm->lock); 646 mutex_lock(&vcpu->kvm->lock);
626 li = &vcpu->arch.local_int; 647 li = &vcpu->arch.local_int;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d470ccbfabae..ecced9d18986 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -32,6 +32,10 @@
32#include "kvm-s390.h" 32#include "kvm-s390.h"
33#include "gaccess.h" 33#include "gaccess.h"
34 34
35#define CREATE_TRACE_POINTS
36#include "trace.h"
37#include "trace-s390.h"
38
35#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 39#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
36 40
37struct kvm_stats_debugfs_item debugfs_entries[] = { 41struct kvm_stats_debugfs_item debugfs_entries[] = {
@@ -242,6 +246,7 @@ out_err:
242void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 246void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
243{ 247{
244 VCPU_EVENT(vcpu, 3, "%s", "free cpu"); 248 VCPU_EVENT(vcpu, 3, "%s", "free cpu");
249 trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id);
245 if (!kvm_is_ucontrol(vcpu->kvm)) { 250 if (!kvm_is_ucontrol(vcpu->kvm)) {
246 clear_bit(63 - vcpu->vcpu_id, 251 clear_bit(63 - vcpu->vcpu_id,
247 (unsigned long *) &vcpu->kvm->arch.sca->mcn); 252 (unsigned long *) &vcpu->kvm->arch.sca->mcn);
@@ -417,6 +422,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
417 goto out_free_sie_block; 422 goto out_free_sie_block;
418 VM_EVENT(kvm, 3, "create cpu %d at %p, sie block at %p", id, vcpu, 423 VM_EVENT(kvm, 3, "create cpu %d at %p, sie block at %p", id, vcpu,
419 vcpu->arch.sie_block); 424 vcpu->arch.sie_block);
425 trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block);
420 426
421 return vcpu; 427 return vcpu;
422out_free_sie_block: 428out_free_sie_block:
@@ -607,18 +613,22 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
607 local_irq_enable(); 613 local_irq_enable();
608 VCPU_EVENT(vcpu, 6, "entering sie flags %x", 614 VCPU_EVENT(vcpu, 6, "entering sie flags %x",
609 atomic_read(&vcpu->arch.sie_block->cpuflags)); 615 atomic_read(&vcpu->arch.sie_block->cpuflags));
616 trace_kvm_s390_sie_enter(vcpu,
617 atomic_read(&vcpu->arch.sie_block->cpuflags));
610 rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); 618 rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
611 if (rc) { 619 if (rc) {
612 if (kvm_is_ucontrol(vcpu->kvm)) { 620 if (kvm_is_ucontrol(vcpu->kvm)) {
613 rc = SIE_INTERCEPT_UCONTROL; 621 rc = SIE_INTERCEPT_UCONTROL;
614 } else { 622 } else {
615 VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction"); 623 VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
624 trace_kvm_s390_sie_fault(vcpu);
616 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 625 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
617 rc = 0; 626 rc = 0;
618 } 627 }
619 } 628 }
620 VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", 629 VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
621 vcpu->arch.sie_block->icptcode); 630 vcpu->arch.sie_block->icptcode);
631 trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
622 local_irq_disable(); 632 local_irq_disable();
623 kvm_guest_exit(); 633 kvm_guest_exit();
624 local_irq_enable(); 634 local_irq_enable();
@@ -959,7 +969,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
959 return; 969 return;
960} 970}
961 971
962void kvm_arch_flush_shadow(struct kvm *kvm) 972void kvm_arch_flush_shadow_all(struct kvm *kvm)
973{
974}
975
976void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
977 struct kvm_memory_slot *slot)
963{ 978{
964} 979}
965 980
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 310be61bead7..d768906f15c8 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -20,6 +20,7 @@
20#include <asm/sysinfo.h> 20#include <asm/sysinfo.h>
21#include "gaccess.h" 21#include "gaccess.h"
22#include "kvm-s390.h" 22#include "kvm-s390.h"
23#include "trace.h"
23 24
24static int handle_set_prefix(struct kvm_vcpu *vcpu) 25static int handle_set_prefix(struct kvm_vcpu *vcpu)
25{ 26{
@@ -59,6 +60,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
59 kvm_s390_set_prefix(vcpu, address); 60 kvm_s390_set_prefix(vcpu, address);
60 61
61 VCPU_EVENT(vcpu, 5, "setting prefix to %x", address); 62 VCPU_EVENT(vcpu, 5, "setting prefix to %x", address);
63 trace_kvm_s390_handle_prefix(vcpu, 1, address);
62out: 64out:
63 return 0; 65 return 0;
64} 66}
@@ -91,6 +93,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
91 } 93 }
92 94
93 VCPU_EVENT(vcpu, 5, "storing prefix to %x", address); 95 VCPU_EVENT(vcpu, 5, "storing prefix to %x", address);
96 trace_kvm_s390_handle_prefix(vcpu, 0, address);
94out: 97out:
95 return 0; 98 return 0;
96} 99}
@@ -119,6 +122,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
119 } 122 }
120 123
121 VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr); 124 VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr);
125 trace_kvm_s390_handle_stap(vcpu, useraddr);
122out: 126out:
123 return 0; 127 return 0;
124} 128}
@@ -164,9 +168,11 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
164 &facility_list, sizeof(facility_list)); 168 &facility_list, sizeof(facility_list));
165 if (rc == -EFAULT) 169 if (rc == -EFAULT)
166 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 170 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
167 else 171 else {
168 VCPU_EVENT(vcpu, 5, "store facility list value %x", 172 VCPU_EVENT(vcpu, 5, "store facility list value %x",
169 facility_list); 173 facility_list);
174 trace_kvm_s390_handle_stfl(vcpu, facility_list);
175 }
170 return 0; 176 return 0;
171} 177}
172 178
@@ -278,6 +284,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
278 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 284 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
279 goto out_mem; 285 goto out_mem;
280 } 286 }
287 trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
281 free_page(mem); 288 free_page(mem);
282 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 289 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
283 vcpu->run->s.regs.gprs[0] = 0; 290 vcpu->run->s.regs.gprs[0] = 0;
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 56f80e1f98f7..566ddf6e8dfb 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -18,6 +18,7 @@
18#include <asm/sigp.h> 18#include <asm/sigp.h>
19#include "gaccess.h" 19#include "gaccess.h"
20#include "kvm-s390.h" 20#include "kvm-s390.h"
21#include "trace.h"
21 22
22static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr, 23static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
23 u64 *reg) 24 u64 *reg)
@@ -344,6 +345,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
344 else 345 else
345 parameter = vcpu->run->s.regs.gprs[r1 + 1]; 346 parameter = vcpu->run->s.regs.gprs[r1 + 1];
346 347
348 trace_kvm_s390_handle_sigp(vcpu, order_code, cpu_addr, parameter);
347 switch (order_code) { 349 switch (order_code) {
348 case SIGP_SENSE: 350 case SIGP_SENSE:
349 vcpu->stat.instruction_sigp_sense++; 351 vcpu->stat.instruction_sigp_sense++;
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
new file mode 100644
index 000000000000..90fdf85b5ff7
--- /dev/null
+++ b/arch/s390/kvm/trace-s390.h
@@ -0,0 +1,210 @@
1#if !defined(_TRACE_KVMS390_H) || defined(TRACE_HEADER_MULTI_READ)
2#define _TRACE_KVMS390_H
3
4#include <linux/tracepoint.h>
5
6#undef TRACE_SYSTEM
7#define TRACE_SYSTEM kvm-s390
8#define TRACE_INCLUDE_PATH .
9#undef TRACE_INCLUDE_FILE
10#define TRACE_INCLUDE_FILE trace-s390
11
12/*
13 * Trace point for the creation of the kvm instance.
14 */
15TRACE_EVENT(kvm_s390_create_vm,
16 TP_PROTO(unsigned long type),
17 TP_ARGS(type),
18
19 TP_STRUCT__entry(
20 __field(unsigned long, type)
21 ),
22
23 TP_fast_assign(
24 __entry->type = type;
25 ),
26
27 TP_printk("create vm%s",
28 __entry->type & KVM_VM_S390_UCONTROL ? " (UCONTROL)" : "")
29 );
30
31/*
32 * Trace points for creation and destruction of vpcus.
33 */
34TRACE_EVENT(kvm_s390_create_vcpu,
35 TP_PROTO(unsigned int id, struct kvm_vcpu *vcpu,
36 struct kvm_s390_sie_block *sie_block),
37 TP_ARGS(id, vcpu, sie_block),
38
39 TP_STRUCT__entry(
40 __field(unsigned int, id)
41 __field(struct kvm_vcpu *, vcpu)
42 __field(struct kvm_s390_sie_block *, sie_block)
43 ),
44
45 TP_fast_assign(
46 __entry->id = id;
47 __entry->vcpu = vcpu;
48 __entry->sie_block = sie_block;
49 ),
50
51 TP_printk("create cpu %d at %p, sie block at %p", __entry->id,
52 __entry->vcpu, __entry->sie_block)
53 );
54
55TRACE_EVENT(kvm_s390_destroy_vcpu,
56 TP_PROTO(unsigned int id),
57 TP_ARGS(id),
58
59 TP_STRUCT__entry(
60 __field(unsigned int, id)
61 ),
62
63 TP_fast_assign(
64 __entry->id = id;
65 ),
66
67 TP_printk("destroy cpu %d", __entry->id)
68 );
69
70/*
71 * Trace points for injection of interrupts, either per machine or
72 * per vcpu.
73 */
74
75#define kvm_s390_int_type \
76 {KVM_S390_SIGP_STOP, "sigp stop"}, \
77 {KVM_S390_PROGRAM_INT, "program interrupt"}, \
78 {KVM_S390_SIGP_SET_PREFIX, "sigp set prefix"}, \
79 {KVM_S390_RESTART, "sigp restart"}, \
80 {KVM_S390_INT_VIRTIO, "virtio interrupt"}, \
81 {KVM_S390_INT_SERVICE, "sclp interrupt"}, \
82 {KVM_S390_INT_EMERGENCY, "sigp emergency"}, \
83 {KVM_S390_INT_EXTERNAL_CALL, "sigp ext call"}
84
85TRACE_EVENT(kvm_s390_inject_vm,
86 TP_PROTO(__u64 type, __u32 parm, __u64 parm64, int who),
87 TP_ARGS(type, parm, parm64, who),
88
89 TP_STRUCT__entry(
90 __field(__u32, inttype)
91 __field(__u32, parm)
92 __field(__u64, parm64)
93 __field(int, who)
94 ),
95
96 TP_fast_assign(
97 __entry->inttype = type & 0x00000000ffffffff;
98 __entry->parm = parm;
99 __entry->parm64 = parm64;
100 __entry->who = who;
101 ),
102
103 TP_printk("inject%s: type:%x (%s) parm:%x parm64:%llx",
104 (__entry->who == 1) ? " (from kernel)" :
105 (__entry->who == 2) ? " (from user)" : "",
106 __entry->inttype,
107 __print_symbolic(__entry->inttype, kvm_s390_int_type),
108 __entry->parm, __entry->parm64)
109 );
110
111TRACE_EVENT(kvm_s390_inject_vcpu,
112 TP_PROTO(unsigned int id, __u64 type, __u32 parm, __u64 parm64, \
113 int who),
114 TP_ARGS(id, type, parm, parm64, who),
115
116 TP_STRUCT__entry(
117 __field(int, id)
118 __field(__u32, inttype)
119 __field(__u32, parm)
120 __field(__u64, parm64)
121 __field(int, who)
122 ),
123
124 TP_fast_assign(
125 __entry->id = id;
126 __entry->inttype = type & 0x00000000ffffffff;
127 __entry->parm = parm;
128 __entry->parm64 = parm64;
129 __entry->who = who;
130 ),
131
132 TP_printk("inject%s (vcpu %d): type:%x (%s) parm:%x parm64:%llx",
133 (__entry->who == 1) ? " (from kernel)" :
134 (__entry->who == 2) ? " (from user)" : "",
135 __entry->id, __entry->inttype,
136 __print_symbolic(__entry->inttype, kvm_s390_int_type),
137 __entry->parm, __entry->parm64)
138 );
139
140/*
141 * Trace point for the actual delivery of interrupts.
142 */
143TRACE_EVENT(kvm_s390_deliver_interrupt,
144 TP_PROTO(unsigned int id, __u64 type, __u32 data0, __u64 data1),
145 TP_ARGS(id, type, data0, data1),
146
147 TP_STRUCT__entry(
148 __field(int, id)
149 __field(__u32, inttype)
150 __field(__u32, data0)
151 __field(__u64, data1)
152 ),
153
154 TP_fast_assign(
155 __entry->id = id;
156 __entry->inttype = type & 0x00000000ffffffff;
157 __entry->data0 = data0;
158 __entry->data1 = data1;
159 ),
160
161 TP_printk("deliver interrupt (vcpu %d): type:%x (%s) " \
162 "data:%08x %016llx",
163 __entry->id, __entry->inttype,
164 __print_symbolic(__entry->inttype, kvm_s390_int_type),
165 __entry->data0, __entry->data1)
166 );
167
168/*
169 * Trace point for resets that may be requested from userspace.
170 */
171TRACE_EVENT(kvm_s390_request_resets,
172 TP_PROTO(__u64 resets),
173 TP_ARGS(resets),
174
175 TP_STRUCT__entry(
176 __field(__u64, resets)
177 ),
178
179 TP_fast_assign(
180 __entry->resets = resets;
181 ),
182
183 TP_printk("requesting userspace resets %llx",
184 __entry->resets)
185 );
186
187/*
188 * Trace point for a vcpu's stop requests.
189 */
190TRACE_EVENT(kvm_s390_stop_request,
191 TP_PROTO(unsigned int action_bits),
192 TP_ARGS(action_bits),
193
194 TP_STRUCT__entry(
195 __field(unsigned int, action_bits)
196 ),
197
198 TP_fast_assign(
199 __entry->action_bits = action_bits;
200 ),
201
202 TP_printk("stop request, action_bits = %08x",
203 __entry->action_bits)
204 );
205
206
207#endif /* _TRACE_KVMS390_H */
208
209/* This part must be outside protection */
210#include <trace/define_trace.h>
diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h
new file mode 100644
index 000000000000..2b29e62351d3
--- /dev/null
+++ b/arch/s390/kvm/trace.h
@@ -0,0 +1,341 @@
1#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
2#define _TRACE_KVM_H
3
4#include <linux/tracepoint.h>
5#include <asm/sigp.h>
6#include <asm/debug.h>
7
8#undef TRACE_SYSTEM
9#define TRACE_SYSTEM kvm
10#define TRACE_INCLUDE_PATH .
11#undef TRACE_INCLUDE_FILE
12#define TRACE_INCLUDE_FILE trace
13
14/*
15 * Helpers for vcpu-specific tracepoints containing the same information
16 * as s390dbf VCPU_EVENTs.
17 */
18#define VCPU_PROTO_COMMON struct kvm_vcpu *vcpu
19#define VCPU_ARGS_COMMON vcpu
20#define VCPU_FIELD_COMMON __field(int, id) \
21 __field(unsigned long, pswmask) \
22 __field(unsigned long, pswaddr)
23#define VCPU_ASSIGN_COMMON do { \
24 __entry->id = vcpu->vcpu_id; \
25 __entry->pswmask = vcpu->arch.sie_block->gpsw.mask; \
26 __entry->pswaddr = vcpu->arch.sie_block->gpsw.addr; \
27 } while (0);
28#define VCPU_TP_PRINTK(p_str, p_args...) \
29 TP_printk("%02d[%016lx-%016lx]: " p_str, __entry->id, \
30 __entry->pswmask, __entry->pswaddr, p_args)
31
32/*
33 * Tracepoints for SIE entry and exit.
34 */
35TRACE_EVENT(kvm_s390_sie_enter,
36 TP_PROTO(VCPU_PROTO_COMMON, int cpuflags),
37 TP_ARGS(VCPU_ARGS_COMMON, cpuflags),
38
39 TP_STRUCT__entry(
40 VCPU_FIELD_COMMON
41 __field(int, cpuflags)
42 ),
43
44 TP_fast_assign(
45 VCPU_ASSIGN_COMMON
46 __entry->cpuflags = cpuflags;
47 ),
48
49 VCPU_TP_PRINTK("entering sie flags %x", __entry->cpuflags)
50 );
51
52TRACE_EVENT(kvm_s390_sie_fault,
53 TP_PROTO(VCPU_PROTO_COMMON),
54 TP_ARGS(VCPU_ARGS_COMMON),
55
56 TP_STRUCT__entry(
57 VCPU_FIELD_COMMON
58 ),
59
60 TP_fast_assign(
61 VCPU_ASSIGN_COMMON
62 ),
63
64 VCPU_TP_PRINTK("%s", "fault in sie instruction")
65 );
66
67#define sie_intercept_code \
68 {0x04, "Instruction"}, \
69 {0x08, "Program interruption"}, \
70 {0x0C, "Instruction and program interuption"}, \
71 {0x10, "External request"}, \
72 {0x14, "External interruption"}, \
73 {0x18, "I/O request"}, \
74 {0x1C, "Wait state"}, \
75 {0x20, "Validity"}, \
76 {0x28, "Stop request"}
77
78TRACE_EVENT(kvm_s390_sie_exit,
79 TP_PROTO(VCPU_PROTO_COMMON, u8 icptcode),
80 TP_ARGS(VCPU_ARGS_COMMON, icptcode),
81
82 TP_STRUCT__entry(
83 VCPU_FIELD_COMMON
84 __field(u8, icptcode)
85 ),
86
87 TP_fast_assign(
88 VCPU_ASSIGN_COMMON
89 __entry->icptcode = icptcode;
90 ),
91
92 VCPU_TP_PRINTK("exit sie icptcode %d (%s)", __entry->icptcode,
93 __print_symbolic(__entry->icptcode,
94 sie_intercept_code))
95 );
96
97/*
98 * Trace point for intercepted instructions.
99 */
100TRACE_EVENT(kvm_s390_intercept_instruction,
101 TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb),
102 TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb),
103
104 TP_STRUCT__entry(
105 VCPU_FIELD_COMMON
106 __field(__u64, instruction)
107 __field(char, insn[8])
108 ),
109
110 TP_fast_assign(
111 VCPU_ASSIGN_COMMON
112 __entry->instruction = ((__u64)ipa << 48) |
113 ((__u64)ipb << 16);
114 ),
115
116 VCPU_TP_PRINTK("intercepted instruction %016llx (%s)",
117 __entry->instruction,
118 insn_to_mnemonic((unsigned char *)
119 &__entry->instruction,
120 __entry->insn) ?
121 "unknown" : __entry->insn)
122 );
123
124/*
125 * Trace point for intercepted program interruptions.
126 */
127TRACE_EVENT(kvm_s390_intercept_prog,
128 TP_PROTO(VCPU_PROTO_COMMON, __u16 code),
129 TP_ARGS(VCPU_ARGS_COMMON, code),
130
131 TP_STRUCT__entry(
132 VCPU_FIELD_COMMON
133 __field(__u16, code)
134 ),
135
136 TP_fast_assign(
137 VCPU_ASSIGN_COMMON
138 __entry->code = code;
139 ),
140
141 VCPU_TP_PRINTK("intercepted program interruption %04x",
142 __entry->code)
143 );
144
145/*
146 * Trace point for validity intercepts.
147 */
148TRACE_EVENT(kvm_s390_intercept_validity,
149 TP_PROTO(VCPU_PROTO_COMMON, __u16 viwhy),
150 TP_ARGS(VCPU_ARGS_COMMON, viwhy),
151
152 TP_STRUCT__entry(
153 VCPU_FIELD_COMMON
154 __field(__u16, viwhy)
155 ),
156
157 TP_fast_assign(
158 VCPU_ASSIGN_COMMON
159 __entry->viwhy = viwhy;
160 ),
161
162 VCPU_TP_PRINTK("got validity intercept %04x", __entry->viwhy)
163 );
164
165/*
166 * Trace points for instructions that are of special interest.
167 */
168
169#define sigp_order_codes \
170 {SIGP_SENSE, "sense"}, \
171 {SIGP_EXTERNAL_CALL, "external call"}, \
172 {SIGP_EMERGENCY_SIGNAL, "emergency signal"}, \
173 {SIGP_STOP, "stop"}, \
174 {SIGP_STOP_AND_STORE_STATUS, "stop and store status"}, \
175 {SIGP_SET_ARCHITECTURE, "set architecture"}, \
176 {SIGP_SET_PREFIX, "set prefix"}, \
177 {SIGP_SENSE_RUNNING, "sense running"}, \
178 {SIGP_RESTART, "restart"}
179
180TRACE_EVENT(kvm_s390_handle_sigp,
181 TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr, \
182 __u32 parameter),
183 TP_ARGS(VCPU_ARGS_COMMON, order_code, cpu_addr, parameter),
184
185 TP_STRUCT__entry(
186 VCPU_FIELD_COMMON
187 __field(__u8, order_code)
188 __field(__u16, cpu_addr)
189 __field(__u32, parameter)
190 ),
191
192 TP_fast_assign(
193 VCPU_ASSIGN_COMMON
194 __entry->order_code = order_code;
195 __entry->cpu_addr = cpu_addr;
196 __entry->parameter = parameter;
197 ),
198
199 VCPU_TP_PRINTK("handle sigp order %02x (%s), cpu address %04x, " \
200 "parameter %08x", __entry->order_code,
201 __print_symbolic(__entry->order_code,
202 sigp_order_codes),
203 __entry->cpu_addr, __entry->parameter)
204 );
205
206#define diagnose_codes \
207 {0x10, "release pages"}, \
208 {0x44, "time slice end"}, \
209 {0x308, "ipl functions"}, \
210 {0x500, "kvm hypercall"}, \
211 {0x501, "kvm breakpoint"}
212
213TRACE_EVENT(kvm_s390_handle_diag,
214 TP_PROTO(VCPU_PROTO_COMMON, __u16 code),
215 TP_ARGS(VCPU_ARGS_COMMON, code),
216
217 TP_STRUCT__entry(
218 VCPU_FIELD_COMMON
219 __field(__u16, code)
220 ),
221
222 TP_fast_assign(
223 VCPU_ASSIGN_COMMON
224 __entry->code = code;
225 ),
226
227 VCPU_TP_PRINTK("handle diagnose call %04x (%s)", __entry->code,
228 __print_symbolic(__entry->code, diagnose_codes))
229 );
230
231TRACE_EVENT(kvm_s390_handle_lctl,
232 TP_PROTO(VCPU_PROTO_COMMON, int g, int reg1, int reg3, u64 addr),
233 TP_ARGS(VCPU_ARGS_COMMON, g, reg1, reg3, addr),
234
235 TP_STRUCT__entry(
236 VCPU_FIELD_COMMON
237 __field(int, g)
238 __field(int, reg1)
239 __field(int, reg3)
240 __field(u64, addr)
241 ),
242
243 TP_fast_assign(
244 VCPU_ASSIGN_COMMON
245 __entry->g = g;
246 __entry->reg1 = reg1;
247 __entry->reg3 = reg3;
248 __entry->addr = addr;
249 ),
250
251 VCPU_TP_PRINTK("%s: loading cr %x-%x from %016llx",
252 __entry->g ? "lctlg" : "lctl",
253 __entry->reg1, __entry->reg3, __entry->addr)
254 );
255
256TRACE_EVENT(kvm_s390_handle_prefix,
257 TP_PROTO(VCPU_PROTO_COMMON, int set, u32 address),
258 TP_ARGS(VCPU_ARGS_COMMON, set, address),
259
260 TP_STRUCT__entry(
261 VCPU_FIELD_COMMON
262 __field(int, set)
263 __field(u32, address)
264 ),
265
266 TP_fast_assign(
267 VCPU_ASSIGN_COMMON
268 __entry->set = set;
269 __entry->address = address;
270 ),
271
272 VCPU_TP_PRINTK("%s prefix to %08x",
273 __entry->set ? "setting" : "storing",
274 __entry->address)
275 );
276
277TRACE_EVENT(kvm_s390_handle_stap,
278 TP_PROTO(VCPU_PROTO_COMMON, u64 address),
279 TP_ARGS(VCPU_ARGS_COMMON, address),
280
281 TP_STRUCT__entry(
282 VCPU_FIELD_COMMON
283 __field(u64, address)
284 ),
285
286 TP_fast_assign(
287 VCPU_ASSIGN_COMMON
288 __entry->address = address;
289 ),
290
291 VCPU_TP_PRINTK("storing cpu address to %016llx",
292 __entry->address)
293 );
294
295TRACE_EVENT(kvm_s390_handle_stfl,
296 TP_PROTO(VCPU_PROTO_COMMON, unsigned int facility_list),
297 TP_ARGS(VCPU_ARGS_COMMON, facility_list),
298
299 TP_STRUCT__entry(
300 VCPU_FIELD_COMMON
301 __field(unsigned int, facility_list)
302 ),
303
304 TP_fast_assign(
305 VCPU_ASSIGN_COMMON
306 __entry->facility_list = facility_list;
307 ),
308
309 VCPU_TP_PRINTK("store facility list value %08x",
310 __entry->facility_list)
311 );
312
313TRACE_EVENT(kvm_s390_handle_stsi,
314 TP_PROTO(VCPU_PROTO_COMMON, int fc, int sel1, int sel2, u64 addr),
315 TP_ARGS(VCPU_ARGS_COMMON, fc, sel1, sel2, addr),
316
317 TP_STRUCT__entry(
318 VCPU_FIELD_COMMON
319 __field(int, fc)
320 __field(int, sel1)
321 __field(int, sel2)
322 __field(u64, addr)
323 ),
324
325 TP_fast_assign(
326 VCPU_ASSIGN_COMMON
327 __entry->fc = fc;
328 __entry->sel1 = sel1;
329 __entry->sel2 = sel2;
330 __entry->addr = addr;
331 ),
332
333 VCPU_TP_PRINTK("STSI %d.%d.%d information stored to %016llx",
334 __entry->fc, __entry->sel1, __entry->sel2,
335 __entry->addr)
336 );
337
338#endif /* _TRACE_KVM_H */
339
340/* This part must be outside protection */
341#include <trace/define_trace.h>
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7f9a395c5254..b72777ff32a9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -586,23 +586,18 @@ config PARAVIRT_TIME_ACCOUNTING
586 586
587source "arch/x86/xen/Kconfig" 587source "arch/x86/xen/Kconfig"
588 588
589config KVM_CLOCK
590 bool "KVM paravirtualized clock"
591 select PARAVIRT
592 select PARAVIRT_CLOCK
593 ---help---
594 Turning on this option will allow you to run a paravirtualized clock
595 when running over the KVM hypervisor. Instead of relying on a PIT
596 (or probably other) emulation by the underlying device model, the host
597 provides the guest with timing infrastructure such as time of day, and
598 system time
599
600config KVM_GUEST 589config KVM_GUEST
601 bool "KVM Guest support" 590 bool "KVM Guest support (including kvmclock)"
591 select PARAVIRT
602 select PARAVIRT 592 select PARAVIRT
593 select PARAVIRT_CLOCK
594 default y if PARAVIRT_GUEST
603 ---help--- 595 ---help---
604 This option enables various optimizations for running under the KVM 596 This option enables various optimizations for running under the KVM
605 hypervisor. 597 hypervisor. It includes a paravirtualized clock, so that instead
598 of relying on a PIT (or probably other) emulation by the
599 underlying device model, the host provides the guest with
600 timing infrastructure such as time of day, and system time
606 601
607source "arch/x86/lguest/Kconfig" 602source "arch/x86/lguest/Kconfig"
608 603
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 41e08cb6a092..a65ec29e6ffb 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -41,6 +41,7 @@
41#define __KVM_HAVE_DEBUGREGS 41#define __KVM_HAVE_DEBUGREGS
42#define __KVM_HAVE_XSAVE 42#define __KVM_HAVE_XSAVE
43#define __KVM_HAVE_XCRS 43#define __KVM_HAVE_XCRS
44#define __KVM_HAVE_READONLY_MEM
44 45
45/* Architectural interrupt line count. */ 46/* Architectural interrupt line count. */
46#define KVM_NR_INTERRUPTS 256 47#define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index c764f43b71c5..15f960c06ff7 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -86,6 +86,19 @@ struct x86_instruction_info {
86 86
87struct x86_emulate_ops { 87struct x86_emulate_ops {
88 /* 88 /*
89 * read_gpr: read a general purpose register (rax - r15)
90 *
91 * @reg: gpr number.
92 */
93 ulong (*read_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg);
94 /*
95 * write_gpr: write a general purpose register (rax - r15)
96 *
97 * @reg: gpr number.
98 * @val: value to write.
99 */
100 void (*write_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val);
101 /*
89 * read_std: Read bytes of standard (non-emulated/special) memory. 102 * read_std: Read bytes of standard (non-emulated/special) memory.
90 * Used for descriptor reading. 103 * Used for descriptor reading.
91 * @addr: [IN ] Linear address from which to read. 104 * @addr: [IN ] Linear address from which to read.
@@ -200,8 +213,9 @@ typedef u32 __attribute__((vector_size(16))) sse128_t;
200 213
201/* Type, address-of, and value of an instruction's operand. */ 214/* Type, address-of, and value of an instruction's operand. */
202struct operand { 215struct operand {
203 enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_MM, OP_NONE } type; 216 enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
204 unsigned int bytes; 217 unsigned int bytes;
218 unsigned int count;
205 union { 219 union {
206 unsigned long orig_val; 220 unsigned long orig_val;
207 u64 orig_val64; 221 u64 orig_val64;
@@ -221,6 +235,7 @@ struct operand {
221 char valptr[sizeof(unsigned long) + 2]; 235 char valptr[sizeof(unsigned long) + 2];
222 sse128_t vec_val; 236 sse128_t vec_val;
223 u64 mm_val; 237 u64 mm_val;
238 void *data;
224 }; 239 };
225}; 240};
226 241
@@ -236,14 +251,23 @@ struct read_cache {
236 unsigned long end; 251 unsigned long end;
237}; 252};
238 253
254/* Execution mode, passed to the emulator. */
255enum x86emul_mode {
256 X86EMUL_MODE_REAL, /* Real mode. */
257 X86EMUL_MODE_VM86, /* Virtual 8086 mode. */
258 X86EMUL_MODE_PROT16, /* 16-bit protected mode. */
259 X86EMUL_MODE_PROT32, /* 32-bit protected mode. */
260 X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */
261};
262
239struct x86_emulate_ctxt { 263struct x86_emulate_ctxt {
240 struct x86_emulate_ops *ops; 264 const struct x86_emulate_ops *ops;
241 265
242 /* Register state before/after emulation. */ 266 /* Register state before/after emulation. */
243 unsigned long eflags; 267 unsigned long eflags;
244 unsigned long eip; /* eip before instruction emulation */ 268 unsigned long eip; /* eip before instruction emulation */
245 /* Emulated execution mode, represented by an X86EMUL_MODE value. */ 269 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
246 int mode; 270 enum x86emul_mode mode;
247 271
248 /* interruptibility state, as a result of execution of STI or MOV SS */ 272 /* interruptibility state, as a result of execution of STI or MOV SS */
249 int interruptibility; 273 int interruptibility;
@@ -281,8 +305,10 @@ struct x86_emulate_ctxt {
281 bool rip_relative; 305 bool rip_relative;
282 unsigned long _eip; 306 unsigned long _eip;
283 struct operand memop; 307 struct operand memop;
308 u32 regs_valid; /* bitmaps of registers in _regs[] that can be read */
309 u32 regs_dirty; /* bitmaps of registers in _regs[] that have been written */
284 /* Fields above regs are cleared together. */ 310 /* Fields above regs are cleared together. */
285 unsigned long regs[NR_VCPU_REGS]; 311 unsigned long _regs[NR_VCPU_REGS];
286 struct operand *memopp; 312 struct operand *memopp;
287 struct fetch_cache fetch; 313 struct fetch_cache fetch;
288 struct read_cache io_read; 314 struct read_cache io_read;
@@ -293,17 +319,6 @@ struct x86_emulate_ctxt {
293#define REPE_PREFIX 0xf3 319#define REPE_PREFIX 0xf3
294#define REPNE_PREFIX 0xf2 320#define REPNE_PREFIX 0xf2
295 321
296/* Execution mode, passed to the emulator. */
297#define X86EMUL_MODE_REAL 0 /* Real mode. */
298#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */
299#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
300#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
301#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
302
303/* any protected mode */
304#define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \
305 X86EMUL_MODE_PROT64)
306
307/* CPUID vendors */ 322/* CPUID vendors */
308#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541 323#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
309#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163 324#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
@@ -394,4 +409,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
394 u16 tss_selector, int idt_index, int reason, 409 u16 tss_selector, int idt_index, int reason,
395 bool has_error_code, u32 error_code); 410 bool has_error_code, u32 error_code);
396int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); 411int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
412void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
413void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
414
397#endif /* _ASM_X86_KVM_X86_EMULATE_H */ 415#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1eaa6b056670..b2e11f452435 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -271,10 +271,24 @@ struct kvm_mmu {
271 union kvm_mmu_page_role base_role; 271 union kvm_mmu_page_role base_role;
272 bool direct_map; 272 bool direct_map;
273 273
274 /*
275 * Bitmap; bit set = permission fault
276 * Byte index: page fault error code [4:1]
277 * Bit index: pte permissions in ACC_* format
278 */
279 u8 permissions[16];
280
274 u64 *pae_root; 281 u64 *pae_root;
275 u64 *lm_root; 282 u64 *lm_root;
276 u64 rsvd_bits_mask[2][4]; 283 u64 rsvd_bits_mask[2][4];
277 284
285 /*
286 * Bitmap: bit set = last pte in walk
287 * index[0:1]: level (zero-based)
288 * index[2]: pte.ps
289 */
290 u8 last_pte_bitmap;
291
278 bool nx; 292 bool nx;
279 293
280 u64 pdptrs[4]; /* pae */ 294 u64 pdptrs[4]; /* pae */
@@ -398,12 +412,15 @@ struct kvm_vcpu_arch {
398 struct x86_emulate_ctxt emulate_ctxt; 412 struct x86_emulate_ctxt emulate_ctxt;
399 bool emulate_regs_need_sync_to_vcpu; 413 bool emulate_regs_need_sync_to_vcpu;
400 bool emulate_regs_need_sync_from_vcpu; 414 bool emulate_regs_need_sync_from_vcpu;
415 int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
401 416
402 gpa_t time; 417 gpa_t time;
403 struct pvclock_vcpu_time_info hv_clock; 418 struct pvclock_vcpu_time_info hv_clock;
404 unsigned int hw_tsc_khz; 419 unsigned int hw_tsc_khz;
405 unsigned int time_offset; 420 unsigned int time_offset;
406 struct page *time_page; 421 struct page *time_page;
422 /* set guest stopped flag in pvclock flags field */
423 bool pvclock_set_guest_stopped_request;
407 424
408 struct { 425 struct {
409 u64 msr_val; 426 u64 msr_val;
@@ -438,6 +455,7 @@ struct kvm_vcpu_arch {
438 unsigned long dr6; 455 unsigned long dr6;
439 unsigned long dr7; 456 unsigned long dr7;
440 unsigned long eff_db[KVM_NR_DB_REGS]; 457 unsigned long eff_db[KVM_NR_DB_REGS];
458 unsigned long guest_debug_dr7;
441 459
442 u64 mcg_cap; 460 u64 mcg_cap;
443 u64 mcg_status; 461 u64 mcg_status;
@@ -484,14 +502,24 @@ struct kvm_vcpu_arch {
484}; 502};
485 503
486struct kvm_lpage_info { 504struct kvm_lpage_info {
487 unsigned long rmap_pde;
488 int write_count; 505 int write_count;
489}; 506};
490 507
491struct kvm_arch_memory_slot { 508struct kvm_arch_memory_slot {
509 unsigned long *rmap[KVM_NR_PAGE_SIZES];
492 struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; 510 struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
493}; 511};
494 512
513struct kvm_apic_map {
514 struct rcu_head rcu;
515 u8 ldr_bits;
516 /* fields bellow are used to decode ldr values in different modes */
517 u32 cid_shift, cid_mask, lid_mask;
518 struct kvm_lapic *phys_map[256];
519 /* first index is cluster id second is cpu id in a cluster */
520 struct kvm_lapic *logical_map[16][16];
521};
522
495struct kvm_arch { 523struct kvm_arch {
496 unsigned int n_used_mmu_pages; 524 unsigned int n_used_mmu_pages;
497 unsigned int n_requested_mmu_pages; 525 unsigned int n_requested_mmu_pages;
@@ -509,6 +537,8 @@ struct kvm_arch {
509 struct kvm_ioapic *vioapic; 537 struct kvm_ioapic *vioapic;
510 struct kvm_pit *vpit; 538 struct kvm_pit *vpit;
511 int vapics_in_nmi_mode; 539 int vapics_in_nmi_mode;
540 struct mutex apic_map_lock;
541 struct kvm_apic_map *apic_map;
512 542
513 unsigned int tss_addr; 543 unsigned int tss_addr;
514 struct page *apic_access_page; 544 struct page *apic_access_page;
@@ -602,8 +632,7 @@ struct kvm_x86_ops {
602 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 632 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
603 void (*vcpu_put)(struct kvm_vcpu *vcpu); 633 void (*vcpu_put)(struct kvm_vcpu *vcpu);
604 634
605 void (*set_guest_debug)(struct kvm_vcpu *vcpu, 635 void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
606 struct kvm_guest_debug *dbg);
607 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 636 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
608 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 637 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
609 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 638 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -941,6 +970,7 @@ extern bool kvm_rebooting;
941 970
942#define KVM_ARCH_WANT_MMU_NOTIFIER 971#define KVM_ARCH_WANT_MMU_NOTIFIER
943int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 972int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
973int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
944int kvm_age_hva(struct kvm *kvm, unsigned long hva); 974int kvm_age_hva(struct kvm *kvm, unsigned long hva);
945int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 975int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
946void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 976void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 2f7712e08b1e..eb3e9d85e1f1 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -102,21 +102,21 @@ struct kvm_vcpu_pv_apf_data {
102extern void kvmclock_init(void); 102extern void kvmclock_init(void);
103extern int kvm_register_clock(char *txt); 103extern int kvm_register_clock(char *txt);
104 104
105#ifdef CONFIG_KVM_CLOCK 105#ifdef CONFIG_KVM_GUEST
106bool kvm_check_and_clear_guest_paused(void); 106bool kvm_check_and_clear_guest_paused(void);
107#else 107#else
108static inline bool kvm_check_and_clear_guest_paused(void) 108static inline bool kvm_check_and_clear_guest_paused(void)
109{ 109{
110 return false; 110 return false;
111} 111}
112#endif /* CONFIG_KVMCLOCK */ 112#endif /* CONFIG_KVM_GUEST */
113 113
114/* This instruction is vmcall. On non-VT architectures, it will generate a 114/* This instruction is vmcall. On non-VT architectures, it will generate a
115 * trap that we will then rewrite to the appropriate instruction. 115 * trap that we will then rewrite to the appropriate instruction.
116 */ 116 */
117#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" 117#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
118 118
119/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun 119/* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
120 * instruction. The hypervisor may replace it with something else but only the 120 * instruction. The hypervisor may replace it with something else but only the
121 * instructions are guaranteed to be supported. 121 * instructions are guaranteed to be supported.
122 * 122 *
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8d7a619718b5..a48ea05157d3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -81,8 +81,7 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
81obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 81obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
82obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o 82obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
83 83
84obj-$(CONFIG_KVM_GUEST) += kvm.o 84obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
85obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
86obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 85obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
87obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o 86obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
88obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 87obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index c1d61ee4b4f1..b3e5e51bc907 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -354,6 +354,7 @@ static void kvm_pv_guest_cpu_reboot(void *unused)
354 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 354 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
355 wrmsrl(MSR_KVM_PV_EOI_EN, 0); 355 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
356 kvm_pv_disable_apf(); 356 kvm_pv_disable_apf();
357 kvm_disable_steal_time();
357} 358}
358 359
359static int kvm_pv_reboot_notify(struct notifier_block *nb, 360static int kvm_pv_reboot_notify(struct notifier_block *nb,
@@ -396,9 +397,7 @@ void kvm_disable_steal_time(void)
396#ifdef CONFIG_SMP 397#ifdef CONFIG_SMP
397static void __init kvm_smp_prepare_boot_cpu(void) 398static void __init kvm_smp_prepare_boot_cpu(void)
398{ 399{
399#ifdef CONFIG_KVM_CLOCK
400 WARN_ON(kvm_register_clock("primary cpu clock")); 400 WARN_ON(kvm_register_clock("primary cpu clock"));
401#endif
402 kvm_guest_cpu_init(); 401 kvm_guest_cpu_init();
403 native_smp_prepare_boot_cpu(); 402 native_smp_prepare_boot_cpu();
404} 403}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4f165479c453..d609be046b57 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -957,7 +957,7 @@ void __init setup_arch(char **cmdline_p)
957 initmem_init(); 957 initmem_init();
958 memblock_find_dma_reserve(); 958 memblock_find_dma_reserve();
959 959
960#ifdef CONFIG_KVM_CLOCK 960#ifdef CONFIG_KVM_GUEST
961 kvmclock_init(); 961 kvmclock_init();
962#endif 962#endif
963 963
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a28f338843ea..586f00059805 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -20,6 +20,7 @@ if VIRTUALIZATION
20config KVM 20config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support" 21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on HAVE_KVM 22 depends on HAVE_KVM
23 depends on HIGH_RES_TIMERS
23 # for device assignment: 24 # for device assignment:
24 depends on PCI 25 depends on PCI
25 # for TASKSTATS/TASK_DELAY_ACCT: 26 # for TASKSTATS/TASK_DELAY_ACCT:
@@ -37,6 +38,7 @@ config KVM
37 select TASK_DELAY_ACCT 38 select TASK_DELAY_ACCT
38 select PERF_EVENTS 39 select PERF_EVENTS
39 select HAVE_KVM_MSI 40 select HAVE_KVM_MSI
41 select HAVE_KVM_CPU_RELAX_INTERCEPT
40 ---help--- 42 ---help---
41 Support hosting fully virtualized guest machines using hardware 43 Support hosting fully virtualized guest machines using hardware
42 virtualization extensions. You will need a fairly recent 44 virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 4f579e8dcacf..04d30401c5cb 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) 12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
13 13
14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
15 i8254.o timer.o cpuid.o pmu.o 15 i8254.o cpuid.o pmu.o
16kvm-intel-y += vmx.o 16kvm-intel-y += vmx.o
17kvm-amd-y += svm.o 17kvm-amd-y += svm.o
18 18
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0595f1397b7c..ec79e773342e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -316,7 +316,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
316 } 316 }
317 case 7: { 317 case 7: {
318 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 318 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
319 /* Mask ebx against host capbability word 9 */ 319 /* Mask ebx against host capability word 9 */
320 if (index == 0) { 320 if (index == 0) {
321 entry->ebx &= kvm_supported_word9_x86_features; 321 entry->ebx &= kvm_supported_word9_x86_features;
322 cpuid_mask(&entry->ebx, 9); 322 cpuid_mask(&entry->ebx, 9);
@@ -397,8 +397,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
397 break; 397 break;
398 } 398 }
399 case KVM_CPUID_SIGNATURE: { 399 case KVM_CPUID_SIGNATURE: {
400 char signature[12] = "KVMKVMKVM\0\0"; 400 static const char signature[12] = "KVMKVMKVM\0\0";
401 u32 *sigptr = (u32 *)signature; 401 const u32 *sigptr = (const u32 *)signature;
402 entry->eax = KVM_CPUID_FEATURES; 402 entry->eax = KVM_CPUID_FEATURES;
403 entry->ebx = sigptr[0]; 403 entry->ebx = sigptr[0];
404 entry->ecx = sigptr[1]; 404 entry->ecx = sigptr[1];
@@ -484,10 +484,10 @@ struct kvm_cpuid_param {
484 u32 func; 484 u32 func;
485 u32 idx; 485 u32 idx;
486 bool has_leaf_count; 486 bool has_leaf_count;
487 bool (*qualifier)(struct kvm_cpuid_param *param); 487 bool (*qualifier)(const struct kvm_cpuid_param *param);
488}; 488};
489 489
490static bool is_centaur_cpu(struct kvm_cpuid_param *param) 490static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
491{ 491{
492 return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; 492 return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
493} 493}
@@ -498,7 +498,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
498 struct kvm_cpuid_entry2 *cpuid_entries; 498 struct kvm_cpuid_entry2 *cpuid_entries;
499 int limit, nent = 0, r = -E2BIG, i; 499 int limit, nent = 0, r = -E2BIG, i;
500 u32 func; 500 u32 func;
501 static struct kvm_cpuid_param param[] = { 501 static const struct kvm_cpuid_param param[] = {
502 { .func = 0, .has_leaf_count = true }, 502 { .func = 0, .has_leaf_count = true },
503 { .func = 0x80000000, .has_leaf_count = true }, 503 { .func = 0x80000000, .has_leaf_count = true },
504 { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true }, 504 { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true },
@@ -517,7 +517,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
517 517
518 r = 0; 518 r = 0;
519 for (i = 0; i < ARRAY_SIZE(param); i++) { 519 for (i = 0; i < ARRAY_SIZE(param); i++) {
520 struct kvm_cpuid_param *ent = &param[i]; 520 const struct kvm_cpuid_param *ent = &param[i];
521 521
522 if (ent->qualifier && !ent->qualifier(ent)) 522 if (ent->qualifier && !ent->qualifier(ent))
523 continue; 523 continue;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a3b57a27be88..39171cb307ea 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -161,9 +161,9 @@ struct opcode {
161 u64 intercept : 8; 161 u64 intercept : 8;
162 union { 162 union {
163 int (*execute)(struct x86_emulate_ctxt *ctxt); 163 int (*execute)(struct x86_emulate_ctxt *ctxt);
164 struct opcode *group; 164 const struct opcode *group;
165 struct group_dual *gdual; 165 const struct group_dual *gdual;
166 struct gprefix *gprefix; 166 const struct gprefix *gprefix;
167 } u; 167 } u;
168 int (*check_perm)(struct x86_emulate_ctxt *ctxt); 168 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
169}; 169};
@@ -202,6 +202,42 @@ struct gprefix {
202#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a 202#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
203#define EFLG_RESERVED_ONE_MASK 2 203#define EFLG_RESERVED_ONE_MASK 2
204 204
205static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
206{
207 if (!(ctxt->regs_valid & (1 << nr))) {
208 ctxt->regs_valid |= 1 << nr;
209 ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
210 }
211 return ctxt->_regs[nr];
212}
213
214static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
215{
216 ctxt->regs_valid |= 1 << nr;
217 ctxt->regs_dirty |= 1 << nr;
218 return &ctxt->_regs[nr];
219}
220
221static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
222{
223 reg_read(ctxt, nr);
224 return reg_write(ctxt, nr);
225}
226
227static void writeback_registers(struct x86_emulate_ctxt *ctxt)
228{
229 unsigned reg;
230
231 for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16)
232 ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]);
233}
234
235static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
236{
237 ctxt->regs_dirty = 0;
238 ctxt->regs_valid = 0;
239}
240
205/* 241/*
206 * Instruction emulation: 242 * Instruction emulation:
207 * Most instructions are emulated directly via a fragment of inline assembly 243 * Most instructions are emulated directly via a fragment of inline assembly
@@ -374,8 +410,8 @@ struct gprefix {
374#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ 410#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
375 do { \ 411 do { \
376 unsigned long _tmp; \ 412 unsigned long _tmp; \
377 ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX]; \ 413 ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX); \
378 ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX]; \ 414 ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX); \
379 \ 415 \
380 __asm__ __volatile__ ( \ 416 __asm__ __volatile__ ( \
381 _PRE_EFLAGS("0", "5", "1") \ 417 _PRE_EFLAGS("0", "5", "1") \
@@ -494,7 +530,7 @@ register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, in
494 530
495static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) 531static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
496{ 532{
497 masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc); 533 masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc);
498} 534}
499 535
500static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) 536static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -632,8 +668,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
632 668
633 la = seg_base(ctxt, addr.seg) + addr.ea; 669 la = seg_base(ctxt, addr.seg) + addr.ea;
634 switch (ctxt->mode) { 670 switch (ctxt->mode) {
635 case X86EMUL_MODE_REAL:
636 break;
637 case X86EMUL_MODE_PROT64: 671 case X86EMUL_MODE_PROT64:
638 if (((signed long)la << 16) >> 16 != la) 672 if (((signed long)la << 16) >> 16 != la)
639 return emulate_gp(ctxt, 0); 673 return emulate_gp(ctxt, 0);
@@ -655,7 +689,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
655 if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) 689 if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
656 goto bad; 690 goto bad;
657 } else { 691 } else {
658 /* exapand-down segment */ 692 /* expand-down segment */
659 if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) 693 if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
660 goto bad; 694 goto bad;
661 lim = desc.d ? 0xffffffff : 0xffff; 695 lim = desc.d ? 0xffffffff : 0xffff;
@@ -663,7 +697,10 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
663 goto bad; 697 goto bad;
664 } 698 }
665 cpl = ctxt->ops->cpl(ctxt); 699 cpl = ctxt->ops->cpl(ctxt);
666 rpl = sel & 3; 700 if (ctxt->mode == X86EMUL_MODE_REAL)
701 rpl = 0;
702 else
703 rpl = sel & 3;
667 cpl = max(cpl, rpl); 704 cpl = max(cpl, rpl);
668 if (!(desc.type & 8)) { 705 if (!(desc.type & 8)) {
669 /* data segment */ 706 /* data segment */
@@ -688,9 +725,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
688 return X86EMUL_CONTINUE; 725 return X86EMUL_CONTINUE;
689bad: 726bad:
690 if (addr.seg == VCPU_SREG_SS) 727 if (addr.seg == VCPU_SREG_SS)
691 return emulate_ss(ctxt, addr.seg); 728 return emulate_ss(ctxt, sel);
692 else 729 else
693 return emulate_gp(ctxt, addr.seg); 730 return emulate_gp(ctxt, sel);
694} 731}
695 732
696static int linearize(struct x86_emulate_ctxt *ctxt, 733static int linearize(struct x86_emulate_ctxt *ctxt,
@@ -786,14 +823,15 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
786 * pointer into the block that addresses the relevant register. 823 * pointer into the block that addresses the relevant register.
787 * @highbyte_regs specifies whether to decode AH,CH,DH,BH. 824 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
788 */ 825 */
789static void *decode_register(u8 modrm_reg, unsigned long *regs, 826static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg,
790 int highbyte_regs) 827 int highbyte_regs)
791{ 828{
792 void *p; 829 void *p;
793 830
794 p = &regs[modrm_reg];
795 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) 831 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
796 p = (unsigned char *)&regs[modrm_reg & 3] + 1; 832 p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1;
833 else
834 p = reg_rmw(ctxt, modrm_reg);
797 return p; 835 return p;
798} 836}
799 837
@@ -871,23 +909,23 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
871{ 909{
872 ctxt->ops->get_fpu(ctxt); 910 ctxt->ops->get_fpu(ctxt);
873 switch (reg) { 911 switch (reg) {
874 case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break; 912 case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
875 case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break; 913 case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
876 case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break; 914 case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break;
877 case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break; 915 case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break;
878 case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break; 916 case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break;
879 case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break; 917 case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break;
880 case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break; 918 case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break;
881 case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break; 919 case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break;
882#ifdef CONFIG_X86_64 920#ifdef CONFIG_X86_64
883 case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break; 921 case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break;
884 case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break; 922 case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break;
885 case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break; 923 case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break;
886 case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break; 924 case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break;
887 case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break; 925 case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break;
888 case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break; 926 case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break;
889 case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break; 927 case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break;
890 case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break; 928 case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break;
891#endif 929#endif
892 default: BUG(); 930 default: BUG();
893 } 931 }
@@ -899,23 +937,23 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
899{ 937{
900 ctxt->ops->get_fpu(ctxt); 938 ctxt->ops->get_fpu(ctxt);
901 switch (reg) { 939 switch (reg) {
902 case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break; 940 case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
903 case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break; 941 case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
904 case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break; 942 case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break;
905 case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break; 943 case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break;
906 case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break; 944 case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break;
907 case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break; 945 case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break;
908 case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break; 946 case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break;
909 case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break; 947 case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break;
910#ifdef CONFIG_X86_64 948#ifdef CONFIG_X86_64
911 case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break; 949 case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break;
912 case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break; 950 case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break;
913 case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break; 951 case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break;
914 case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break; 952 case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break;
915 case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break; 953 case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break;
916 case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break; 954 case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break;
917 case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break; 955 case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break;
918 case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break; 956 case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break;
919#endif 957#endif
920 default: BUG(); 958 default: BUG();
921 } 959 }
@@ -982,10 +1020,10 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
982 1020
983 op->type = OP_REG; 1021 op->type = OP_REG;
984 if (ctxt->d & ByteOp) { 1022 if (ctxt->d & ByteOp) {
985 op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); 1023 op->addr.reg = decode_register(ctxt, reg, highbyte_regs);
986 op->bytes = 1; 1024 op->bytes = 1;
987 } else { 1025 } else {
988 op->addr.reg = decode_register(reg, ctxt->regs, 0); 1026 op->addr.reg = decode_register(ctxt, reg, 0);
989 op->bytes = ctxt->op_bytes; 1027 op->bytes = ctxt->op_bytes;
990 } 1028 }
991 fetch_register_operand(op); 1029 fetch_register_operand(op);
@@ -1020,8 +1058,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
1020 if (ctxt->modrm_mod == 3) { 1058 if (ctxt->modrm_mod == 3) {
1021 op->type = OP_REG; 1059 op->type = OP_REG;
1022 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 1060 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
1023 op->addr.reg = decode_register(ctxt->modrm_rm, 1061 op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, ctxt->d & ByteOp);
1024 ctxt->regs, ctxt->d & ByteOp);
1025 if (ctxt->d & Sse) { 1062 if (ctxt->d & Sse) {
1026 op->type = OP_XMM; 1063 op->type = OP_XMM;
1027 op->bytes = 16; 1064 op->bytes = 16;
@@ -1042,10 +1079,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
1042 op->type = OP_MEM; 1079 op->type = OP_MEM;
1043 1080
1044 if (ctxt->ad_bytes == 2) { 1081 if (ctxt->ad_bytes == 2) {
1045 unsigned bx = ctxt->regs[VCPU_REGS_RBX]; 1082 unsigned bx = reg_read(ctxt, VCPU_REGS_RBX);
1046 unsigned bp = ctxt->regs[VCPU_REGS_RBP]; 1083 unsigned bp = reg_read(ctxt, VCPU_REGS_RBP);
1047 unsigned si = ctxt->regs[VCPU_REGS_RSI]; 1084 unsigned si = reg_read(ctxt, VCPU_REGS_RSI);
1048 unsigned di = ctxt->regs[VCPU_REGS_RDI]; 1085 unsigned di = reg_read(ctxt, VCPU_REGS_RDI);
1049 1086
1050 /* 16-bit ModR/M decode. */ 1087 /* 16-bit ModR/M decode. */
1051 switch (ctxt->modrm_mod) { 1088 switch (ctxt->modrm_mod) {
@@ -1102,17 +1139,17 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
1102 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) 1139 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
1103 modrm_ea += insn_fetch(s32, ctxt); 1140 modrm_ea += insn_fetch(s32, ctxt);
1104 else { 1141 else {
1105 modrm_ea += ctxt->regs[base_reg]; 1142 modrm_ea += reg_read(ctxt, base_reg);
1106 adjust_modrm_seg(ctxt, base_reg); 1143 adjust_modrm_seg(ctxt, base_reg);
1107 } 1144 }
1108 if (index_reg != 4) 1145 if (index_reg != 4)
1109 modrm_ea += ctxt->regs[index_reg] << scale; 1146 modrm_ea += reg_read(ctxt, index_reg) << scale;
1110 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { 1147 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
1111 if (ctxt->mode == X86EMUL_MODE_PROT64) 1148 if (ctxt->mode == X86EMUL_MODE_PROT64)
1112 ctxt->rip_relative = 1; 1149 ctxt->rip_relative = 1;
1113 } else { 1150 } else {
1114 base_reg = ctxt->modrm_rm; 1151 base_reg = ctxt->modrm_rm;
1115 modrm_ea += ctxt->regs[base_reg]; 1152 modrm_ea += reg_read(ctxt, base_reg);
1116 adjust_modrm_seg(ctxt, base_reg); 1153 adjust_modrm_seg(ctxt, base_reg);
1117 } 1154 }
1118 switch (ctxt->modrm_mod) { 1155 switch (ctxt->modrm_mod) {
@@ -1179,24 +1216,21 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
1179 int rc; 1216 int rc;
1180 struct read_cache *mc = &ctxt->mem_read; 1217 struct read_cache *mc = &ctxt->mem_read;
1181 1218
1182 while (size) { 1219 if (mc->pos < mc->end)
1183 int n = min(size, 8u); 1220 goto read_cached;
1184 size -= n;
1185 if (mc->pos < mc->end)
1186 goto read_cached;
1187 1221
1188 rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n, 1222 WARN_ON((mc->end + size) >= sizeof(mc->data));
1189 &ctxt->exception);
1190 if (rc != X86EMUL_CONTINUE)
1191 return rc;
1192 mc->end += n;
1193 1223
1194 read_cached: 1224 rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size,
1195 memcpy(dest, mc->data + mc->pos, n); 1225 &ctxt->exception);
1196 mc->pos += n; 1226 if (rc != X86EMUL_CONTINUE)
1197 dest += n; 1227 return rc;
1198 addr += n; 1228
1199 } 1229 mc->end += size;
1230
1231read_cached:
1232 memcpy(dest, mc->data + mc->pos, size);
1233 mc->pos += size;
1200 return X86EMUL_CONTINUE; 1234 return X86EMUL_CONTINUE;
1201} 1235}
1202 1236
@@ -1253,10 +1287,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1253 if (rc->pos == rc->end) { /* refill pio read ahead */ 1287 if (rc->pos == rc->end) { /* refill pio read ahead */
1254 unsigned int in_page, n; 1288 unsigned int in_page, n;
1255 unsigned int count = ctxt->rep_prefix ? 1289 unsigned int count = ctxt->rep_prefix ?
1256 address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1; 1290 address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
1257 in_page = (ctxt->eflags & EFLG_DF) ? 1291 in_page = (ctxt->eflags & EFLG_DF) ?
1258 offset_in_page(ctxt->regs[VCPU_REGS_RDI]) : 1292 offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
1259 PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]); 1293 PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
1260 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, 1294 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
1261 count); 1295 count);
1262 if (n == 0) 1296 if (n == 0)
@@ -1267,8 +1301,15 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1267 rc->end = n * size; 1301 rc->end = n * size;
1268 } 1302 }
1269 1303
1270 memcpy(dest, rc->data + rc->pos, size); 1304 if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
1271 rc->pos += size; 1305 ctxt->dst.data = rc->data + rc->pos;
1306 ctxt->dst.type = OP_MEM_STR;
1307 ctxt->dst.count = (rc->end - rc->pos) / size;
1308 rc->pos = rc->end;
1309 } else {
1310 memcpy(dest, rc->data + rc->pos, size);
1311 rc->pos += size;
1312 }
1272 return 1; 1313 return 1;
1273} 1314}
1274 1315
@@ -1291,7 +1332,7 @@ static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
1291static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, 1332static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1292 u16 selector, struct desc_ptr *dt) 1333 u16 selector, struct desc_ptr *dt)
1293{ 1334{
1294 struct x86_emulate_ops *ops = ctxt->ops; 1335 const struct x86_emulate_ops *ops = ctxt->ops;
1295 1336
1296 if (selector & 1 << 2) { 1337 if (selector & 1 << 2) {
1297 struct desc_struct desc; 1338 struct desc_struct desc;
@@ -1355,19 +1396,15 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1355 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ 1396 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
1356 ulong desc_addr; 1397 ulong desc_addr;
1357 int ret; 1398 int ret;
1399 u16 dummy;
1358 1400
1359 memset(&seg_desc, 0, sizeof seg_desc); 1401 memset(&seg_desc, 0, sizeof seg_desc);
1360 1402
1361 if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) 1403 if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
1362 || ctxt->mode == X86EMUL_MODE_REAL) { 1404 || ctxt->mode == X86EMUL_MODE_REAL) {
1363 /* set real mode segment descriptor */ 1405 /* set real mode segment descriptor */
1406 ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg);
1364 set_desc_base(&seg_desc, selector << 4); 1407 set_desc_base(&seg_desc, selector << 4);
1365 set_desc_limit(&seg_desc, 0xffff);
1366 seg_desc.type = 3;
1367 seg_desc.p = 1;
1368 seg_desc.s = 1;
1369 if (ctxt->mode == X86EMUL_MODE_VM86)
1370 seg_desc.dpl = 3;
1371 goto load; 1408 goto load;
1372 } 1409 }
1373 1410
@@ -1396,7 +1433,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1396 err_code = selector & 0xfffc; 1433 err_code = selector & 0xfffc;
1397 err_vec = GP_VECTOR; 1434 err_vec = GP_VECTOR;
1398 1435
1399 /* can't load system descriptor into segment selecor */ 1436 /* can't load system descriptor into segment selector */
1400 if (seg <= VCPU_SREG_GS && !seg_desc.s) 1437 if (seg <= VCPU_SREG_GS && !seg_desc.s)
1401 goto exception; 1438 goto exception;
1402 1439
@@ -1516,6 +1553,14 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
1516 if (rc != X86EMUL_CONTINUE) 1553 if (rc != X86EMUL_CONTINUE)
1517 return rc; 1554 return rc;
1518 break; 1555 break;
1556 case OP_MEM_STR:
1557 rc = segmented_write(ctxt,
1558 ctxt->dst.addr.mem,
1559 ctxt->dst.data,
1560 ctxt->dst.bytes * ctxt->dst.count);
1561 if (rc != X86EMUL_CONTINUE)
1562 return rc;
1563 break;
1519 case OP_XMM: 1564 case OP_XMM:
1520 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); 1565 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
1521 break; 1566 break;
@@ -1536,7 +1581,7 @@ static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
1536 struct segmented_address addr; 1581 struct segmented_address addr;
1537 1582
1538 rsp_increment(ctxt, -bytes); 1583 rsp_increment(ctxt, -bytes);
1539 addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); 1584 addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
1540 addr.seg = VCPU_SREG_SS; 1585 addr.seg = VCPU_SREG_SS;
1541 1586
1542 return segmented_write(ctxt, addr, data, bytes); 1587 return segmented_write(ctxt, addr, data, bytes);
@@ -1555,7 +1600,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1555 int rc; 1600 int rc;
1556 struct segmented_address addr; 1601 struct segmented_address addr;
1557 1602
1558 addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); 1603 addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
1559 addr.seg = VCPU_SREG_SS; 1604 addr.seg = VCPU_SREG_SS;
1560 rc = segmented_read(ctxt, addr, dest, len); 1605 rc = segmented_read(ctxt, addr, dest, len);
1561 if (rc != X86EMUL_CONTINUE) 1606 if (rc != X86EMUL_CONTINUE)
@@ -1623,26 +1668,28 @@ static int em_enter(struct x86_emulate_ctxt *ctxt)
1623 int rc; 1668 int rc;
1624 unsigned frame_size = ctxt->src.val; 1669 unsigned frame_size = ctxt->src.val;
1625 unsigned nesting_level = ctxt->src2.val & 31; 1670 unsigned nesting_level = ctxt->src2.val & 31;
1671 ulong rbp;
1626 1672
1627 if (nesting_level) 1673 if (nesting_level)
1628 return X86EMUL_UNHANDLEABLE; 1674 return X86EMUL_UNHANDLEABLE;
1629 1675
1630 rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt)); 1676 rbp = reg_read(ctxt, VCPU_REGS_RBP);
1677 rc = push(ctxt, &rbp, stack_size(ctxt));
1631 if (rc != X86EMUL_CONTINUE) 1678 if (rc != X86EMUL_CONTINUE)
1632 return rc; 1679 return rc;
1633 assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP], 1680 assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP),
1634 stack_mask(ctxt)); 1681 stack_mask(ctxt));
1635 assign_masked(&ctxt->regs[VCPU_REGS_RSP], 1682 assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP),
1636 ctxt->regs[VCPU_REGS_RSP] - frame_size, 1683 reg_read(ctxt, VCPU_REGS_RSP) - frame_size,
1637 stack_mask(ctxt)); 1684 stack_mask(ctxt));
1638 return X86EMUL_CONTINUE; 1685 return X86EMUL_CONTINUE;
1639} 1686}
1640 1687
1641static int em_leave(struct x86_emulate_ctxt *ctxt) 1688static int em_leave(struct x86_emulate_ctxt *ctxt)
1642{ 1689{
1643 assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP], 1690 assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), reg_read(ctxt, VCPU_REGS_RBP),
1644 stack_mask(ctxt)); 1691 stack_mask(ctxt));
1645 return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes); 1692 return emulate_pop(ctxt, reg_rmw(ctxt, VCPU_REGS_RBP), ctxt->op_bytes);
1646} 1693}
1647 1694
1648static int em_push_sreg(struct x86_emulate_ctxt *ctxt) 1695static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
@@ -1670,13 +1717,13 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
1670 1717
1671static int em_pusha(struct x86_emulate_ctxt *ctxt) 1718static int em_pusha(struct x86_emulate_ctxt *ctxt)
1672{ 1719{
1673 unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP]; 1720 unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP);
1674 int rc = X86EMUL_CONTINUE; 1721 int rc = X86EMUL_CONTINUE;
1675 int reg = VCPU_REGS_RAX; 1722 int reg = VCPU_REGS_RAX;
1676 1723
1677 while (reg <= VCPU_REGS_RDI) { 1724 while (reg <= VCPU_REGS_RDI) {
1678 (reg == VCPU_REGS_RSP) ? 1725 (reg == VCPU_REGS_RSP) ?
1679 (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]); 1726 (ctxt->src.val = old_esp) : (ctxt->src.val = reg_read(ctxt, reg));
1680 1727
1681 rc = em_push(ctxt); 1728 rc = em_push(ctxt);
1682 if (rc != X86EMUL_CONTINUE) 1729 if (rc != X86EMUL_CONTINUE)
@@ -1705,7 +1752,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
1705 --reg; 1752 --reg;
1706 } 1753 }
1707 1754
1708 rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes); 1755 rc = emulate_pop(ctxt, reg_rmw(ctxt, reg), ctxt->op_bytes);
1709 if (rc != X86EMUL_CONTINUE) 1756 if (rc != X86EMUL_CONTINUE)
1710 break; 1757 break;
1711 --reg; 1758 --reg;
@@ -1713,9 +1760,9 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
1713 return rc; 1760 return rc;
1714} 1761}
1715 1762
1716int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) 1763static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
1717{ 1764{
1718 struct x86_emulate_ops *ops = ctxt->ops; 1765 const struct x86_emulate_ops *ops = ctxt->ops;
1719 int rc; 1766 int rc;
1720 struct desc_ptr dt; 1767 struct desc_ptr dt;
1721 gva_t cs_addr; 1768 gva_t cs_addr;
@@ -1762,11 +1809,22 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
1762 return rc; 1809 return rc;
1763} 1810}
1764 1811
1812int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
1813{
1814 int rc;
1815
1816 invalidate_registers(ctxt);
1817 rc = __emulate_int_real(ctxt, irq);
1818 if (rc == X86EMUL_CONTINUE)
1819 writeback_registers(ctxt);
1820 return rc;
1821}
1822
1765static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq) 1823static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
1766{ 1824{
1767 switch(ctxt->mode) { 1825 switch(ctxt->mode) {
1768 case X86EMUL_MODE_REAL: 1826 case X86EMUL_MODE_REAL:
1769 return emulate_int_real(ctxt, irq); 1827 return __emulate_int_real(ctxt, irq);
1770 case X86EMUL_MODE_VM86: 1828 case X86EMUL_MODE_VM86:
1771 case X86EMUL_MODE_PROT16: 1829 case X86EMUL_MODE_PROT16:
1772 case X86EMUL_MODE_PROT32: 1830 case X86EMUL_MODE_PROT32:
@@ -1973,14 +2031,14 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
1973{ 2031{
1974 u64 old = ctxt->dst.orig_val64; 2032 u64 old = ctxt->dst.orig_val64;
1975 2033
1976 if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) || 2034 if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
1977 ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) { 2035 ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
1978 ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0); 2036 *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
1979 ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32); 2037 *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
1980 ctxt->eflags &= ~EFLG_ZF; 2038 ctxt->eflags &= ~EFLG_ZF;
1981 } else { 2039 } else {
1982 ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) | 2040 ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
1983 (u32) ctxt->regs[VCPU_REGS_RBX]; 2041 (u32) reg_read(ctxt, VCPU_REGS_RBX);
1984 2042
1985 ctxt->eflags |= EFLG_ZF; 2043 ctxt->eflags |= EFLG_ZF;
1986 } 2044 }
@@ -2016,7 +2074,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
2016{ 2074{
2017 /* Save real source value, then compare EAX against destination. */ 2075 /* Save real source value, then compare EAX against destination. */
2018 ctxt->src.orig_val = ctxt->src.val; 2076 ctxt->src.orig_val = ctxt->src.val;
2019 ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; 2077 ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX);
2020 emulate_2op_SrcV(ctxt, "cmp"); 2078 emulate_2op_SrcV(ctxt, "cmp");
2021 2079
2022 if (ctxt->eflags & EFLG_ZF) { 2080 if (ctxt->eflags & EFLG_ZF) {
@@ -2025,7 +2083,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
2025 } else { 2083 } else {
2026 /* Failure: write the value we saw to EAX. */ 2084 /* Failure: write the value we saw to EAX. */
2027 ctxt->dst.type = OP_REG; 2085 ctxt->dst.type = OP_REG;
2028 ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; 2086 ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
2029 } 2087 }
2030 return X86EMUL_CONTINUE; 2088 return X86EMUL_CONTINUE;
2031} 2089}
@@ -2050,12 +2108,6 @@ static void
2050setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 2108setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
2051 struct desc_struct *cs, struct desc_struct *ss) 2109 struct desc_struct *cs, struct desc_struct *ss)
2052{ 2110{
2053 u16 selector;
2054
2055 memset(cs, 0, sizeof(struct desc_struct));
2056 ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
2057 memset(ss, 0, sizeof(struct desc_struct));
2058
2059 cs->l = 0; /* will be adjusted later */ 2111 cs->l = 0; /* will be adjusted later */
2060 set_desc_base(cs, 0); /* flat segment */ 2112 set_desc_base(cs, 0); /* flat segment */
2061 cs->g = 1; /* 4kb granularity */ 2113 cs->g = 1; /* 4kb granularity */
@@ -2065,6 +2117,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
2065 cs->dpl = 0; /* will be adjusted later */ 2117 cs->dpl = 0; /* will be adjusted later */
2066 cs->p = 1; 2118 cs->p = 1;
2067 cs->d = 1; 2119 cs->d = 1;
2120 cs->avl = 0;
2068 2121
2069 set_desc_base(ss, 0); /* flat segment */ 2122 set_desc_base(ss, 0); /* flat segment */
2070 set_desc_limit(ss, 0xfffff); /* 4GB limit */ 2123 set_desc_limit(ss, 0xfffff); /* 4GB limit */
@@ -2074,6 +2127,8 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
2074 ss->d = 1; /* 32bit stack segment */ 2127 ss->d = 1; /* 32bit stack segment */
2075 ss->dpl = 0; 2128 ss->dpl = 0;
2076 ss->p = 1; 2129 ss->p = 1;
2130 ss->l = 0;
2131 ss->avl = 0;
2077} 2132}
2078 2133
2079static bool vendor_intel(struct x86_emulate_ctxt *ctxt) 2134static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
@@ -2089,7 +2144,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
2089 2144
2090static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) 2145static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
2091{ 2146{
2092 struct x86_emulate_ops *ops = ctxt->ops; 2147 const struct x86_emulate_ops *ops = ctxt->ops;
2093 u32 eax, ebx, ecx, edx; 2148 u32 eax, ebx, ecx, edx;
2094 2149
2095 /* 2150 /*
@@ -2133,7 +2188,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
2133 2188
2134static int em_syscall(struct x86_emulate_ctxt *ctxt) 2189static int em_syscall(struct x86_emulate_ctxt *ctxt)
2135{ 2190{
2136 struct x86_emulate_ops *ops = ctxt->ops; 2191 const struct x86_emulate_ops *ops = ctxt->ops;
2137 struct desc_struct cs, ss; 2192 struct desc_struct cs, ss;
2138 u64 msr_data; 2193 u64 msr_data;
2139 u16 cs_sel, ss_sel; 2194 u16 cs_sel, ss_sel;
@@ -2165,10 +2220,10 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
2165 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); 2220 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
2166 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 2221 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
2167 2222
2168 ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip; 2223 *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip;
2169 if (efer & EFER_LMA) { 2224 if (efer & EFER_LMA) {
2170#ifdef CONFIG_X86_64 2225#ifdef CONFIG_X86_64
2171 ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; 2226 *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags & ~EFLG_RF;
2172 2227
2173 ops->get_msr(ctxt, 2228 ops->get_msr(ctxt,
2174 ctxt->mode == X86EMUL_MODE_PROT64 ? 2229 ctxt->mode == X86EMUL_MODE_PROT64 ?
@@ -2191,7 +2246,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
2191 2246
2192static int em_sysenter(struct x86_emulate_ctxt *ctxt) 2247static int em_sysenter(struct x86_emulate_ctxt *ctxt)
2193{ 2248{
2194 struct x86_emulate_ops *ops = ctxt->ops; 2249 const struct x86_emulate_ops *ops = ctxt->ops;
2195 struct desc_struct cs, ss; 2250 struct desc_struct cs, ss;
2196 u64 msr_data; 2251 u64 msr_data;
2197 u16 cs_sel, ss_sel; 2252 u16 cs_sel, ss_sel;
@@ -2228,6 +2283,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
2228 if (msr_data == 0x0) 2283 if (msr_data == 0x0)
2229 return emulate_gp(ctxt, 0); 2284 return emulate_gp(ctxt, 0);
2230 break; 2285 break;
2286 default:
2287 break;
2231 } 2288 }
2232 2289
2233 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 2290 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -2247,14 +2304,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
2247 ctxt->_eip = msr_data; 2304 ctxt->_eip = msr_data;
2248 2305
2249 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); 2306 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
2250 ctxt->regs[VCPU_REGS_RSP] = msr_data; 2307 *reg_write(ctxt, VCPU_REGS_RSP) = msr_data;
2251 2308
2252 return X86EMUL_CONTINUE; 2309 return X86EMUL_CONTINUE;
2253} 2310}
2254 2311
2255static int em_sysexit(struct x86_emulate_ctxt *ctxt) 2312static int em_sysexit(struct x86_emulate_ctxt *ctxt)
2256{ 2313{
2257 struct x86_emulate_ops *ops = ctxt->ops; 2314 const struct x86_emulate_ops *ops = ctxt->ops;
2258 struct desc_struct cs, ss; 2315 struct desc_struct cs, ss;
2259 u64 msr_data; 2316 u64 msr_data;
2260 int usermode; 2317 int usermode;
@@ -2297,8 +2354,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
2297 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); 2354 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
2298 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 2355 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
2299 2356
2300 ctxt->_eip = ctxt->regs[VCPU_REGS_RDX]; 2357 ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX);
2301 ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX]; 2358 *reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX);
2302 2359
2303 return X86EMUL_CONTINUE; 2360 return X86EMUL_CONTINUE;
2304} 2361}
@@ -2317,7 +2374,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
2317static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, 2374static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2318 u16 port, u16 len) 2375 u16 port, u16 len)
2319{ 2376{
2320 struct x86_emulate_ops *ops = ctxt->ops; 2377 const struct x86_emulate_ops *ops = ctxt->ops;
2321 struct desc_struct tr_seg; 2378 struct desc_struct tr_seg;
2322 u32 base3; 2379 u32 base3;
2323 int r; 2380 int r;
@@ -2367,14 +2424,14 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2367{ 2424{
2368 tss->ip = ctxt->_eip; 2425 tss->ip = ctxt->_eip;
2369 tss->flag = ctxt->eflags; 2426 tss->flag = ctxt->eflags;
2370 tss->ax = ctxt->regs[VCPU_REGS_RAX]; 2427 tss->ax = reg_read(ctxt, VCPU_REGS_RAX);
2371 tss->cx = ctxt->regs[VCPU_REGS_RCX]; 2428 tss->cx = reg_read(ctxt, VCPU_REGS_RCX);
2372 tss->dx = ctxt->regs[VCPU_REGS_RDX]; 2429 tss->dx = reg_read(ctxt, VCPU_REGS_RDX);
2373 tss->bx = ctxt->regs[VCPU_REGS_RBX]; 2430 tss->bx = reg_read(ctxt, VCPU_REGS_RBX);
2374 tss->sp = ctxt->regs[VCPU_REGS_RSP]; 2431 tss->sp = reg_read(ctxt, VCPU_REGS_RSP);
2375 tss->bp = ctxt->regs[VCPU_REGS_RBP]; 2432 tss->bp = reg_read(ctxt, VCPU_REGS_RBP);
2376 tss->si = ctxt->regs[VCPU_REGS_RSI]; 2433 tss->si = reg_read(ctxt, VCPU_REGS_RSI);
2377 tss->di = ctxt->regs[VCPU_REGS_RDI]; 2434 tss->di = reg_read(ctxt, VCPU_REGS_RDI);
2378 2435
2379 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); 2436 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
2380 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); 2437 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2390,14 +2447,14 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2390 2447
2391 ctxt->_eip = tss->ip; 2448 ctxt->_eip = tss->ip;
2392 ctxt->eflags = tss->flag | 2; 2449 ctxt->eflags = tss->flag | 2;
2393 ctxt->regs[VCPU_REGS_RAX] = tss->ax; 2450 *reg_write(ctxt, VCPU_REGS_RAX) = tss->ax;
2394 ctxt->regs[VCPU_REGS_RCX] = tss->cx; 2451 *reg_write(ctxt, VCPU_REGS_RCX) = tss->cx;
2395 ctxt->regs[VCPU_REGS_RDX] = tss->dx; 2452 *reg_write(ctxt, VCPU_REGS_RDX) = tss->dx;
2396 ctxt->regs[VCPU_REGS_RBX] = tss->bx; 2453 *reg_write(ctxt, VCPU_REGS_RBX) = tss->bx;
2397 ctxt->regs[VCPU_REGS_RSP] = tss->sp; 2454 *reg_write(ctxt, VCPU_REGS_RSP) = tss->sp;
2398 ctxt->regs[VCPU_REGS_RBP] = tss->bp; 2455 *reg_write(ctxt, VCPU_REGS_RBP) = tss->bp;
2399 ctxt->regs[VCPU_REGS_RSI] = tss->si; 2456 *reg_write(ctxt, VCPU_REGS_RSI) = tss->si;
2400 ctxt->regs[VCPU_REGS_RDI] = tss->di; 2457 *reg_write(ctxt, VCPU_REGS_RDI) = tss->di;
2401 2458
2402 /* 2459 /*
2403 * SDM says that segment selectors are loaded before segment 2460 * SDM says that segment selectors are loaded before segment
@@ -2410,7 +2467,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2410 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); 2467 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
2411 2468
2412 /* 2469 /*
2413 * Now load segment descriptors. If fault happenes at this stage 2470 * Now load segment descriptors. If fault happens at this stage
2414 * it is handled in a context of new task 2471 * it is handled in a context of new task
2415 */ 2472 */
2416 ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); 2473 ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
@@ -2436,7 +2493,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2436 u16 tss_selector, u16 old_tss_sel, 2493 u16 tss_selector, u16 old_tss_sel,
2437 ulong old_tss_base, struct desc_struct *new_desc) 2494 ulong old_tss_base, struct desc_struct *new_desc)
2438{ 2495{
2439 struct x86_emulate_ops *ops = ctxt->ops; 2496 const struct x86_emulate_ops *ops = ctxt->ops;
2440 struct tss_segment_16 tss_seg; 2497 struct tss_segment_16 tss_seg;
2441 int ret; 2498 int ret;
2442 u32 new_tss_base = get_desc_base(new_desc); 2499 u32 new_tss_base = get_desc_base(new_desc);
@@ -2482,14 +2539,14 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2482 tss->cr3 = ctxt->ops->get_cr(ctxt, 3); 2539 tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
2483 tss->eip = ctxt->_eip; 2540 tss->eip = ctxt->_eip;
2484 tss->eflags = ctxt->eflags; 2541 tss->eflags = ctxt->eflags;
2485 tss->eax = ctxt->regs[VCPU_REGS_RAX]; 2542 tss->eax = reg_read(ctxt, VCPU_REGS_RAX);
2486 tss->ecx = ctxt->regs[VCPU_REGS_RCX]; 2543 tss->ecx = reg_read(ctxt, VCPU_REGS_RCX);
2487 tss->edx = ctxt->regs[VCPU_REGS_RDX]; 2544 tss->edx = reg_read(ctxt, VCPU_REGS_RDX);
2488 tss->ebx = ctxt->regs[VCPU_REGS_RBX]; 2545 tss->ebx = reg_read(ctxt, VCPU_REGS_RBX);
2489 tss->esp = ctxt->regs[VCPU_REGS_RSP]; 2546 tss->esp = reg_read(ctxt, VCPU_REGS_RSP);
2490 tss->ebp = ctxt->regs[VCPU_REGS_RBP]; 2547 tss->ebp = reg_read(ctxt, VCPU_REGS_RBP);
2491 tss->esi = ctxt->regs[VCPU_REGS_RSI]; 2548 tss->esi = reg_read(ctxt, VCPU_REGS_RSI);
2492 tss->edi = ctxt->regs[VCPU_REGS_RDI]; 2549 tss->edi = reg_read(ctxt, VCPU_REGS_RDI);
2493 2550
2494 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); 2551 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
2495 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); 2552 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2511,14 +2568,14 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2511 ctxt->eflags = tss->eflags | 2; 2568 ctxt->eflags = tss->eflags | 2;
2512 2569
2513 /* General purpose registers */ 2570 /* General purpose registers */
2514 ctxt->regs[VCPU_REGS_RAX] = tss->eax; 2571 *reg_write(ctxt, VCPU_REGS_RAX) = tss->eax;
2515 ctxt->regs[VCPU_REGS_RCX] = tss->ecx; 2572 *reg_write(ctxt, VCPU_REGS_RCX) = tss->ecx;
2516 ctxt->regs[VCPU_REGS_RDX] = tss->edx; 2573 *reg_write(ctxt, VCPU_REGS_RDX) = tss->edx;
2517 ctxt->regs[VCPU_REGS_RBX] = tss->ebx; 2574 *reg_write(ctxt, VCPU_REGS_RBX) = tss->ebx;
2518 ctxt->regs[VCPU_REGS_RSP] = tss->esp; 2575 *reg_write(ctxt, VCPU_REGS_RSP) = tss->esp;
2519 ctxt->regs[VCPU_REGS_RBP] = tss->ebp; 2576 *reg_write(ctxt, VCPU_REGS_RBP) = tss->ebp;
2520 ctxt->regs[VCPU_REGS_RSI] = tss->esi; 2577 *reg_write(ctxt, VCPU_REGS_RSI) = tss->esi;
2521 ctxt->regs[VCPU_REGS_RDI] = tss->edi; 2578 *reg_write(ctxt, VCPU_REGS_RDI) = tss->edi;
2522 2579
2523 /* 2580 /*
2524 * SDM says that segment selectors are loaded before segment 2581 * SDM says that segment selectors are loaded before segment
@@ -2583,7 +2640,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2583 u16 tss_selector, u16 old_tss_sel, 2640 u16 tss_selector, u16 old_tss_sel,
2584 ulong old_tss_base, struct desc_struct *new_desc) 2641 ulong old_tss_base, struct desc_struct *new_desc)
2585{ 2642{
2586 struct x86_emulate_ops *ops = ctxt->ops; 2643 const struct x86_emulate_ops *ops = ctxt->ops;
2587 struct tss_segment_32 tss_seg; 2644 struct tss_segment_32 tss_seg;
2588 int ret; 2645 int ret;
2589 u32 new_tss_base = get_desc_base(new_desc); 2646 u32 new_tss_base = get_desc_base(new_desc);
@@ -2627,7 +2684,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2627 u16 tss_selector, int idt_index, int reason, 2684 u16 tss_selector, int idt_index, int reason,
2628 bool has_error_code, u32 error_code) 2685 bool has_error_code, u32 error_code)
2629{ 2686{
2630 struct x86_emulate_ops *ops = ctxt->ops; 2687 const struct x86_emulate_ops *ops = ctxt->ops;
2631 struct desc_struct curr_tss_desc, next_tss_desc; 2688 struct desc_struct curr_tss_desc, next_tss_desc;
2632 int ret; 2689 int ret;
2633 u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); 2690 u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
@@ -2652,7 +2709,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2652 * 2709 *
2653 * 1. jmp/call/int to task gate: Check against DPL of the task gate 2710 * 1. jmp/call/int to task gate: Check against DPL of the task gate
2654 * 2. Exception/IRQ/iret: No check is performed 2711 * 2. Exception/IRQ/iret: No check is performed
2655 * 3. jmp/call to TSS: Check agains DPL of the TSS 2712 * 3. jmp/call to TSS: Check against DPL of the TSS
2656 */ 2713 */
2657 if (reason == TASK_SWITCH_GATE) { 2714 if (reason == TASK_SWITCH_GATE) {
2658 if (idt_index != -1) { 2715 if (idt_index != -1) {
@@ -2693,7 +2750,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2693 ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT; 2750 ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
2694 2751
2695 /* set back link to prev task only if NT bit is set in eflags 2752 /* set back link to prev task only if NT bit is set in eflags
2696 note that old_tss_sel is not used afetr this point */ 2753 note that old_tss_sel is not used after this point */
2697 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 2754 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
2698 old_tss_sel = 0xffff; 2755 old_tss_sel = 0xffff;
2699 2756
@@ -2733,26 +2790,28 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2733{ 2790{
2734 int rc; 2791 int rc;
2735 2792
2793 invalidate_registers(ctxt);
2736 ctxt->_eip = ctxt->eip; 2794 ctxt->_eip = ctxt->eip;
2737 ctxt->dst.type = OP_NONE; 2795 ctxt->dst.type = OP_NONE;
2738 2796
2739 rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason, 2797 rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
2740 has_error_code, error_code); 2798 has_error_code, error_code);
2741 2799
2742 if (rc == X86EMUL_CONTINUE) 2800 if (rc == X86EMUL_CONTINUE) {
2743 ctxt->eip = ctxt->_eip; 2801 ctxt->eip = ctxt->_eip;
2802 writeback_registers(ctxt);
2803 }
2744 2804
2745 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 2805 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
2746} 2806}
2747 2807
2748static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, 2808static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
2749 int reg, struct operand *op) 2809 struct operand *op)
2750{ 2810{
2751 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; 2811 int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
2752 2812
2753 register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes); 2813 register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes);
2754 op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]); 2814 op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg));
2755 op->addr.mem.seg = seg;
2756} 2815}
2757 2816
2758static int em_das(struct x86_emulate_ctxt *ctxt) 2817static int em_das(struct x86_emulate_ctxt *ctxt)
@@ -2927,7 +2986,7 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt)
2927{ 2986{
2928 ctxt->dst.type = OP_REG; 2987 ctxt->dst.type = OP_REG;
2929 ctxt->dst.bytes = ctxt->src.bytes; 2988 ctxt->dst.bytes = ctxt->src.bytes;
2930 ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; 2989 ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
2931 ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1); 2990 ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
2932 2991
2933 return X86EMUL_CONTINUE; 2992 return X86EMUL_CONTINUE;
@@ -2938,8 +2997,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2938 u64 tsc = 0; 2997 u64 tsc = 0;
2939 2998
2940 ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); 2999 ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
2941 ctxt->regs[VCPU_REGS_RAX] = (u32)tsc; 3000 *reg_write(ctxt, VCPU_REGS_RAX) = (u32)tsc;
2942 ctxt->regs[VCPU_REGS_RDX] = tsc >> 32; 3001 *reg_write(ctxt, VCPU_REGS_RDX) = tsc >> 32;
2943 return X86EMUL_CONTINUE; 3002 return X86EMUL_CONTINUE;
2944} 3003}
2945 3004
@@ -2947,10 +3006,10 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
2947{ 3006{
2948 u64 pmc; 3007 u64 pmc;
2949 3008
2950 if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc)) 3009 if (ctxt->ops->read_pmc(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &pmc))
2951 return emulate_gp(ctxt, 0); 3010 return emulate_gp(ctxt, 0);
2952 ctxt->regs[VCPU_REGS_RAX] = (u32)pmc; 3011 *reg_write(ctxt, VCPU_REGS_RAX) = (u32)pmc;
2953 ctxt->regs[VCPU_REGS_RDX] = pmc >> 32; 3012 *reg_write(ctxt, VCPU_REGS_RDX) = pmc >> 32;
2954 return X86EMUL_CONTINUE; 3013 return X86EMUL_CONTINUE;
2955} 3014}
2956 3015
@@ -2992,9 +3051,9 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
2992{ 3051{
2993 u64 msr_data; 3052 u64 msr_data;
2994 3053
2995 msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] 3054 msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
2996 | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32); 3055 | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
2997 if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) 3056 if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
2998 return emulate_gp(ctxt, 0); 3057 return emulate_gp(ctxt, 0);
2999 3058
3000 return X86EMUL_CONTINUE; 3059 return X86EMUL_CONTINUE;
@@ -3004,11 +3063,11 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
3004{ 3063{
3005 u64 msr_data; 3064 u64 msr_data;
3006 3065
3007 if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) 3066 if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data))
3008 return emulate_gp(ctxt, 0); 3067 return emulate_gp(ctxt, 0);
3009 3068
3010 ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data; 3069 *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
3011 ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32; 3070 *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
3012 return X86EMUL_CONTINUE; 3071 return X86EMUL_CONTINUE;
3013} 3072}
3014 3073
@@ -3188,8 +3247,8 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt)
3188 3247
3189static int em_loop(struct x86_emulate_ctxt *ctxt) 3248static int em_loop(struct x86_emulate_ctxt *ctxt)
3190{ 3249{
3191 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); 3250 register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1);
3192 if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) && 3251 if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) &&
3193 (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) 3252 (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
3194 jmp_rel(ctxt, ctxt->src.val); 3253 jmp_rel(ctxt, ctxt->src.val);
3195 3254
@@ -3198,7 +3257,7 @@ static int em_loop(struct x86_emulate_ctxt *ctxt)
3198 3257
3199static int em_jcxz(struct x86_emulate_ctxt *ctxt) 3258static int em_jcxz(struct x86_emulate_ctxt *ctxt)
3200{ 3259{
3201 if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) 3260 if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0)
3202 jmp_rel(ctxt, ctxt->src.val); 3261 jmp_rel(ctxt, ctxt->src.val);
3203 3262
3204 return X86EMUL_CONTINUE; 3263 return X86EMUL_CONTINUE;
@@ -3286,20 +3345,20 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt)
3286{ 3345{
3287 u32 eax, ebx, ecx, edx; 3346 u32 eax, ebx, ecx, edx;
3288 3347
3289 eax = ctxt->regs[VCPU_REGS_RAX]; 3348 eax = reg_read(ctxt, VCPU_REGS_RAX);
3290 ecx = ctxt->regs[VCPU_REGS_RCX]; 3349 ecx = reg_read(ctxt, VCPU_REGS_RCX);
3291 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 3350 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
3292 ctxt->regs[VCPU_REGS_RAX] = eax; 3351 *reg_write(ctxt, VCPU_REGS_RAX) = eax;
3293 ctxt->regs[VCPU_REGS_RBX] = ebx; 3352 *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
3294 ctxt->regs[VCPU_REGS_RCX] = ecx; 3353 *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
3295 ctxt->regs[VCPU_REGS_RDX] = edx; 3354 *reg_write(ctxt, VCPU_REGS_RDX) = edx;
3296 return X86EMUL_CONTINUE; 3355 return X86EMUL_CONTINUE;
3297} 3356}
3298 3357
3299static int em_lahf(struct x86_emulate_ctxt *ctxt) 3358static int em_lahf(struct x86_emulate_ctxt *ctxt)
3300{ 3359{
3301 ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL; 3360 *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL;
3302 ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8; 3361 *reg_rmw(ctxt, VCPU_REGS_RAX) |= (ctxt->eflags & 0xff) << 8;
3303 return X86EMUL_CONTINUE; 3362 return X86EMUL_CONTINUE;
3304} 3363}
3305 3364
@@ -3456,7 +3515,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt)
3456 3515
3457static int check_svme_pa(struct x86_emulate_ctxt *ctxt) 3516static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
3458{ 3517{
3459 u64 rax = ctxt->regs[VCPU_REGS_RAX]; 3518 u64 rax = reg_read(ctxt, VCPU_REGS_RAX);
3460 3519
3461 /* Valid physical address? */ 3520 /* Valid physical address? */
3462 if (rax & 0xffff000000000000ULL) 3521 if (rax & 0xffff000000000000ULL)
@@ -3478,7 +3537,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
3478static int check_rdpmc(struct x86_emulate_ctxt *ctxt) 3537static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
3479{ 3538{
3480 u64 cr4 = ctxt->ops->get_cr(ctxt, 4); 3539 u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
3481 u64 rcx = ctxt->regs[VCPU_REGS_RCX]; 3540 u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
3482 3541
3483 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || 3542 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
3484 (rcx > 3)) 3543 (rcx > 3))
@@ -3531,13 +3590,13 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3531 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ 3590 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
3532 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) 3591 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
3533 3592
3534static struct opcode group7_rm1[] = { 3593static const struct opcode group7_rm1[] = {
3535 DI(SrcNone | Priv, monitor), 3594 DI(SrcNone | Priv, monitor),
3536 DI(SrcNone | Priv, mwait), 3595 DI(SrcNone | Priv, mwait),
3537 N, N, N, N, N, N, 3596 N, N, N, N, N, N,
3538}; 3597};
3539 3598
3540static struct opcode group7_rm3[] = { 3599static const struct opcode group7_rm3[] = {
3541 DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), 3600 DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
3542 II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall), 3601 II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall),
3543 DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), 3602 DIP(SrcNone | Prot | Priv, vmload, check_svme_pa),
@@ -3548,13 +3607,13 @@ static struct opcode group7_rm3[] = {
3548 DIP(SrcNone | Prot | Priv, invlpga, check_svme), 3607 DIP(SrcNone | Prot | Priv, invlpga, check_svme),
3549}; 3608};
3550 3609
3551static struct opcode group7_rm7[] = { 3610static const struct opcode group7_rm7[] = {
3552 N, 3611 N,
3553 DIP(SrcNone, rdtscp, check_rdtsc), 3612 DIP(SrcNone, rdtscp, check_rdtsc),
3554 N, N, N, N, N, N, 3613 N, N, N, N, N, N,
3555}; 3614};
3556 3615
3557static struct opcode group1[] = { 3616static const struct opcode group1[] = {
3558 I(Lock, em_add), 3617 I(Lock, em_add),
3559 I(Lock | PageTable, em_or), 3618 I(Lock | PageTable, em_or),
3560 I(Lock, em_adc), 3619 I(Lock, em_adc),
@@ -3565,11 +3624,11 @@ static struct opcode group1[] = {
3565 I(0, em_cmp), 3624 I(0, em_cmp),
3566}; 3625};
3567 3626
3568static struct opcode group1A[] = { 3627static const struct opcode group1A[] = {
3569 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, 3628 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
3570}; 3629};
3571 3630
3572static struct opcode group3[] = { 3631static const struct opcode group3[] = {
3573 I(DstMem | SrcImm, em_test), 3632 I(DstMem | SrcImm, em_test),
3574 I(DstMem | SrcImm, em_test), 3633 I(DstMem | SrcImm, em_test),
3575 I(DstMem | SrcNone | Lock, em_not), 3634 I(DstMem | SrcNone | Lock, em_not),
@@ -3580,13 +3639,13 @@ static struct opcode group3[] = {
3580 I(SrcMem, em_idiv_ex), 3639 I(SrcMem, em_idiv_ex),
3581}; 3640};
3582 3641
3583static struct opcode group4[] = { 3642static const struct opcode group4[] = {
3584 I(ByteOp | DstMem | SrcNone | Lock, em_grp45), 3643 I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
3585 I(ByteOp | DstMem | SrcNone | Lock, em_grp45), 3644 I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
3586 N, N, N, N, N, N, 3645 N, N, N, N, N, N,
3587}; 3646};
3588 3647
3589static struct opcode group5[] = { 3648static const struct opcode group5[] = {
3590 I(DstMem | SrcNone | Lock, em_grp45), 3649 I(DstMem | SrcNone | Lock, em_grp45),
3591 I(DstMem | SrcNone | Lock, em_grp45), 3650 I(DstMem | SrcNone | Lock, em_grp45),
3592 I(SrcMem | Stack, em_grp45), 3651 I(SrcMem | Stack, em_grp45),
@@ -3596,7 +3655,7 @@ static struct opcode group5[] = {
3596 I(SrcMem | Stack, em_grp45), N, 3655 I(SrcMem | Stack, em_grp45), N,
3597}; 3656};
3598 3657
3599static struct opcode group6[] = { 3658static const struct opcode group6[] = {
3600 DI(Prot, sldt), 3659 DI(Prot, sldt),
3601 DI(Prot, str), 3660 DI(Prot, str),
3602 II(Prot | Priv | SrcMem16, em_lldt, lldt), 3661 II(Prot | Priv | SrcMem16, em_lldt, lldt),
@@ -3604,7 +3663,7 @@ static struct opcode group6[] = {
3604 N, N, N, N, 3663 N, N, N, N,
3605}; 3664};
3606 3665
3607static struct group_dual group7 = { { 3666static const struct group_dual group7 = { {
3608 II(Mov | DstMem | Priv, em_sgdt, sgdt), 3667 II(Mov | DstMem | Priv, em_sgdt, sgdt),
3609 II(Mov | DstMem | Priv, em_sidt, sidt), 3668 II(Mov | DstMem | Priv, em_sidt, sidt),
3610 II(SrcMem | Priv, em_lgdt, lgdt), 3669 II(SrcMem | Priv, em_lgdt, lgdt),
@@ -3621,7 +3680,7 @@ static struct group_dual group7 = { {
3621 EXT(0, group7_rm7), 3680 EXT(0, group7_rm7),
3622} }; 3681} };
3623 3682
3624static struct opcode group8[] = { 3683static const struct opcode group8[] = {
3625 N, N, N, N, 3684 N, N, N, N,
3626 I(DstMem | SrcImmByte, em_bt), 3685 I(DstMem | SrcImmByte, em_bt),
3627 I(DstMem | SrcImmByte | Lock | PageTable, em_bts), 3686 I(DstMem | SrcImmByte | Lock | PageTable, em_bts),
@@ -3629,26 +3688,26 @@ static struct opcode group8[] = {
3629 I(DstMem | SrcImmByte | Lock | PageTable, em_btc), 3688 I(DstMem | SrcImmByte | Lock | PageTable, em_btc),
3630}; 3689};
3631 3690
3632static struct group_dual group9 = { { 3691static const struct group_dual group9 = { {
3633 N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, 3692 N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
3634}, { 3693}, {
3635 N, N, N, N, N, N, N, N, 3694 N, N, N, N, N, N, N, N,
3636} }; 3695} };
3637 3696
3638static struct opcode group11[] = { 3697static const struct opcode group11[] = {
3639 I(DstMem | SrcImm | Mov | PageTable, em_mov), 3698 I(DstMem | SrcImm | Mov | PageTable, em_mov),
3640 X7(D(Undefined)), 3699 X7(D(Undefined)),
3641}; 3700};
3642 3701
3643static struct gprefix pfx_0f_6f_0f_7f = { 3702static const struct gprefix pfx_0f_6f_0f_7f = {
3644 I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), 3703 I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
3645}; 3704};
3646 3705
3647static struct gprefix pfx_vmovntpx = { 3706static const struct gprefix pfx_vmovntpx = {
3648 I(0, em_mov), N, N, N, 3707 I(0, em_mov), N, N, N,
3649}; 3708};
3650 3709
3651static struct opcode opcode_table[256] = { 3710static const struct opcode opcode_table[256] = {
3652 /* 0x00 - 0x07 */ 3711 /* 0x00 - 0x07 */
3653 I6ALU(Lock, em_add), 3712 I6ALU(Lock, em_add),
3654 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), 3713 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
@@ -3689,7 +3748,7 @@ static struct opcode opcode_table[256] = {
3689 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), 3748 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
3690 I(SrcImmByte | Mov | Stack, em_push), 3749 I(SrcImmByte | Mov | Stack, em_push),
3691 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), 3750 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
3692 I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */ 3751 I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
3693 I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */ 3752 I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
3694 /* 0x70 - 0x7F */ 3753 /* 0x70 - 0x7F */
3695 X16(D(SrcImmByte)), 3754 X16(D(SrcImmByte)),
@@ -3765,7 +3824,7 @@ static struct opcode opcode_table[256] = {
3765 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), 3824 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
3766}; 3825};
3767 3826
3768static struct opcode twobyte_table[256] = { 3827static const struct opcode twobyte_table[256] = {
3769 /* 0x00 - 0x0F */ 3828 /* 0x00 - 0x0F */
3770 G(0, group6), GD(0, &group7), N, N, 3829 G(0, group6), GD(0, &group7), N, N,
3771 N, I(ImplicitOps | VendorSpecific, em_syscall), 3830 N, I(ImplicitOps | VendorSpecific, em_syscall),
@@ -3936,7 +3995,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3936 case OpAcc: 3995 case OpAcc:
3937 op->type = OP_REG; 3996 op->type = OP_REG;
3938 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 3997 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3939 op->addr.reg = &ctxt->regs[VCPU_REGS_RAX]; 3998 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
3940 fetch_register_operand(op); 3999 fetch_register_operand(op);
3941 op->orig_val = op->val; 4000 op->orig_val = op->val;
3942 break; 4001 break;
@@ -3944,19 +4003,20 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3944 op->type = OP_MEM; 4003 op->type = OP_MEM;
3945 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 4004 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3946 op->addr.mem.ea = 4005 op->addr.mem.ea =
3947 register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]); 4006 register_address(ctxt, reg_read(ctxt, VCPU_REGS_RDI));
3948 op->addr.mem.seg = VCPU_SREG_ES; 4007 op->addr.mem.seg = VCPU_SREG_ES;
3949 op->val = 0; 4008 op->val = 0;
4009 op->count = 1;
3950 break; 4010 break;
3951 case OpDX: 4011 case OpDX:
3952 op->type = OP_REG; 4012 op->type = OP_REG;
3953 op->bytes = 2; 4013 op->bytes = 2;
3954 op->addr.reg = &ctxt->regs[VCPU_REGS_RDX]; 4014 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
3955 fetch_register_operand(op); 4015 fetch_register_operand(op);
3956 break; 4016 break;
3957 case OpCL: 4017 case OpCL:
3958 op->bytes = 1; 4018 op->bytes = 1;
3959 op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff; 4019 op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff;
3960 break; 4020 break;
3961 case OpImmByte: 4021 case OpImmByte:
3962 rc = decode_imm(ctxt, op, 1, true); 4022 rc = decode_imm(ctxt, op, 1, true);
@@ -3987,9 +4047,10 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3987 op->type = OP_MEM; 4047 op->type = OP_MEM;
3988 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 4048 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3989 op->addr.mem.ea = 4049 op->addr.mem.ea =
3990 register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]); 4050 register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI));
3991 op->addr.mem.seg = seg_override(ctxt); 4051 op->addr.mem.seg = seg_override(ctxt);
3992 op->val = 0; 4052 op->val = 0;
4053 op->count = 1;
3993 break; 4054 break;
3994 case OpImmFAddr: 4055 case OpImmFAddr:
3995 op->type = OP_IMM; 4056 op->type = OP_IMM;
@@ -4293,9 +4354,10 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
4293 read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); 4354 read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
4294} 4355}
4295 4356
4357
4296int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) 4358int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4297{ 4359{
4298 struct x86_emulate_ops *ops = ctxt->ops; 4360 const struct x86_emulate_ops *ops = ctxt->ops;
4299 int rc = X86EMUL_CONTINUE; 4361 int rc = X86EMUL_CONTINUE;
4300 int saved_dst_type = ctxt->dst.type; 4362 int saved_dst_type = ctxt->dst.type;
4301 4363
@@ -4356,7 +4418,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4356 } 4418 }
4357 4419
4358 /* Instruction can only be executed in protected mode */ 4420 /* Instruction can only be executed in protected mode */
4359 if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { 4421 if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
4360 rc = emulate_ud(ctxt); 4422 rc = emulate_ud(ctxt);
4361 goto done; 4423 goto done;
4362 } 4424 }
@@ -4377,7 +4439,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4377 4439
4378 if (ctxt->rep_prefix && (ctxt->d & String)) { 4440 if (ctxt->rep_prefix && (ctxt->d & String)) {
4379 /* All REP prefixes have the same first termination condition */ 4441 /* All REP prefixes have the same first termination condition */
4380 if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) { 4442 if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
4381 ctxt->eip = ctxt->_eip; 4443 ctxt->eip = ctxt->_eip;
4382 goto done; 4444 goto done;
4383 } 4445 }
@@ -4450,7 +4512,7 @@ special_insn:
4450 ctxt->dst.val = ctxt->src.addr.mem.ea; 4512 ctxt->dst.val = ctxt->src.addr.mem.ea;
4451 break; 4513 break;
4452 case 0x90 ... 0x97: /* nop / xchg reg, rax */ 4514 case 0x90 ... 0x97: /* nop / xchg reg, rax */
4453 if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX]) 4515 if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX))
4454 break; 4516 break;
4455 rc = em_xchg(ctxt); 4517 rc = em_xchg(ctxt);
4456 break; 4518 break;
@@ -4478,7 +4540,7 @@ special_insn:
4478 rc = em_grp2(ctxt); 4540 rc = em_grp2(ctxt);
4479 break; 4541 break;
4480 case 0xd2 ... 0xd3: /* Grp2 */ 4542 case 0xd2 ... 0xd3: /* Grp2 */
4481 ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; 4543 ctxt->src.val = reg_read(ctxt, VCPU_REGS_RCX);
4482 rc = em_grp2(ctxt); 4544 rc = em_grp2(ctxt);
4483 break; 4545 break;
4484 case 0xe9: /* jmp rel */ 4546 case 0xe9: /* jmp rel */
@@ -4524,23 +4586,27 @@ writeback:
4524 ctxt->dst.type = saved_dst_type; 4586 ctxt->dst.type = saved_dst_type;
4525 4587
4526 if ((ctxt->d & SrcMask) == SrcSI) 4588 if ((ctxt->d & SrcMask) == SrcSI)
4527 string_addr_inc(ctxt, seg_override(ctxt), 4589 string_addr_inc(ctxt, VCPU_REGS_RSI, &ctxt->src);
4528 VCPU_REGS_RSI, &ctxt->src);
4529 4590
4530 if ((ctxt->d & DstMask) == DstDI) 4591 if ((ctxt->d & DstMask) == DstDI)
4531 string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, 4592 string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
4532 &ctxt->dst);
4533 4593
4534 if (ctxt->rep_prefix && (ctxt->d & String)) { 4594 if (ctxt->rep_prefix && (ctxt->d & String)) {
4595 unsigned int count;
4535 struct read_cache *r = &ctxt->io_read; 4596 struct read_cache *r = &ctxt->io_read;
4536 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); 4597 if ((ctxt->d & SrcMask) == SrcSI)
4598 count = ctxt->src.count;
4599 else
4600 count = ctxt->dst.count;
4601 register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX),
4602 -count);
4537 4603
4538 if (!string_insn_completed(ctxt)) { 4604 if (!string_insn_completed(ctxt)) {
4539 /* 4605 /*
4540 * Re-enter guest when pio read ahead buffer is empty 4606 * Re-enter guest when pio read ahead buffer is empty
4541 * or, if it is not used, after each 1024 iteration. 4607 * or, if it is not used, after each 1024 iteration.
4542 */ 4608 */
4543 if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) && 4609 if ((r->end != 0 || reg_read(ctxt, VCPU_REGS_RCX) & 0x3ff) &&
4544 (r->end == 0 || r->end != r->pos)) { 4610 (r->end == 0 || r->end != r->pos)) {
4545 /* 4611 /*
4546 * Reset read cache. Usually happens before 4612 * Reset read cache. Usually happens before
@@ -4548,6 +4614,7 @@ writeback:
4548 * we have to do it here. 4614 * we have to do it here.
4549 */ 4615 */
4550 ctxt->mem_read.end = 0; 4616 ctxt->mem_read.end = 0;
4617 writeback_registers(ctxt);
4551 return EMULATION_RESTART; 4618 return EMULATION_RESTART;
4552 } 4619 }
4553 goto done; /* skip rip writeback */ 4620 goto done; /* skip rip writeback */
@@ -4562,6 +4629,9 @@ done:
4562 if (rc == X86EMUL_INTERCEPTED) 4629 if (rc == X86EMUL_INTERCEPTED)
4563 return EMULATION_INTERCEPTED; 4630 return EMULATION_INTERCEPTED;
4564 4631
4632 if (rc == X86EMUL_CONTINUE)
4633 writeback_registers(ctxt);
4634
4565 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 4635 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
4566 4636
4567twobyte_insn: 4637twobyte_insn:
@@ -4634,3 +4704,13 @@ twobyte_insn:
4634cannot_emulate: 4704cannot_emulate:
4635 return EMULATION_FAILED; 4705 return EMULATION_FAILED;
4636} 4706}
4707
4708void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt)
4709{
4710 invalidate_registers(ctxt);
4711}
4712
4713void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
4714{
4715 writeback_registers(ctxt);
4716}
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index adba28f88d1a..11300d2fa714 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -108,7 +108,7 @@ static s64 __kpit_elapsed(struct kvm *kvm)
108 ktime_t remaining; 108 ktime_t remaining;
109 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; 109 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
110 110
111 if (!ps->pit_timer.period) 111 if (!ps->period)
112 return 0; 112 return 0;
113 113
114 /* 114 /*
@@ -120,9 +120,9 @@ static s64 __kpit_elapsed(struct kvm *kvm)
120 * itself with the initial count and continues counting 120 * itself with the initial count and continues counting
121 * from there. 121 * from there.
122 */ 122 */
123 remaining = hrtimer_get_remaining(&ps->pit_timer.timer); 123 remaining = hrtimer_get_remaining(&ps->timer);
124 elapsed = ps->pit_timer.period - ktime_to_ns(remaining); 124 elapsed = ps->period - ktime_to_ns(remaining);
125 elapsed = mod_64(elapsed, ps->pit_timer.period); 125 elapsed = mod_64(elapsed, ps->period);
126 126
127 return elapsed; 127 return elapsed;
128} 128}
@@ -238,12 +238,12 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
238 int value; 238 int value;
239 239
240 spin_lock(&ps->inject_lock); 240 spin_lock(&ps->inject_lock);
241 value = atomic_dec_return(&ps->pit_timer.pending); 241 value = atomic_dec_return(&ps->pending);
242 if (value < 0) 242 if (value < 0)
243 /* spurious acks can be generated if, for example, the 243 /* spurious acks can be generated if, for example, the
244 * PIC is being reset. Handle it gracefully here 244 * PIC is being reset. Handle it gracefully here
245 */ 245 */
246 atomic_inc(&ps->pit_timer.pending); 246 atomic_inc(&ps->pending);
247 else if (value > 0) 247 else if (value > 0)
248 /* in this case, we had multiple outstanding pit interrupts 248 /* in this case, we had multiple outstanding pit interrupts
249 * that we needed to inject. Reinject 249 * that we needed to inject. Reinject
@@ -261,28 +261,17 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
261 if (!kvm_vcpu_is_bsp(vcpu) || !pit) 261 if (!kvm_vcpu_is_bsp(vcpu) || !pit)
262 return; 262 return;
263 263
264 timer = &pit->pit_state.pit_timer.timer; 264 timer = &pit->pit_state.timer;
265 if (hrtimer_cancel(timer)) 265 if (hrtimer_cancel(timer))
266 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 266 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
267} 267}
268 268
269static void destroy_pit_timer(struct kvm_pit *pit) 269static void destroy_pit_timer(struct kvm_pit *pit)
270{ 270{
271 hrtimer_cancel(&pit->pit_state.pit_timer.timer); 271 hrtimer_cancel(&pit->pit_state.timer);
272 flush_kthread_work(&pit->expired); 272 flush_kthread_work(&pit->expired);
273} 273}
274 274
275static bool kpit_is_periodic(struct kvm_timer *ktimer)
276{
277 struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
278 pit_timer);
279 return ps->is_periodic;
280}
281
282static struct kvm_timer_ops kpit_ops = {
283 .is_periodic = kpit_is_periodic,
284};
285
286static void pit_do_work(struct kthread_work *work) 275static void pit_do_work(struct kthread_work *work)
287{ 276{
288 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); 277 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
@@ -322,16 +311,16 @@ static void pit_do_work(struct kthread_work *work)
322 311
323static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) 312static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
324{ 313{
325 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); 314 struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
326 struct kvm_pit *pt = ktimer->kvm->arch.vpit; 315 struct kvm_pit *pt = ps->kvm->arch.vpit;
327 316
328 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 317 if (ps->reinject || !atomic_read(&ps->pending)) {
329 atomic_inc(&ktimer->pending); 318 atomic_inc(&ps->pending);
330 queue_kthread_work(&pt->worker, &pt->expired); 319 queue_kthread_work(&pt->worker, &pt->expired);
331 } 320 }
332 321
333 if (ktimer->t_ops->is_periodic(ktimer)) { 322 if (ps->is_periodic) {
334 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); 323 hrtimer_add_expires_ns(&ps->timer, ps->period);
335 return HRTIMER_RESTART; 324 return HRTIMER_RESTART;
336 } else 325 } else
337 return HRTIMER_NORESTART; 326 return HRTIMER_NORESTART;
@@ -340,7 +329,6 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
340static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) 329static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
341{ 330{
342 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; 331 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
343 struct kvm_timer *pt = &ps->pit_timer;
344 s64 interval; 332 s64 interval;
345 333
346 if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) 334 if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
@@ -351,19 +339,18 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
351 pr_debug("create pit timer, interval is %llu nsec\n", interval); 339 pr_debug("create pit timer, interval is %llu nsec\n", interval);
352 340
353 /* TODO The new value only affected after the retriggered */ 341 /* TODO The new value only affected after the retriggered */
354 hrtimer_cancel(&pt->timer); 342 hrtimer_cancel(&ps->timer);
355 flush_kthread_work(&ps->pit->expired); 343 flush_kthread_work(&ps->pit->expired);
356 pt->period = interval; 344 ps->period = interval;
357 ps->is_periodic = is_period; 345 ps->is_periodic = is_period;
358 346
359 pt->timer.function = pit_timer_fn; 347 ps->timer.function = pit_timer_fn;
360 pt->t_ops = &kpit_ops; 348 ps->kvm = ps->pit->kvm;
361 pt->kvm = ps->pit->kvm;
362 349
363 atomic_set(&pt->pending, 0); 350 atomic_set(&ps->pending, 0);
364 ps->irq_ack = 1; 351 ps->irq_ack = 1;
365 352
366 hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), 353 hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
367 HRTIMER_MODE_ABS); 354 HRTIMER_MODE_ABS);
368} 355}
369 356
@@ -639,7 +626,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
639 } 626 }
640 mutex_unlock(&pit->pit_state.lock); 627 mutex_unlock(&pit->pit_state.lock);
641 628
642 atomic_set(&pit->pit_state.pit_timer.pending, 0); 629 atomic_set(&pit->pit_state.pending, 0);
643 pit->pit_state.irq_ack = 1; 630 pit->pit_state.irq_ack = 1;
644} 631}
645 632
@@ -648,7 +635,7 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
648 struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); 635 struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
649 636
650 if (!mask) { 637 if (!mask) {
651 atomic_set(&pit->pit_state.pit_timer.pending, 0); 638 atomic_set(&pit->pit_state.pending, 0);
652 pit->pit_state.irq_ack = 1; 639 pit->pit_state.irq_ack = 1;
653 } 640 }
654} 641}
@@ -706,12 +693,11 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
706 693
707 pit_state = &pit->pit_state; 694 pit_state = &pit->pit_state;
708 pit_state->pit = pit; 695 pit_state->pit = pit;
709 hrtimer_init(&pit_state->pit_timer.timer, 696 hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
710 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
711 pit_state->irq_ack_notifier.gsi = 0; 697 pit_state->irq_ack_notifier.gsi = 0;
712 pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; 698 pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
713 kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); 699 kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
714 pit_state->pit_timer.reinject = true; 700 pit_state->reinject = true;
715 mutex_unlock(&pit->pit_state.lock); 701 mutex_unlock(&pit->pit_state.lock);
716 702
717 kvm_pit_reset(pit); 703 kvm_pit_reset(pit);
@@ -761,7 +747,7 @@ void kvm_free_pit(struct kvm *kvm)
761 kvm_unregister_irq_ack_notifier(kvm, 747 kvm_unregister_irq_ack_notifier(kvm,
762 &kvm->arch.vpit->pit_state.irq_ack_notifier); 748 &kvm->arch.vpit->pit_state.irq_ack_notifier);
763 mutex_lock(&kvm->arch.vpit->pit_state.lock); 749 mutex_lock(&kvm->arch.vpit->pit_state.lock);
764 timer = &kvm->arch.vpit->pit_state.pit_timer.timer; 750 timer = &kvm->arch.vpit->pit_state.timer;
765 hrtimer_cancel(timer); 751 hrtimer_cancel(timer);
766 flush_kthread_work(&kvm->arch.vpit->expired); 752 flush_kthread_work(&kvm->arch.vpit->expired);
767 kthread_stop(kvm->arch.vpit->worker_task); 753 kthread_stop(kvm->arch.vpit->worker_task);
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index fdf40425ea1d..dd1b16b611b0 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -24,8 +24,12 @@ struct kvm_kpit_channel_state {
24struct kvm_kpit_state { 24struct kvm_kpit_state {
25 struct kvm_kpit_channel_state channels[3]; 25 struct kvm_kpit_channel_state channels[3];
26 u32 flags; 26 u32 flags;
27 struct kvm_timer pit_timer;
28 bool is_periodic; 27 bool is_periodic;
28 s64 period; /* unit: ns */
29 struct hrtimer timer;
30 atomic_t pending; /* accumulated triggered timers */
31 bool reinject;
32 struct kvm *kvm;
29 u32 speaker_data_on; 33 u32 speaker_data_on;
30 struct mutex lock; 34 struct mutex lock;
31 struct kvm_pit *pit; 35 struct kvm_pit *pit;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 9fc9aa7ac703..848206df0967 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -190,17 +190,17 @@ void kvm_pic_update_irq(struct kvm_pic *s)
190 190
191int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) 191int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
192{ 192{
193 int ret = -1; 193 int ret, irq_level;
194
195 BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
194 196
195 pic_lock(s); 197 pic_lock(s);
196 if (irq >= 0 && irq < PIC_NUM_PINS) { 198 irq_level = __kvm_irq_line_state(&s->irq_states[irq],
197 int irq_level = __kvm_irq_line_state(&s->irq_states[irq], 199 irq_source_id, level);
198 irq_source_id, level); 200 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
199 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); 201 pic_update_irq(s);
200 pic_update_irq(s); 202 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
201 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 203 s->pics[irq >> 3].imr, ret == 0);
202 s->pics[irq >> 3].imr, ret == 0);
203 }
204 pic_unlock(s); 204 pic_unlock(s);
205 205
206 return ret; 206 return ret;
@@ -275,23 +275,20 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
275{ 275{
276 int irq, i; 276 int irq, i;
277 struct kvm_vcpu *vcpu; 277 struct kvm_vcpu *vcpu;
278 u8 irr = s->irr, isr = s->imr; 278 u8 edge_irr = s->irr & ~s->elcr;
279 bool found = false; 279 bool found = false;
280 280
281 s->last_irr = 0; 281 s->last_irr = 0;
282 s->irr = 0; 282 s->irr &= s->elcr;
283 s->imr = 0; 283 s->imr = 0;
284 s->isr = 0;
285 s->priority_add = 0; 284 s->priority_add = 0;
286 s->irq_base = 0;
287 s->read_reg_select = 0;
288 s->poll = 0;
289 s->special_mask = 0; 285 s->special_mask = 0;
290 s->init_state = 0; 286 s->read_reg_select = 0;
291 s->auto_eoi = 0; 287 if (!s->init4) {
292 s->rotate_on_auto_eoi = 0; 288 s->special_fully_nested_mode = 0;
293 s->special_fully_nested_mode = 0; 289 s->auto_eoi = 0;
294 s->init4 = 0; 290 }
291 s->init_state = 1;
295 292
296 kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm) 293 kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
297 if (kvm_apic_accept_pic_intr(vcpu)) { 294 if (kvm_apic_accept_pic_intr(vcpu)) {
@@ -304,7 +301,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
304 return; 301 return;
305 302
306 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) 303 for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
307 if (irr & (1 << irq) || isr & (1 << irq)) 304 if (edge_irr & (1 << irq))
308 pic_clear_isr(s, irq); 305 pic_clear_isr(s, irq);
309} 306}
310 307
@@ -316,40 +313,13 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
316 addr &= 1; 313 addr &= 1;
317 if (addr == 0) { 314 if (addr == 0) {
318 if (val & 0x10) { 315 if (val & 0x10) {
319 u8 edge_irr = s->irr & ~s->elcr;
320 int i;
321 bool found = false;
322 struct kvm_vcpu *vcpu;
323
324 s->init4 = val & 1; 316 s->init4 = val & 1;
325 s->last_irr = 0;
326 s->irr &= s->elcr;
327 s->imr = 0;
328 s->priority_add = 0;
329 s->special_mask = 0;
330 s->read_reg_select = 0;
331 if (!s->init4) {
332 s->special_fully_nested_mode = 0;
333 s->auto_eoi = 0;
334 }
335 s->init_state = 1;
336 if (val & 0x02) 317 if (val & 0x02)
337 pr_pic_unimpl("single mode not supported"); 318 pr_pic_unimpl("single mode not supported");
338 if (val & 0x08) 319 if (val & 0x08)
339 pr_pic_unimpl( 320 pr_pic_unimpl(
340 "level sensitive irq not supported"); 321 "level sensitive irq not supported");
341 322 kvm_pic_reset(s);
342 kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
343 if (kvm_apic_accept_pic_intr(vcpu)) {
344 found = true;
345 break;
346 }
347
348
349 if (found)
350 for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
351 if (edge_irr & (1 << irq))
352 pic_clear_isr(s, irq);
353 } else if (val & 0x08) { 323 } else if (val & 0x08) {
354 if (val & 0x04) 324 if (val & 0x04)
355 s->poll = 1; 325 s->poll = 1;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2086f2bfba33..2d03568e9498 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -70,7 +70,7 @@ struct kvm_pic {
70 struct kvm_io_device dev_slave; 70 struct kvm_io_device dev_slave;
71 struct kvm_io_device dev_eclr; 71 struct kvm_io_device dev_eclr;
72 void (*ack_notifier)(void *opaque, int irq); 72 void (*ack_notifier)(void *opaque, int irq);
73 unsigned long irq_states[16]; 73 unsigned long irq_states[PIC_NUM_PINS];
74}; 74};
75 75
76struct kvm_pic *kvm_create_pic(struct kvm *kvm); 76struct kvm_pic *kvm_create_pic(struct kvm *kvm);
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
deleted file mode 100644
index 497dbaa366d4..000000000000
--- a/arch/x86/kvm/kvm_timer.h
+++ /dev/null
@@ -1,18 +0,0 @@
1
2struct kvm_timer {
3 struct hrtimer timer;
4 s64 period; /* unit: ns */
5 u32 timer_mode_mask;
6 u64 tscdeadline;
7 atomic_t pending; /* accumulated triggered timers */
8 bool reinject;
9 struct kvm_timer_ops *t_ops;
10 struct kvm *kvm;
11 struct kvm_vcpu *vcpu;
12};
13
14struct kvm_timer_ops {
15 bool (*is_periodic)(struct kvm_timer *);
16};
17
18enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ce878788a39f..c6e6b721b6ee 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -34,6 +34,7 @@
34#include <asm/current.h> 34#include <asm/current.h>
35#include <asm/apicdef.h> 35#include <asm/apicdef.h>
36#include <linux/atomic.h> 36#include <linux/atomic.h>
37#include <linux/jump_label.h>
37#include "kvm_cache_regs.h" 38#include "kvm_cache_regs.h"
38#include "irq.h" 39#include "irq.h"
39#include "trace.h" 40#include "trace.h"
@@ -65,6 +66,7 @@
65#define APIC_DEST_NOSHORT 0x0 66#define APIC_DEST_NOSHORT 0x0
66#define APIC_DEST_MASK 0x800 67#define APIC_DEST_MASK 0x800
67#define MAX_APIC_VECTOR 256 68#define MAX_APIC_VECTOR 256
69#define APIC_VECTORS_PER_REG 32
68 70
69#define VEC_POS(v) ((v) & (32 - 1)) 71#define VEC_POS(v) ((v) & (32 - 1))
70#define REG_POS(v) (((v) >> 5) << 4) 72#define REG_POS(v) (((v) >> 5) << 4)
@@ -72,11 +74,6 @@
72static unsigned int min_timer_period_us = 500; 74static unsigned int min_timer_period_us = 500;
73module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); 75module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
74 76
75static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
76{
77 return *((u32 *) (apic->regs + reg_off));
78}
79
80static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) 77static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
81{ 78{
82 *((u32 *) (apic->regs + reg_off)) = val; 79 *((u32 *) (apic->regs + reg_off)) = val;
@@ -117,19 +114,23 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
117 return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 114 return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
118} 115}
119 116
120static inline int apic_hw_enabled(struct kvm_lapic *apic) 117struct static_key_deferred apic_hw_disabled __read_mostly;
121{ 118struct static_key_deferred apic_sw_disabled __read_mostly;
122 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
123}
124 119
125static inline int apic_sw_enabled(struct kvm_lapic *apic) 120static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
126{ 121{
127 return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; 122 if ((kvm_apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) {
123 if (val & APIC_SPIV_APIC_ENABLED)
124 static_key_slow_dec_deferred(&apic_sw_disabled);
125 else
126 static_key_slow_inc(&apic_sw_disabled.key);
127 }
128 apic_set_reg(apic, APIC_SPIV, val);
128} 129}
129 130
130static inline int apic_enabled(struct kvm_lapic *apic) 131static inline int apic_enabled(struct kvm_lapic *apic)
131{ 132{
132 return apic_sw_enabled(apic) && apic_hw_enabled(apic); 133 return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic);
133} 134}
134 135
135#define LVT_MASK \ 136#define LVT_MASK \
@@ -139,36 +140,135 @@ static inline int apic_enabled(struct kvm_lapic *apic)
139 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ 140 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
140 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) 141 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
141 142
143static inline int apic_x2apic_mode(struct kvm_lapic *apic)
144{
145 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
146}
147
142static inline int kvm_apic_id(struct kvm_lapic *apic) 148static inline int kvm_apic_id(struct kvm_lapic *apic)
143{ 149{
144 return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff; 150 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
151}
152
153static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
154{
155 u16 cid;
156 ldr >>= 32 - map->ldr_bits;
157 cid = (ldr >> map->cid_shift) & map->cid_mask;
158
159 BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
160
161 return cid;
162}
163
164static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
165{
166 ldr >>= (32 - map->ldr_bits);
167 return ldr & map->lid_mask;
168}
169
170static void recalculate_apic_map(struct kvm *kvm)
171{
172 struct kvm_apic_map *new, *old = NULL;
173 struct kvm_vcpu *vcpu;
174 int i;
175
176 new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
177
178 mutex_lock(&kvm->arch.apic_map_lock);
179
180 if (!new)
181 goto out;
182
183 new->ldr_bits = 8;
184 /* flat mode is default */
185 new->cid_shift = 8;
186 new->cid_mask = 0;
187 new->lid_mask = 0xff;
188
189 kvm_for_each_vcpu(i, vcpu, kvm) {
190 struct kvm_lapic *apic = vcpu->arch.apic;
191 u16 cid, lid;
192 u32 ldr;
193
194 if (!kvm_apic_present(vcpu))
195 continue;
196
197 /*
198 * All APICs have to be configured in the same mode by an OS.
199 * We take advatage of this while building logical id loockup
200 * table. After reset APICs are in xapic/flat mode, so if we
201 * find apic with different setting we assume this is the mode
202 * OS wants all apics to be in; build lookup table accordingly.
203 */
204 if (apic_x2apic_mode(apic)) {
205 new->ldr_bits = 32;
206 new->cid_shift = 16;
207 new->cid_mask = new->lid_mask = 0xffff;
208 } else if (kvm_apic_sw_enabled(apic) &&
209 !new->cid_mask /* flat mode */ &&
210 kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER) {
211 new->cid_shift = 4;
212 new->cid_mask = 0xf;
213 new->lid_mask = 0xf;
214 }
215
216 new->phys_map[kvm_apic_id(apic)] = apic;
217
218 ldr = kvm_apic_get_reg(apic, APIC_LDR);
219 cid = apic_cluster_id(new, ldr);
220 lid = apic_logical_id(new, ldr);
221
222 if (lid)
223 new->logical_map[cid][ffs(lid) - 1] = apic;
224 }
225out:
226 old = rcu_dereference_protected(kvm->arch.apic_map,
227 lockdep_is_held(&kvm->arch.apic_map_lock));
228 rcu_assign_pointer(kvm->arch.apic_map, new);
229 mutex_unlock(&kvm->arch.apic_map_lock);
230
231 if (old)
232 kfree_rcu(old, rcu);
233}
234
235static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
236{
237 apic_set_reg(apic, APIC_ID, id << 24);
238 recalculate_apic_map(apic->vcpu->kvm);
239}
240
241static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
242{
243 apic_set_reg(apic, APIC_LDR, id);
244 recalculate_apic_map(apic->vcpu->kvm);
145} 245}
146 246
147static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) 247static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
148{ 248{
149 return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); 249 return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
150} 250}
151 251
152static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) 252static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
153{ 253{
154 return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; 254 return kvm_apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
155} 255}
156 256
157static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) 257static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
158{ 258{
159 return ((apic_get_reg(apic, APIC_LVTT) & 259 return ((kvm_apic_get_reg(apic, APIC_LVTT) &
160 apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT); 260 apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
161} 261}
162 262
163static inline int apic_lvtt_period(struct kvm_lapic *apic) 263static inline int apic_lvtt_period(struct kvm_lapic *apic)
164{ 264{
165 return ((apic_get_reg(apic, APIC_LVTT) & 265 return ((kvm_apic_get_reg(apic, APIC_LVTT) &
166 apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC); 266 apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
167} 267}
168 268
169static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) 269static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
170{ 270{
171 return ((apic_get_reg(apic, APIC_LVTT) & 271 return ((kvm_apic_get_reg(apic, APIC_LVTT) &
172 apic->lapic_timer.timer_mode_mask) == 272 apic->lapic_timer.timer_mode_mask) ==
173 APIC_LVT_TIMER_TSCDEADLINE); 273 APIC_LVT_TIMER_TSCDEADLINE);
174} 274}
@@ -184,7 +284,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
184 struct kvm_cpuid_entry2 *feat; 284 struct kvm_cpuid_entry2 *feat;
185 u32 v = APIC_VERSION; 285 u32 v = APIC_VERSION;
186 286
187 if (!irqchip_in_kernel(vcpu->kvm)) 287 if (!kvm_vcpu_has_lapic(vcpu))
188 return; 288 return;
189 289
190 feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); 290 feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@ -193,12 +293,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
193 apic_set_reg(apic, APIC_LVR, v); 293 apic_set_reg(apic, APIC_LVR, v);
194} 294}
195 295
196static inline int apic_x2apic_mode(struct kvm_lapic *apic) 296static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
197{
198 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
199}
200
201static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
202 LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ 297 LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
203 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ 298 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
204 LVT_MASK | APIC_MODE_MASK, /* LVTPC */ 299 LVT_MASK | APIC_MODE_MASK, /* LVTPC */
@@ -208,25 +303,30 @@ static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
208 303
209static int find_highest_vector(void *bitmap) 304static int find_highest_vector(void *bitmap)
210{ 305{
211 u32 *word = bitmap; 306 int vec;
212 int word_offset = MAX_APIC_VECTOR >> 5; 307 u32 *reg;
213 308
214 while ((word_offset != 0) && (word[(--word_offset) << 2] == 0)) 309 for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG;
215 continue; 310 vec >= 0; vec -= APIC_VECTORS_PER_REG) {
311 reg = bitmap + REG_POS(vec);
312 if (*reg)
313 return fls(*reg) - 1 + vec;
314 }
216 315
217 if (likely(!word_offset && !word[0])) 316 return -1;
218 return -1;
219 else
220 return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
221} 317}
222 318
223static u8 count_vectors(void *bitmap) 319static u8 count_vectors(void *bitmap)
224{ 320{
225 u32 *word = bitmap; 321 int vec;
226 int word_offset; 322 u32 *reg;
227 u8 count = 0; 323 u8 count = 0;
228 for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset) 324
229 count += hweight32(word[word_offset << 2]); 325 for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
326 reg = bitmap + REG_POS(vec);
327 count += hweight32(*reg);
328 }
329
230 return count; 330 return count;
231} 331}
232 332
@@ -285,7 +385,6 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
285 385
286int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) 386int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
287{ 387{
288 struct kvm_lapic *apic = vcpu->arch.apic;
289 int highest_irr; 388 int highest_irr;
290 389
291 /* This may race with setting of irr in __apic_accept_irq() and 390 /* This may race with setting of irr in __apic_accept_irq() and
@@ -293,9 +392,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
293 * will cause vmexit immediately and the value will be recalculated 392 * will cause vmexit immediately and the value will be recalculated
294 * on the next vmentry. 393 * on the next vmentry.
295 */ 394 */
296 if (!apic) 395 if (!kvm_vcpu_has_lapic(vcpu))
297 return 0; 396 return 0;
298 highest_irr = apic_find_highest_irr(apic); 397 highest_irr = apic_find_highest_irr(vcpu->arch.apic);
299 398
300 return highest_irr; 399 return highest_irr;
301} 400}
@@ -378,8 +477,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
378 u32 tpr, isrv, ppr, old_ppr; 477 u32 tpr, isrv, ppr, old_ppr;
379 int isr; 478 int isr;
380 479
381 old_ppr = apic_get_reg(apic, APIC_PROCPRI); 480 old_ppr = kvm_apic_get_reg(apic, APIC_PROCPRI);
382 tpr = apic_get_reg(apic, APIC_TASKPRI); 481 tpr = kvm_apic_get_reg(apic, APIC_TASKPRI);
383 isr = apic_find_highest_isr(apic); 482 isr = apic_find_highest_isr(apic);
384 isrv = (isr != -1) ? isr : 0; 483 isrv = (isr != -1) ? isr : 0;
385 484
@@ -415,13 +514,13 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
415 u32 logical_id; 514 u32 logical_id;
416 515
417 if (apic_x2apic_mode(apic)) { 516 if (apic_x2apic_mode(apic)) {
418 logical_id = apic_get_reg(apic, APIC_LDR); 517 logical_id = kvm_apic_get_reg(apic, APIC_LDR);
419 return logical_id & mda; 518 return logical_id & mda;
420 } 519 }
421 520
422 logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); 521 logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR));
423 522
424 switch (apic_get_reg(apic, APIC_DFR)) { 523 switch (kvm_apic_get_reg(apic, APIC_DFR)) {
425 case APIC_DFR_FLAT: 524 case APIC_DFR_FLAT:
426 if (logical_id & mda) 525 if (logical_id & mda)
427 result = 1; 526 result = 1;
@@ -433,7 +532,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
433 break; 532 break;
434 default: 533 default:
435 apic_debug("Bad DFR vcpu %d: %08x\n", 534 apic_debug("Bad DFR vcpu %d: %08x\n",
436 apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); 535 apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR));
437 break; 536 break;
438 } 537 }
439 538
@@ -478,6 +577,72 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
478 return result; 577 return result;
479} 578}
480 579
580bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
581 struct kvm_lapic_irq *irq, int *r)
582{
583 struct kvm_apic_map *map;
584 unsigned long bitmap = 1;
585 struct kvm_lapic **dst;
586 int i;
587 bool ret = false;
588
589 *r = -1;
590
591 if (irq->shorthand == APIC_DEST_SELF) {
592 *r = kvm_apic_set_irq(src->vcpu, irq);
593 return true;
594 }
595
596 if (irq->shorthand)
597 return false;
598
599 rcu_read_lock();
600 map = rcu_dereference(kvm->arch.apic_map);
601
602 if (!map)
603 goto out;
604
605 if (irq->dest_mode == 0) { /* physical mode */
606 if (irq->delivery_mode == APIC_DM_LOWEST ||
607 irq->dest_id == 0xff)
608 goto out;
609 dst = &map->phys_map[irq->dest_id & 0xff];
610 } else {
611 u32 mda = irq->dest_id << (32 - map->ldr_bits);
612
613 dst = map->logical_map[apic_cluster_id(map, mda)];
614
615 bitmap = apic_logical_id(map, mda);
616
617 if (irq->delivery_mode == APIC_DM_LOWEST) {
618 int l = -1;
619 for_each_set_bit(i, &bitmap, 16) {
620 if (!dst[i])
621 continue;
622 if (l < 0)
623 l = i;
624 else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
625 l = i;
626 }
627
628 bitmap = (l >= 0) ? 1 << l : 0;
629 }
630 }
631
632 for_each_set_bit(i, &bitmap, 16) {
633 if (!dst[i])
634 continue;
635 if (*r < 0)
636 *r = 0;
637 *r += kvm_apic_set_irq(dst[i]->vcpu, irq);
638 }
639
640 ret = true;
641out:
642 rcu_read_unlock();
643 return ret;
644}
645
481/* 646/*
482 * Add a pending IRQ into lapic. 647 * Add a pending IRQ into lapic.
483 * Return 1 if successfully added and 0 if discarded. 648 * Return 1 if successfully added and 0 if discarded.
@@ -591,7 +756,7 @@ static int apic_set_eoi(struct kvm_lapic *apic)
591 apic_clear_isr(vector, apic); 756 apic_clear_isr(vector, apic);
592 apic_update_ppr(apic); 757 apic_update_ppr(apic);
593 758
594 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && 759 if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
595 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { 760 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
596 int trigger_mode; 761 int trigger_mode;
597 if (apic_test_vector(vector, apic->regs + APIC_TMR)) 762 if (apic_test_vector(vector, apic->regs + APIC_TMR))
@@ -606,8 +771,8 @@ static int apic_set_eoi(struct kvm_lapic *apic)
606 771
607static void apic_send_ipi(struct kvm_lapic *apic) 772static void apic_send_ipi(struct kvm_lapic *apic)
608{ 773{
609 u32 icr_low = apic_get_reg(apic, APIC_ICR); 774 u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
610 u32 icr_high = apic_get_reg(apic, APIC_ICR2); 775 u32 icr_high = kvm_apic_get_reg(apic, APIC_ICR2);
611 struct kvm_lapic_irq irq; 776 struct kvm_lapic_irq irq;
612 777
613 irq.vector = icr_low & APIC_VECTOR_MASK; 778 irq.vector = icr_low & APIC_VECTOR_MASK;
@@ -642,7 +807,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
642 ASSERT(apic != NULL); 807 ASSERT(apic != NULL);
643 808
644 /* if initial count is 0, current count should also be 0 */ 809 /* if initial count is 0, current count should also be 0 */
645 if (apic_get_reg(apic, APIC_TMICT) == 0) 810 if (kvm_apic_get_reg(apic, APIC_TMICT) == 0)
646 return 0; 811 return 0;
647 812
648 remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); 813 remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
@@ -696,13 +861,15 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
696 861
697 val = apic_get_tmcct(apic); 862 val = apic_get_tmcct(apic);
698 break; 863 break;
699 864 case APIC_PROCPRI:
865 apic_update_ppr(apic);
866 val = kvm_apic_get_reg(apic, offset);
867 break;
700 case APIC_TASKPRI: 868 case APIC_TASKPRI:
701 report_tpr_access(apic, false); 869 report_tpr_access(apic, false);
702 /* fall thru */ 870 /* fall thru */
703 default: 871 default:
704 apic_update_ppr(apic); 872 val = kvm_apic_get_reg(apic, offset);
705 val = apic_get_reg(apic, offset);
706 break; 873 break;
707 } 874 }
708 875
@@ -719,7 +886,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
719{ 886{
720 unsigned char alignment = offset & 0xf; 887 unsigned char alignment = offset & 0xf;
721 u32 result; 888 u32 result;
722 /* this bitmask has a bit cleared for each reserver register */ 889 /* this bitmask has a bit cleared for each reserved register */
723 static const u64 rmask = 0x43ff01ffffffe70cULL; 890 static const u64 rmask = 0x43ff01ffffffe70cULL;
724 891
725 if ((alignment + len) > 4) { 892 if ((alignment + len) > 4) {
@@ -754,7 +921,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
754 921
755static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) 922static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
756{ 923{
757 return apic_hw_enabled(apic) && 924 return kvm_apic_hw_enabled(apic) &&
758 addr >= apic->base_address && 925 addr >= apic->base_address &&
759 addr < apic->base_address + LAPIC_MMIO_LENGTH; 926 addr < apic->base_address + LAPIC_MMIO_LENGTH;
760} 927}
@@ -777,7 +944,7 @@ static void update_divide_count(struct kvm_lapic *apic)
777{ 944{
778 u32 tmp1, tmp2, tdcr; 945 u32 tmp1, tmp2, tdcr;
779 946
780 tdcr = apic_get_reg(apic, APIC_TDCR); 947 tdcr = kvm_apic_get_reg(apic, APIC_TDCR);
781 tmp1 = tdcr & 0xf; 948 tmp1 = tdcr & 0xf;
782 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; 949 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
783 apic->divide_count = 0x1 << (tmp2 & 0x7); 950 apic->divide_count = 0x1 << (tmp2 & 0x7);
@@ -792,9 +959,9 @@ static void start_apic_timer(struct kvm_lapic *apic)
792 atomic_set(&apic->lapic_timer.pending, 0); 959 atomic_set(&apic->lapic_timer.pending, 0);
793 960
794 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { 961 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
795 /* lapic timer in oneshot or peroidic mode */ 962 /* lapic timer in oneshot or periodic mode */
796 now = apic->lapic_timer.timer.base->get_time(); 963 now = apic->lapic_timer.timer.base->get_time();
797 apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) 964 apic->lapic_timer.period = (u64)kvm_apic_get_reg(apic, APIC_TMICT)
798 * APIC_BUS_CYCLE_NS * apic->divide_count; 965 * APIC_BUS_CYCLE_NS * apic->divide_count;
799 966
800 if (!apic->lapic_timer.period) 967 if (!apic->lapic_timer.period)
@@ -826,7 +993,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
826 "timer initial count 0x%x, period %lldns, " 993 "timer initial count 0x%x, period %lldns, "
827 "expire @ 0x%016" PRIx64 ".\n", __func__, 994 "expire @ 0x%016" PRIx64 ".\n", __func__,
828 APIC_BUS_CYCLE_NS, ktime_to_ns(now), 995 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
829 apic_get_reg(apic, APIC_TMICT), 996 kvm_apic_get_reg(apic, APIC_TMICT),
830 apic->lapic_timer.period, 997 apic->lapic_timer.period,
831 ktime_to_ns(ktime_add_ns(now, 998 ktime_to_ns(ktime_add_ns(now,
832 apic->lapic_timer.period))); 999 apic->lapic_timer.period)));
@@ -858,7 +1025,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
858 1025
859static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) 1026static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
860{ 1027{
861 int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0)); 1028 int nmi_wd_enabled = apic_lvt_nmi_mode(kvm_apic_get_reg(apic, APIC_LVT0));
862 1029
863 if (apic_lvt_nmi_mode(lvt0_val)) { 1030 if (apic_lvt_nmi_mode(lvt0_val)) {
864 if (!nmi_wd_enabled) { 1031 if (!nmi_wd_enabled) {
@@ -879,7 +1046,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
879 switch (reg) { 1046 switch (reg) {
880 case APIC_ID: /* Local APIC ID */ 1047 case APIC_ID: /* Local APIC ID */
881 if (!apic_x2apic_mode(apic)) 1048 if (!apic_x2apic_mode(apic))
882 apic_set_reg(apic, APIC_ID, val); 1049 kvm_apic_set_id(apic, val >> 24);
883 else 1050 else
884 ret = 1; 1051 ret = 1;
885 break; 1052 break;
@@ -895,29 +1062,30 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
895 1062
896 case APIC_LDR: 1063 case APIC_LDR:
897 if (!apic_x2apic_mode(apic)) 1064 if (!apic_x2apic_mode(apic))
898 apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); 1065 kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
899 else 1066 else
900 ret = 1; 1067 ret = 1;
901 break; 1068 break;
902 1069
903 case APIC_DFR: 1070 case APIC_DFR:
904 if (!apic_x2apic_mode(apic)) 1071 if (!apic_x2apic_mode(apic)) {
905 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); 1072 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
906 else 1073 recalculate_apic_map(apic->vcpu->kvm);
1074 } else
907 ret = 1; 1075 ret = 1;
908 break; 1076 break;
909 1077
910 case APIC_SPIV: { 1078 case APIC_SPIV: {
911 u32 mask = 0x3ff; 1079 u32 mask = 0x3ff;
912 if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) 1080 if (kvm_apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
913 mask |= APIC_SPIV_DIRECTED_EOI; 1081 mask |= APIC_SPIV_DIRECTED_EOI;
914 apic_set_reg(apic, APIC_SPIV, val & mask); 1082 apic_set_spiv(apic, val & mask);
915 if (!(val & APIC_SPIV_APIC_ENABLED)) { 1083 if (!(val & APIC_SPIV_APIC_ENABLED)) {
916 int i; 1084 int i;
917 u32 lvt_val; 1085 u32 lvt_val;
918 1086
919 for (i = 0; i < APIC_LVT_NUM; i++) { 1087 for (i = 0; i < APIC_LVT_NUM; i++) {
920 lvt_val = apic_get_reg(apic, 1088 lvt_val = kvm_apic_get_reg(apic,
921 APIC_LVTT + 0x10 * i); 1089 APIC_LVTT + 0x10 * i);
922 apic_set_reg(apic, APIC_LVTT + 0x10 * i, 1090 apic_set_reg(apic, APIC_LVTT + 0x10 * i,
923 lvt_val | APIC_LVT_MASKED); 1091 lvt_val | APIC_LVT_MASKED);
@@ -946,7 +1114,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
946 case APIC_LVT1: 1114 case APIC_LVT1:
947 case APIC_LVTERR: 1115 case APIC_LVTERR:
948 /* TODO: Check vector */ 1116 /* TODO: Check vector */
949 if (!apic_sw_enabled(apic)) 1117 if (!kvm_apic_sw_enabled(apic))
950 val |= APIC_LVT_MASKED; 1118 val |= APIC_LVT_MASKED;
951 1119
952 val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; 1120 val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
@@ -955,12 +1123,12 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
955 break; 1123 break;
956 1124
957 case APIC_LVTT: 1125 case APIC_LVTT:
958 if ((apic_get_reg(apic, APIC_LVTT) & 1126 if ((kvm_apic_get_reg(apic, APIC_LVTT) &
959 apic->lapic_timer.timer_mode_mask) != 1127 apic->lapic_timer.timer_mode_mask) !=
960 (val & apic->lapic_timer.timer_mode_mask)) 1128 (val & apic->lapic_timer.timer_mode_mask))
961 hrtimer_cancel(&apic->lapic_timer.timer); 1129 hrtimer_cancel(&apic->lapic_timer.timer);
962 1130
963 if (!apic_sw_enabled(apic)) 1131 if (!kvm_apic_sw_enabled(apic))
964 val |= APIC_LVT_MASKED; 1132 val |= APIC_LVT_MASKED;
965 val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); 1133 val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
966 apic_set_reg(apic, APIC_LVTT, val); 1134 apic_set_reg(apic, APIC_LVTT, val);
@@ -1039,24 +1207,30 @@ static int apic_mmio_write(struct kvm_io_device *this,
1039 1207
1040void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) 1208void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
1041{ 1209{
1042 struct kvm_lapic *apic = vcpu->arch.apic; 1210 if (kvm_vcpu_has_lapic(vcpu))
1043
1044 if (apic)
1045 apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); 1211 apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
1046} 1212}
1047EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); 1213EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
1048 1214
1049void kvm_free_lapic(struct kvm_vcpu *vcpu) 1215void kvm_free_lapic(struct kvm_vcpu *vcpu)
1050{ 1216{
1217 struct kvm_lapic *apic = vcpu->arch.apic;
1218
1051 if (!vcpu->arch.apic) 1219 if (!vcpu->arch.apic)
1052 return; 1220 return;
1053 1221
1054 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); 1222 hrtimer_cancel(&apic->lapic_timer.timer);
1223
1224 if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
1225 static_key_slow_dec_deferred(&apic_hw_disabled);
1226
1227 if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED))
1228 static_key_slow_dec_deferred(&apic_sw_disabled);
1055 1229
1056 if (vcpu->arch.apic->regs) 1230 if (apic->regs)
1057 free_page((unsigned long)vcpu->arch.apic->regs); 1231 free_page((unsigned long)apic->regs);
1058 1232
1059 kfree(vcpu->arch.apic); 1233 kfree(apic);
1060} 1234}
1061 1235
1062/* 1236/*
@@ -1068,10 +1242,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
1068u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) 1242u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
1069{ 1243{
1070 struct kvm_lapic *apic = vcpu->arch.apic; 1244 struct kvm_lapic *apic = vcpu->arch.apic;
1071 if (!apic)
1072 return 0;
1073 1245
1074 if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) 1246 if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
1247 apic_lvtt_period(apic))
1075 return 0; 1248 return 0;
1076 1249
1077 return apic->lapic_timer.tscdeadline; 1250 return apic->lapic_timer.tscdeadline;
@@ -1080,10 +1253,9 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
1080void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) 1253void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
1081{ 1254{
1082 struct kvm_lapic *apic = vcpu->arch.apic; 1255 struct kvm_lapic *apic = vcpu->arch.apic;
1083 if (!apic)
1084 return;
1085 1256
1086 if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) 1257 if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
1258 apic_lvtt_period(apic))
1087 return; 1259 return;
1088 1260
1089 hrtimer_cancel(&apic->lapic_timer.timer); 1261 hrtimer_cancel(&apic->lapic_timer.timer);
@@ -1095,20 +1267,21 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
1095{ 1267{
1096 struct kvm_lapic *apic = vcpu->arch.apic; 1268 struct kvm_lapic *apic = vcpu->arch.apic;
1097 1269
1098 if (!apic) 1270 if (!kvm_vcpu_has_lapic(vcpu))
1099 return; 1271 return;
1272
1100 apic_set_tpr(apic, ((cr8 & 0x0f) << 4) 1273 apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
1101 | (apic_get_reg(apic, APIC_TASKPRI) & 4)); 1274 | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
1102} 1275}
1103 1276
1104u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) 1277u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
1105{ 1278{
1106 struct kvm_lapic *apic = vcpu->arch.apic;
1107 u64 tpr; 1279 u64 tpr;
1108 1280
1109 if (!apic) 1281 if (!kvm_vcpu_has_lapic(vcpu))
1110 return 0; 1282 return 0;
1111 tpr = (u64) apic_get_reg(apic, APIC_TASKPRI); 1283
1284 tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
1112 1285
1113 return (tpr & 0xf0) >> 4; 1286 return (tpr & 0xf0) >> 4;
1114} 1287}
@@ -1123,6 +1296,15 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1123 return; 1296 return;
1124 } 1297 }
1125 1298
1299 /* update jump label if enable bit changes */
1300 if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) {
1301 if (value & MSR_IA32_APICBASE_ENABLE)
1302 static_key_slow_dec_deferred(&apic_hw_disabled);
1303 else
1304 static_key_slow_inc(&apic_hw_disabled.key);
1305 recalculate_apic_map(vcpu->kvm);
1306 }
1307
1126 if (!kvm_vcpu_is_bsp(apic->vcpu)) 1308 if (!kvm_vcpu_is_bsp(apic->vcpu))
1127 value &= ~MSR_IA32_APICBASE_BSP; 1309 value &= ~MSR_IA32_APICBASE_BSP;
1128 1310
@@ -1130,7 +1312,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1130 if (apic_x2apic_mode(apic)) { 1312 if (apic_x2apic_mode(apic)) {
1131 u32 id = kvm_apic_id(apic); 1313 u32 id = kvm_apic_id(apic);
1132 u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); 1314 u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
1133 apic_set_reg(apic, APIC_LDR, ldr); 1315 kvm_apic_set_ldr(apic, ldr);
1134 } 1316 }
1135 apic->base_address = apic->vcpu->arch.apic_base & 1317 apic->base_address = apic->vcpu->arch.apic_base &
1136 MSR_IA32_APICBASE_BASE; 1318 MSR_IA32_APICBASE_BASE;
@@ -1155,7 +1337,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1155 /* Stop the timer in case it's a reset to an active apic */ 1337 /* Stop the timer in case it's a reset to an active apic */
1156 hrtimer_cancel(&apic->lapic_timer.timer); 1338 hrtimer_cancel(&apic->lapic_timer.timer);
1157 1339
1158 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); 1340 kvm_apic_set_id(apic, vcpu->vcpu_id);
1159 kvm_apic_set_version(apic->vcpu); 1341 kvm_apic_set_version(apic->vcpu);
1160 1342
1161 for (i = 0; i < APIC_LVT_NUM; i++) 1343 for (i = 0; i < APIC_LVT_NUM; i++)
@@ -1164,9 +1346,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1164 SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); 1346 SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
1165 1347
1166 apic_set_reg(apic, APIC_DFR, 0xffffffffU); 1348 apic_set_reg(apic, APIC_DFR, 0xffffffffU);
1167 apic_set_reg(apic, APIC_SPIV, 0xff); 1349 apic_set_spiv(apic, 0xff);
1168 apic_set_reg(apic, APIC_TASKPRI, 0); 1350 apic_set_reg(apic, APIC_TASKPRI, 0);
1169 apic_set_reg(apic, APIC_LDR, 0); 1351 kvm_apic_set_ldr(apic, 0);
1170 apic_set_reg(apic, APIC_ESR, 0); 1352 apic_set_reg(apic, APIC_ESR, 0);
1171 apic_set_reg(apic, APIC_ICR, 0); 1353 apic_set_reg(apic, APIC_ICR, 0);
1172 apic_set_reg(apic, APIC_ICR2, 0); 1354 apic_set_reg(apic, APIC_ICR2, 0);
@@ -1183,7 +1365,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1183 update_divide_count(apic); 1365 update_divide_count(apic);
1184 atomic_set(&apic->lapic_timer.pending, 0); 1366 atomic_set(&apic->lapic_timer.pending, 0);
1185 if (kvm_vcpu_is_bsp(vcpu)) 1367 if (kvm_vcpu_is_bsp(vcpu))
1186 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; 1368 kvm_lapic_set_base(vcpu,
1369 vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
1187 vcpu->arch.pv_eoi.msr_val = 0; 1370 vcpu->arch.pv_eoi.msr_val = 0;
1188 apic_update_ppr(apic); 1371 apic_update_ppr(apic);
1189 1372
@@ -1196,45 +1379,34 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1196 vcpu->arch.apic_base, apic->base_address); 1379 vcpu->arch.apic_base, apic->base_address);
1197} 1380}
1198 1381
1199bool kvm_apic_present(struct kvm_vcpu *vcpu)
1200{
1201 return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
1202}
1203
1204int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
1205{
1206 return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
1207}
1208
1209/* 1382/*
1210 *---------------------------------------------------------------------- 1383 *----------------------------------------------------------------------
1211 * timer interface 1384 * timer interface
1212 *---------------------------------------------------------------------- 1385 *----------------------------------------------------------------------
1213 */ 1386 */
1214 1387
1215static bool lapic_is_periodic(struct kvm_timer *ktimer) 1388static bool lapic_is_periodic(struct kvm_lapic *apic)
1216{ 1389{
1217 struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
1218 lapic_timer);
1219 return apic_lvtt_period(apic); 1390 return apic_lvtt_period(apic);
1220} 1391}
1221 1392
1222int apic_has_pending_timer(struct kvm_vcpu *vcpu) 1393int apic_has_pending_timer(struct kvm_vcpu *vcpu)
1223{ 1394{
1224 struct kvm_lapic *lapic = vcpu->arch.apic; 1395 struct kvm_lapic *apic = vcpu->arch.apic;
1225 1396
1226 if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) 1397 if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
1227 return atomic_read(&lapic->lapic_timer.pending); 1398 apic_lvt_enabled(apic, APIC_LVTT))
1399 return atomic_read(&apic->lapic_timer.pending);
1228 1400
1229 return 0; 1401 return 0;
1230} 1402}
1231 1403
1232int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) 1404int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
1233{ 1405{
1234 u32 reg = apic_get_reg(apic, lvt_type); 1406 u32 reg = kvm_apic_get_reg(apic, lvt_type);
1235 int vector, mode, trig_mode; 1407 int vector, mode, trig_mode;
1236 1408
1237 if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { 1409 if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
1238 vector = reg & APIC_VECTOR_MASK; 1410 vector = reg & APIC_VECTOR_MASK;
1239 mode = reg & APIC_MODE_MASK; 1411 mode = reg & APIC_MODE_MASK;
1240 trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; 1412 trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
@@ -1251,15 +1423,40 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
1251 kvm_apic_local_deliver(apic, APIC_LVT0); 1423 kvm_apic_local_deliver(apic, APIC_LVT0);
1252} 1424}
1253 1425
1254static struct kvm_timer_ops lapic_timer_ops = {
1255 .is_periodic = lapic_is_periodic,
1256};
1257
1258static const struct kvm_io_device_ops apic_mmio_ops = { 1426static const struct kvm_io_device_ops apic_mmio_ops = {
1259 .read = apic_mmio_read, 1427 .read = apic_mmio_read,
1260 .write = apic_mmio_write, 1428 .write = apic_mmio_write,
1261}; 1429};
1262 1430
1431static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
1432{
1433 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
1434 struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
1435 struct kvm_vcpu *vcpu = apic->vcpu;
1436 wait_queue_head_t *q = &vcpu->wq;
1437
1438 /*
1439 * There is a race window between reading and incrementing, but we do
1440 * not care about potentially losing timer events in the !reinject
1441 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
1442 * in vcpu_enter_guest.
1443 */
1444 if (!atomic_read(&ktimer->pending)) {
1445 atomic_inc(&ktimer->pending);
1446 /* FIXME: this code should not know anything about vcpus */
1447 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
1448 }
1449
1450 if (waitqueue_active(q))
1451 wake_up_interruptible(q);
1452
1453 if (lapic_is_periodic(apic)) {
1454 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
1455 return HRTIMER_RESTART;
1456 } else
1457 return HRTIMER_NORESTART;
1458}
1459
1263int kvm_create_lapic(struct kvm_vcpu *vcpu) 1460int kvm_create_lapic(struct kvm_vcpu *vcpu)
1264{ 1461{
1265 struct kvm_lapic *apic; 1462 struct kvm_lapic *apic;
@@ -1283,14 +1480,17 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
1283 1480
1284 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, 1481 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
1285 HRTIMER_MODE_ABS); 1482 HRTIMER_MODE_ABS);
1286 apic->lapic_timer.timer.function = kvm_timer_fn; 1483 apic->lapic_timer.timer.function = apic_timer_fn;
1287 apic->lapic_timer.t_ops = &lapic_timer_ops;
1288 apic->lapic_timer.kvm = vcpu->kvm;
1289 apic->lapic_timer.vcpu = vcpu;
1290 1484
1291 apic->base_address = APIC_DEFAULT_PHYS_BASE; 1485 /*
1292 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; 1486 * APIC is created enabled. This will prevent kvm_lapic_set_base from
1487 * thinking that APIC satet has changed.
1488 */
1489 vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
1490 kvm_lapic_set_base(vcpu,
1491 APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
1293 1492
1493 static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
1294 kvm_lapic_reset(vcpu); 1494 kvm_lapic_reset(vcpu);
1295 kvm_iodevice_init(&apic->dev, &apic_mmio_ops); 1495 kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
1296 1496
@@ -1306,23 +1506,23 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
1306 struct kvm_lapic *apic = vcpu->arch.apic; 1506 struct kvm_lapic *apic = vcpu->arch.apic;
1307 int highest_irr; 1507 int highest_irr;
1308 1508
1309 if (!apic || !apic_enabled(apic)) 1509 if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
1310 return -1; 1510 return -1;
1311 1511
1312 apic_update_ppr(apic); 1512 apic_update_ppr(apic);
1313 highest_irr = apic_find_highest_irr(apic); 1513 highest_irr = apic_find_highest_irr(apic);
1314 if ((highest_irr == -1) || 1514 if ((highest_irr == -1) ||
1315 ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) 1515 ((highest_irr & 0xF0) <= kvm_apic_get_reg(apic, APIC_PROCPRI)))
1316 return -1; 1516 return -1;
1317 return highest_irr; 1517 return highest_irr;
1318} 1518}
1319 1519
1320int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) 1520int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1321{ 1521{
1322 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); 1522 u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
1323 int r = 0; 1523 int r = 0;
1324 1524
1325 if (!apic_hw_enabled(vcpu->arch.apic)) 1525 if (!kvm_apic_hw_enabled(vcpu->arch.apic))
1326 r = 1; 1526 r = 1;
1327 if ((lvt0 & APIC_LVT_MASKED) == 0 && 1527 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1328 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) 1528 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
@@ -1334,7 +1534,10 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1334{ 1534{
1335 struct kvm_lapic *apic = vcpu->arch.apic; 1535 struct kvm_lapic *apic = vcpu->arch.apic;
1336 1536
1337 if (apic && atomic_read(&apic->lapic_timer.pending) > 0) { 1537 if (!kvm_vcpu_has_lapic(vcpu))
1538 return;
1539
1540 if (atomic_read(&apic->lapic_timer.pending) > 0) {
1338 if (kvm_apic_local_deliver(apic, APIC_LVTT)) 1541 if (kvm_apic_local_deliver(apic, APIC_LVTT))
1339 atomic_dec(&apic->lapic_timer.pending); 1542 atomic_dec(&apic->lapic_timer.pending);
1340 } 1543 }
@@ -1354,12 +1557,17 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1354 return vector; 1557 return vector;
1355} 1558}
1356 1559
1357void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) 1560void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
1561 struct kvm_lapic_state *s)
1358{ 1562{
1359 struct kvm_lapic *apic = vcpu->arch.apic; 1563 struct kvm_lapic *apic = vcpu->arch.apic;
1360 1564
1361 apic->base_address = vcpu->arch.apic_base & 1565 kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
1362 MSR_IA32_APICBASE_BASE; 1566 /* set SPIV separately to get count of SW disabled APICs right */
1567 apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
1568 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1569 /* call kvm_apic_set_id() to put apic into apic_map */
1570 kvm_apic_set_id(apic, kvm_apic_id(apic));
1363 kvm_apic_set_version(vcpu); 1571 kvm_apic_set_version(vcpu);
1364 1572
1365 apic_update_ppr(apic); 1573 apic_update_ppr(apic);
@@ -1374,13 +1582,12 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1374 1582
1375void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1583void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1376{ 1584{
1377 struct kvm_lapic *apic = vcpu->arch.apic;
1378 struct hrtimer *timer; 1585 struct hrtimer *timer;
1379 1586
1380 if (!apic) 1587 if (!kvm_vcpu_has_lapic(vcpu))
1381 return; 1588 return;
1382 1589
1383 timer = &apic->lapic_timer.timer; 1590 timer = &vcpu->arch.apic->lapic_timer.timer;
1384 if (hrtimer_cancel(timer)) 1591 if (hrtimer_cancel(timer))
1385 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 1592 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
1386} 1593}
@@ -1478,7 +1685,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1478 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) 1685 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
1479 return; 1686 return;
1480 1687
1481 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; 1688 tpr = kvm_apic_get_reg(apic, APIC_TASKPRI) & 0xff;
1482 max_irr = apic_find_highest_irr(apic); 1689 max_irr = apic_find_highest_irr(apic);
1483 if (max_irr < 0) 1690 if (max_irr < 0)
1484 max_irr = 0; 1691 max_irr = 0;
@@ -1537,7 +1744,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
1537{ 1744{
1538 struct kvm_lapic *apic = vcpu->arch.apic; 1745 struct kvm_lapic *apic = vcpu->arch.apic;
1539 1746
1540 if (!irqchip_in_kernel(vcpu->kvm)) 1747 if (!kvm_vcpu_has_lapic(vcpu))
1541 return 1; 1748 return 1;
1542 1749
1543 /* if this is ICR write vector before command */ 1750 /* if this is ICR write vector before command */
@@ -1551,7 +1758,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
1551 struct kvm_lapic *apic = vcpu->arch.apic; 1758 struct kvm_lapic *apic = vcpu->arch.apic;
1552 u32 low, high = 0; 1759 u32 low, high = 0;
1553 1760
1554 if (!irqchip_in_kernel(vcpu->kvm)) 1761 if (!kvm_vcpu_has_lapic(vcpu))
1555 return 1; 1762 return 1;
1556 1763
1557 if (apic_reg_read(apic, reg, 4, &low)) 1764 if (apic_reg_read(apic, reg, 4, &low))
@@ -1576,3 +1783,10 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
1576 return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, 1783 return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
1577 addr); 1784 addr);
1578} 1785}
1786
1787void kvm_lapic_init(void)
1788{
1789 /* do not patch jump label more than once per second */
1790 jump_label_rate_limit(&apic_hw_disabled, HZ);
1791 jump_label_rate_limit(&apic_sw_disabled, HZ);
1792}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 4af5405ae1e2..e5ebf9f3571f 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -2,10 +2,17 @@
2#define __KVM_X86_LAPIC_H 2#define __KVM_X86_LAPIC_H
3 3
4#include "iodev.h" 4#include "iodev.h"
5#include "kvm_timer.h"
6 5
7#include <linux/kvm_host.h> 6#include <linux/kvm_host.h>
8 7
8struct kvm_timer {
9 struct hrtimer timer;
10 s64 period; /* unit: ns */
11 u32 timer_mode_mask;
12 u64 tscdeadline;
13 atomic_t pending; /* accumulated triggered timers */
14};
15
9struct kvm_lapic { 16struct kvm_lapic {
10 unsigned long base_address; 17 unsigned long base_address;
11 struct kvm_io_device dev; 18 struct kvm_io_device dev;
@@ -45,11 +52,13 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
45int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); 52int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
46int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); 53int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
47 54
55bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
56 struct kvm_lapic_irq *irq, int *r);
57
48u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); 58u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
49void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); 59void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
50void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); 60void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
51int kvm_lapic_enabled(struct kvm_vcpu *vcpu); 61 struct kvm_lapic_state *s);
52bool kvm_apic_present(struct kvm_vcpu *vcpu);
53int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); 62int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
54 63
55u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); 64u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
@@ -71,4 +80,48 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
71} 80}
72 81
73int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); 82int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
83void kvm_lapic_init(void);
84
85static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off)
86{
87 return *((u32 *) (apic->regs + reg_off));
88}
89
90extern struct static_key kvm_no_apic_vcpu;
91
92static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu)
93{
94 if (static_key_false(&kvm_no_apic_vcpu))
95 return vcpu->arch.apic;
96 return true;
97}
98
99extern struct static_key_deferred apic_hw_disabled;
100
101static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
102{
103 if (static_key_false(&apic_hw_disabled.key))
104 return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
105 return MSR_IA32_APICBASE_ENABLE;
106}
107
108extern struct static_key_deferred apic_sw_disabled;
109
110static inline int kvm_apic_sw_enabled(struct kvm_lapic *apic)
111{
112 if (static_key_false(&apic_sw_disabled.key))
113 return kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
114 return APIC_SPIV_APIC_ENABLED;
115}
116
117static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
118{
119 return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
120}
121
122static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
123{
124 return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
125}
126
74#endif 127#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7fbd0d273ea8..d289fee1ffb8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -556,6 +556,14 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
556 return 0; 556 return 0;
557 557
558 pfn = spte_to_pfn(old_spte); 558 pfn = spte_to_pfn(old_spte);
559
560 /*
561 * KVM does not hold the refcount of the page used by
562 * kvm mmu, before reclaiming the page, we should
563 * unmap it from mmu first.
564 */
565 WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
566
559 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 567 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
560 kvm_set_pfn_accessed(pfn); 568 kvm_set_pfn_accessed(pfn);
561 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) 569 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
@@ -960,13 +968,10 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
960static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, 968static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
961 struct kvm_memory_slot *slot) 969 struct kvm_memory_slot *slot)
962{ 970{
963 struct kvm_lpage_info *linfo; 971 unsigned long idx;
964
965 if (likely(level == PT_PAGE_TABLE_LEVEL))
966 return &slot->rmap[gfn - slot->base_gfn];
967 972
968 linfo = lpage_info_slot(gfn, slot, level); 973 idx = gfn_to_index(gfn, slot->base_gfn, level);
969 return &linfo->rmap_pde; 974 return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
970} 975}
971 976
972/* 977/*
@@ -1173,7 +1178,8 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1173 unsigned long *rmapp; 1178 unsigned long *rmapp;
1174 1179
1175 while (mask) { 1180 while (mask) {
1176 rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; 1181 rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1182 PT_PAGE_TABLE_LEVEL, slot);
1177 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); 1183 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
1178 1184
1179 /* clear the first set bit */ 1185 /* clear the first set bit */
@@ -1200,7 +1206,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
1200} 1206}
1201 1207
1202static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 1208static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1203 unsigned long data) 1209 struct kvm_memory_slot *slot, unsigned long data)
1204{ 1210{
1205 u64 *sptep; 1211 u64 *sptep;
1206 struct rmap_iterator iter; 1212 struct rmap_iterator iter;
@@ -1218,7 +1224,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1218} 1224}
1219 1225
1220static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, 1226static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1221 unsigned long data) 1227 struct kvm_memory_slot *slot, unsigned long data)
1222{ 1228{
1223 u64 *sptep; 1229 u64 *sptep;
1224 struct rmap_iterator iter; 1230 struct rmap_iterator iter;
@@ -1259,43 +1265,67 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1259 return 0; 1265 return 0;
1260} 1266}
1261 1267
1262static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 1268static int kvm_handle_hva_range(struct kvm *kvm,
1263 unsigned long data, 1269 unsigned long start,
1264 int (*handler)(struct kvm *kvm, unsigned long *rmapp, 1270 unsigned long end,
1265 unsigned long data)) 1271 unsigned long data,
1272 int (*handler)(struct kvm *kvm,
1273 unsigned long *rmapp,
1274 struct kvm_memory_slot *slot,
1275 unsigned long data))
1266{ 1276{
1267 int j; 1277 int j;
1268 int ret; 1278 int ret = 0;
1269 int retval = 0;
1270 struct kvm_memslots *slots; 1279 struct kvm_memslots *slots;
1271 struct kvm_memory_slot *memslot; 1280 struct kvm_memory_slot *memslot;
1272 1281
1273 slots = kvm_memslots(kvm); 1282 slots = kvm_memslots(kvm);
1274 1283
1275 kvm_for_each_memslot(memslot, slots) { 1284 kvm_for_each_memslot(memslot, slots) {
1276 unsigned long start = memslot->userspace_addr; 1285 unsigned long hva_start, hva_end;
1277 unsigned long end; 1286 gfn_t gfn_start, gfn_end;
1278 1287
1279 end = start + (memslot->npages << PAGE_SHIFT); 1288 hva_start = max(start, memslot->userspace_addr);
1280 if (hva >= start && hva < end) { 1289 hva_end = min(end, memslot->userspace_addr +
1281 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 1290 (memslot->npages << PAGE_SHIFT));
1282 gfn_t gfn = memslot->base_gfn + gfn_offset; 1291 if (hva_start >= hva_end)
1292 continue;
1293 /*
1294 * {gfn(page) | page intersects with [hva_start, hva_end)} =
1295 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1296 */
1297 gfn_start = hva_to_gfn_memslot(hva_start, memslot);
1298 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1283 1299
1284 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 1300 for (j = PT_PAGE_TABLE_LEVEL;
1301 j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
1302 unsigned long idx, idx_end;
1303 unsigned long *rmapp;
1285 1304
1286 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 1305 /*
1287 struct kvm_lpage_info *linfo; 1306 * {idx(page_j) | page_j intersects with
1307 * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
1308 */
1309 idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
1310 idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
1288 1311
1289 linfo = lpage_info_slot(gfn, memslot, 1312 rmapp = __gfn_to_rmap(gfn_start, j, memslot);
1290 PT_DIRECTORY_LEVEL + j); 1313
1291 ret |= handler(kvm, &linfo->rmap_pde, data); 1314 for (; idx <= idx_end; ++idx)
1292 } 1315 ret |= handler(kvm, rmapp++, memslot, data);
1293 trace_kvm_age_page(hva, memslot, ret);
1294 retval |= ret;
1295 } 1316 }
1296 } 1317 }
1297 1318
1298 return retval; 1319 return ret;
1320}
1321
1322static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
1323 unsigned long data,
1324 int (*handler)(struct kvm *kvm, unsigned long *rmapp,
1325 struct kvm_memory_slot *slot,
1326 unsigned long data))
1327{
1328 return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
1299} 1329}
1300 1330
1301int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 1331int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
@@ -1303,13 +1333,18 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
1303 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); 1333 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
1304} 1334}
1305 1335
1336int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1337{
1338 return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
1339}
1340
1306void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 1341void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1307{ 1342{
1308 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); 1343 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
1309} 1344}
1310 1345
1311static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 1346static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1312 unsigned long data) 1347 struct kvm_memory_slot *slot, unsigned long data)
1313{ 1348{
1314 u64 *sptep; 1349 u64 *sptep;
1315 struct rmap_iterator uninitialized_var(iter); 1350 struct rmap_iterator uninitialized_var(iter);
@@ -1323,8 +1358,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1323 * This has some overhead, but not as much as the cost of swapping 1358 * This has some overhead, but not as much as the cost of swapping
1324 * out actively used pages or breaking up actively used hugepages. 1359 * out actively used pages or breaking up actively used hugepages.
1325 */ 1360 */
1326 if (!shadow_accessed_mask) 1361 if (!shadow_accessed_mask) {
1327 return kvm_unmap_rmapp(kvm, rmapp, data); 1362 young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
1363 goto out;
1364 }
1328 1365
1329 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 1366 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1330 sptep = rmap_get_next(&iter)) { 1367 sptep = rmap_get_next(&iter)) {
@@ -1336,12 +1373,14 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1336 (unsigned long *)sptep); 1373 (unsigned long *)sptep);
1337 } 1374 }
1338 } 1375 }
1339 1376out:
1377 /* @data has hva passed to kvm_age_hva(). */
1378 trace_kvm_age_page(data, slot, young);
1340 return young; 1379 return young;
1341} 1380}
1342 1381
1343static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 1382static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1344 unsigned long data) 1383 struct kvm_memory_slot *slot, unsigned long data)
1345{ 1384{
1346 u64 *sptep; 1385 u64 *sptep;
1347 struct rmap_iterator iter; 1386 struct rmap_iterator iter;
@@ -1379,13 +1418,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1379 1418
1380 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 1419 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
1381 1420
1382 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); 1421 kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
1383 kvm_flush_remote_tlbs(vcpu->kvm); 1422 kvm_flush_remote_tlbs(vcpu->kvm);
1384} 1423}
1385 1424
1386int kvm_age_hva(struct kvm *kvm, unsigned long hva) 1425int kvm_age_hva(struct kvm *kvm, unsigned long hva)
1387{ 1426{
1388 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 1427 return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
1389} 1428}
1390 1429
1391int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 1430int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
@@ -2457,7 +2496,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2457 rmap_recycle(vcpu, sptep, gfn); 2496 rmap_recycle(vcpu, sptep, gfn);
2458 } 2497 }
2459 } 2498 }
2460 kvm_release_pfn_clean(pfn); 2499
2500 if (!is_error_pfn(pfn))
2501 kvm_release_pfn_clean(pfn);
2461} 2502}
2462 2503
2463static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 2504static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2469,17 +2510,12 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2469 bool no_dirty_log) 2510 bool no_dirty_log)
2470{ 2511{
2471 struct kvm_memory_slot *slot; 2512 struct kvm_memory_slot *slot;
2472 unsigned long hva;
2473 2513
2474 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); 2514 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2475 if (!slot) { 2515 if (!slot)
2476 get_page(fault_page); 2516 return KVM_PFN_ERR_FAULT;
2477 return page_to_pfn(fault_page);
2478 }
2479 2517
2480 hva = gfn_to_hva_memslot(slot, gfn); 2518 return gfn_to_pfn_memslot_atomic(slot, gfn);
2481
2482 return hva_to_pfn_atomic(vcpu->kvm, hva);
2483} 2519}
2484 2520
2485static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, 2521static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
@@ -2580,11 +2616,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2580 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, 2616 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
2581 iterator.level - 1, 2617 iterator.level - 1,
2582 1, ACC_ALL, iterator.sptep); 2618 1, ACC_ALL, iterator.sptep);
2583 if (!sp) {
2584 pgprintk("nonpaging_map: ENOMEM\n");
2585 kvm_release_pfn_clean(pfn);
2586 return -ENOMEM;
2587 }
2588 2619
2589 mmu_spte_set(iterator.sptep, 2620 mmu_spte_set(iterator.sptep,
2590 __pa(sp->spt) 2621 __pa(sp->spt)
@@ -2611,8 +2642,16 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
2611 2642
2612static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) 2643static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
2613{ 2644{
2614 kvm_release_pfn_clean(pfn); 2645 /*
2615 if (is_hwpoison_pfn(pfn)) { 2646 * Do not cache the mmio info caused by writing the readonly gfn
2647 * into the spte otherwise read access on readonly gfn also can
2648 * caused mmio page fault and treat it as mmio access.
2649 * Return 1 to tell kvm to emulate it.
2650 */
2651 if (pfn == KVM_PFN_ERR_RO_FAULT)
2652 return 1;
2653
2654 if (pfn == KVM_PFN_ERR_HWPOISON) {
2616 kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); 2655 kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
2617 return 0; 2656 return 0;
2618 } 2657 }
@@ -3236,8 +3275,6 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3236 if (!async) 3275 if (!async)
3237 return false; /* *pfn has correct page already */ 3276 return false; /* *pfn has correct page already */
3238 3277
3239 put_page(pfn_to_page(*pfn));
3240
3241 if (!prefault && can_do_async_pf(vcpu)) { 3278 if (!prefault && can_do_async_pf(vcpu)) {
3242 trace_kvm_try_async_get_page(gva, gfn); 3279 trace_kvm_try_async_get_page(gva, gfn);
3243 if (kvm_find_async_pf_gfn(vcpu, gfn)) { 3280 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
@@ -3371,6 +3408,18 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
3371 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; 3408 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
3372} 3409}
3373 3410
3411static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
3412{
3413 unsigned mask;
3414
3415 BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
3416
3417 mask = (unsigned)~ACC_WRITE_MASK;
3418 /* Allow write access to dirty gptes */
3419 mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
3420 *access &= mask;
3421}
3422
3374static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, 3423static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3375 int *nr_present) 3424 int *nr_present)
3376{ 3425{
@@ -3388,6 +3437,25 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3388 return false; 3437 return false;
3389} 3438}
3390 3439
3440static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
3441{
3442 unsigned access;
3443
3444 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
3445 access &= ~(gpte >> PT64_NX_SHIFT);
3446
3447 return access;
3448}
3449
3450static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
3451{
3452 unsigned index;
3453
3454 index = level - 1;
3455 index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
3456 return mmu->last_pte_bitmap & (1 << index);
3457}
3458
3391#define PTTYPE 64 3459#define PTTYPE 64
3392#include "paging_tmpl.h" 3460#include "paging_tmpl.h"
3393#undef PTTYPE 3461#undef PTTYPE
@@ -3457,6 +3525,56 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
3457 } 3525 }
3458} 3526}
3459 3527
3528static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
3529{
3530 unsigned bit, byte, pfec;
3531 u8 map;
3532 bool fault, x, w, u, wf, uf, ff, smep;
3533
3534 smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
3535 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
3536 pfec = byte << 1;
3537 map = 0;
3538 wf = pfec & PFERR_WRITE_MASK;
3539 uf = pfec & PFERR_USER_MASK;
3540 ff = pfec & PFERR_FETCH_MASK;
3541 for (bit = 0; bit < 8; ++bit) {
3542 x = bit & ACC_EXEC_MASK;
3543 w = bit & ACC_WRITE_MASK;
3544 u = bit & ACC_USER_MASK;
3545
3546 /* Not really needed: !nx will cause pte.nx to fault */
3547 x |= !mmu->nx;
3548 /* Allow supervisor writes if !cr0.wp */
3549 w |= !is_write_protection(vcpu) && !uf;
3550 /* Disallow supervisor fetches of user code if cr4.smep */
3551 x &= !(smep && u && !uf);
3552
3553 fault = (ff && !x) || (uf && !u) || (wf && !w);
3554 map |= fault << bit;
3555 }
3556 mmu->permissions[byte] = map;
3557 }
3558}
3559
3560static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
3561{
3562 u8 map;
3563 unsigned level, root_level = mmu->root_level;
3564 const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */
3565
3566 if (root_level == PT32E_ROOT_LEVEL)
3567 --root_level;
3568 /* PT_PAGE_TABLE_LEVEL always terminates */
3569 map = 1 | (1 << ps_set_index);
3570 for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
3571 if (level <= PT_PDPE_LEVEL
3572 && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
3573 map |= 1 << (ps_set_index | (level - 1));
3574 }
3575 mmu->last_pte_bitmap = map;
3576}
3577
3460static int paging64_init_context_common(struct kvm_vcpu *vcpu, 3578static int paging64_init_context_common(struct kvm_vcpu *vcpu,
3461 struct kvm_mmu *context, 3579 struct kvm_mmu *context,
3462 int level) 3580 int level)
@@ -3465,6 +3583,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
3465 context->root_level = level; 3583 context->root_level = level;
3466 3584
3467 reset_rsvds_bits_mask(vcpu, context); 3585 reset_rsvds_bits_mask(vcpu, context);
3586 update_permission_bitmask(vcpu, context);
3587 update_last_pte_bitmap(vcpu, context);
3468 3588
3469 ASSERT(is_pae(vcpu)); 3589 ASSERT(is_pae(vcpu));
3470 context->new_cr3 = paging_new_cr3; 3590 context->new_cr3 = paging_new_cr3;
@@ -3493,6 +3613,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
3493 context->root_level = PT32_ROOT_LEVEL; 3613 context->root_level = PT32_ROOT_LEVEL;
3494 3614
3495 reset_rsvds_bits_mask(vcpu, context); 3615 reset_rsvds_bits_mask(vcpu, context);
3616 update_permission_bitmask(vcpu, context);
3617 update_last_pte_bitmap(vcpu, context);
3496 3618
3497 context->new_cr3 = paging_new_cr3; 3619 context->new_cr3 = paging_new_cr3;
3498 context->page_fault = paging32_page_fault; 3620 context->page_fault = paging32_page_fault;
@@ -3553,6 +3675,9 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3553 context->gva_to_gpa = paging32_gva_to_gpa; 3675 context->gva_to_gpa = paging32_gva_to_gpa;
3554 } 3676 }
3555 3677
3678 update_permission_bitmask(vcpu, context);
3679 update_last_pte_bitmap(vcpu, context);
3680
3556 return 0; 3681 return 0;
3557} 3682}
3558 3683
@@ -3628,6 +3753,9 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3628 g_context->gva_to_gpa = paging32_gva_to_gpa_nested; 3753 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3629 } 3754 }
3630 3755
3756 update_permission_bitmask(vcpu, g_context);
3757 update_last_pte_bitmap(vcpu, g_context);
3758
3631 return 0; 3759 return 0;
3632} 3760}
3633 3761
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e374db9af021..69871080e866 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -18,8 +18,10 @@
18#define PT_PCD_MASK (1ULL << 4) 18#define PT_PCD_MASK (1ULL << 4)
19#define PT_ACCESSED_SHIFT 5 19#define PT_ACCESSED_SHIFT 5
20#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) 20#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
21#define PT_DIRTY_MASK (1ULL << 6) 21#define PT_DIRTY_SHIFT 6
22#define PT_PAGE_SIZE_MASK (1ULL << 7) 22#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
23#define PT_PAGE_SIZE_SHIFT 7
24#define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT)
23#define PT_PAT_MASK (1ULL << 7) 25#define PT_PAT_MASK (1ULL << 7)
24#define PT_GLOBAL_MASK (1ULL << 8) 26#define PT_GLOBAL_MASK (1ULL << 8)
25#define PT64_NX_SHIFT 63 27#define PT64_NX_SHIFT 63
@@ -88,17 +90,14 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
88 return kvm_read_cr0_bits(vcpu, X86_CR0_WP); 90 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
89} 91}
90 92
91static inline bool check_write_user_access(struct kvm_vcpu *vcpu, 93/*
92 bool write_fault, bool user_fault, 94 * Will a fault with a given page-fault error code (pfec) cause a permission
93 unsigned long pte) 95 * fault with the given access (in ACC_* format)?
96 */
97static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
98 unsigned pfec)
94{ 99{
95 if (unlikely(write_fault && !is_writable_pte(pte) 100 return (mmu->permissions[pfec >> 1] >> pte_access) & 1;
96 && (user_fault || is_write_protection(vcpu))))
97 return false;
98
99 if (unlikely(user_fault && !(pte & PT_USER_MASK)))
100 return false;
101
102 return true;
103} 101}
102
104#endif 103#endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 7d7d0b9e23eb..daff69e21150 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -116,10 +116,8 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
116 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); 116 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
117 pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); 117 pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
118 118
119 if (is_error_pfn(pfn)) { 119 if (is_error_pfn(pfn))
120 kvm_release_pfn_clean(pfn);
121 return; 120 return;
122 }
123 121
124 hpa = pfn << PAGE_SHIFT; 122 hpa = pfn << PAGE_SHIFT;
125 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) 123 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
@@ -190,7 +188,6 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
190 188
191static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) 189static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
192{ 190{
193 struct kvm_memory_slot *slot;
194 unsigned long *rmapp; 191 unsigned long *rmapp;
195 u64 *sptep; 192 u64 *sptep;
196 struct rmap_iterator iter; 193 struct rmap_iterator iter;
@@ -198,8 +195,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
198 if (sp->role.direct || sp->unsync || sp->role.invalid) 195 if (sp->role.direct || sp->unsync || sp->role.invalid)
199 return; 196 return;
200 197
201 slot = gfn_to_memslot(kvm, sp->gfn); 198 rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL);
202 rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
203 199
204 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 200 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
205 sptep = rmap_get_next(&iter)) { 201 sptep = rmap_get_next(&iter)) {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index bb7cf01cae76..714e2c01a6fe 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -63,10 +63,12 @@
63 */ 63 */
64struct guest_walker { 64struct guest_walker {
65 int level; 65 int level;
66 unsigned max_level;
66 gfn_t table_gfn[PT_MAX_FULL_LEVELS]; 67 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
67 pt_element_t ptes[PT_MAX_FULL_LEVELS]; 68 pt_element_t ptes[PT_MAX_FULL_LEVELS];
68 pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; 69 pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
69 gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; 70 gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
71 pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
70 unsigned pt_access; 72 unsigned pt_access;
71 unsigned pte_access; 73 unsigned pte_access;
72 gfn_t gfn; 74 gfn_t gfn;
@@ -101,38 +103,41 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
101 return (ret != orig_pte); 103 return (ret != orig_pte);
102} 104}
103 105
104static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte, 106static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
105 bool last) 107 struct kvm_mmu *mmu,
108 struct guest_walker *walker,
109 int write_fault)
106{ 110{
107 unsigned access; 111 unsigned level, index;
108 112 pt_element_t pte, orig_pte;
109 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; 113 pt_element_t __user *ptep_user;
110 if (last && !is_dirty_gpte(gpte)) 114 gfn_t table_gfn;
111 access &= ~ACC_WRITE_MASK; 115 int ret;
112 116
113#if PTTYPE == 64 117 for (level = walker->max_level; level >= walker->level; --level) {
114 if (vcpu->arch.mmu.nx) 118 pte = orig_pte = walker->ptes[level - 1];
115 access &= ~(gpte >> PT64_NX_SHIFT); 119 table_gfn = walker->table_gfn[level - 1];
116#endif 120 ptep_user = walker->ptep_user[level - 1];
117 return access; 121 index = offset_in_page(ptep_user) / sizeof(pt_element_t);
118} 122 if (!(pte & PT_ACCESSED_MASK)) {
119 123 trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
120static bool FNAME(is_last_gpte)(struct guest_walker *walker, 124 pte |= PT_ACCESSED_MASK;
121 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 125 }
122 pt_element_t gpte) 126 if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
123{ 127 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
124 if (walker->level == PT_PAGE_TABLE_LEVEL) 128 pte |= PT_DIRTY_MASK;
125 return true; 129 }
126 130 if (pte == orig_pte)
127 if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) && 131 continue;
128 (PTTYPE == 64 || is_pse(vcpu)))
129 return true;
130 132
131 if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) && 133 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
132 (mmu->root_level == PT64_ROOT_LEVEL)) 134 if (ret)
133 return true; 135 return ret;
134 136
135 return false; 137 mark_page_dirty(vcpu->kvm, table_gfn);
138 walker->ptes[level] = pte;
139 }
140 return 0;
136} 141}
137 142
138/* 143/*
@@ -142,21 +147,22 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
142 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 147 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
143 gva_t addr, u32 access) 148 gva_t addr, u32 access)
144{ 149{
150 int ret;
145 pt_element_t pte; 151 pt_element_t pte;
146 pt_element_t __user *uninitialized_var(ptep_user); 152 pt_element_t __user *uninitialized_var(ptep_user);
147 gfn_t table_gfn; 153 gfn_t table_gfn;
148 unsigned index, pt_access, uninitialized_var(pte_access); 154 unsigned index, pt_access, pte_access, accessed_dirty, shift;
149 gpa_t pte_gpa; 155 gpa_t pte_gpa;
150 bool eperm, last_gpte;
151 int offset; 156 int offset;
152 const int write_fault = access & PFERR_WRITE_MASK; 157 const int write_fault = access & PFERR_WRITE_MASK;
153 const int user_fault = access & PFERR_USER_MASK; 158 const int user_fault = access & PFERR_USER_MASK;
154 const int fetch_fault = access & PFERR_FETCH_MASK; 159 const int fetch_fault = access & PFERR_FETCH_MASK;
155 u16 errcode = 0; 160 u16 errcode = 0;
161 gpa_t real_gpa;
162 gfn_t gfn;
156 163
157 trace_kvm_mmu_pagetable_walk(addr, access); 164 trace_kvm_mmu_pagetable_walk(addr, access);
158retry_walk: 165retry_walk:
159 eperm = false;
160 walker->level = mmu->root_level; 166 walker->level = mmu->root_level;
161 pte = mmu->get_cr3(vcpu); 167 pte = mmu->get_cr3(vcpu);
162 168
@@ -169,15 +175,21 @@ retry_walk:
169 --walker->level; 175 --walker->level;
170 } 176 }
171#endif 177#endif
178 walker->max_level = walker->level;
172 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || 179 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
173 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); 180 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
174 181
175 pt_access = ACC_ALL; 182 accessed_dirty = PT_ACCESSED_MASK;
183 pt_access = pte_access = ACC_ALL;
184 ++walker->level;
176 185
177 for (;;) { 186 do {
178 gfn_t real_gfn; 187 gfn_t real_gfn;
179 unsigned long host_addr; 188 unsigned long host_addr;
180 189
190 pt_access &= pte_access;
191 --walker->level;
192
181 index = PT_INDEX(addr, walker->level); 193 index = PT_INDEX(addr, walker->level);
182 194
183 table_gfn = gpte_to_gfn(pte); 195 table_gfn = gpte_to_gfn(pte);
@@ -199,6 +211,7 @@ retry_walk:
199 ptep_user = (pt_element_t __user *)((void *)host_addr + offset); 211 ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
200 if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) 212 if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
201 goto error; 213 goto error;
214 walker->ptep_user[walker->level - 1] = ptep_user;
202 215
203 trace_kvm_mmu_paging_element(pte, walker->level); 216 trace_kvm_mmu_paging_element(pte, walker->level);
204 217
@@ -211,92 +224,48 @@ retry_walk:
211 goto error; 224 goto error;
212 } 225 }
213 226
214 if (!check_write_user_access(vcpu, write_fault, user_fault, 227 accessed_dirty &= pte;
215 pte)) 228 pte_access = pt_access & gpte_access(vcpu, pte);
216 eperm = true;
217
218#if PTTYPE == 64
219 if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
220 eperm = true;
221#endif
222
223 last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
224 if (last_gpte) {
225 pte_access = pt_access &
226 FNAME(gpte_access)(vcpu, pte, true);
227 /* check if the kernel is fetching from user page */
228 if (unlikely(pte_access & PT_USER_MASK) &&
229 kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
230 if (fetch_fault && !user_fault)
231 eperm = true;
232 }
233
234 if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
235 int ret;
236 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
237 sizeof(pte));
238 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
239 pte, pte|PT_ACCESSED_MASK);
240 if (unlikely(ret < 0))
241 goto error;
242 else if (ret)
243 goto retry_walk;
244
245 mark_page_dirty(vcpu->kvm, table_gfn);
246 pte |= PT_ACCESSED_MASK;
247 }
248 229
249 walker->ptes[walker->level - 1] = pte; 230 walker->ptes[walker->level - 1] = pte;
231 } while (!is_last_gpte(mmu, walker->level, pte));
250 232
251 if (last_gpte) { 233 if (unlikely(permission_fault(mmu, pte_access, access))) {
252 int lvl = walker->level; 234 errcode |= PFERR_PRESENT_MASK;
253 gpa_t real_gpa; 235 goto error;
254 gfn_t gfn; 236 }
255 u32 ac;
256
257 gfn = gpte_to_gfn_lvl(pte, lvl);
258 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
259
260 if (PTTYPE == 32 &&
261 walker->level == PT_DIRECTORY_LEVEL &&
262 is_cpuid_PSE36())
263 gfn += pse36_gfn_delta(pte);
264
265 ac = write_fault | fetch_fault | user_fault;
266 237
267 real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), 238 gfn = gpte_to_gfn_lvl(pte, walker->level);
268 ac); 239 gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
269 if (real_gpa == UNMAPPED_GVA)
270 return 0;
271 240
272 walker->gfn = real_gpa >> PAGE_SHIFT; 241 if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
242 gfn += pse36_gfn_delta(pte);
273 243
274 break; 244 real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access);
275 } 245 if (real_gpa == UNMAPPED_GVA)
246 return 0;
276 247
277 pt_access &= FNAME(gpte_access)(vcpu, pte, false); 248 walker->gfn = real_gpa >> PAGE_SHIFT;
278 --walker->level;
279 }
280 249
281 if (unlikely(eperm)) { 250 if (!write_fault)
282 errcode |= PFERR_PRESENT_MASK; 251 protect_clean_gpte(&pte_access, pte);
283 goto error;
284 }
285 252
286 if (write_fault && unlikely(!is_dirty_gpte(pte))) { 253 /*
287 int ret; 254 * On a write fault, fold the dirty bit into accessed_dirty by shifting it one
255 * place right.
256 *
257 * On a read fault, do nothing.
258 */
259 shift = write_fault >> ilog2(PFERR_WRITE_MASK);
260 shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT;
261 accessed_dirty &= pte >> shift;
288 262
289 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 263 if (unlikely(!accessed_dirty)) {
290 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, 264 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
291 pte, pte|PT_DIRTY_MASK);
292 if (unlikely(ret < 0)) 265 if (unlikely(ret < 0))
293 goto error; 266 goto error;
294 else if (ret) 267 else if (ret)
295 goto retry_walk; 268 goto retry_walk;
296
297 mark_page_dirty(vcpu->kvm, table_gfn);
298 pte |= PT_DIRTY_MASK;
299 walker->ptes[walker->level - 1] = pte;
300 } 269 }
301 270
302 walker->pt_access = pt_access; 271 walker->pt_access = pt_access;
@@ -368,12 +337,11 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
368 return; 337 return;
369 338
370 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 339 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
371 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true); 340 pte_access = sp->role.access & gpte_access(vcpu, gpte);
341 protect_clean_gpte(&pte_access, gpte);
372 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); 342 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
373 if (mmu_invalid_pfn(pfn)) { 343 if (mmu_invalid_pfn(pfn))
374 kvm_release_pfn_clean(pfn);
375 return; 344 return;
376 }
377 345
378 /* 346 /*
379 * we call mmu_set_spte() with host_writable = true because that 347 * we call mmu_set_spte() with host_writable = true because that
@@ -443,15 +411,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
443 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) 411 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
444 continue; 412 continue;
445 413
446 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, 414 pte_access = sp->role.access & gpte_access(vcpu, gpte);
447 true); 415 protect_clean_gpte(&pte_access, gpte);
448 gfn = gpte_to_gfn(gpte); 416 gfn = gpte_to_gfn(gpte);
449 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, 417 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
450 pte_access & ACC_WRITE_MASK); 418 pte_access & ACC_WRITE_MASK);
451 if (mmu_invalid_pfn(pfn)) { 419 if (mmu_invalid_pfn(pfn))
452 kvm_release_pfn_clean(pfn);
453 break; 420 break;
454 }
455 421
456 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 422 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
457 NULL, PT_PAGE_TABLE_LEVEL, gfn, 423 NULL, PT_PAGE_TABLE_LEVEL, gfn,
@@ -798,7 +764,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
798 764
799 gfn = gpte_to_gfn(gpte); 765 gfn = gpte_to_gfn(gpte);
800 pte_access = sp->role.access; 766 pte_access = sp->role.access;
801 pte_access &= FNAME(gpte_access)(vcpu, gpte, true); 767 pte_access &= gpte_access(vcpu, gpte);
768 protect_clean_gpte(&pte_access, gpte);
802 769
803 if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) 770 if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
804 continue; 771 continue;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 9b7ec1150ab0..cfc258a6bf97 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Kernel-based Virtual Machine -- Performane Monitoring Unit support 2 * Kernel-based Virtual Machine -- Performance Monitoring Unit support
3 * 3 *
4 * Copyright 2011 Red Hat, Inc. and/or its affiliates. 4 * Copyright 2011 Red Hat, Inc. and/or its affiliates.
5 * 5 *
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index baead950d6c8..d017df3899ef 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -163,7 +163,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio);
163 163
164#define MSR_INVALID 0xffffffffU 164#define MSR_INVALID 0xffffffffU
165 165
166static struct svm_direct_access_msrs { 166static const struct svm_direct_access_msrs {
167 u32 index; /* Index of the MSR */ 167 u32 index; /* Index of the MSR */
168 bool always; /* True if intercept is always on */ 168 bool always; /* True if intercept is always on */
169} direct_access_msrs[] = { 169} direct_access_msrs[] = {
@@ -400,7 +400,7 @@ struct svm_init_data {
400 int r; 400 int r;
401}; 401};
402 402
403static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; 403static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
404 404
405#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) 405#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
406#define MSRS_RANGE_SIZE 2048 406#define MSRS_RANGE_SIZE 2048
@@ -1146,7 +1146,6 @@ static void init_vmcb(struct vcpu_svm *svm)
1146 1146
1147 svm_set_efer(&svm->vcpu, 0); 1147 svm_set_efer(&svm->vcpu, 0);
1148 save->dr6 = 0xffff0ff0; 1148 save->dr6 = 0xffff0ff0;
1149 save->dr7 = 0x400;
1150 kvm_set_rflags(&svm->vcpu, 2); 1149 kvm_set_rflags(&svm->vcpu, 2);
1151 save->rip = 0x0000fff0; 1150 save->rip = 0x0000fff0;
1152 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 1151 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
@@ -1643,7 +1642,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
1643 mark_dirty(svm->vmcb, VMCB_SEG); 1642 mark_dirty(svm->vmcb, VMCB_SEG);
1644} 1643}
1645 1644
1646static void update_db_intercept(struct kvm_vcpu *vcpu) 1645static void update_db_bp_intercept(struct kvm_vcpu *vcpu)
1647{ 1646{
1648 struct vcpu_svm *svm = to_svm(vcpu); 1647 struct vcpu_svm *svm = to_svm(vcpu);
1649 1648
@@ -1663,20 +1662,6 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1663 vcpu->guest_debug = 0; 1662 vcpu->guest_debug = 0;
1664} 1663}
1665 1664
1666static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1667{
1668 struct vcpu_svm *svm = to_svm(vcpu);
1669
1670 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1671 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
1672 else
1673 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1674
1675 mark_dirty(svm->vmcb, VMCB_DR);
1676
1677 update_db_intercept(vcpu);
1678}
1679
1680static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 1665static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1681{ 1666{
1682 if (sd->next_asid > sd->max_asid) { 1667 if (sd->next_asid > sd->max_asid) {
@@ -1748,7 +1733,7 @@ static int db_interception(struct vcpu_svm *svm)
1748 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) 1733 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1749 svm->vmcb->save.rflags &= 1734 svm->vmcb->save.rflags &=
1750 ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 1735 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1751 update_db_intercept(&svm->vcpu); 1736 update_db_bp_intercept(&svm->vcpu);
1752 } 1737 }
1753 1738
1754 if (svm->vcpu.guest_debug & 1739 if (svm->vcpu.guest_debug &
@@ -2063,7 +2048,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
2063 if (svm->nested.intercept & 1ULL) { 2048 if (svm->nested.intercept & 1ULL) {
2064 /* 2049 /*
2065 * The #vmexit can't be emulated here directly because this 2050 * The #vmexit can't be emulated here directly because this
2066 * code path runs with irqs and preemtion disabled. A 2051 * code path runs with irqs and preemption disabled. A
2067 * #vmexit emulation might sleep. Only signal request for 2052 * #vmexit emulation might sleep. Only signal request for
2068 * the #vmexit here. 2053 * the #vmexit here.
2069 */ 2054 */
@@ -2105,7 +2090,6 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
2105 return kmap(page); 2090 return kmap(page);
2106 2091
2107error: 2092error:
2108 kvm_release_page_clean(page);
2109 kvm_inject_gp(&svm->vcpu, 0); 2093 kvm_inject_gp(&svm->vcpu, 0);
2110 2094
2111 return NULL; 2095 return NULL;
@@ -2409,7 +2393,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2409{ 2393{
2410 /* 2394 /*
2411 * This function merges the msr permission bitmaps of kvm and the 2395 * This function merges the msr permission bitmaps of kvm and the
2412 * nested vmcb. It is omptimized in that it only merges the parts where 2396 * nested vmcb. It is optimized in that it only merges the parts where
2413 * the kvm msr permission bitmap may contain zero bits 2397 * the kvm msr permission bitmap may contain zero bits
2414 */ 2398 */
2415 int i; 2399 int i;
@@ -3268,7 +3252,7 @@ static int pause_interception(struct vcpu_svm *svm)
3268 return 1; 3252 return 1;
3269} 3253}
3270 3254
3271static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { 3255static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3272 [SVM_EXIT_READ_CR0] = cr_interception, 3256 [SVM_EXIT_READ_CR0] = cr_interception,
3273 [SVM_EXIT_READ_CR3] = cr_interception, 3257 [SVM_EXIT_READ_CR3] = cr_interception,
3274 [SVM_EXIT_READ_CR4] = cr_interception, 3258 [SVM_EXIT_READ_CR4] = cr_interception,
@@ -3660,7 +3644,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
3660 */ 3644 */
3661 svm->nmi_singlestep = true; 3645 svm->nmi_singlestep = true;
3662 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 3646 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3663 update_db_intercept(vcpu); 3647 update_db_bp_intercept(vcpu);
3664} 3648}
3665 3649
3666static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 3650static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -3783,12 +3767,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3783 svm_complete_interrupts(svm); 3767 svm_complete_interrupts(svm);
3784} 3768}
3785 3769
3786#ifdef CONFIG_X86_64
3787#define R "r"
3788#else
3789#define R "e"
3790#endif
3791
3792static void svm_vcpu_run(struct kvm_vcpu *vcpu) 3770static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3793{ 3771{
3794 struct vcpu_svm *svm = to_svm(vcpu); 3772 struct vcpu_svm *svm = to_svm(vcpu);
@@ -3815,13 +3793,13 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3815 local_irq_enable(); 3793 local_irq_enable();
3816 3794
3817 asm volatile ( 3795 asm volatile (
3818 "push %%"R"bp; \n\t" 3796 "push %%" _ASM_BP "; \n\t"
3819 "mov %c[rbx](%[svm]), %%"R"bx \n\t" 3797 "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
3820 "mov %c[rcx](%[svm]), %%"R"cx \n\t" 3798 "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
3821 "mov %c[rdx](%[svm]), %%"R"dx \n\t" 3799 "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
3822 "mov %c[rsi](%[svm]), %%"R"si \n\t" 3800 "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
3823 "mov %c[rdi](%[svm]), %%"R"di \n\t" 3801 "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
3824 "mov %c[rbp](%[svm]), %%"R"bp \n\t" 3802 "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
3825#ifdef CONFIG_X86_64 3803#ifdef CONFIG_X86_64
3826 "mov %c[r8](%[svm]), %%r8 \n\t" 3804 "mov %c[r8](%[svm]), %%r8 \n\t"
3827 "mov %c[r9](%[svm]), %%r9 \n\t" 3805 "mov %c[r9](%[svm]), %%r9 \n\t"
@@ -3834,20 +3812,20 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3834#endif 3812#endif
3835 3813
3836 /* Enter guest mode */ 3814 /* Enter guest mode */
3837 "push %%"R"ax \n\t" 3815 "push %%" _ASM_AX " \n\t"
3838 "mov %c[vmcb](%[svm]), %%"R"ax \n\t" 3816 "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
3839 __ex(SVM_VMLOAD) "\n\t" 3817 __ex(SVM_VMLOAD) "\n\t"
3840 __ex(SVM_VMRUN) "\n\t" 3818 __ex(SVM_VMRUN) "\n\t"
3841 __ex(SVM_VMSAVE) "\n\t" 3819 __ex(SVM_VMSAVE) "\n\t"
3842 "pop %%"R"ax \n\t" 3820 "pop %%" _ASM_AX " \n\t"
3843 3821
3844 /* Save guest registers, load host registers */ 3822 /* Save guest registers, load host registers */
3845 "mov %%"R"bx, %c[rbx](%[svm]) \n\t" 3823 "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
3846 "mov %%"R"cx, %c[rcx](%[svm]) \n\t" 3824 "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
3847 "mov %%"R"dx, %c[rdx](%[svm]) \n\t" 3825 "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
3848 "mov %%"R"si, %c[rsi](%[svm]) \n\t" 3826 "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
3849 "mov %%"R"di, %c[rdi](%[svm]) \n\t" 3827 "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
3850 "mov %%"R"bp, %c[rbp](%[svm]) \n\t" 3828 "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
3851#ifdef CONFIG_X86_64 3829#ifdef CONFIG_X86_64
3852 "mov %%r8, %c[r8](%[svm]) \n\t" 3830 "mov %%r8, %c[r8](%[svm]) \n\t"
3853 "mov %%r9, %c[r9](%[svm]) \n\t" 3831 "mov %%r9, %c[r9](%[svm]) \n\t"
@@ -3858,7 +3836,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3858 "mov %%r14, %c[r14](%[svm]) \n\t" 3836 "mov %%r14, %c[r14](%[svm]) \n\t"
3859 "mov %%r15, %c[r15](%[svm]) \n\t" 3837 "mov %%r15, %c[r15](%[svm]) \n\t"
3860#endif 3838#endif
3861 "pop %%"R"bp" 3839 "pop %%" _ASM_BP
3862 : 3840 :
3863 : [svm]"a"(svm), 3841 : [svm]"a"(svm),
3864 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), 3842 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
@@ -3879,9 +3857,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3879 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) 3857 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
3880#endif 3858#endif
3881 : "cc", "memory" 3859 : "cc", "memory"
3882 , R"bx", R"cx", R"dx", R"si", R"di"
3883#ifdef CONFIG_X86_64 3860#ifdef CONFIG_X86_64
3861 , "rbx", "rcx", "rdx", "rsi", "rdi"
3884 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" 3862 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
3863#else
3864 , "ebx", "ecx", "edx", "esi", "edi"
3885#endif 3865#endif
3886 ); 3866 );
3887 3867
@@ -3941,8 +3921,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3941 mark_all_clean(svm->vmcb); 3921 mark_all_clean(svm->vmcb);
3942} 3922}
3943 3923
3944#undef R
3945
3946static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) 3924static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3947{ 3925{
3948 struct vcpu_svm *svm = to_svm(vcpu); 3926 struct vcpu_svm *svm = to_svm(vcpu);
@@ -4069,7 +4047,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
4069#define POST_MEM(exit) { .exit_code = (exit), \ 4047#define POST_MEM(exit) { .exit_code = (exit), \
4070 .stage = X86_ICPT_POST_MEMACCESS, } 4048 .stage = X86_ICPT_POST_MEMACCESS, }
4071 4049
4072static struct __x86_intercept { 4050static const struct __x86_intercept {
4073 u32 exit_code; 4051 u32 exit_code;
4074 enum x86_intercept_stage stage; 4052 enum x86_intercept_stage stage;
4075} x86_intercept_map[] = { 4053} x86_intercept_map[] = {
@@ -4260,7 +4238,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4260 .vcpu_load = svm_vcpu_load, 4238 .vcpu_load = svm_vcpu_load,
4261 .vcpu_put = svm_vcpu_put, 4239 .vcpu_put = svm_vcpu_put,
4262 4240
4263 .set_guest_debug = svm_guest_debug, 4241 .update_db_bp_intercept = update_db_bp_intercept,
4264 .get_msr = svm_get_msr, 4242 .get_msr = svm_get_msr,
4265 .set_msr = svm_set_msr, 4243 .set_msr = svm_set_msr,
4266 .get_segment_base = svm_get_segment_base, 4244 .get_segment_base = svm_get_segment_base,
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
deleted file mode 100644
index 6b85cc647f34..000000000000
--- a/arch/x86/kvm/timer.c
+++ /dev/null
@@ -1,47 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * timer support
8 *
9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2. See
12 * the COPYING file in the top-level directory.
13 */
14
15#include <linux/kvm_host.h>
16#include <linux/kvm.h>
17#include <linux/hrtimer.h>
18#include <linux/atomic.h>
19#include "kvm_timer.h"
20
21enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
22{
23 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
24 struct kvm_vcpu *vcpu = ktimer->vcpu;
25 wait_queue_head_t *q = &vcpu->wq;
26
27 /*
28 * There is a race window between reading and incrementing, but we do
29 * not care about potentially losing timer events in the !reinject
30 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
31 * in vcpu_enter_guest.
32 */
33 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
34 atomic_inc(&ktimer->pending);
35 /* FIXME: this code should not know anything about vcpus */
36 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
37 }
38
39 if (waitqueue_active(q))
40 wake_up_interruptible(q);
41
42 if (ktimer->t_ops->is_periodic(ktimer)) {
43 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
44 return HRTIMER_RESTART;
45 } else
46 return HRTIMER_NORESTART;
47}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 851aa7c3b890..ad6b1dd06f8b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -127,6 +127,8 @@ module_param(ple_gap, int, S_IRUGO);
127static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 127static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
128module_param(ple_window, int, S_IRUGO); 128module_param(ple_window, int, S_IRUGO);
129 129
130extern const ulong vmx_return;
131
130#define NR_AUTOLOAD_MSRS 8 132#define NR_AUTOLOAD_MSRS 8
131#define VMCS02_POOL_SIZE 1 133#define VMCS02_POOL_SIZE 1
132 134
@@ -405,16 +407,16 @@ struct vcpu_vmx {
405 struct { 407 struct {
406 int vm86_active; 408 int vm86_active;
407 ulong save_rflags; 409 ulong save_rflags;
410 struct kvm_segment segs[8];
411 } rmode;
412 struct {
413 u32 bitmask; /* 4 bits per segment (1 bit per field) */
408 struct kvm_save_segment { 414 struct kvm_save_segment {
409 u16 selector; 415 u16 selector;
410 unsigned long base; 416 unsigned long base;
411 u32 limit; 417 u32 limit;
412 u32 ar; 418 u32 ar;
413 } tr, es, ds, fs, gs; 419 } seg[8];
414 } rmode;
415 struct {
416 u32 bitmask; /* 4 bits per segment (1 bit per field) */
417 struct kvm_save_segment seg[8];
418 } segment_cache; 420 } segment_cache;
419 int vpid; 421 int vpid;
420 bool emulation_required; 422 bool emulation_required;
@@ -450,7 +452,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
450#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ 452#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
451 [number##_HIGH] = VMCS12_OFFSET(name)+4 453 [number##_HIGH] = VMCS12_OFFSET(name)+4
452 454
453static unsigned short vmcs_field_to_offset_table[] = { 455static const unsigned short vmcs_field_to_offset_table[] = {
454 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), 456 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
455 FIELD(GUEST_ES_SELECTOR, guest_es_selector), 457 FIELD(GUEST_ES_SELECTOR, guest_es_selector),
456 FIELD(GUEST_CS_SELECTOR, guest_cs_selector), 458 FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
@@ -596,10 +598,9 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
596static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) 598static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
597{ 599{
598 struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); 600 struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
599 if (is_error_page(page)) { 601 if (is_error_page(page))
600 kvm_release_page_clean(page);
601 return NULL; 602 return NULL;
602 } 603
603 return page; 604 return page;
604} 605}
605 606
@@ -667,7 +668,7 @@ static struct vmx_capability {
667 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 668 .ar_bytes = GUEST_##seg##_AR_BYTES, \
668 } 669 }
669 670
670static struct kvm_vmx_segment_field { 671static const struct kvm_vmx_segment_field {
671 unsigned selector; 672 unsigned selector;
672 unsigned base; 673 unsigned base;
673 unsigned limit; 674 unsigned limit;
@@ -1343,7 +1344,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
1343 guest_efer = vmx->vcpu.arch.efer; 1344 guest_efer = vmx->vcpu.arch.efer;
1344 1345
1345 /* 1346 /*
1346 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 1347 * NX is emulated; LMA and LME handled by hardware; SCE meaningless
1347 * outside long mode 1348 * outside long mode
1348 */ 1349 */
1349 ignore_bits = EFER_NX | EFER_SCE; 1350 ignore_bits = EFER_NX | EFER_SCE;
@@ -1995,7 +1996,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
1995#endif 1996#endif
1996 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 1997 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
1997 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | 1998 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
1998 CPU_BASED_RDPMC_EXITING | 1999 CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
1999 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2000 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2000 /* 2001 /*
2001 * We can allow some features even when not supported by the 2002 * We can allow some features even when not supported by the
@@ -2291,16 +2292,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2291 } 2292 }
2292} 2293}
2293 2294
2294static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
2295{
2296 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
2297 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
2298 else
2299 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
2300
2301 update_exception_bitmap(vcpu);
2302}
2303
2304static __init int cpu_has_kvm_support(void) 2295static __init int cpu_has_kvm_support(void)
2305{ 2296{
2306 return cpu_has_vmx(); 2297 return cpu_has_vmx();
@@ -2698,20 +2689,17 @@ static __exit void hardware_unsetup(void)
2698 free_kvm_area(); 2689 free_kvm_area();
2699} 2690}
2700 2691
2701static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) 2692static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save)
2702{ 2693{
2703 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2694 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2695 struct kvm_segment tmp = *save;
2704 2696
2705 if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) { 2697 if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
2706 vmcs_write16(sf->selector, save->selector); 2698 tmp.base = vmcs_readl(sf->base);
2707 vmcs_writel(sf->base, save->base); 2699 tmp.selector = vmcs_read16(sf->selector);
2708 vmcs_write32(sf->limit, save->limit); 2700 tmp.s = 1;
2709 vmcs_write32(sf->ar_bytes, save->ar);
2710 } else {
2711 u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
2712 << AR_DPL_SHIFT;
2713 vmcs_write32(sf->ar_bytes, 0x93 | dpl);
2714 } 2701 }
2702 vmx_set_segment(vcpu, &tmp, seg);
2715} 2703}
2716 2704
2717static void enter_pmode(struct kvm_vcpu *vcpu) 2705static void enter_pmode(struct kvm_vcpu *vcpu)
@@ -2724,10 +2712,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
2724 2712
2725 vmx_segment_cache_clear(vmx); 2713 vmx_segment_cache_clear(vmx);
2726 2714
2727 vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); 2715 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2728 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
2729 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
2730 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
2731 2716
2732 flags = vmcs_readl(GUEST_RFLAGS); 2717 flags = vmcs_readl(GUEST_RFLAGS);
2733 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 2718 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
@@ -2742,10 +2727,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
2742 if (emulate_invalid_guest_state) 2727 if (emulate_invalid_guest_state)
2743 return; 2728 return;
2744 2729
2745 fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es); 2730 fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2746 fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds); 2731 fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2747 fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); 2732 fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2748 fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); 2733 fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2749 2734
2750 vmx_segment_cache_clear(vmx); 2735 vmx_segment_cache_clear(vmx);
2751 2736
@@ -2773,14 +2758,10 @@ static gva_t rmode_tss_base(struct kvm *kvm)
2773 return kvm->arch.tss_addr; 2758 return kvm->arch.tss_addr;
2774} 2759}
2775 2760
2776static void fix_rmode_seg(int seg, struct kvm_save_segment *save) 2761static void fix_rmode_seg(int seg, struct kvm_segment *save)
2777{ 2762{
2778 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2763 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2779 2764
2780 save->selector = vmcs_read16(sf->selector);
2781 save->base = vmcs_readl(sf->base);
2782 save->limit = vmcs_read32(sf->limit);
2783 save->ar = vmcs_read32(sf->ar_bytes);
2784 vmcs_write16(sf->selector, save->base >> 4); 2765 vmcs_write16(sf->selector, save->base >> 4);
2785 vmcs_write32(sf->base, save->base & 0xffff0); 2766 vmcs_write32(sf->base, save->base & 0xffff0);
2786 vmcs_write32(sf->limit, 0xffff); 2767 vmcs_write32(sf->limit, 0xffff);
@@ -2800,9 +2781,16 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2800 if (enable_unrestricted_guest) 2781 if (enable_unrestricted_guest)
2801 return; 2782 return;
2802 2783
2784 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2785 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2786 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2787 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2788 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2789
2803 vmx->emulation_required = 1; 2790 vmx->emulation_required = 1;
2804 vmx->rmode.vm86_active = 1; 2791 vmx->rmode.vm86_active = 1;
2805 2792
2793
2806 /* 2794 /*
2807 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 2795 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2808 * vcpu. Call it here with phys address pointing 16M below 4G. 2796 * vcpu. Call it here with phys address pointing 16M below 4G.
@@ -2817,14 +2805,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2817 2805
2818 vmx_segment_cache_clear(vmx); 2806 vmx_segment_cache_clear(vmx);
2819 2807
2820 vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
2821 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
2822 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 2808 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
2823
2824 vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
2825 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 2809 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
2826
2827 vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
2828 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 2810 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2829 2811
2830 flags = vmcs_readl(GUEST_RFLAGS); 2812 flags = vmcs_readl(GUEST_RFLAGS);
@@ -3117,35 +3099,24 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
3117 struct kvm_segment *var, int seg) 3099 struct kvm_segment *var, int seg)
3118{ 3100{
3119 struct vcpu_vmx *vmx = to_vmx(vcpu); 3101 struct vcpu_vmx *vmx = to_vmx(vcpu);
3120 struct kvm_save_segment *save;
3121 u32 ar; 3102 u32 ar;
3122 3103
3123 if (vmx->rmode.vm86_active 3104 if (vmx->rmode.vm86_active
3124 && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES 3105 && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
3125 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS 3106 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
3126 || seg == VCPU_SREG_GS) 3107 || seg == VCPU_SREG_GS)) {
3127 && !emulate_invalid_guest_state) { 3108 *var = vmx->rmode.segs[seg];
3128 switch (seg) {
3129 case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
3130 case VCPU_SREG_ES: save = &vmx->rmode.es; break;
3131 case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
3132 case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
3133 case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
3134 default: BUG();
3135 }
3136 var->selector = save->selector;
3137 var->base = save->base;
3138 var->limit = save->limit;
3139 ar = save->ar;
3140 if (seg == VCPU_SREG_TR 3109 if (seg == VCPU_SREG_TR
3141 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3110 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3142 goto use_saved_rmode_seg; 3111 return;
3112 var->base = vmx_read_guest_seg_base(vmx, seg);
3113 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3114 return;
3143 } 3115 }
3144 var->base = vmx_read_guest_seg_base(vmx, seg); 3116 var->base = vmx_read_guest_seg_base(vmx, seg);
3145 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3117 var->limit = vmx_read_guest_seg_limit(vmx, seg);
3146 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3118 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3147 ar = vmx_read_guest_seg_ar(vmx, seg); 3119 ar = vmx_read_guest_seg_ar(vmx, seg);
3148use_saved_rmode_seg:
3149 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) 3120 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
3150 ar = 0; 3121 ar = 0;
3151 var->type = ar & 15; 3122 var->type = ar & 15;
@@ -3227,23 +3198,21 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3227 struct kvm_segment *var, int seg) 3198 struct kvm_segment *var, int seg)
3228{ 3199{
3229 struct vcpu_vmx *vmx = to_vmx(vcpu); 3200 struct vcpu_vmx *vmx = to_vmx(vcpu);
3230 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3201 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3231 u32 ar; 3202 u32 ar;
3232 3203
3233 vmx_segment_cache_clear(vmx); 3204 vmx_segment_cache_clear(vmx);
3234 3205
3235 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { 3206 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
3236 vmcs_write16(sf->selector, var->selector); 3207 vmcs_write16(sf->selector, var->selector);
3237 vmx->rmode.tr.selector = var->selector; 3208 vmx->rmode.segs[VCPU_SREG_TR] = *var;
3238 vmx->rmode.tr.base = var->base;
3239 vmx->rmode.tr.limit = var->limit;
3240 vmx->rmode.tr.ar = vmx_segment_access_rights(var);
3241 return; 3209 return;
3242 } 3210 }
3243 vmcs_writel(sf->base, var->base); 3211 vmcs_writel(sf->base, var->base);
3244 vmcs_write32(sf->limit, var->limit); 3212 vmcs_write32(sf->limit, var->limit);
3245 vmcs_write16(sf->selector, var->selector); 3213 vmcs_write16(sf->selector, var->selector);
3246 if (vmx->rmode.vm86_active && var->s) { 3214 if (vmx->rmode.vm86_active && var->s) {
3215 vmx->rmode.segs[seg] = *var;
3247 /* 3216 /*
3248 * Hack real-mode segments into vm86 compatibility. 3217 * Hack real-mode segments into vm86 compatibility.
3249 */ 3218 */
@@ -3258,7 +3227,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3258 * qemu binaries. 3227 * qemu binaries.
3259 * IA32 arch specifies that at the time of processor reset the 3228 * IA32 arch specifies that at the time of processor reset the
3260 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3229 * "Accessed" bit in the AR field of segment registers is 1. And qemu
3261 * is setting it to 0 in the usedland code. This causes invalid guest 3230 * is setting it to 0 in the userland code. This causes invalid guest
3262 * state vmexit when "unrestricted guest" mode is turned on. 3231 * state vmexit when "unrestricted guest" mode is turned on.
3263 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3232 * Fix for this setup issue in cpu_reset is being pushed in the qemu
3264 * tree. Newer qemu binaries with that qemu fix would not need this 3233 * tree. Newer qemu binaries with that qemu fix would not need this
@@ -3288,16 +3257,10 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3288 vmcs_readl(GUEST_CS_BASE) >> 4); 3257 vmcs_readl(GUEST_CS_BASE) >> 4);
3289 break; 3258 break;
3290 case VCPU_SREG_ES: 3259 case VCPU_SREG_ES:
3291 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
3292 break;
3293 case VCPU_SREG_DS: 3260 case VCPU_SREG_DS:
3294 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
3295 break;
3296 case VCPU_SREG_GS: 3261 case VCPU_SREG_GS:
3297 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
3298 break;
3299 case VCPU_SREG_FS: 3262 case VCPU_SREG_FS:
3300 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); 3263 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3301 break; 3264 break;
3302 case VCPU_SREG_SS: 3265 case VCPU_SREG_SS:
3303 vmcs_write16(GUEST_SS_SELECTOR, 3266 vmcs_write16(GUEST_SS_SELECTOR,
@@ -3351,9 +3314,9 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3351 3314
3352 if (var.base != (var.selector << 4)) 3315 if (var.base != (var.selector << 4))
3353 return false; 3316 return false;
3354 if (var.limit != 0xffff) 3317 if (var.limit < 0xffff)
3355 return false; 3318 return false;
3356 if (ar != 0xf3) 3319 if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3)
3357 return false; 3320 return false;
3358 3321
3359 return true; 3322 return true;
@@ -3605,7 +3568,7 @@ out:
3605 3568
3606static void seg_setup(int seg) 3569static void seg_setup(int seg)
3607{ 3570{
3608 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3571 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3609 unsigned int ar; 3572 unsigned int ar;
3610 3573
3611 vmcs_write16(sf->selector, 0); 3574 vmcs_write16(sf->selector, 0);
@@ -3770,8 +3733,7 @@ static void vmx_set_constant_host_state(void)
3770 native_store_idt(&dt); 3733 native_store_idt(&dt);
3771 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ 3734 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
3772 3735
3773 asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl)); 3736 vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
3774 vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
3775 3737
3776 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 3738 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
3777 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 3739 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
@@ -4005,8 +3967,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4005 kvm_rip_write(vcpu, 0); 3967 kvm_rip_write(vcpu, 0);
4006 kvm_register_write(vcpu, VCPU_REGS_RSP, 0); 3968 kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
4007 3969
4008 vmcs_writel(GUEST_DR7, 0x400);
4009
4010 vmcs_writel(GUEST_GDTR_BASE, 0); 3970 vmcs_writel(GUEST_GDTR_BASE, 0);
4011 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 3971 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4012 3972
@@ -4456,7 +4416,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4456 hypercall[2] = 0xc1; 4416 hypercall[2] = 0xc1;
4457} 4417}
4458 4418
4459/* called to set cr0 as approriate for a mov-to-cr0 exit. */ 4419/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
4460static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 4420static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
4461{ 4421{
4462 if (to_vmx(vcpu)->nested.vmxon && 4422 if (to_vmx(vcpu)->nested.vmxon &&
@@ -5701,7 +5661,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
5701 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 5661 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
5702 * to be done to userspace and return 0. 5662 * to be done to userspace and return 0.
5703 */ 5663 */
5704static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 5664static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5705 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 5665 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
5706 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 5666 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
5707 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 5667 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
@@ -6229,17 +6189,10 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
6229 msrs[i].host); 6189 msrs[i].host);
6230} 6190}
6231 6191
6232#ifdef CONFIG_X86_64
6233#define R "r"
6234#define Q "q"
6235#else
6236#define R "e"
6237#define Q "l"
6238#endif
6239
6240static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) 6192static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6241{ 6193{
6242 struct vcpu_vmx *vmx = to_vmx(vcpu); 6194 struct vcpu_vmx *vmx = to_vmx(vcpu);
6195 unsigned long debugctlmsr;
6243 6196
6244 if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { 6197 if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
6245 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6198 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -6279,34 +6232,35 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6279 vmx_set_interrupt_shadow(vcpu, 0); 6232 vmx_set_interrupt_shadow(vcpu, 0);
6280 6233
6281 atomic_switch_perf_msrs(vmx); 6234 atomic_switch_perf_msrs(vmx);
6235 debugctlmsr = get_debugctlmsr();
6282 6236
6283 vmx->__launched = vmx->loaded_vmcs->launched; 6237 vmx->__launched = vmx->loaded_vmcs->launched;
6284 asm( 6238 asm(
6285 /* Store host registers */ 6239 /* Store host registers */
6286 "push %%"R"dx; push %%"R"bp;" 6240 "push %%" _ASM_DX "; push %%" _ASM_BP ";"
6287 "push %%"R"cx \n\t" /* placeholder for guest rcx */ 6241 "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
6288 "push %%"R"cx \n\t" 6242 "push %%" _ASM_CX " \n\t"
6289 "cmp %%"R"sp, %c[host_rsp](%0) \n\t" 6243 "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
6290 "je 1f \n\t" 6244 "je 1f \n\t"
6291 "mov %%"R"sp, %c[host_rsp](%0) \n\t" 6245 "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
6292 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" 6246 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
6293 "1: \n\t" 6247 "1: \n\t"
6294 /* Reload cr2 if changed */ 6248 /* Reload cr2 if changed */
6295 "mov %c[cr2](%0), %%"R"ax \n\t" 6249 "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
6296 "mov %%cr2, %%"R"dx \n\t" 6250 "mov %%cr2, %%" _ASM_DX " \n\t"
6297 "cmp %%"R"ax, %%"R"dx \n\t" 6251 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
6298 "je 2f \n\t" 6252 "je 2f \n\t"
6299 "mov %%"R"ax, %%cr2 \n\t" 6253 "mov %%" _ASM_AX", %%cr2 \n\t"
6300 "2: \n\t" 6254 "2: \n\t"
6301 /* Check if vmlaunch of vmresume is needed */ 6255 /* Check if vmlaunch of vmresume is needed */
6302 "cmpl $0, %c[launched](%0) \n\t" 6256 "cmpl $0, %c[launched](%0) \n\t"
6303 /* Load guest registers. Don't clobber flags. */ 6257 /* Load guest registers. Don't clobber flags. */
6304 "mov %c[rax](%0), %%"R"ax \n\t" 6258 "mov %c[rax](%0), %%" _ASM_AX " \n\t"
6305 "mov %c[rbx](%0), %%"R"bx \n\t" 6259 "mov %c[rbx](%0), %%" _ASM_BX " \n\t"
6306 "mov %c[rdx](%0), %%"R"dx \n\t" 6260 "mov %c[rdx](%0), %%" _ASM_DX " \n\t"
6307 "mov %c[rsi](%0), %%"R"si \n\t" 6261 "mov %c[rsi](%0), %%" _ASM_SI " \n\t"
6308 "mov %c[rdi](%0), %%"R"di \n\t" 6262 "mov %c[rdi](%0), %%" _ASM_DI " \n\t"
6309 "mov %c[rbp](%0), %%"R"bp \n\t" 6263 "mov %c[rbp](%0), %%" _ASM_BP " \n\t"
6310#ifdef CONFIG_X86_64 6264#ifdef CONFIG_X86_64
6311 "mov %c[r8](%0), %%r8 \n\t" 6265 "mov %c[r8](%0), %%r8 \n\t"
6312 "mov %c[r9](%0), %%r9 \n\t" 6266 "mov %c[r9](%0), %%r9 \n\t"
@@ -6317,24 +6271,24 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6317 "mov %c[r14](%0), %%r14 \n\t" 6271 "mov %c[r14](%0), %%r14 \n\t"
6318 "mov %c[r15](%0), %%r15 \n\t" 6272 "mov %c[r15](%0), %%r15 \n\t"
6319#endif 6273#endif
6320 "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ 6274 "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
6321 6275
6322 /* Enter guest mode */ 6276 /* Enter guest mode */
6323 "jne .Llaunched \n\t" 6277 "jne 1f \n\t"
6324 __ex(ASM_VMX_VMLAUNCH) "\n\t" 6278 __ex(ASM_VMX_VMLAUNCH) "\n\t"
6325 "jmp .Lkvm_vmx_return \n\t" 6279 "jmp 2f \n\t"
6326 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" 6280 "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
6327 ".Lkvm_vmx_return: " 6281 "2: "
6328 /* Save guest registers, load host registers, keep flags */ 6282 /* Save guest registers, load host registers, keep flags */
6329 "mov %0, %c[wordsize](%%"R"sp) \n\t" 6283 "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
6330 "pop %0 \n\t" 6284 "pop %0 \n\t"
6331 "mov %%"R"ax, %c[rax](%0) \n\t" 6285 "mov %%" _ASM_AX ", %c[rax](%0) \n\t"
6332 "mov %%"R"bx, %c[rbx](%0) \n\t" 6286 "mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
6333 "pop"Q" %c[rcx](%0) \n\t" 6287 __ASM_SIZE(pop) " %c[rcx](%0) \n\t"
6334 "mov %%"R"dx, %c[rdx](%0) \n\t" 6288 "mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
6335 "mov %%"R"si, %c[rsi](%0) \n\t" 6289 "mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
6336 "mov %%"R"di, %c[rdi](%0) \n\t" 6290 "mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
6337 "mov %%"R"bp, %c[rbp](%0) \n\t" 6291 "mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
6338#ifdef CONFIG_X86_64 6292#ifdef CONFIG_X86_64
6339 "mov %%r8, %c[r8](%0) \n\t" 6293 "mov %%r8, %c[r8](%0) \n\t"
6340 "mov %%r9, %c[r9](%0) \n\t" 6294 "mov %%r9, %c[r9](%0) \n\t"
@@ -6345,11 +6299,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6345 "mov %%r14, %c[r14](%0) \n\t" 6299 "mov %%r14, %c[r14](%0) \n\t"
6346 "mov %%r15, %c[r15](%0) \n\t" 6300 "mov %%r15, %c[r15](%0) \n\t"
6347#endif 6301#endif
6348 "mov %%cr2, %%"R"ax \n\t" 6302 "mov %%cr2, %%" _ASM_AX " \n\t"
6349 "mov %%"R"ax, %c[cr2](%0) \n\t" 6303 "mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
6350 6304
6351 "pop %%"R"bp; pop %%"R"dx \n\t" 6305 "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
6352 "setbe %c[fail](%0) \n\t" 6306 "setbe %c[fail](%0) \n\t"
6307 ".pushsection .rodata \n\t"
6308 ".global vmx_return \n\t"
6309 "vmx_return: " _ASM_PTR " 2b \n\t"
6310 ".popsection"
6353 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 6311 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
6354 [launched]"i"(offsetof(struct vcpu_vmx, __launched)), 6312 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
6355 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 6313 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
@@ -6374,12 +6332,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6374 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), 6332 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
6375 [wordsize]"i"(sizeof(ulong)) 6333 [wordsize]"i"(sizeof(ulong))
6376 : "cc", "memory" 6334 : "cc", "memory"
6377 , R"ax", R"bx", R"di", R"si"
6378#ifdef CONFIG_X86_64 6335#ifdef CONFIG_X86_64
6336 , "rax", "rbx", "rdi", "rsi"
6379 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 6337 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
6338#else
6339 , "eax", "ebx", "edi", "esi"
6380#endif 6340#endif
6381 ); 6341 );
6382 6342
6343 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
6344 if (debugctlmsr)
6345 update_debugctlmsr(debugctlmsr);
6346
6383#ifndef CONFIG_X86_64 6347#ifndef CONFIG_X86_64
6384 /* 6348 /*
6385 * The sysexit path does not restore ds/es, so we must set them to 6349 * The sysexit path does not restore ds/es, so we must set them to
@@ -6424,9 +6388,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6424 vmx_complete_interrupts(vmx); 6388 vmx_complete_interrupts(vmx);
6425} 6389}
6426 6390
6427#undef R
6428#undef Q
6429
6430static void vmx_free_vcpu(struct kvm_vcpu *vcpu) 6391static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
6431{ 6392{
6432 struct vcpu_vmx *vmx = to_vmx(vcpu); 6393 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7281,7 +7242,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7281 .vcpu_load = vmx_vcpu_load, 7242 .vcpu_load = vmx_vcpu_load,
7282 .vcpu_put = vmx_vcpu_put, 7243 .vcpu_put = vmx_vcpu_put,
7283 7244
7284 .set_guest_debug = set_guest_debug, 7245 .update_db_bp_intercept = update_exception_bitmap,
7285 .get_msr = vmx_get_msr, 7246 .get_msr = vmx_get_msr,
7286 .set_msr = vmx_set_msr, 7247 .set_msr = vmx_set_msr,
7287 .get_segment_base = vmx_get_segment_base, 7248 .get_segment_base = vmx_get_segment_base,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1f09552572fa..1eefebe5d727 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -246,20 +246,14 @@ static void drop_user_return_notifiers(void *ignore)
246 246
247u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 247u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
248{ 248{
249 if (irqchip_in_kernel(vcpu->kvm)) 249 return vcpu->arch.apic_base;
250 return vcpu->arch.apic_base;
251 else
252 return vcpu->arch.apic_base;
253} 250}
254EXPORT_SYMBOL_GPL(kvm_get_apic_base); 251EXPORT_SYMBOL_GPL(kvm_get_apic_base);
255 252
256void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 253void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
257{ 254{
258 /* TODO: reserve bits check */ 255 /* TODO: reserve bits check */
259 if (irqchip_in_kernel(vcpu->kvm)) 256 kvm_lapic_set_base(vcpu, data);
260 kvm_lapic_set_base(vcpu, data);
261 else
262 vcpu->arch.apic_base = data;
263} 257}
264EXPORT_SYMBOL_GPL(kvm_set_apic_base); 258EXPORT_SYMBOL_GPL(kvm_set_apic_base);
265 259
@@ -698,6 +692,18 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
698} 692}
699EXPORT_SYMBOL_GPL(kvm_get_cr8); 693EXPORT_SYMBOL_GPL(kvm_get_cr8);
700 694
695static void kvm_update_dr7(struct kvm_vcpu *vcpu)
696{
697 unsigned long dr7;
698
699 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
700 dr7 = vcpu->arch.guest_debug_dr7;
701 else
702 dr7 = vcpu->arch.dr7;
703 kvm_x86_ops->set_dr7(vcpu, dr7);
704 vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
705}
706
701static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 707static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
702{ 708{
703 switch (dr) { 709 switch (dr) {
@@ -723,10 +729,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
723 if (val & 0xffffffff00000000ULL) 729 if (val & 0xffffffff00000000ULL)
724 return -1; /* #GP */ 730 return -1; /* #GP */
725 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 731 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
726 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 732 kvm_update_dr7(vcpu);
727 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
728 vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
729 }
730 break; 733 break;
731 } 734 }
732 735
@@ -823,7 +826,7 @@ static u32 msrs_to_save[] = {
823 826
824static unsigned num_msrs_to_save; 827static unsigned num_msrs_to_save;
825 828
826static u32 emulated_msrs[] = { 829static const u32 emulated_msrs[] = {
827 MSR_IA32_TSCDEADLINE, 830 MSR_IA32_TSCDEADLINE,
828 MSR_IA32_MISC_ENABLE, 831 MSR_IA32_MISC_ENABLE,
829 MSR_IA32_MCG_STATUS, 832 MSR_IA32_MCG_STATUS,
@@ -1097,7 +1100,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1097 * For each generation, we track the original measured 1100 * For each generation, we track the original measured
1098 * nanosecond time, offset, and write, so if TSCs are in 1101 * nanosecond time, offset, and write, so if TSCs are in
1099 * sync, we can match exact offset, and if not, we can match 1102 * sync, we can match exact offset, and if not, we can match
1100 * exact software computaion in compute_guest_tsc() 1103 * exact software computation in compute_guest_tsc()
1101 * 1104 *
1102 * These values are tracked in kvm->arch.cur_xxx variables. 1105 * These values are tracked in kvm->arch.cur_xxx variables.
1103 */ 1106 */
@@ -1140,6 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1140 unsigned long this_tsc_khz; 1143 unsigned long this_tsc_khz;
1141 s64 kernel_ns, max_kernel_ns; 1144 s64 kernel_ns, max_kernel_ns;
1142 u64 tsc_timestamp; 1145 u64 tsc_timestamp;
1146 u8 pvclock_flags;
1143 1147
1144 /* Keep irq disabled to prevent changes to the clock */ 1148 /* Keep irq disabled to prevent changes to the clock */
1145 local_irq_save(flags); 1149 local_irq_save(flags);
@@ -1221,7 +1225,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1221 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; 1225 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1222 vcpu->last_kernel_ns = kernel_ns; 1226 vcpu->last_kernel_ns = kernel_ns;
1223 vcpu->last_guest_tsc = tsc_timestamp; 1227 vcpu->last_guest_tsc = tsc_timestamp;
1224 vcpu->hv_clock.flags = 0; 1228
1229 pvclock_flags = 0;
1230 if (vcpu->pvclock_set_guest_stopped_request) {
1231 pvclock_flags |= PVCLOCK_GUEST_STOPPED;
1232 vcpu->pvclock_set_guest_stopped_request = false;
1233 }
1234
1235 vcpu->hv_clock.flags = pvclock_flags;
1225 1236
1226 /* 1237 /*
1227 * The interface expects us to write an even number signaling that the 1238 * The interface expects us to write an even number signaling that the
@@ -1504,7 +1515,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1504{ 1515{
1505 gpa_t gpa = data & ~0x3f; 1516 gpa_t gpa = data & ~0x3f;
1506 1517
1507 /* Bits 2:5 are resrved, Should be zero */ 1518 /* Bits 2:5 are reserved, Should be zero */
1508 if (data & 0x3c) 1519 if (data & 0x3c)
1509 return 1; 1520 return 1;
1510 1521
@@ -1639,10 +1650,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1639 vcpu->arch.time_page = 1650 vcpu->arch.time_page =
1640 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 1651 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
1641 1652
1642 if (is_error_page(vcpu->arch.time_page)) { 1653 if (is_error_page(vcpu->arch.time_page))
1643 kvm_release_page_clean(vcpu->arch.time_page);
1644 vcpu->arch.time_page = NULL; 1654 vcpu->arch.time_page = NULL;
1645 } 1655
1646 break; 1656 break;
1647 } 1657 }
1648 case MSR_KVM_ASYNC_PF_EN: 1658 case MSR_KVM_ASYNC_PF_EN:
@@ -1727,7 +1737,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1727 * Ignore all writes to this no longer documented MSR. 1737 * Ignore all writes to this no longer documented MSR.
1728 * Writes are only relevant for old K7 processors, 1738 * Writes are only relevant for old K7 processors,
1729 * all pre-dating SVM, but a recommended workaround from 1739 * all pre-dating SVM, but a recommended workaround from
1730 * AMD for these chips. It is possible to speicify the 1740 * AMD for these chips. It is possible to specify the
1731 * affected processor models on the command line, hence 1741 * affected processor models on the command line, hence
1732 * the need to ignore the workaround. 1742 * the need to ignore the workaround.
1733 */ 1743 */
@@ -2177,6 +2187,8 @@ int kvm_dev_ioctl_check_extension(long ext)
2177 case KVM_CAP_GET_TSC_KHZ: 2187 case KVM_CAP_GET_TSC_KHZ:
2178 case KVM_CAP_PCI_2_3: 2188 case KVM_CAP_PCI_2_3:
2179 case KVM_CAP_KVMCLOCK_CTRL: 2189 case KVM_CAP_KVMCLOCK_CTRL:
2190 case KVM_CAP_READONLY_MEM:
2191 case KVM_CAP_IRQFD_RESAMPLE:
2180 r = 1; 2192 r = 1;
2181 break; 2193 break;
2182 case KVM_CAP_COALESCED_MMIO: 2194 case KVM_CAP_COALESCED_MMIO:
@@ -2358,8 +2370,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2358static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 2370static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2359 struct kvm_lapic_state *s) 2371 struct kvm_lapic_state *s)
2360{ 2372{
2361 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 2373 kvm_apic_post_state_restore(vcpu, s);
2362 kvm_apic_post_state_restore(vcpu);
2363 update_cr8_intercept(vcpu); 2374 update_cr8_intercept(vcpu);
2364 2375
2365 return 0; 2376 return 0;
@@ -2368,7 +2379,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2368static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 2379static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2369 struct kvm_interrupt *irq) 2380 struct kvm_interrupt *irq)
2370{ 2381{
2371 if (irq->irq < 0 || irq->irq >= 256) 2382 if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS)
2372 return -EINVAL; 2383 return -EINVAL;
2373 if (irqchip_in_kernel(vcpu->kvm)) 2384 if (irqchip_in_kernel(vcpu->kvm))
2374 return -ENXIO; 2385 return -ENXIO;
@@ -2635,11 +2646,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
2635 */ 2646 */
2636static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) 2647static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
2637{ 2648{
2638 struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock;
2639 if (!vcpu->arch.time_page) 2649 if (!vcpu->arch.time_page)
2640 return -EINVAL; 2650 return -EINVAL;
2641 src->flags |= PVCLOCK_GUEST_STOPPED; 2651 vcpu->arch.pvclock_set_guest_stopped_request = true;
2642 mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
2643 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2652 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2644 return 0; 2653 return 0;
2645} 2654}
@@ -3090,7 +3099,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
3090 if (!kvm->arch.vpit) 3099 if (!kvm->arch.vpit)
3091 return -ENXIO; 3100 return -ENXIO;
3092 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3101 mutex_lock(&kvm->arch.vpit->pit_state.lock);
3093 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; 3102 kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
3094 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3103 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3095 return 0; 3104 return 0;
3096} 3105}
@@ -3173,6 +3182,16 @@ out:
3173 return r; 3182 return r;
3174} 3183}
3175 3184
3185int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
3186{
3187 if (!irqchip_in_kernel(kvm))
3188 return -ENXIO;
3189
3190 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
3191 irq_event->irq, irq_event->level);
3192 return 0;
3193}
3194
3176long kvm_arch_vm_ioctl(struct file *filp, 3195long kvm_arch_vm_ioctl(struct file *filp,
3177 unsigned int ioctl, unsigned long arg) 3196 unsigned int ioctl, unsigned long arg)
3178{ 3197{
@@ -3279,29 +3298,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3279 create_pit_unlock: 3298 create_pit_unlock:
3280 mutex_unlock(&kvm->slots_lock); 3299 mutex_unlock(&kvm->slots_lock);
3281 break; 3300 break;
3282 case KVM_IRQ_LINE_STATUS:
3283 case KVM_IRQ_LINE: {
3284 struct kvm_irq_level irq_event;
3285
3286 r = -EFAULT;
3287 if (copy_from_user(&irq_event, argp, sizeof irq_event))
3288 goto out;
3289 r = -ENXIO;
3290 if (irqchip_in_kernel(kvm)) {
3291 __s32 status;
3292 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
3293 irq_event.irq, irq_event.level);
3294 if (ioctl == KVM_IRQ_LINE_STATUS) {
3295 r = -EFAULT;
3296 irq_event.status = status;
3297 if (copy_to_user(argp, &irq_event,
3298 sizeof irq_event))
3299 goto out;
3300 }
3301 r = 0;
3302 }
3303 break;
3304 }
3305 case KVM_GET_IRQCHIP: { 3301 case KVM_GET_IRQCHIP: {
3306 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 3302 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3307 struct kvm_irqchip *chip; 3303 struct kvm_irqchip *chip;
@@ -3689,20 +3685,17 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
3689 gpa_t *gpa, struct x86_exception *exception, 3685 gpa_t *gpa, struct x86_exception *exception,
3690 bool write) 3686 bool write)
3691{ 3687{
3692 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3688 u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
3689 | (write ? PFERR_WRITE_MASK : 0);
3693 3690
3694 if (vcpu_match_mmio_gva(vcpu, gva) && 3691 if (vcpu_match_mmio_gva(vcpu, gva)
3695 check_write_user_access(vcpu, write, access, 3692 && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {
3696 vcpu->arch.access)) {
3697 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | 3693 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
3698 (gva & (PAGE_SIZE - 1)); 3694 (gva & (PAGE_SIZE - 1));
3699 trace_vcpu_match_mmio(gva, *gpa, write, false); 3695 trace_vcpu_match_mmio(gva, *gpa, write, false);
3700 return 1; 3696 return 1;
3701 } 3697 }
3702 3698
3703 if (write)
3704 access |= PFERR_WRITE_MASK;
3705
3706 *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); 3699 *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3707 3700
3708 if (*gpa == UNMAPPED_GVA) 3701 if (*gpa == UNMAPPED_GVA)
@@ -3790,14 +3783,14 @@ static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
3790 return X86EMUL_CONTINUE; 3783 return X86EMUL_CONTINUE;
3791} 3784}
3792 3785
3793static struct read_write_emulator_ops read_emultor = { 3786static const struct read_write_emulator_ops read_emultor = {
3794 .read_write_prepare = read_prepare, 3787 .read_write_prepare = read_prepare,
3795 .read_write_emulate = read_emulate, 3788 .read_write_emulate = read_emulate,
3796 .read_write_mmio = vcpu_mmio_read, 3789 .read_write_mmio = vcpu_mmio_read,
3797 .read_write_exit_mmio = read_exit_mmio, 3790 .read_write_exit_mmio = read_exit_mmio,
3798}; 3791};
3799 3792
3800static struct read_write_emulator_ops write_emultor = { 3793static const struct read_write_emulator_ops write_emultor = {
3801 .read_write_emulate = write_emulate, 3794 .read_write_emulate = write_emulate,
3802 .read_write_mmio = write_mmio, 3795 .read_write_mmio = write_mmio,
3803 .read_write_exit_mmio = write_exit_mmio, 3796 .read_write_exit_mmio = write_exit_mmio,
@@ -3808,7 +3801,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
3808 unsigned int bytes, 3801 unsigned int bytes,
3809 struct x86_exception *exception, 3802 struct x86_exception *exception,
3810 struct kvm_vcpu *vcpu, 3803 struct kvm_vcpu *vcpu,
3811 struct read_write_emulator_ops *ops) 3804 const struct read_write_emulator_ops *ops)
3812{ 3805{
3813 gpa_t gpa; 3806 gpa_t gpa;
3814 int handled, ret; 3807 int handled, ret;
@@ -3857,7 +3850,7 @@ mmio:
3857int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, 3850int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
3858 void *val, unsigned int bytes, 3851 void *val, unsigned int bytes,
3859 struct x86_exception *exception, 3852 struct x86_exception *exception,
3860 struct read_write_emulator_ops *ops) 3853 const struct read_write_emulator_ops *ops)
3861{ 3854{
3862 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 3855 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3863 gpa_t gpa; 3856 gpa_t gpa;
@@ -3962,10 +3955,8 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
3962 goto emul_write; 3955 goto emul_write;
3963 3956
3964 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3957 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3965 if (is_error_page(page)) { 3958 if (is_error_page(page))
3966 kvm_release_page_clean(page);
3967 goto emul_write; 3959 goto emul_write;
3968 }
3969 3960
3970 kaddr = kmap_atomic(page); 3961 kaddr = kmap_atomic(page);
3971 kaddr += offset_in_page(gpa); 3962 kaddr += offset_in_page(gpa);
@@ -4332,7 +4323,19 @@ static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
4332 kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); 4323 kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
4333} 4324}
4334 4325
4335static struct x86_emulate_ops emulate_ops = { 4326static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
4327{
4328 return kvm_register_read(emul_to_vcpu(ctxt), reg);
4329}
4330
4331static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
4332{
4333 kvm_register_write(emul_to_vcpu(ctxt), reg, val);
4334}
4335
4336static const struct x86_emulate_ops emulate_ops = {
4337 .read_gpr = emulator_read_gpr,
4338 .write_gpr = emulator_write_gpr,
4336 .read_std = kvm_read_guest_virt_system, 4339 .read_std = kvm_read_guest_virt_system,
4337 .write_std = kvm_write_guest_virt_system, 4340 .write_std = kvm_write_guest_virt_system,
4338 .fetch = kvm_fetch_guest_virt, 4341 .fetch = kvm_fetch_guest_virt,
@@ -4367,14 +4370,6 @@ static struct x86_emulate_ops emulate_ops = {
4367 .get_cpuid = emulator_get_cpuid, 4370 .get_cpuid = emulator_get_cpuid,
4368}; 4371};
4369 4372
4370static void cache_all_regs(struct kvm_vcpu *vcpu)
4371{
4372 kvm_register_read(vcpu, VCPU_REGS_RAX);
4373 kvm_register_read(vcpu, VCPU_REGS_RSP);
4374 kvm_register_read(vcpu, VCPU_REGS_RIP);
4375 vcpu->arch.regs_dirty = ~0;
4376}
4377
4378static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) 4373static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
4379{ 4374{
4380 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); 4375 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
@@ -4401,12 +4396,10 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
4401 kvm_queue_exception(vcpu, ctxt->exception.vector); 4396 kvm_queue_exception(vcpu, ctxt->exception.vector);
4402} 4397}
4403 4398
4404static void init_decode_cache(struct x86_emulate_ctxt *ctxt, 4399static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
4405 const unsigned long *regs)
4406{ 4400{
4407 memset(&ctxt->twobyte, 0, 4401 memset(&ctxt->twobyte, 0,
4408 (void *)&ctxt->regs - (void *)&ctxt->twobyte); 4402 (void *)&ctxt->_regs - (void *)&ctxt->twobyte);
4409 memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
4410 4403
4411 ctxt->fetch.start = 0; 4404 ctxt->fetch.start = 0;
4412 ctxt->fetch.end = 0; 4405 ctxt->fetch.end = 0;
@@ -4421,14 +4414,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4421 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4414 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4422 int cs_db, cs_l; 4415 int cs_db, cs_l;
4423 4416
4424 /*
4425 * TODO: fix emulate.c to use guest_read/write_register
4426 * instead of direct ->regs accesses, can save hundred cycles
4427 * on Intel for instructions that don't read/change RSP, for
4428 * for example.
4429 */
4430 cache_all_regs(vcpu);
4431
4432 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 4417 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4433 4418
4434 ctxt->eflags = kvm_get_rflags(vcpu); 4419 ctxt->eflags = kvm_get_rflags(vcpu);
@@ -4440,7 +4425,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4440 X86EMUL_MODE_PROT16; 4425 X86EMUL_MODE_PROT16;
4441 ctxt->guest_mode = is_guest_mode(vcpu); 4426 ctxt->guest_mode = is_guest_mode(vcpu);
4442 4427
4443 init_decode_cache(ctxt, vcpu->arch.regs); 4428 init_decode_cache(ctxt);
4444 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 4429 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4445} 4430}
4446 4431
@@ -4460,7 +4445,6 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
4460 return EMULATE_FAIL; 4445 return EMULATE_FAIL;
4461 4446
4462 ctxt->eip = ctxt->_eip; 4447 ctxt->eip = ctxt->_eip;
4463 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
4464 kvm_rip_write(vcpu, ctxt->eip); 4448 kvm_rip_write(vcpu, ctxt->eip);
4465 kvm_set_rflags(vcpu, ctxt->eflags); 4449 kvm_set_rflags(vcpu, ctxt->eflags);
4466 4450
@@ -4493,13 +4477,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4493static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 4477static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4494{ 4478{
4495 gpa_t gpa; 4479 gpa_t gpa;
4480 pfn_t pfn;
4496 4481
4497 if (tdp_enabled) 4482 if (tdp_enabled)
4498 return false; 4483 return false;
4499 4484
4500 /* 4485 /*
4501 * if emulation was due to access to shadowed page table 4486 * if emulation was due to access to shadowed page table
4502 * and it failed try to unshadow page and re-entetr the 4487 * and it failed try to unshadow page and re-enter the
4503 * guest to let CPU execute the instruction. 4488 * guest to let CPU execute the instruction.
4504 */ 4489 */
4505 if (kvm_mmu_unprotect_page_virt(vcpu, gva)) 4490 if (kvm_mmu_unprotect_page_virt(vcpu, gva))
@@ -4510,8 +4495,17 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4510 if (gpa == UNMAPPED_GVA) 4495 if (gpa == UNMAPPED_GVA)
4511 return true; /* let cpu generate fault */ 4496 return true; /* let cpu generate fault */
4512 4497
4513 if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) 4498 /*
4499 * Do not retry the unhandleable instruction if it faults on the
4500 * readonly host memory, otherwise it will goto a infinite loop:
4501 * retry instruction -> write #PF -> emulation fail -> retry
4502 * instruction -> ...
4503 */
4504 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
4505 if (!is_error_pfn(pfn)) {
4506 kvm_release_pfn_clean(pfn);
4514 return true; 4507 return true;
4508 }
4515 4509
4516 return false; 4510 return false;
4517} 4511}
@@ -4560,6 +4554,9 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
4560 return true; 4554 return true;
4561} 4555}
4562 4556
4557static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
4558static int complete_emulated_pio(struct kvm_vcpu *vcpu);
4559
4563int x86_emulate_instruction(struct kvm_vcpu *vcpu, 4560int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4564 unsigned long cr2, 4561 unsigned long cr2,
4565 int emulation_type, 4562 int emulation_type,
@@ -4608,7 +4605,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4608 changes registers values during IO operation */ 4605 changes registers values during IO operation */
4609 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { 4606 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
4610 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 4607 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4611 memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs); 4608 emulator_invalidate_register_cache(ctxt);
4612 } 4609 }
4613 4610
4614restart: 4611restart:
@@ -4630,13 +4627,16 @@ restart:
4630 } else if (vcpu->arch.pio.count) { 4627 } else if (vcpu->arch.pio.count) {
4631 if (!vcpu->arch.pio.in) 4628 if (!vcpu->arch.pio.in)
4632 vcpu->arch.pio.count = 0; 4629 vcpu->arch.pio.count = 0;
4633 else 4630 else {
4634 writeback = false; 4631 writeback = false;
4632 vcpu->arch.complete_userspace_io = complete_emulated_pio;
4633 }
4635 r = EMULATE_DO_MMIO; 4634 r = EMULATE_DO_MMIO;
4636 } else if (vcpu->mmio_needed) { 4635 } else if (vcpu->mmio_needed) {
4637 if (!vcpu->mmio_is_write) 4636 if (!vcpu->mmio_is_write)
4638 writeback = false; 4637 writeback = false;
4639 r = EMULATE_DO_MMIO; 4638 r = EMULATE_DO_MMIO;
4639 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
4640 } else if (r == EMULATION_RESTART) 4640 } else if (r == EMULATION_RESTART)
4641 goto restart; 4641 goto restart;
4642 else 4642 else
@@ -4646,7 +4646,6 @@ restart:
4646 toggle_interruptibility(vcpu, ctxt->interruptibility); 4646 toggle_interruptibility(vcpu, ctxt->interruptibility);
4647 kvm_set_rflags(vcpu, ctxt->eflags); 4647 kvm_set_rflags(vcpu, ctxt->eflags);
4648 kvm_make_request(KVM_REQ_EVENT, vcpu); 4648 kvm_make_request(KVM_REQ_EVENT, vcpu);
4649 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
4650 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 4649 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
4651 kvm_rip_write(vcpu, ctxt->eip); 4650 kvm_rip_write(vcpu, ctxt->eip);
4652 } else 4651 } else
@@ -4929,6 +4928,7 @@ int kvm_arch_init(void *opaque)
4929 if (cpu_has_xsave) 4928 if (cpu_has_xsave)
4930 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 4929 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
4931 4930
4931 kvm_lapic_init();
4932 return 0; 4932 return 0;
4933 4933
4934out: 4934out:
@@ -5499,6 +5499,24 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5499 return r; 5499 return r;
5500} 5500}
5501 5501
5502static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
5503{
5504 int r;
5505 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5506 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5507 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5508 if (r != EMULATE_DONE)
5509 return 0;
5510 return 1;
5511}
5512
5513static int complete_emulated_pio(struct kvm_vcpu *vcpu)
5514{
5515 BUG_ON(!vcpu->arch.pio.count);
5516
5517 return complete_emulated_io(vcpu);
5518}
5519
5502/* 5520/*
5503 * Implements the following, as a state machine: 5521 * Implements the following, as a state machine:
5504 * 5522 *
@@ -5515,47 +5533,37 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5515 * copy data 5533 * copy data
5516 * exit 5534 * exit
5517 */ 5535 */
5518static int complete_mmio(struct kvm_vcpu *vcpu) 5536static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
5519{ 5537{
5520 struct kvm_run *run = vcpu->run; 5538 struct kvm_run *run = vcpu->run;
5521 struct kvm_mmio_fragment *frag; 5539 struct kvm_mmio_fragment *frag;
5522 int r;
5523 5540
5524 if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) 5541 BUG_ON(!vcpu->mmio_needed);
5525 return 1;
5526 5542
5527 if (vcpu->mmio_needed) { 5543 /* Complete previous fragment */
5528 /* Complete previous fragment */ 5544 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
5529 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; 5545 if (!vcpu->mmio_is_write)
5530 if (!vcpu->mmio_is_write) 5546 memcpy(frag->data, run->mmio.data, frag->len);
5531 memcpy(frag->data, run->mmio.data, frag->len); 5547 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
5532 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { 5548 vcpu->mmio_needed = 0;
5533 vcpu->mmio_needed = 0;
5534 if (vcpu->mmio_is_write)
5535 return 1;
5536 vcpu->mmio_read_completed = 1;
5537 goto done;
5538 }
5539 /* Initiate next fragment */
5540 ++frag;
5541 run->exit_reason = KVM_EXIT_MMIO;
5542 run->mmio.phys_addr = frag->gpa;
5543 if (vcpu->mmio_is_write) 5549 if (vcpu->mmio_is_write)
5544 memcpy(run->mmio.data, frag->data, frag->len); 5550 return 1;
5545 run->mmio.len = frag->len; 5551 vcpu->mmio_read_completed = 1;
5546 run->mmio.is_write = vcpu->mmio_is_write; 5552 return complete_emulated_io(vcpu);
5547 return 0; 5553 }
5548 5554 /* Initiate next fragment */
5549 } 5555 ++frag;
5550done: 5556 run->exit_reason = KVM_EXIT_MMIO;
5551 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 5557 run->mmio.phys_addr = frag->gpa;
5552 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); 5558 if (vcpu->mmio_is_write)
5553 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5559 memcpy(run->mmio.data, frag->data, frag->len);
5554 if (r != EMULATE_DONE) 5560 run->mmio.len = frag->len;
5555 return 0; 5561 run->mmio.is_write = vcpu->mmio_is_write;
5556 return 1; 5562 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
5563 return 0;
5557} 5564}
5558 5565
5566
5559int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 5567int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5560{ 5568{
5561 int r; 5569 int r;
@@ -5582,9 +5590,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5582 } 5590 }
5583 } 5591 }
5584 5592
5585 r = complete_mmio(vcpu); 5593 if (unlikely(vcpu->arch.complete_userspace_io)) {
5586 if (r <= 0) 5594 int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
5587 goto out; 5595 vcpu->arch.complete_userspace_io = NULL;
5596 r = cui(vcpu);
5597 if (r <= 0)
5598 goto out;
5599 } else
5600 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
5588 5601
5589 r = __vcpu_run(vcpu); 5602 r = __vcpu_run(vcpu);
5590 5603
@@ -5602,12 +5615,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
5602 /* 5615 /*
5603 * We are here if userspace calls get_regs() in the middle of 5616 * We are here if userspace calls get_regs() in the middle of
5604 * instruction emulation. Registers state needs to be copied 5617 * instruction emulation. Registers state needs to be copied
5605 * back from emulation context to vcpu. Usrapace shouldn't do 5618 * back from emulation context to vcpu. Userspace shouldn't do
5606 * that usually, but some bad designed PV devices (vmware 5619 * that usually, but some bad designed PV devices (vmware
5607 * backdoor interface) need this to work 5620 * backdoor interface) need this to work
5608 */ 5621 */
5609 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 5622 emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
5610 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
5611 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 5623 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5612 } 5624 }
5613 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 5625 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -5747,7 +5759,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
5747 if (ret) 5759 if (ret)
5748 return EMULATE_FAIL; 5760 return EMULATE_FAIL;
5749 5761
5750 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
5751 kvm_rip_write(vcpu, ctxt->eip); 5762 kvm_rip_write(vcpu, ctxt->eip);
5752 kvm_set_rflags(vcpu, ctxt->eflags); 5763 kvm_set_rflags(vcpu, ctxt->eflags);
5753 kvm_make_request(KVM_REQ_EVENT, vcpu); 5764 kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5799,7 +5810,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5799 if (mmu_reset_needed) 5810 if (mmu_reset_needed)
5800 kvm_mmu_reset_context(vcpu); 5811 kvm_mmu_reset_context(vcpu);
5801 5812
5802 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 5813 max_bits = KVM_NR_INTERRUPTS;
5803 pending_vec = find_first_bit( 5814 pending_vec = find_first_bit(
5804 (const unsigned long *)sregs->interrupt_bitmap, max_bits); 5815 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
5805 if (pending_vec < max_bits) { 5816 if (pending_vec < max_bits) {
@@ -5859,13 +5870,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5859 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5870 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5860 for (i = 0; i < KVM_NR_DB_REGS; ++i) 5871 for (i = 0; i < KVM_NR_DB_REGS; ++i)
5861 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 5872 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
5862 vcpu->arch.switch_db_regs = 5873 vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
5863 (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
5864 } else { 5874 } else {
5865 for (i = 0; i < KVM_NR_DB_REGS; i++) 5875 for (i = 0; i < KVM_NR_DB_REGS; i++)
5866 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 5876 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
5867 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
5868 } 5877 }
5878 kvm_update_dr7(vcpu);
5869 5879
5870 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5880 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5871 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + 5881 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
@@ -5877,7 +5887,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5877 */ 5887 */
5878 kvm_set_rflags(vcpu, rflags); 5888 kvm_set_rflags(vcpu, rflags);
5879 5889
5880 kvm_x86_ops->set_guest_debug(vcpu, dbg); 5890 kvm_x86_ops->update_db_bp_intercept(vcpu);
5881 5891
5882 r = 0; 5892 r = 0;
5883 5893
@@ -6023,7 +6033,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6023 int r; 6033 int r;
6024 6034
6025 vcpu->arch.mtrr_state.have_fixed = 1; 6035 vcpu->arch.mtrr_state.have_fixed = 1;
6026 vcpu_load(vcpu); 6036 r = vcpu_load(vcpu);
6037 if (r)
6038 return r;
6027 r = kvm_arch_vcpu_reset(vcpu); 6039 r = kvm_arch_vcpu_reset(vcpu);
6028 if (r == 0) 6040 if (r == 0)
6029 r = kvm_mmu_setup(vcpu); 6041 r = kvm_mmu_setup(vcpu);
@@ -6034,9 +6046,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6034 6046
6035void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 6047void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6036{ 6048{
6049 int r;
6037 vcpu->arch.apf.msr_val = 0; 6050 vcpu->arch.apf.msr_val = 0;
6038 6051
6039 vcpu_load(vcpu); 6052 r = vcpu_load(vcpu);
6053 BUG_ON(r);
6040 kvm_mmu_unload(vcpu); 6054 kvm_mmu_unload(vcpu);
6041 vcpu_put(vcpu); 6055 vcpu_put(vcpu);
6042 6056
@@ -6050,10 +6064,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
6050 vcpu->arch.nmi_pending = 0; 6064 vcpu->arch.nmi_pending = 0;
6051 vcpu->arch.nmi_injected = false; 6065 vcpu->arch.nmi_injected = false;
6052 6066
6053 vcpu->arch.switch_db_regs = 0;
6054 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 6067 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
6055 vcpu->arch.dr6 = DR6_FIXED_1; 6068 vcpu->arch.dr6 = DR6_FIXED_1;
6056 vcpu->arch.dr7 = DR7_FIXED_1; 6069 vcpu->arch.dr7 = DR7_FIXED_1;
6070 kvm_update_dr7(vcpu);
6057 6071
6058 kvm_make_request(KVM_REQ_EVENT, vcpu); 6072 kvm_make_request(KVM_REQ_EVENT, vcpu);
6059 vcpu->arch.apf.msr_val = 0; 6073 vcpu->arch.apf.msr_val = 0;
@@ -6132,7 +6146,7 @@ int kvm_arch_hardware_enable(void *garbage)
6132 * as we reset last_host_tsc on all VCPUs to stop this from being 6146 * as we reset last_host_tsc on all VCPUs to stop this from being
6133 * called multiple times (one for each physical CPU bringup). 6147 * called multiple times (one for each physical CPU bringup).
6134 * 6148 *
6135 * Platforms with unnreliable TSCs don't have to deal with this, they 6149 * Platforms with unreliable TSCs don't have to deal with this, they
6136 * will be compensated by the logic in vcpu_load, which sets the TSC to 6150 * will be compensated by the logic in vcpu_load, which sets the TSC to
6137 * catchup mode. This will catchup all VCPUs to real time, but cannot 6151 * catchup mode. This will catchup all VCPUs to real time, but cannot
6138 * guarantee that they stay in perfect synchronization. 6152 * guarantee that they stay in perfect synchronization.
@@ -6185,6 +6199,8 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
6185 return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); 6199 return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
6186} 6200}
6187 6201
6202struct static_key kvm_no_apic_vcpu __read_mostly;
6203
6188int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 6204int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6189{ 6205{
6190 struct page *page; 6206 struct page *page;
@@ -6217,7 +6233,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6217 r = kvm_create_lapic(vcpu); 6233 r = kvm_create_lapic(vcpu);
6218 if (r < 0) 6234 if (r < 0)
6219 goto fail_mmu_destroy; 6235 goto fail_mmu_destroy;
6220 } 6236 } else
6237 static_key_slow_inc(&kvm_no_apic_vcpu);
6221 6238
6222 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, 6239 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
6223 GFP_KERNEL); 6240 GFP_KERNEL);
@@ -6257,6 +6274,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
6257 kvm_mmu_destroy(vcpu); 6274 kvm_mmu_destroy(vcpu);
6258 srcu_read_unlock(&vcpu->kvm->srcu, idx); 6275 srcu_read_unlock(&vcpu->kvm->srcu, idx);
6259 free_page((unsigned long)vcpu->arch.pio_data); 6276 free_page((unsigned long)vcpu->arch.pio_data);
6277 if (!irqchip_in_kernel(vcpu->kvm))
6278 static_key_slow_dec(&kvm_no_apic_vcpu);
6260} 6279}
6261 6280
6262int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) 6281int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
@@ -6269,15 +6288,21 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6269 6288
6270 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 6289 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
6271 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 6290 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
6291 /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
6292 set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
6293 &kvm->arch.irq_sources_bitmap);
6272 6294
6273 raw_spin_lock_init(&kvm->arch.tsc_write_lock); 6295 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
6296 mutex_init(&kvm->arch.apic_map_lock);
6274 6297
6275 return 0; 6298 return 0;
6276} 6299}
6277 6300
6278static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 6301static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
6279{ 6302{
6280 vcpu_load(vcpu); 6303 int r;
6304 r = vcpu_load(vcpu);
6305 BUG_ON(r);
6281 kvm_mmu_unload(vcpu); 6306 kvm_mmu_unload(vcpu);
6282 vcpu_put(vcpu); 6307 vcpu_put(vcpu);
6283} 6308}
@@ -6321,6 +6346,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
6321 put_page(kvm->arch.apic_access_page); 6346 put_page(kvm->arch.apic_access_page);
6322 if (kvm->arch.ept_identity_pagetable) 6347 if (kvm->arch.ept_identity_pagetable)
6323 put_page(kvm->arch.ept_identity_pagetable); 6348 put_page(kvm->arch.ept_identity_pagetable);
6349 kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
6324} 6350}
6325 6351
6326void kvm_arch_free_memslot(struct kvm_memory_slot *free, 6352void kvm_arch_free_memslot(struct kvm_memory_slot *free,
@@ -6328,10 +6354,18 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
6328{ 6354{
6329 int i; 6355 int i;
6330 6356
6331 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6357 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
6332 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { 6358 if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
6333 kvm_kvfree(free->arch.lpage_info[i]); 6359 kvm_kvfree(free->arch.rmap[i]);
6334 free->arch.lpage_info[i] = NULL; 6360 free->arch.rmap[i] = NULL;
6361 }
6362 if (i == 0)
6363 continue;
6364
6365 if (!dont || free->arch.lpage_info[i - 1] !=
6366 dont->arch.lpage_info[i - 1]) {
6367 kvm_kvfree(free->arch.lpage_info[i - 1]);
6368 free->arch.lpage_info[i - 1] = NULL;
6335 } 6369 }
6336 } 6370 }
6337} 6371}
@@ -6340,23 +6374,30 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6340{ 6374{
6341 int i; 6375 int i;
6342 6376
6343 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6377 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
6344 unsigned long ugfn; 6378 unsigned long ugfn;
6345 int lpages; 6379 int lpages;
6346 int level = i + 2; 6380 int level = i + 1;
6347 6381
6348 lpages = gfn_to_index(slot->base_gfn + npages - 1, 6382 lpages = gfn_to_index(slot->base_gfn + npages - 1,
6349 slot->base_gfn, level) + 1; 6383 slot->base_gfn, level) + 1;
6350 6384
6351 slot->arch.lpage_info[i] = 6385 slot->arch.rmap[i] =
6352 kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); 6386 kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
6353 if (!slot->arch.lpage_info[i]) 6387 if (!slot->arch.rmap[i])
6388 goto out_free;
6389 if (i == 0)
6390 continue;
6391
6392 slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
6393 sizeof(*slot->arch.lpage_info[i - 1]));
6394 if (!slot->arch.lpage_info[i - 1])
6354 goto out_free; 6395 goto out_free;
6355 6396
6356 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) 6397 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
6357 slot->arch.lpage_info[i][0].write_count = 1; 6398 slot->arch.lpage_info[i - 1][0].write_count = 1;
6358 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) 6399 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
6359 slot->arch.lpage_info[i][lpages - 1].write_count = 1; 6400 slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
6360 ugfn = slot->userspace_addr >> PAGE_SHIFT; 6401 ugfn = slot->userspace_addr >> PAGE_SHIFT;
6361 /* 6402 /*
6362 * If the gfn and userspace address are not aligned wrt each 6403 * If the gfn and userspace address are not aligned wrt each
@@ -6368,16 +6409,21 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6368 unsigned long j; 6409 unsigned long j;
6369 6410
6370 for (j = 0; j < lpages; ++j) 6411 for (j = 0; j < lpages; ++j)
6371 slot->arch.lpage_info[i][j].write_count = 1; 6412 slot->arch.lpage_info[i - 1][j].write_count = 1;
6372 } 6413 }
6373 } 6414 }
6374 6415
6375 return 0; 6416 return 0;
6376 6417
6377out_free: 6418out_free:
6378 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6419 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
6379 kvm_kvfree(slot->arch.lpage_info[i]); 6420 kvm_kvfree(slot->arch.rmap[i]);
6380 slot->arch.lpage_info[i] = NULL; 6421 slot->arch.rmap[i] = NULL;
6422 if (i == 0)
6423 continue;
6424
6425 kvm_kvfree(slot->arch.lpage_info[i - 1]);
6426 slot->arch.lpage_info[i - 1] = NULL;
6381 } 6427 }
6382 return -ENOMEM; 6428 return -ENOMEM;
6383} 6429}
@@ -6396,10 +6442,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
6396 map_flags = MAP_SHARED | MAP_ANONYMOUS; 6442 map_flags = MAP_SHARED | MAP_ANONYMOUS;
6397 6443
6398 /*To keep backward compatibility with older userspace, 6444 /*To keep backward compatibility with older userspace,
6399 *x86 needs to hanlde !user_alloc case. 6445 *x86 needs to handle !user_alloc case.
6400 */ 6446 */
6401 if (!user_alloc) { 6447 if (!user_alloc) {
6402 if (npages && !old.rmap) { 6448 if (npages && !old.npages) {
6403 unsigned long userspace_addr; 6449 unsigned long userspace_addr;
6404 6450
6405 userspace_addr = vm_mmap(NULL, 0, 6451 userspace_addr = vm_mmap(NULL, 0,
@@ -6427,7 +6473,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6427 6473
6428 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; 6474 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
6429 6475
6430 if (!user_alloc && !old.user_alloc && old.rmap && !npages) { 6476 if (!user_alloc && !old.user_alloc && old.npages && !npages) {
6431 int ret; 6477 int ret;
6432 6478
6433 ret = vm_munmap(old.userspace_addr, 6479 ret = vm_munmap(old.userspace_addr,
@@ -6446,14 +6492,28 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6446 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 6492 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
6447 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 6493 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
6448 spin_unlock(&kvm->mmu_lock); 6494 spin_unlock(&kvm->mmu_lock);
6495 /*
6496 * If memory slot is created, or moved, we need to clear all
6497 * mmio sptes.
6498 */
6499 if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {
6500 kvm_mmu_zap_all(kvm);
6501 kvm_reload_remote_mmus(kvm);
6502 }
6449} 6503}
6450 6504
6451void kvm_arch_flush_shadow(struct kvm *kvm) 6505void kvm_arch_flush_shadow_all(struct kvm *kvm)
6452{ 6506{
6453 kvm_mmu_zap_all(kvm); 6507 kvm_mmu_zap_all(kvm);
6454 kvm_reload_remote_mmus(kvm); 6508 kvm_reload_remote_mmus(kvm);
6455} 6509}
6456 6510
6511void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
6512 struct kvm_memory_slot *slot)
6513{
6514 kvm_arch_flush_shadow_all(kvm);
6515}
6516
6457int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 6517int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6458{ 6518{
6459 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 6519 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 3d1134ddb885..2b5219c12ac8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -124,4 +124,5 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
124 124
125extern u64 host_xcr0; 125extern u64 host_xcr0;
126 126
127extern struct static_key kvm_no_apic_vcpu;
127#endif 128#endif
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 2ce09aa7d3b3..0a6d6ba44c85 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -101,9 +101,13 @@ struct kvm_userspace_memory_region {
101 __u64 userspace_addr; /* start of the userspace allocated memory */ 101 __u64 userspace_addr; /* start of the userspace allocated memory */
102}; 102};
103 103
104/* for kvm_memory_region::flags */ 104/*
105#define KVM_MEM_LOG_DIRTY_PAGES 1UL 105 * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace,
106#define KVM_MEMSLOT_INVALID (1UL << 1) 106 * other bits are reserved for kvm internal use which are defined in
107 * include/linux/kvm_host.h.
108 */
109#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0)
110#define KVM_MEM_READONLY (1UL << 1)
107 111
108/* for KVM_IRQ_LINE */ 112/* for KVM_IRQ_LINE */
109struct kvm_irq_level { 113struct kvm_irq_level {
@@ -618,6 +622,10 @@ struct kvm_ppc_smmu_info {
618#define KVM_CAP_PPC_GET_SMMU_INFO 78 622#define KVM_CAP_PPC_GET_SMMU_INFO 78
619#define KVM_CAP_S390_COW 79 623#define KVM_CAP_S390_COW 79
620#define KVM_CAP_PPC_ALLOC_HTAB 80 624#define KVM_CAP_PPC_ALLOC_HTAB 80
625#ifdef __KVM_HAVE_READONLY_MEM
626#define KVM_CAP_READONLY_MEM 81
627#endif
628#define KVM_CAP_IRQFD_RESAMPLE 82
621 629
622#ifdef KVM_CAP_IRQ_ROUTING 630#ifdef KVM_CAP_IRQ_ROUTING
623 631
@@ -683,12 +691,21 @@ struct kvm_xen_hvm_config {
683#endif 691#endif
684 692
685#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) 693#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
694/*
695 * Available with KVM_CAP_IRQFD_RESAMPLE
696 *
697 * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies
698 * the irqfd to operate in resampling mode for level triggered interrupt
699 * emlation. See Documentation/virtual/kvm/api.txt.
700 */
701#define KVM_IRQFD_FLAG_RESAMPLE (1 << 1)
686 702
687struct kvm_irqfd { 703struct kvm_irqfd {
688 __u32 fd; 704 __u32 fd;
689 __u32 gsi; 705 __u32 gsi;
690 __u32 flags; 706 __u32 flags;
691 __u8 pad[20]; 707 __u32 resamplefd;
708 __u8 pad[16];
692}; 709};
693 710
694struct kvm_clock_data { 711struct kvm_clock_data {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8a59e0abe5fa..93bfc9f9815c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -21,6 +21,7 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/rcupdate.h> 22#include <linux/rcupdate.h>
23#include <linux/ratelimit.h> 23#include <linux/ratelimit.h>
24#include <linux/err.h>
24#include <asm/signal.h> 25#include <asm/signal.h>
25 26
26#include <linux/kvm.h> 27#include <linux/kvm.h>
@@ -35,6 +36,13 @@
35#endif 36#endif
36 37
37/* 38/*
39 * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
40 * in kvm, other bits are visible for userspace which are defined in
41 * include/linux/kvm_h.
42 */
43#define KVM_MEMSLOT_INVALID (1UL << 16)
44
45/*
38 * If we support unaligned MMIO, at most one fragment will be split into two: 46 * If we support unaligned MMIO, at most one fragment will be split into two:
39 */ 47 */
40#ifdef KVM_UNALIGNED_MMIO 48#ifdef KVM_UNALIGNED_MMIO
@@ -49,6 +57,47 @@
49 (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS) 57 (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS)
50 58
51/* 59/*
60 * For the normal pfn, the highest 12 bits should be zero,
61 * so we can mask these bits to indicate the error.
62 */
63#define KVM_PFN_ERR_MASK (0xfffULL << 52)
64
65#define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK)
66#define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1)
67#define KVM_PFN_ERR_BAD (KVM_PFN_ERR_MASK + 2)
68#define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 3)
69
70static inline bool is_error_pfn(pfn_t pfn)
71{
72 return !!(pfn & KVM_PFN_ERR_MASK);
73}
74
75static inline bool is_noslot_pfn(pfn_t pfn)
76{
77 return pfn == KVM_PFN_ERR_BAD;
78}
79
80static inline bool is_invalid_pfn(pfn_t pfn)
81{
82 return !is_noslot_pfn(pfn) && is_error_pfn(pfn);
83}
84
85#define KVM_HVA_ERR_BAD (PAGE_OFFSET)
86#define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE)
87
88static inline bool kvm_is_error_hva(unsigned long addr)
89{
90 return addr >= PAGE_OFFSET;
91}
92
93#define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT))
94
95static inline bool is_error_page(struct page *page)
96{
97 return IS_ERR(page);
98}
99
100/*
52 * vcpu->requests bit members 101 * vcpu->requests bit members
53 */ 102 */
54#define KVM_REQ_TLB_FLUSH 0 103#define KVM_REQ_TLB_FLUSH 0
@@ -70,7 +119,8 @@
70#define KVM_REQ_PMU 16 119#define KVM_REQ_PMU 16
71#define KVM_REQ_PMI 17 120#define KVM_REQ_PMI 17
72 121
73#define KVM_USERSPACE_IRQ_SOURCE_ID 0 122#define KVM_USERSPACE_IRQ_SOURCE_ID 0
123#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
74 124
75struct kvm; 125struct kvm;
76struct kvm_vcpu; 126struct kvm_vcpu;
@@ -183,6 +233,18 @@ struct kvm_vcpu {
183 } async_pf; 233 } async_pf;
184#endif 234#endif
185 235
236#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
237 /*
238 * Cpu relax intercept or pause loop exit optimization
239 * in_spin_loop: set when a vcpu does a pause loop exit
240 * or cpu relax intercepted.
241 * dy_eligible: indicates whether vcpu is eligible for directed yield.
242 */
243 struct {
244 bool in_spin_loop;
245 bool dy_eligible;
246 } spin_loop;
247#endif
186 struct kvm_vcpu_arch arch; 248 struct kvm_vcpu_arch arch;
187}; 249};
188 250
@@ -201,7 +263,6 @@ struct kvm_memory_slot {
201 gfn_t base_gfn; 263 gfn_t base_gfn;
202 unsigned long npages; 264 unsigned long npages;
203 unsigned long flags; 265 unsigned long flags;
204 unsigned long *rmap;
205 unsigned long *dirty_bitmap; 266 unsigned long *dirty_bitmap;
206 struct kvm_arch_memory_slot arch; 267 struct kvm_arch_memory_slot arch;
207 unsigned long userspace_addr; 268 unsigned long userspace_addr;
@@ -283,6 +344,8 @@ struct kvm {
283 struct { 344 struct {
284 spinlock_t lock; 345 spinlock_t lock;
285 struct list_head items; 346 struct list_head items;
347 struct list_head resampler_list;
348 struct mutex resampler_lock;
286 } irqfds; 349 } irqfds;
287 struct list_head ioeventfds; 350 struct list_head ioeventfds;
288#endif 351#endif
@@ -348,7 +411,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
348int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); 411int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
349void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); 412void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
350 413
351void vcpu_load(struct kvm_vcpu *vcpu); 414int __must_check vcpu_load(struct kvm_vcpu *vcpu);
352void vcpu_put(struct kvm_vcpu *vcpu); 415void vcpu_put(struct kvm_vcpu *vcpu);
353 416
354int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 417int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
@@ -378,23 +441,6 @@ id_to_memslot(struct kvm_memslots *slots, int id)
378 return slot; 441 return slot;
379} 442}
380 443
381#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
382#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
383static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
384
385extern struct page *bad_page;
386extern struct page *fault_page;
387
388extern pfn_t bad_pfn;
389extern pfn_t fault_pfn;
390
391int is_error_page(struct page *page);
392int is_error_pfn(pfn_t pfn);
393int is_hwpoison_pfn(pfn_t pfn);
394int is_fault_pfn(pfn_t pfn);
395int is_noslot_pfn(pfn_t pfn);
396int is_invalid_pfn(pfn_t pfn);
397int kvm_is_error_hva(unsigned long addr);
398int kvm_set_memory_region(struct kvm *kvm, 444int kvm_set_memory_region(struct kvm *kvm,
399 struct kvm_userspace_memory_region *mem, 445 struct kvm_userspace_memory_region *mem,
400 int user_alloc); 446 int user_alloc);
@@ -415,28 +461,33 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
415 int user_alloc); 461 int user_alloc);
416bool kvm_largepages_enabled(void); 462bool kvm_largepages_enabled(void);
417void kvm_disable_largepages(void); 463void kvm_disable_largepages(void);
418void kvm_arch_flush_shadow(struct kvm *kvm); 464/* flush all memory translations */
465void kvm_arch_flush_shadow_all(struct kvm *kvm);
466/* flush memory translations pointing to 'slot' */
467void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
468 struct kvm_memory_slot *slot);
419 469
420int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 470int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
421 int nr_pages); 471 int nr_pages);
422 472
423struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); 473struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
424unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); 474unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
475unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
425void kvm_release_page_clean(struct page *page); 476void kvm_release_page_clean(struct page *page);
426void kvm_release_page_dirty(struct page *page); 477void kvm_release_page_dirty(struct page *page);
427void kvm_set_page_dirty(struct page *page); 478void kvm_set_page_dirty(struct page *page);
428void kvm_set_page_accessed(struct page *page); 479void kvm_set_page_accessed(struct page *page);
429 480
430pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
431pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); 481pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
432pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, 482pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
433 bool write_fault, bool *writable); 483 bool write_fault, bool *writable);
434pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); 484pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
435pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 485pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
436 bool *writable); 486 bool *writable);
437pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 487pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
438 struct kvm_memory_slot *slot, gfn_t gfn); 488pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
439void kvm_release_pfn_dirty(pfn_t); 489
490void kvm_release_pfn_dirty(pfn_t pfn);
440void kvm_release_pfn_clean(pfn_t pfn); 491void kvm_release_pfn_clean(pfn_t pfn);
441void kvm_set_pfn_dirty(pfn_t pfn); 492void kvm_set_pfn_dirty(pfn_t pfn);
442void kvm_set_pfn_accessed(pfn_t pfn); 493void kvm_set_pfn_accessed(pfn_t pfn);
@@ -494,6 +545,7 @@ int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
494 struct 545 struct
495 kvm_userspace_memory_region *mem, 546 kvm_userspace_memory_region *mem,
496 int user_alloc); 547 int user_alloc);
548int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level);
497long kvm_arch_vm_ioctl(struct file *filp, 549long kvm_arch_vm_ioctl(struct file *filp,
498 unsigned int ioctl, unsigned long arg); 550 unsigned int ioctl, unsigned long arg);
499 551
@@ -573,7 +625,7 @@ void kvm_arch_sync_events(struct kvm *kvm);
573int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); 625int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
574void kvm_vcpu_kick(struct kvm_vcpu *vcpu); 626void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
575 627
576int kvm_is_mmio_pfn(pfn_t pfn); 628bool kvm_is_mmio_pfn(pfn_t pfn);
577 629
578struct kvm_irq_ack_notifier { 630struct kvm_irq_ack_notifier {
579 struct hlist_node link; 631 struct hlist_node link;
@@ -728,6 +780,12 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
728 return search_memslots(slots, gfn); 780 return search_memslots(slots, gfn);
729} 781}
730 782
783static inline unsigned long
784__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
785{
786 return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
787}
788
731static inline int memslot_id(struct kvm *kvm, gfn_t gfn) 789static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
732{ 790{
733 return gfn_to_memslot(kvm, gfn)->id; 791 return gfn_to_memslot(kvm, gfn)->id;
@@ -740,10 +798,12 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
740 (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 798 (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
741} 799}
742 800
743static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 801static inline gfn_t
744 gfn_t gfn) 802hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
745{ 803{
746 return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; 804 gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT;
805
806 return slot->base_gfn + gfn_offset;
747} 807}
748 808
749static inline gpa_t gfn_to_gpa(gfn_t gfn) 809static inline gpa_t gfn_to_gpa(gfn_t gfn)
@@ -899,5 +959,32 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
899 } 959 }
900} 960}
901 961
962#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
963
964static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
965{
966 vcpu->spin_loop.in_spin_loop = val;
967}
968static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
969{
970 vcpu->spin_loop.dy_eligible = val;
971}
972
973#else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
974
975static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
976{
977}
978
979static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
980{
981}
982
983static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
984{
985 return true;
986}
987
988#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
902#endif 989#endif
903 990
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 43049192b5ec..60f48fa0fd0d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -118,6 +118,7 @@ void jump_label_rate_limit(struct static_key_deferred *key,
118 key->timeout = rl; 118 key->timeout = rl;
119 INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); 119 INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
120} 120}
121EXPORT_SYMBOL_GPL(jump_label_rate_limit);
121 122
122static int addr_conflict(struct jump_entry *entry, void *start, void *end) 123static int addr_conflict(struct jump_entry *entry, void *start, void *end)
123{ 124{
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 28694f4a9139..d01b24b72c61 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -21,3 +21,6 @@ config KVM_ASYNC_PF
21 21
22config HAVE_KVM_MSI 22config HAVE_KVM_MSI
23 bool 23 bool
24
25config HAVE_KVM_CPU_RELAX_INTERCEPT
26 bool
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 74268b4c2ee1..ea475cd03511 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -111,8 +111,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
111 list_entry(vcpu->async_pf.done.next, 111 list_entry(vcpu->async_pf.done.next,
112 typeof(*work), link); 112 typeof(*work), link);
113 list_del(&work->link); 113 list_del(&work->link);
114 if (work->page) 114 if (!is_error_page(work->page))
115 put_page(work->page); 115 kvm_release_page_clean(work->page);
116 kmem_cache_free(async_pf_cache, work); 116 kmem_cache_free(async_pf_cache, work);
117 } 117 }
118 spin_unlock(&vcpu->async_pf.lock); 118 spin_unlock(&vcpu->async_pf.lock);
@@ -138,8 +138,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
138 138
139 list_del(&work->queue); 139 list_del(&work->queue);
140 vcpu->async_pf.queued--; 140 vcpu->async_pf.queued--;
141 if (work->page) 141 if (!is_error_page(work->page))
142 put_page(work->page); 142 kvm_release_page_clean(work->page);
143 kmem_cache_free(async_pf_cache, work); 143 kmem_cache_free(async_pf_cache, work);
144 } 144 }
145} 145}
@@ -203,8 +203,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
203 if (!work) 203 if (!work)
204 return -ENOMEM; 204 return -ENOMEM;
205 205
206 work->page = bad_page; 206 work->page = KVM_ERR_PTR_BAD_PAGE;
207 get_page(bad_page);
208 INIT_LIST_HEAD(&work->queue); /* for list_del to work */ 207 INIT_LIST_HEAD(&work->queue); /* for list_del to work */
209 208
210 spin_lock(&vcpu->async_pf.lock); 209 spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 67a35e90384c..9718e98d6d2a 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -43,6 +43,31 @@
43 * -------------------------------------------------------------------- 43 * --------------------------------------------------------------------
44 */ 44 */
45 45
46/*
47 * Resampling irqfds are a special variety of irqfds used to emulate
48 * level triggered interrupts. The interrupt is asserted on eventfd
49 * trigger. On acknowledgement through the irq ack notifier, the
50 * interrupt is de-asserted and userspace is notified through the
51 * resamplefd. All resamplers on the same gsi are de-asserted
52 * together, so we don't need to track the state of each individual
53 * user. We can also therefore share the same irq source ID.
54 */
55struct _irqfd_resampler {
56 struct kvm *kvm;
57 /*
58 * List of resampling struct _irqfd objects sharing this gsi.
59 * RCU list modified under kvm->irqfds.resampler_lock
60 */
61 struct list_head list;
62 struct kvm_irq_ack_notifier notifier;
63 /*
64 * Entry in list of kvm->irqfd.resampler_list. Use for sharing
65 * resamplers among irqfds on the same gsi.
66 * Accessed and modified under kvm->irqfds.resampler_lock
67 */
68 struct list_head link;
69};
70
46struct _irqfd { 71struct _irqfd {
47 /* Used for MSI fast-path */ 72 /* Used for MSI fast-path */
48 struct kvm *kvm; 73 struct kvm *kvm;
@@ -52,6 +77,12 @@ struct _irqfd {
52 /* Used for level IRQ fast-path */ 77 /* Used for level IRQ fast-path */
53 int gsi; 78 int gsi;
54 struct work_struct inject; 79 struct work_struct inject;
80 /* The resampler used by this irqfd (resampler-only) */
81 struct _irqfd_resampler *resampler;
82 /* Eventfd notified on resample (resampler-only) */
83 struct eventfd_ctx *resamplefd;
84 /* Entry in list of irqfds for a resampler (resampler-only) */
85 struct list_head resampler_link;
55 /* Used for setup/shutdown */ 86 /* Used for setup/shutdown */
56 struct eventfd_ctx *eventfd; 87 struct eventfd_ctx *eventfd;
57 struct list_head list; 88 struct list_head list;
@@ -67,8 +98,58 @@ irqfd_inject(struct work_struct *work)
67 struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); 98 struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
68 struct kvm *kvm = irqfd->kvm; 99 struct kvm *kvm = irqfd->kvm;
69 100
70 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); 101 if (!irqfd->resampler) {
71 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); 102 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
103 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
104 } else
105 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
106 irqfd->gsi, 1);
107}
108
109/*
110 * Since resampler irqfds share an IRQ source ID, we de-assert once
111 * then notify all of the resampler irqfds using this GSI. We can't
112 * do multiple de-asserts or we risk racing with incoming re-asserts.
113 */
114static void
115irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
116{
117 struct _irqfd_resampler *resampler;
118 struct _irqfd *irqfd;
119
120 resampler = container_of(kian, struct _irqfd_resampler, notifier);
121
122 kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
123 resampler->notifier.gsi, 0);
124
125 rcu_read_lock();
126
127 list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
128 eventfd_signal(irqfd->resamplefd, 1);
129
130 rcu_read_unlock();
131}
132
133static void
134irqfd_resampler_shutdown(struct _irqfd *irqfd)
135{
136 struct _irqfd_resampler *resampler = irqfd->resampler;
137 struct kvm *kvm = resampler->kvm;
138
139 mutex_lock(&kvm->irqfds.resampler_lock);
140
141 list_del_rcu(&irqfd->resampler_link);
142 synchronize_rcu();
143
144 if (list_empty(&resampler->list)) {
145 list_del(&resampler->link);
146 kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
147 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
148 resampler->notifier.gsi, 0);
149 kfree(resampler);
150 }
151
152 mutex_unlock(&kvm->irqfds.resampler_lock);
72} 153}
73 154
74/* 155/*
@@ -92,6 +173,11 @@ irqfd_shutdown(struct work_struct *work)
92 */ 173 */
93 flush_work(&irqfd->inject); 174 flush_work(&irqfd->inject);
94 175
176 if (irqfd->resampler) {
177 irqfd_resampler_shutdown(irqfd);
178 eventfd_ctx_put(irqfd->resamplefd);
179 }
180
95 /* 181 /*
96 * It is now safe to release the object's resources 182 * It is now safe to release the object's resources
97 */ 183 */
@@ -203,7 +289,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
203 struct kvm_irq_routing_table *irq_rt; 289 struct kvm_irq_routing_table *irq_rt;
204 struct _irqfd *irqfd, *tmp; 290 struct _irqfd *irqfd, *tmp;
205 struct file *file = NULL; 291 struct file *file = NULL;
206 struct eventfd_ctx *eventfd = NULL; 292 struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
207 int ret; 293 int ret;
208 unsigned int events; 294 unsigned int events;
209 295
@@ -231,6 +317,54 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
231 317
232 irqfd->eventfd = eventfd; 318 irqfd->eventfd = eventfd;
233 319
320 if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
321 struct _irqfd_resampler *resampler;
322
323 resamplefd = eventfd_ctx_fdget(args->resamplefd);
324 if (IS_ERR(resamplefd)) {
325 ret = PTR_ERR(resamplefd);
326 goto fail;
327 }
328
329 irqfd->resamplefd = resamplefd;
330 INIT_LIST_HEAD(&irqfd->resampler_link);
331
332 mutex_lock(&kvm->irqfds.resampler_lock);
333
334 list_for_each_entry(resampler,
335 &kvm->irqfds.resampler_list, list) {
336 if (resampler->notifier.gsi == irqfd->gsi) {
337 irqfd->resampler = resampler;
338 break;
339 }
340 }
341
342 if (!irqfd->resampler) {
343 resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
344 if (!resampler) {
345 ret = -ENOMEM;
346 mutex_unlock(&kvm->irqfds.resampler_lock);
347 goto fail;
348 }
349
350 resampler->kvm = kvm;
351 INIT_LIST_HEAD(&resampler->list);
352 resampler->notifier.gsi = irqfd->gsi;
353 resampler->notifier.irq_acked = irqfd_resampler_ack;
354 INIT_LIST_HEAD(&resampler->link);
355
356 list_add(&resampler->link, &kvm->irqfds.resampler_list);
357 kvm_register_irq_ack_notifier(kvm,
358 &resampler->notifier);
359 irqfd->resampler = resampler;
360 }
361
362 list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
363 synchronize_rcu();
364
365 mutex_unlock(&kvm->irqfds.resampler_lock);
366 }
367
234 /* 368 /*
235 * Install our own custom wake-up handling so we are notified via 369 * Install our own custom wake-up handling so we are notified via
236 * a callback whenever someone signals the underlying eventfd 370 * a callback whenever someone signals the underlying eventfd
@@ -276,6 +410,12 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
276 return 0; 410 return 0;
277 411
278fail: 412fail:
413 if (irqfd->resampler)
414 irqfd_resampler_shutdown(irqfd);
415
416 if (resamplefd && !IS_ERR(resamplefd))
417 eventfd_ctx_put(resamplefd);
418
279 if (eventfd && !IS_ERR(eventfd)) 419 if (eventfd && !IS_ERR(eventfd))
280 eventfd_ctx_put(eventfd); 420 eventfd_ctx_put(eventfd);
281 421
@@ -291,6 +431,8 @@ kvm_eventfd_init(struct kvm *kvm)
291{ 431{
292 spin_lock_init(&kvm->irqfds.lock); 432 spin_lock_init(&kvm->irqfds.lock);
293 INIT_LIST_HEAD(&kvm->irqfds.items); 433 INIT_LIST_HEAD(&kvm->irqfds.items);
434 INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
435 mutex_init(&kvm->irqfds.resampler_lock);
294 INIT_LIST_HEAD(&kvm->ioeventfds); 436 INIT_LIST_HEAD(&kvm->ioeventfds);
295} 437}
296 438
@@ -340,7 +482,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
340int 482int
341kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) 483kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
342{ 484{
343 if (args->flags & ~KVM_IRQFD_FLAG_DEASSIGN) 485 if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
344 return -EINVAL; 486 return -EINVAL;
345 487
346 if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) 488 if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index ef61d529a6c4..cfb7e4d52dc2 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -197,28 +197,29 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
197 u32 old_irr; 197 u32 old_irr;
198 u32 mask = 1 << irq; 198 u32 mask = 1 << irq;
199 union kvm_ioapic_redirect_entry entry; 199 union kvm_ioapic_redirect_entry entry;
200 int ret = 1; 200 int ret, irq_level;
201
202 BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
201 203
202 spin_lock(&ioapic->lock); 204 spin_lock(&ioapic->lock);
203 old_irr = ioapic->irr; 205 old_irr = ioapic->irr;
204 if (irq >= 0 && irq < IOAPIC_NUM_PINS) { 206 irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
205 int irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], 207 irq_source_id, level);
206 irq_source_id, level); 208 entry = ioapic->redirtbl[irq];
207 entry = ioapic->redirtbl[irq]; 209 irq_level ^= entry.fields.polarity;
208 irq_level ^= entry.fields.polarity; 210 if (!irq_level) {
209 if (!irq_level) 211 ioapic->irr &= ~mask;
210 ioapic->irr &= ~mask; 212 ret = 1;
211 else { 213 } else {
212 int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); 214 int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
213 ioapic->irr |= mask; 215 ioapic->irr |= mask;
214 if ((edge && old_irr != ioapic->irr) || 216 if ((edge && old_irr != ioapic->irr) ||
215 (!edge && !entry.fields.remote_irr)) 217 (!edge && !entry.fields.remote_irr))
216 ret = ioapic_service(ioapic, irq); 218 ret = ioapic_service(ioapic, irq);
217 else 219 else
218 ret = 0; /* report coalesced interrupt */ 220 ret = 0; /* report coalesced interrupt */
219 }
220 trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
221 } 221 }
222 trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
222 spin_unlock(&ioapic->lock); 223 spin_unlock(&ioapic->lock);
223 224
224 return ret; 225 return ret;
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index e9fff9830bf0..037cb6730e68 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -42,13 +42,13 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
42static void kvm_iommu_put_pages(struct kvm *kvm, 42static void kvm_iommu_put_pages(struct kvm *kvm,
43 gfn_t base_gfn, unsigned long npages); 43 gfn_t base_gfn, unsigned long npages);
44 44
45static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot, 45static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
46 gfn_t gfn, unsigned long size) 46 unsigned long size)
47{ 47{
48 gfn_t end_gfn; 48 gfn_t end_gfn;
49 pfn_t pfn; 49 pfn_t pfn;
50 50
51 pfn = gfn_to_pfn_memslot(kvm, slot, gfn); 51 pfn = gfn_to_pfn_memslot(slot, gfn);
52 end_gfn = gfn + (size >> PAGE_SHIFT); 52 end_gfn = gfn + (size >> PAGE_SHIFT);
53 gfn += 1; 53 gfn += 1;
54 54
@@ -56,7 +56,7 @@ static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
56 return pfn; 56 return pfn;
57 57
58 while (gfn < end_gfn) 58 while (gfn < end_gfn)
59 gfn_to_pfn_memslot(kvm, slot, gfn++); 59 gfn_to_pfn_memslot(slot, gfn++);
60 60
61 return pfn; 61 return pfn;
62} 62}
@@ -105,7 +105,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
105 * Pin all pages we are about to map in memory. This is 105 * Pin all pages we are about to map in memory. This is
106 * important because we unmap and unpin in 4kb steps later. 106 * important because we unmap and unpin in 4kb steps later.
107 */ 107 */
108 pfn = kvm_pin_pages(kvm, slot, gfn, page_size); 108 pfn = kvm_pin_pages(slot, gfn, page_size);
109 if (is_error_pfn(pfn)) { 109 if (is_error_pfn(pfn)) {
110 gfn += 1; 110 gfn += 1;
111 continue; 111 continue;
@@ -300,6 +300,12 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
300 300
301 /* Get physical address */ 301 /* Get physical address */
302 phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); 302 phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
303
304 if (!phys) {
305 gfn++;
306 continue;
307 }
308
303 pfn = phys >> PAGE_SHIFT; 309 pfn = phys >> PAGE_SHIFT;
304 310
305 /* Unmap address from IO address space */ 311 /* Unmap address from IO address space */
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 83402d74a767..2eb58af7ee99 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -68,8 +68,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
68 struct kvm_vcpu *vcpu, *lowest = NULL; 68 struct kvm_vcpu *vcpu, *lowest = NULL;
69 69
70 if (irq->dest_mode == 0 && irq->dest_id == 0xff && 70 if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
71 kvm_is_dm_lowest_prio(irq)) 71 kvm_is_dm_lowest_prio(irq)) {
72 printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); 72 printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
73 irq->delivery_mode = APIC_DM_FIXED;
74 }
75
76 if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r))
77 return r;
73 78
74 kvm_for_each_vcpu(i, vcpu, kvm) { 79 kvm_for_each_vcpu(i, vcpu, kvm) {
75 if (!kvm_apic_present(vcpu)) 80 if (!kvm_apic_present(vcpu))
@@ -223,6 +228,9 @@ int kvm_request_irq_source_id(struct kvm *kvm)
223 } 228 }
224 229
225 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); 230 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
231#ifdef CONFIG_X86
232 ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
233#endif
226 set_bit(irq_source_id, bitmap); 234 set_bit(irq_source_id, bitmap);
227unlock: 235unlock:
228 mutex_unlock(&kvm->irq_lock); 236 mutex_unlock(&kvm->irq_lock);
@@ -233,6 +241,9 @@ unlock:
233void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) 241void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
234{ 242{
235 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); 243 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
244#ifdef CONFIG_X86
245 ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
246#endif
236 247
237 mutex_lock(&kvm->irq_lock); 248 mutex_lock(&kvm->irq_lock);
238 if (irq_source_id < 0 || 249 if (irq_source_id < 0 ||
@@ -321,11 +332,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
321 switch (ue->u.irqchip.irqchip) { 332 switch (ue->u.irqchip.irqchip) {
322 case KVM_IRQCHIP_PIC_MASTER: 333 case KVM_IRQCHIP_PIC_MASTER:
323 e->set = kvm_set_pic_irq; 334 e->set = kvm_set_pic_irq;
324 max_pin = 16; 335 max_pin = PIC_NUM_PINS;
325 break; 336 break;
326 case KVM_IRQCHIP_PIC_SLAVE: 337 case KVM_IRQCHIP_PIC_SLAVE:
327 e->set = kvm_set_pic_irq; 338 e->set = kvm_set_pic_irq;
328 max_pin = 16; 339 max_pin = PIC_NUM_PINS;
329 delta = 8; 340 delta = 8;
330 break; 341 break;
331 case KVM_IRQCHIP_IOAPIC: 342 case KVM_IRQCHIP_IOAPIC:
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d617f69131d7..c353b4599cec 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -100,13 +100,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
100 100
101static bool largepages_enabled = true; 101static bool largepages_enabled = true;
102 102
103static struct page *hwpoison_page; 103bool kvm_is_mmio_pfn(pfn_t pfn)
104static pfn_t hwpoison_pfn;
105
106struct page *fault_page;
107pfn_t fault_pfn;
108
109inline int kvm_is_mmio_pfn(pfn_t pfn)
110{ 104{
111 if (pfn_valid(pfn)) { 105 if (pfn_valid(pfn)) {
112 int reserved; 106 int reserved;
@@ -137,11 +131,12 @@ inline int kvm_is_mmio_pfn(pfn_t pfn)
137/* 131/*
138 * Switches to specified vcpu, until a matching vcpu_put() 132 * Switches to specified vcpu, until a matching vcpu_put()
139 */ 133 */
140void vcpu_load(struct kvm_vcpu *vcpu) 134int vcpu_load(struct kvm_vcpu *vcpu)
141{ 135{
142 int cpu; 136 int cpu;
143 137
144 mutex_lock(&vcpu->mutex); 138 if (mutex_lock_killable(&vcpu->mutex))
139 return -EINTR;
145 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { 140 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
146 /* The thread running this VCPU changed. */ 141 /* The thread running this VCPU changed. */
147 struct pid *oldpid = vcpu->pid; 142 struct pid *oldpid = vcpu->pid;
@@ -154,6 +149,7 @@ void vcpu_load(struct kvm_vcpu *vcpu)
154 preempt_notifier_register(&vcpu->preempt_notifier); 149 preempt_notifier_register(&vcpu->preempt_notifier);
155 kvm_arch_vcpu_load(vcpu, cpu); 150 kvm_arch_vcpu_load(vcpu, cpu);
156 put_cpu(); 151 put_cpu();
152 return 0;
157} 153}
158 154
159void vcpu_put(struct kvm_vcpu *vcpu) 155void vcpu_put(struct kvm_vcpu *vcpu)
@@ -236,6 +232,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
236 } 232 }
237 vcpu->run = page_address(page); 233 vcpu->run = page_address(page);
238 234
235 kvm_vcpu_set_in_spin_loop(vcpu, false);
236 kvm_vcpu_set_dy_eligible(vcpu, false);
237
239 r = kvm_arch_vcpu_init(vcpu); 238 r = kvm_arch_vcpu_init(vcpu);
240 if (r < 0) 239 if (r < 0)
241 goto fail_free_run; 240 goto fail_free_run;
@@ -332,8 +331,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
332 * count is also read inside the mmu_lock critical section. 331 * count is also read inside the mmu_lock critical section.
333 */ 332 */
334 kvm->mmu_notifier_count++; 333 kvm->mmu_notifier_count++;
335 for (; start < end; start += PAGE_SIZE) 334 need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
336 need_tlb_flush |= kvm_unmap_hva(kvm, start);
337 need_tlb_flush |= kvm->tlbs_dirty; 335 need_tlb_flush |= kvm->tlbs_dirty;
338 /* we've to flush the tlb before the pages can be freed */ 336 /* we've to flush the tlb before the pages can be freed */
339 if (need_tlb_flush) 337 if (need_tlb_flush)
@@ -412,7 +410,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
412 int idx; 410 int idx;
413 411
414 idx = srcu_read_lock(&kvm->srcu); 412 idx = srcu_read_lock(&kvm->srcu);
415 kvm_arch_flush_shadow(kvm); 413 kvm_arch_flush_shadow_all(kvm);
416 srcu_read_unlock(&kvm->srcu, idx); 414 srcu_read_unlock(&kvm->srcu, idx);
417} 415}
418 416
@@ -551,16 +549,12 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
551static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 549static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
552 struct kvm_memory_slot *dont) 550 struct kvm_memory_slot *dont)
553{ 551{
554 if (!dont || free->rmap != dont->rmap)
555 vfree(free->rmap);
556
557 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 552 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
558 kvm_destroy_dirty_bitmap(free); 553 kvm_destroy_dirty_bitmap(free);
559 554
560 kvm_arch_free_memslot(free, dont); 555 kvm_arch_free_memslot(free, dont);
561 556
562 free->npages = 0; 557 free->npages = 0;
563 free->rmap = NULL;
564} 558}
565 559
566void kvm_free_physmem(struct kvm *kvm) 560void kvm_free_physmem(struct kvm *kvm)
@@ -590,7 +584,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
590#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 584#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
591 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 585 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
592#else 586#else
593 kvm_arch_flush_shadow(kvm); 587 kvm_arch_flush_shadow_all(kvm);
594#endif 588#endif
595 kvm_arch_destroy_vm(kvm); 589 kvm_arch_destroy_vm(kvm);
596 kvm_free_physmem(kvm); 590 kvm_free_physmem(kvm);
@@ -686,6 +680,20 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
686 slots->generation++; 680 slots->generation++;
687} 681}
688 682
683static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
684{
685 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
686
687#ifdef KVM_CAP_READONLY_MEM
688 valid_flags |= KVM_MEM_READONLY;
689#endif
690
691 if (mem->flags & ~valid_flags)
692 return -EINVAL;
693
694 return 0;
695}
696
689/* 697/*
690 * Allocate some memory and give it an address in the guest physical address 698 * Allocate some memory and give it an address in the guest physical address
691 * space. 699 * space.
@@ -706,6 +714,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
706 struct kvm_memory_slot old, new; 714 struct kvm_memory_slot old, new;
707 struct kvm_memslots *slots, *old_memslots; 715 struct kvm_memslots *slots, *old_memslots;
708 716
717 r = check_memory_region_flags(mem);
718 if (r)
719 goto out;
720
709 r = -EINVAL; 721 r = -EINVAL;
710 /* General sanity checks */ 722 /* General sanity checks */
711 if (mem->memory_size & (PAGE_SIZE - 1)) 723 if (mem->memory_size & (PAGE_SIZE - 1))
@@ -769,11 +781,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
769 if (npages && !old.npages) { 781 if (npages && !old.npages) {
770 new.user_alloc = user_alloc; 782 new.user_alloc = user_alloc;
771 new.userspace_addr = mem->userspace_addr; 783 new.userspace_addr = mem->userspace_addr;
772#ifndef CONFIG_S390 784
773 new.rmap = vzalloc(npages * sizeof(*new.rmap));
774 if (!new.rmap)
775 goto out_free;
776#endif /* not defined CONFIG_S390 */
777 if (kvm_arch_create_memslot(&new, npages)) 785 if (kvm_arch_create_memslot(&new, npages))
778 goto out_free; 786 goto out_free;
779 } 787 }
@@ -785,7 +793,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
785 /* destroy any largepage mappings for dirty tracking */ 793 /* destroy any largepage mappings for dirty tracking */
786 } 794 }
787 795
788 if (!npages) { 796 if (!npages || base_gfn != old.base_gfn) {
789 struct kvm_memory_slot *slot; 797 struct kvm_memory_slot *slot;
790 798
791 r = -ENOMEM; 799 r = -ENOMEM;
@@ -801,14 +809,14 @@ int __kvm_set_memory_region(struct kvm *kvm,
801 old_memslots = kvm->memslots; 809 old_memslots = kvm->memslots;
802 rcu_assign_pointer(kvm->memslots, slots); 810 rcu_assign_pointer(kvm->memslots, slots);
803 synchronize_srcu_expedited(&kvm->srcu); 811 synchronize_srcu_expedited(&kvm->srcu);
804 /* From this point no new shadow pages pointing to a deleted 812 /* From this point no new shadow pages pointing to a deleted,
805 * memslot will be created. 813 * or moved, memslot will be created.
806 * 814 *
807 * validation of sp->gfn happens in: 815 * validation of sp->gfn happens in:
808 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 816 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
809 * - kvm_is_visible_gfn (mmu_check_roots) 817 * - kvm_is_visible_gfn (mmu_check_roots)
810 */ 818 */
811 kvm_arch_flush_shadow(kvm); 819 kvm_arch_flush_shadow_memslot(kvm, slot);
812 kfree(old_memslots); 820 kfree(old_memslots);
813 } 821 }
814 822
@@ -832,7 +840,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
832 840
833 /* actual memory is freed via old in kvm_free_physmem_slot below */ 841 /* actual memory is freed via old in kvm_free_physmem_slot below */
834 if (!npages) { 842 if (!npages) {
835 new.rmap = NULL;
836 new.dirty_bitmap = NULL; 843 new.dirty_bitmap = NULL;
837 memset(&new.arch, 0, sizeof(new.arch)); 844 memset(&new.arch, 0, sizeof(new.arch));
838 } 845 }
@@ -844,13 +851,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
844 851
845 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); 852 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
846 853
847 /*
848 * If the new memory slot is created, we need to clear all
849 * mmio sptes.
850 */
851 if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
852 kvm_arch_flush_shadow(kvm);
853
854 kvm_free_physmem_slot(&old, &new); 854 kvm_free_physmem_slot(&old, &new);
855 kfree(old_memslots); 855 kfree(old_memslots);
856 856
@@ -932,53 +932,6 @@ void kvm_disable_largepages(void)
932} 932}
933EXPORT_SYMBOL_GPL(kvm_disable_largepages); 933EXPORT_SYMBOL_GPL(kvm_disable_largepages);
934 934
935int is_error_page(struct page *page)
936{
937 return page == bad_page || page == hwpoison_page || page == fault_page;
938}
939EXPORT_SYMBOL_GPL(is_error_page);
940
941int is_error_pfn(pfn_t pfn)
942{
943 return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
944}
945EXPORT_SYMBOL_GPL(is_error_pfn);
946
947int is_hwpoison_pfn(pfn_t pfn)
948{
949 return pfn == hwpoison_pfn;
950}
951EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
952
953int is_fault_pfn(pfn_t pfn)
954{
955 return pfn == fault_pfn;
956}
957EXPORT_SYMBOL_GPL(is_fault_pfn);
958
959int is_noslot_pfn(pfn_t pfn)
960{
961 return pfn == bad_pfn;
962}
963EXPORT_SYMBOL_GPL(is_noslot_pfn);
964
965int is_invalid_pfn(pfn_t pfn)
966{
967 return pfn == hwpoison_pfn || pfn == fault_pfn;
968}
969EXPORT_SYMBOL_GPL(is_invalid_pfn);
970
971static inline unsigned long bad_hva(void)
972{
973 return PAGE_OFFSET;
974}
975
976int kvm_is_error_hva(unsigned long addr)
977{
978 return addr == bad_hva();
979}
980EXPORT_SYMBOL_GPL(kvm_is_error_hva);
981
982struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 935struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
983{ 936{
984 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 937 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -1021,28 +974,62 @@ out:
1021 return size; 974 return size;
1022} 975}
1023 976
1024static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 977static bool memslot_is_readonly(struct kvm_memory_slot *slot)
1025 gfn_t *nr_pages) 978{
979 return slot->flags & KVM_MEM_READONLY;
980}
981
982static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
983 gfn_t *nr_pages, bool write)
1026{ 984{
1027 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 985 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1028 return bad_hva(); 986 return KVM_HVA_ERR_BAD;
987
988 if (memslot_is_readonly(slot) && write)
989 return KVM_HVA_ERR_RO_BAD;
1029 990
1030 if (nr_pages) 991 if (nr_pages)
1031 *nr_pages = slot->npages - (gfn - slot->base_gfn); 992 *nr_pages = slot->npages - (gfn - slot->base_gfn);
1032 993
1033 return gfn_to_hva_memslot(slot, gfn); 994 return __gfn_to_hva_memslot(slot, gfn);
1034} 995}
1035 996
997static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
998 gfn_t *nr_pages)
999{
1000 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
1001}
1002
1003unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
1004 gfn_t gfn)
1005{
1006 return gfn_to_hva_many(slot, gfn, NULL);
1007}
1008EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
1009
1036unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1010unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1037{ 1011{
1038 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1012 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
1039} 1013}
1040EXPORT_SYMBOL_GPL(gfn_to_hva); 1014EXPORT_SYMBOL_GPL(gfn_to_hva);
1041 1015
1042static pfn_t get_fault_pfn(void) 1016/*
1017 * The hva returned by this function is only allowed to be read.
1018 * It should pair with kvm_read_hva() or kvm_read_hva_atomic().
1019 */
1020static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
1021{
1022 return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
1023}
1024
1025static int kvm_read_hva(void *data, void __user *hva, int len)
1043{ 1026{
1044 get_page(fault_page); 1027 return __copy_from_user(data, hva, len);
1045 return fault_pfn; 1028}
1029
1030static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
1031{
1032 return __copy_from_user_inatomic(data, hva, len);
1046} 1033}
1047 1034
1048int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1035int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
@@ -1065,108 +1052,186 @@ static inline int check_user_page_hwpoison(unsigned long addr)
1065 return rc == -EHWPOISON; 1052 return rc == -EHWPOISON;
1066} 1053}
1067 1054
1068static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, 1055/*
1069 bool *async, bool write_fault, bool *writable) 1056 * The atomic path to get the writable pfn which will be stored in @pfn,
1057 * true indicates success, otherwise false is returned.
1058 */
1059static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
1060 bool write_fault, bool *writable, pfn_t *pfn)
1070{ 1061{
1071 struct page *page[1]; 1062 struct page *page[1];
1072 int npages = 0; 1063 int npages;
1073 pfn_t pfn;
1074 1064
1075 /* we can do it either atomically or asynchronously, not both */ 1065 if (!(async || atomic))
1076 BUG_ON(atomic && async); 1066 return false;
1077 1067
1078 BUG_ON(!write_fault && !writable); 1068 /*
1069 * Fast pin a writable pfn only if it is a write fault request
1070 * or the caller allows to map a writable pfn for a read fault
1071 * request.
1072 */
1073 if (!(write_fault || writable))
1074 return false;
1079 1075
1080 if (writable) 1076 npages = __get_user_pages_fast(addr, 1, 1, page);
1081 *writable = true; 1077 if (npages == 1) {
1078 *pfn = page_to_pfn(page[0]);
1082 1079
1083 if (atomic || async) 1080 if (writable)
1084 npages = __get_user_pages_fast(addr, 1, 1, page); 1081 *writable = true;
1082 return true;
1083 }
1085 1084
1086 if (unlikely(npages != 1) && !atomic) { 1085 return false;
1087 might_sleep(); 1086}
1088 1087
1089 if (writable) 1088/*
1090 *writable = write_fault; 1089 * The slow path to get the pfn of the specified host virtual address,
1090 * 1 indicates success, -errno is returned if error is detected.
1091 */
1092static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
1093 bool *writable, pfn_t *pfn)
1094{
1095 struct page *page[1];
1096 int npages = 0;
1091 1097
1092 if (async) { 1098 might_sleep();
1093 down_read(&current->mm->mmap_sem); 1099
1094 npages = get_user_page_nowait(current, current->mm, 1100 if (writable)
1095 addr, write_fault, page); 1101 *writable = write_fault;
1096 up_read(&current->mm->mmap_sem); 1102
1097 } else 1103 if (async) {
1098 npages = get_user_pages_fast(addr, 1, write_fault, 1104 down_read(&current->mm->mmap_sem);
1099 page); 1105 npages = get_user_page_nowait(current, current->mm,
1100 1106 addr, write_fault, page);
1101 /* map read fault as writable if possible */ 1107 up_read(&current->mm->mmap_sem);
1102 if (unlikely(!write_fault) && npages == 1) { 1108 } else
1103 struct page *wpage[1]; 1109 npages = get_user_pages_fast(addr, 1, write_fault,
1104 1110 page);
1105 npages = __get_user_pages_fast(addr, 1, 1, wpage); 1111 if (npages != 1)
1106 if (npages == 1) { 1112 return npages;
1107 *writable = true; 1113
1108 put_page(page[0]); 1114 /* map read fault as writable if possible */
1109 page[0] = wpage[0]; 1115 if (unlikely(!write_fault) && writable) {
1110 } 1116 struct page *wpage[1];
1111 npages = 1; 1117
1118 npages = __get_user_pages_fast(addr, 1, 1, wpage);
1119 if (npages == 1) {
1120 *writable = true;
1121 put_page(page[0]);
1122 page[0] = wpage[0];
1112 } 1123 }
1124
1125 npages = 1;
1113 } 1126 }
1127 *pfn = page_to_pfn(page[0]);
1128 return npages;
1129}
1114 1130
1115 if (unlikely(npages != 1)) { 1131static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
1116 struct vm_area_struct *vma; 1132{
1133 if (unlikely(!(vma->vm_flags & VM_READ)))
1134 return false;
1117 1135
1118 if (atomic) 1136 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
1119 return get_fault_pfn(); 1137 return false;
1120 1138
1121 down_read(&current->mm->mmap_sem); 1139 return true;
1122 if (npages == -EHWPOISON || 1140}
1123 (!async && check_user_page_hwpoison(addr))) {
1124 up_read(&current->mm->mmap_sem);
1125 get_page(hwpoison_page);
1126 return page_to_pfn(hwpoison_page);
1127 }
1128 1141
1129 vma = find_vma_intersection(current->mm, addr, addr+1); 1142/*
1130 1143 * Pin guest page in memory and return its pfn.
1131 if (vma == NULL) 1144 * @addr: host virtual address which maps memory to the guest
1132 pfn = get_fault_pfn(); 1145 * @atomic: whether this function can sleep
1133 else if ((vma->vm_flags & VM_PFNMAP)) { 1146 * @async: whether this function need to wait IO complete if the
1134 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + 1147 * host page is not in the memory
1135 vma->vm_pgoff; 1148 * @write_fault: whether we should get a writable host page
1136 BUG_ON(!kvm_is_mmio_pfn(pfn)); 1149 * @writable: whether it allows to map a writable host page for !@write_fault
1137 } else { 1150 *
1138 if (async && (vma->vm_flags & VM_WRITE)) 1151 * The function will map a writable host page for these two cases:
1139 *async = true; 1152 * 1): @write_fault = true
1140 pfn = get_fault_pfn(); 1153 * 2): @write_fault = false && @writable, @writable will tell the caller
1141 } 1154 * whether the mapping is writable.
1142 up_read(&current->mm->mmap_sem); 1155 */
1143 } else 1156static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
1144 pfn = page_to_pfn(page[0]); 1157 bool write_fault, bool *writable)
1158{
1159 struct vm_area_struct *vma;
1160 pfn_t pfn = 0;
1161 int npages;
1162
1163 /* we can do it either atomically or asynchronously, not both */
1164 BUG_ON(atomic && async);
1145 1165
1166 if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
1167 return pfn;
1168
1169 if (atomic)
1170 return KVM_PFN_ERR_FAULT;
1171
1172 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
1173 if (npages == 1)
1174 return pfn;
1175
1176 down_read(&current->mm->mmap_sem);
1177 if (npages == -EHWPOISON ||
1178 (!async && check_user_page_hwpoison(addr))) {
1179 pfn = KVM_PFN_ERR_HWPOISON;
1180 goto exit;
1181 }
1182
1183 vma = find_vma_intersection(current->mm, addr, addr + 1);
1184
1185 if (vma == NULL)
1186 pfn = KVM_PFN_ERR_FAULT;
1187 else if ((vma->vm_flags & VM_PFNMAP)) {
1188 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
1189 vma->vm_pgoff;
1190 BUG_ON(!kvm_is_mmio_pfn(pfn));
1191 } else {
1192 if (async && vma_is_valid(vma, write_fault))
1193 *async = true;
1194 pfn = KVM_PFN_ERR_FAULT;
1195 }
1196exit:
1197 up_read(&current->mm->mmap_sem);
1146 return pfn; 1198 return pfn;
1147} 1199}
1148 1200
1149pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) 1201static pfn_t
1202__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
1203 bool *async, bool write_fault, bool *writable)
1150{ 1204{
1151 return hva_to_pfn(kvm, addr, true, NULL, true, NULL); 1205 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
1206
1207 if (addr == KVM_HVA_ERR_RO_BAD)
1208 return KVM_PFN_ERR_RO_FAULT;
1209
1210 if (kvm_is_error_hva(addr))
1211 return KVM_PFN_ERR_BAD;
1212
1213 /* Do not map writable pfn in the readonly memslot. */
1214 if (writable && memslot_is_readonly(slot)) {
1215 *writable = false;
1216 writable = NULL;
1217 }
1218
1219 return hva_to_pfn(addr, atomic, async, write_fault,
1220 writable);
1152} 1221}
1153EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
1154 1222
1155static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, 1223static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
1156 bool write_fault, bool *writable) 1224 bool write_fault, bool *writable)
1157{ 1225{
1158 unsigned long addr; 1226 struct kvm_memory_slot *slot;
1159 1227
1160 if (async) 1228 if (async)
1161 *async = false; 1229 *async = false;
1162 1230
1163 addr = gfn_to_hva(kvm, gfn); 1231 slot = gfn_to_memslot(kvm, gfn);
1164 if (kvm_is_error_hva(addr)) {
1165 get_page(bad_page);
1166 return page_to_pfn(bad_page);
1167 }
1168 1232
1169 return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); 1233 return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
1234 writable);
1170} 1235}
1171 1236
1172pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1237pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
@@ -1195,12 +1260,16 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1195} 1260}
1196EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1261EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1197 1262
1198pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 1263pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
1199 struct kvm_memory_slot *slot, gfn_t gfn) 1264{
1265 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
1266}
1267
1268pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
1200{ 1269{
1201 unsigned long addr = gfn_to_hva_memslot(slot, gfn); 1270 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
1202 return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
1203} 1271}
1272EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
1204 1273
1205int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 1274int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1206 int nr_pages) 1275 int nr_pages)
@@ -1219,30 +1288,42 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1219} 1288}
1220EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1289EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1221 1290
1291static struct page *kvm_pfn_to_page(pfn_t pfn)
1292{
1293 if (is_error_pfn(pfn))
1294 return KVM_ERR_PTR_BAD_PAGE;
1295
1296 if (kvm_is_mmio_pfn(pfn)) {
1297 WARN_ON(1);
1298 return KVM_ERR_PTR_BAD_PAGE;
1299 }
1300
1301 return pfn_to_page(pfn);
1302}
1303
1222struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1304struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1223{ 1305{
1224 pfn_t pfn; 1306 pfn_t pfn;
1225 1307
1226 pfn = gfn_to_pfn(kvm, gfn); 1308 pfn = gfn_to_pfn(kvm, gfn);
1227 if (!kvm_is_mmio_pfn(pfn))
1228 return pfn_to_page(pfn);
1229
1230 WARN_ON(kvm_is_mmio_pfn(pfn));
1231 1309
1232 get_page(bad_page); 1310 return kvm_pfn_to_page(pfn);
1233 return bad_page;
1234} 1311}
1235 1312
1236EXPORT_SYMBOL_GPL(gfn_to_page); 1313EXPORT_SYMBOL_GPL(gfn_to_page);
1237 1314
1238void kvm_release_page_clean(struct page *page) 1315void kvm_release_page_clean(struct page *page)
1239{ 1316{
1317 WARN_ON(is_error_page(page));
1318
1240 kvm_release_pfn_clean(page_to_pfn(page)); 1319 kvm_release_pfn_clean(page_to_pfn(page));
1241} 1320}
1242EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1321EXPORT_SYMBOL_GPL(kvm_release_page_clean);
1243 1322
1244void kvm_release_pfn_clean(pfn_t pfn) 1323void kvm_release_pfn_clean(pfn_t pfn)
1245{ 1324{
1325 WARN_ON(is_error_pfn(pfn));
1326
1246 if (!kvm_is_mmio_pfn(pfn)) 1327 if (!kvm_is_mmio_pfn(pfn))
1247 put_page(pfn_to_page(pfn)); 1328 put_page(pfn_to_page(pfn));
1248} 1329}
@@ -1250,6 +1331,8 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
1250 1331
1251void kvm_release_page_dirty(struct page *page) 1332void kvm_release_page_dirty(struct page *page)
1252{ 1333{
1334 WARN_ON(is_error_page(page));
1335
1253 kvm_release_pfn_dirty(page_to_pfn(page)); 1336 kvm_release_pfn_dirty(page_to_pfn(page));
1254} 1337}
1255EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1338EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
@@ -1305,10 +1388,10 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1305 int r; 1388 int r;
1306 unsigned long addr; 1389 unsigned long addr;
1307 1390
1308 addr = gfn_to_hva(kvm, gfn); 1391 addr = gfn_to_hva_read(kvm, gfn);
1309 if (kvm_is_error_hva(addr)) 1392 if (kvm_is_error_hva(addr))
1310 return -EFAULT; 1393 return -EFAULT;
1311 r = __copy_from_user(data, (void __user *)addr + offset, len); 1394 r = kvm_read_hva(data, (void __user *)addr + offset, len);
1312 if (r) 1395 if (r)
1313 return -EFAULT; 1396 return -EFAULT;
1314 return 0; 1397 return 0;
@@ -1343,11 +1426,11 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
1343 gfn_t gfn = gpa >> PAGE_SHIFT; 1426 gfn_t gfn = gpa >> PAGE_SHIFT;
1344 int offset = offset_in_page(gpa); 1427 int offset = offset_in_page(gpa);
1345 1428
1346 addr = gfn_to_hva(kvm, gfn); 1429 addr = gfn_to_hva_read(kvm, gfn);
1347 if (kvm_is_error_hva(addr)) 1430 if (kvm_is_error_hva(addr))
1348 return -EFAULT; 1431 return -EFAULT;
1349 pagefault_disable(); 1432 pagefault_disable();
1350 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1433 r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
1351 pagefault_enable(); 1434 pagefault_enable();
1352 if (r) 1435 if (r)
1353 return -EFAULT; 1436 return -EFAULT;
@@ -1580,6 +1663,43 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
1580} 1663}
1581EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 1664EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
1582 1665
1666#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
1667/*
1668 * Helper that checks whether a VCPU is eligible for directed yield.
1669 * Most eligible candidate to yield is decided by following heuristics:
1670 *
1671 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
1672 * (preempted lock holder), indicated by @in_spin_loop.
1673 * Set at the beiginning and cleared at the end of interception/PLE handler.
1674 *
1675 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
1676 * chance last time (mostly it has become eligible now since we have probably
1677 * yielded to lockholder in last iteration. This is done by toggling
1678 * @dy_eligible each time a VCPU checked for eligibility.)
1679 *
1680 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
1681 * to preempted lock-holder could result in wrong VCPU selection and CPU
1682 * burning. Giving priority for a potential lock-holder increases lock
1683 * progress.
1684 *
1685 * Since algorithm is based on heuristics, accessing another VCPU data without
1686 * locking does not harm. It may result in trying to yield to same VCPU, fail
1687 * and continue with next VCPU and so on.
1688 */
1689bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
1690{
1691 bool eligible;
1692
1693 eligible = !vcpu->spin_loop.in_spin_loop ||
1694 (vcpu->spin_loop.in_spin_loop &&
1695 vcpu->spin_loop.dy_eligible);
1696
1697 if (vcpu->spin_loop.in_spin_loop)
1698 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
1699
1700 return eligible;
1701}
1702#endif
1583void kvm_vcpu_on_spin(struct kvm_vcpu *me) 1703void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1584{ 1704{
1585 struct kvm *kvm = me->kvm; 1705 struct kvm *kvm = me->kvm;
@@ -1589,6 +1709,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1589 int pass; 1709 int pass;
1590 int i; 1710 int i;
1591 1711
1712 kvm_vcpu_set_in_spin_loop(me, true);
1592 /* 1713 /*
1593 * We boost the priority of a VCPU that is runnable but not 1714 * We boost the priority of a VCPU that is runnable but not
1594 * currently running, because it got preempted by something 1715 * currently running, because it got preempted by something
@@ -1607,6 +1728,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1607 continue; 1728 continue;
1608 if (waitqueue_active(&vcpu->wq)) 1729 if (waitqueue_active(&vcpu->wq))
1609 continue; 1730 continue;
1731 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
1732 continue;
1610 if (kvm_vcpu_yield_to(vcpu)) { 1733 if (kvm_vcpu_yield_to(vcpu)) {
1611 kvm->last_boosted_vcpu = i; 1734 kvm->last_boosted_vcpu = i;
1612 yielded = 1; 1735 yielded = 1;
@@ -1614,6 +1737,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1614 } 1737 }
1615 } 1738 }
1616 } 1739 }
1740 kvm_vcpu_set_in_spin_loop(me, false);
1741
1742 /* Ensure vcpu is not eligible during next spinloop */
1743 kvm_vcpu_set_dy_eligible(me, false);
1617} 1744}
1618EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 1745EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
1619 1746
@@ -1766,7 +1893,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
1766#endif 1893#endif
1767 1894
1768 1895
1769 vcpu_load(vcpu); 1896 r = vcpu_load(vcpu);
1897 if (r)
1898 return r;
1770 switch (ioctl) { 1899 switch (ioctl) {
1771 case KVM_RUN: 1900 case KVM_RUN:
1772 r = -EINVAL; 1901 r = -EINVAL;
@@ -2094,6 +2223,29 @@ static long kvm_vm_ioctl(struct file *filp,
2094 break; 2223 break;
2095 } 2224 }
2096#endif 2225#endif
2226#ifdef __KVM_HAVE_IRQ_LINE
2227 case KVM_IRQ_LINE_STATUS:
2228 case KVM_IRQ_LINE: {
2229 struct kvm_irq_level irq_event;
2230
2231 r = -EFAULT;
2232 if (copy_from_user(&irq_event, argp, sizeof irq_event))
2233 goto out;
2234
2235 r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
2236 if (r)
2237 goto out;
2238
2239 r = -EFAULT;
2240 if (ioctl == KVM_IRQ_LINE_STATUS) {
2241 if (copy_to_user(argp, &irq_event, sizeof irq_event))
2242 goto out;
2243 }
2244
2245 r = 0;
2246 break;
2247 }
2248#endif
2097 default: 2249 default:
2098 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2250 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
2099 if (r == -ENOTTY) 2251 if (r == -ENOTTY)
@@ -2698,9 +2850,6 @@ static struct syscore_ops kvm_syscore_ops = {
2698 .resume = kvm_resume, 2850 .resume = kvm_resume,
2699}; 2851};
2700 2852
2701struct page *bad_page;
2702pfn_t bad_pfn;
2703
2704static inline 2853static inline
2705struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2854struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2706{ 2855{
@@ -2732,33 +2881,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2732 if (r) 2881 if (r)
2733 goto out_fail; 2882 goto out_fail;
2734 2883
2735 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2736
2737 if (bad_page == NULL) {
2738 r = -ENOMEM;
2739 goto out;
2740 }
2741
2742 bad_pfn = page_to_pfn(bad_page);
2743
2744 hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2745
2746 if (hwpoison_page == NULL) {
2747 r = -ENOMEM;
2748 goto out_free_0;
2749 }
2750
2751 hwpoison_pfn = page_to_pfn(hwpoison_page);
2752
2753 fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2754
2755 if (fault_page == NULL) {
2756 r = -ENOMEM;
2757 goto out_free_0;
2758 }
2759
2760 fault_pfn = page_to_pfn(fault_page);
2761
2762 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2884 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2763 r = -ENOMEM; 2885 r = -ENOMEM;
2764 goto out_free_0; 2886 goto out_free_0;
@@ -2833,12 +2955,6 @@ out_free_1:
2833out_free_0a: 2955out_free_0a:
2834 free_cpumask_var(cpus_hardware_enabled); 2956 free_cpumask_var(cpus_hardware_enabled);
2835out_free_0: 2957out_free_0:
2836 if (fault_page)
2837 __free_page(fault_page);
2838 if (hwpoison_page)
2839 __free_page(hwpoison_page);
2840 __free_page(bad_page);
2841out:
2842 kvm_arch_exit(); 2958 kvm_arch_exit();
2843out_fail: 2959out_fail:
2844 return r; 2960 return r;
@@ -2858,8 +2974,5 @@ void kvm_exit(void)
2858 kvm_arch_hardware_unsetup(); 2974 kvm_arch_hardware_unsetup();
2859 kvm_arch_exit(); 2975 kvm_arch_exit();
2860 free_cpumask_var(cpus_hardware_enabled); 2976 free_cpumask_var(cpus_hardware_enabled);
2861 __free_page(fault_page);
2862 __free_page(hwpoison_page);
2863 __free_page(bad_page);
2864} 2977}
2865EXPORT_SYMBOL_GPL(kvm_exit); 2978EXPORT_SYMBOL_GPL(kvm_exit);