diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-04 12:30:33 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-04 12:30:33 -0400 |
commit | ecefbd94b834fa32559d854646d777c56749ef1c (patch) | |
tree | ca8958900ad9e208a8e5fb7704f1b66dc76131b4 | |
parent | ce57e981f2b996aaca2031003b3f866368307766 (diff) | |
parent | 3d11df7abbff013b811d5615320580cd5d9d7d31 (diff) |
Merge tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Avi Kivity:
"Highlights of the changes for this release include support for vfio
level triggered interrupts, improved big real mode support on older
Intels, a streamlines guest page table walker, guest APIC speedups,
PIO optimizations, better overcommit handling, and read-only memory."
* tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (138 commits)
KVM: s390: Fix vcpu_load handling in interrupt code
KVM: x86: Fix guest debug across vcpu INIT reset
KVM: Add resampling irqfds for level triggered interrupts
KVM: optimize apic interrupt delivery
KVM: MMU: Eliminate pointless temporary 'ac'
KVM: MMU: Avoid access/dirty update loop if all is well
KVM: MMU: Eliminate eperm temporary
KVM: MMU: Optimize is_last_gpte()
KVM: MMU: Simplify walk_addr_generic() loop
KVM: MMU: Optimize pte permission checks
KVM: MMU: Update accessed and dirty bits after guest pagetable walk
KVM: MMU: Move gpte_access() out of paging_tmpl.h
KVM: MMU: Optimize gpte_access() slightly
KVM: MMU: Push clean gpte write protection out of gpte_access()
KVM: clarify kvmclock documentation
KVM: make processes waiting on vcpu mutex killable
KVM: SVM: Make use of asm.h
KVM: VMX: Make use of asm.h
KVM: VMX: Make lto-friendly
KVM: x86: lapic: Clean up find_highest_vector() and count_vectors()
...
Conflicts:
arch/s390/include/asm/processor.h
arch/x86/kvm/i8259.c
62 files changed, 3006 insertions, 1466 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index bf33aaa4c59f..f6ec3a92e621 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt | |||
@@ -857,7 +857,8 @@ struct kvm_userspace_memory_region { | |||
857 | }; | 857 | }; |
858 | 858 | ||
859 | /* for kvm_memory_region::flags */ | 859 | /* for kvm_memory_region::flags */ |
860 | #define KVM_MEM_LOG_DIRTY_PAGES 1UL | 860 | #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) |
861 | #define KVM_MEM_READONLY (1UL << 1) | ||
861 | 862 | ||
862 | This ioctl allows the user to create or modify a guest physical memory | 863 | This ioctl allows the user to create or modify a guest physical memory |
863 | slot. When changing an existing slot, it may be moved in the guest | 864 | slot. When changing an existing slot, it may be moved in the guest |
@@ -873,14 +874,17 @@ It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr | |||
873 | be identical. This allows large pages in the guest to be backed by large | 874 | be identical. This allows large pages in the guest to be backed by large |
874 | pages in the host. | 875 | pages in the host. |
875 | 876 | ||
876 | The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which | 877 | The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which instructs |
877 | instructs kvm to keep track of writes to memory within the slot. See | 878 | kvm to keep track of writes to memory within the slot. See KVM_GET_DIRTY_LOG |
878 | the KVM_GET_DIRTY_LOG ioctl. | 879 | ioctl. The KVM_CAP_READONLY_MEM capability indicates the availability of the |
880 | KVM_MEM_READONLY flag. When this flag is set for a memory region, KVM only | ||
881 | allows read accesses. Writes will be posted to userspace as KVM_EXIT_MMIO | ||
882 | exits. | ||
879 | 883 | ||
880 | When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory | 884 | When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of |
881 | region are automatically reflected into the guest. For example, an mmap() | 885 | the memory region are automatically reflected into the guest. For example, an |
882 | that affects the region will be made visible immediately. Another example | 886 | mmap() that affects the region will be made visible immediately. Another |
883 | is madvise(MADV_DROP). | 887 | example is madvise(MADV_DROP). |
884 | 888 | ||
885 | It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. | 889 | It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. |
886 | The KVM_SET_MEMORY_REGION does not allow fine grained control over memory | 890 | The KVM_SET_MEMORY_REGION does not allow fine grained control over memory |
@@ -1946,6 +1950,19 @@ the guest using the specified gsi pin. The irqfd is removed using | |||
1946 | the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd | 1950 | the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd |
1947 | and kvm_irqfd.gsi. | 1951 | and kvm_irqfd.gsi. |
1948 | 1952 | ||
1953 | With KVM_CAP_IRQFD_RESAMPLE, KVM_IRQFD supports a de-assert and notify | ||
1954 | mechanism allowing emulation of level-triggered, irqfd-based | ||
1955 | interrupts. When KVM_IRQFD_FLAG_RESAMPLE is set the user must pass an | ||
1956 | additional eventfd in the kvm_irqfd.resamplefd field. When operating | ||
1957 | in resample mode, posting of an interrupt through kvm_irq.fd asserts | ||
1958 | the specified gsi in the irqchip. When the irqchip is resampled, such | ||
1959 | as from an EOI, the gsi is de-asserted and the user is notifed via | ||
1960 | kvm_irqfd.resamplefd. It is the user's responsibility to re-queue | ||
1961 | the interrupt if the device making use of it still requires service. | ||
1962 | Note that closing the resamplefd is not sufficient to disable the | ||
1963 | irqfd. The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment | ||
1964 | and need not be specified with KVM_IRQFD_FLAG_DEASSIGN. | ||
1965 | |||
1949 | 4.76 KVM_PPC_ALLOCATE_HTAB | 1966 | 4.76 KVM_PPC_ALLOCATE_HTAB |
1950 | 1967 | ||
1951 | Capability: KVM_CAP_PPC_ALLOC_HTAB | 1968 | Capability: KVM_CAP_PPC_ALLOC_HTAB |
diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt new file mode 100644 index 000000000000..ea113b5d87a4 --- /dev/null +++ b/Documentation/virtual/kvm/hypercalls.txt | |||
@@ -0,0 +1,66 @@ | |||
1 | Linux KVM Hypercall: | ||
2 | =================== | ||
3 | X86: | ||
4 | KVM Hypercalls have a three-byte sequence of either the vmcall or the vmmcall | ||
5 | instruction. The hypervisor can replace it with instructions that are | ||
6 | guaranteed to be supported. | ||
7 | |||
8 | Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. | ||
9 | The hypercall number should be placed in rax and the return value will be | ||
10 | placed in rax. No other registers will be clobbered unless explicitly stated | ||
11 | by the particular hypercall. | ||
12 | |||
13 | S390: | ||
14 | R2-R7 are used for parameters 1-6. In addition, R1 is used for hypercall | ||
15 | number. The return value is written to R2. | ||
16 | |||
17 | S390 uses diagnose instruction as hypercall (0x500) along with hypercall | ||
18 | number in R1. | ||
19 | |||
20 | PowerPC: | ||
21 | It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers. | ||
22 | Return value is placed in R3. | ||
23 | |||
24 | KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions' | ||
25 | property inside the device tree's /hypervisor node. | ||
26 | For more information refer to Documentation/virtual/kvm/ppc-pv.txt | ||
27 | |||
28 | KVM Hypercalls Documentation | ||
29 | =========================== | ||
30 | The template for each hypercall is: | ||
31 | 1. Hypercall name. | ||
32 | 2. Architecture(s) | ||
33 | 3. Status (deprecated, obsolete, active) | ||
34 | 4. Purpose | ||
35 | |||
36 | 1. KVM_HC_VAPIC_POLL_IRQ | ||
37 | ------------------------ | ||
38 | Architecture: x86 | ||
39 | Status: active | ||
40 | Purpose: Trigger guest exit so that the host can check for pending | ||
41 | interrupts on reentry. | ||
42 | |||
43 | 2. KVM_HC_MMU_OP | ||
44 | ------------------------ | ||
45 | Architecture: x86 | ||
46 | Status: deprecated. | ||
47 | Purpose: Support MMU operations such as writing to PTE, | ||
48 | flushing TLB, release PT. | ||
49 | |||
50 | 3. KVM_HC_FEATURES | ||
51 | ------------------------ | ||
52 | Architecture: PPC | ||
53 | Status: active | ||
54 | Purpose: Expose hypercall availability to the guest. On x86 platforms, cpuid | ||
55 | used to enumerate which hypercalls are available. On PPC, either device tree | ||
56 | based lookup ( which is also what EPAPR dictates) OR KVM specific enumeration | ||
57 | mechanism (which is this hypercall) can be used. | ||
58 | |||
59 | 4. KVM_HC_PPC_MAP_MAGIC_PAGE | ||
60 | ------------------------ | ||
61 | Architecture: PPC | ||
62 | Status: active | ||
63 | Purpose: To enable communication between the hypervisor and guest there is a | ||
64 | shared page that contains parts of supervisor visible register state. | ||
65 | The guest can map this shared page to access its supervisor register through | ||
66 | memory using this hypercall. | ||
diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt index 730471048583..6d470ae7b073 100644 --- a/Documentation/virtual/kvm/msr.txt +++ b/Documentation/virtual/kvm/msr.txt | |||
@@ -34,9 +34,12 @@ MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00 | |||
34 | time information and check that they are both equal and even. | 34 | time information and check that they are both equal and even. |
35 | An odd version indicates an in-progress update. | 35 | An odd version indicates an in-progress update. |
36 | 36 | ||
37 | sec: number of seconds for wallclock. | 37 | sec: number of seconds for wallclock at time of boot. |
38 | 38 | ||
39 | nsec: number of nanoseconds for wallclock. | 39 | nsec: number of nanoseconds for wallclock at time of boot. |
40 | |||
41 | In order to get the current wallclock time, the system_time from | ||
42 | MSR_KVM_SYSTEM_TIME_NEW needs to be added. | ||
40 | 43 | ||
41 | Note that although MSRs are per-CPU entities, the effect of this | 44 | Note that although MSRs are per-CPU entities, the effect of this |
42 | particular MSR is global. | 45 | particular MSR is global. |
@@ -82,20 +85,25 @@ MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01 | |||
82 | time at the time this structure was last updated. Unit is | 85 | time at the time this structure was last updated. Unit is |
83 | nanoseconds. | 86 | nanoseconds. |
84 | 87 | ||
85 | tsc_to_system_mul: a function of the tsc frequency. One has | 88 | tsc_to_system_mul: multiplier to be used when converting |
86 | to multiply any tsc-related quantity by this value to get | 89 | tsc-related quantity to nanoseconds |
87 | a value in nanoseconds, besides dividing by 2^tsc_shift | ||
88 | 90 | ||
89 | tsc_shift: cycle to nanosecond divider, as a power of two, to | 91 | tsc_shift: shift to be used when converting tsc-related |
90 | allow for shift rights. One has to shift right any tsc-related | 92 | quantity to nanoseconds. This shift will ensure that |
91 | quantity by this value to get a value in nanoseconds, besides | 93 | multiplication with tsc_to_system_mul does not overflow. |
92 | multiplying by tsc_to_system_mul. | 94 | A positive value denotes a left shift, a negative value |
95 | a right shift. | ||
93 | 96 | ||
94 | With this information, guests can derive per-CPU time by | 97 | The conversion from tsc to nanoseconds involves an additional |
95 | doing: | 98 | right shift by 32 bits. With this information, guests can |
99 | derive per-CPU time by doing: | ||
96 | 100 | ||
97 | time = (current_tsc - tsc_timestamp) | 101 | time = (current_tsc - tsc_timestamp) |
98 | time = (time * tsc_to_system_mul) >> tsc_shift | 102 | if (tsc_shift >= 0) |
103 | time <<= tsc_shift; | ||
104 | else | ||
105 | time >>= -tsc_shift; | ||
106 | time = (time * tsc_to_system_mul) >> 32 | ||
99 | time = time + system_time | 107 | time = time + system_time |
100 | 108 | ||
101 | flags: bits in this field indicate extended capabilities | 109 | flags: bits in this field indicate extended capabilities |
diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt index 4911cf95c67e..4cd076febb02 100644 --- a/Documentation/virtual/kvm/ppc-pv.txt +++ b/Documentation/virtual/kvm/ppc-pv.txt | |||
@@ -174,3 +174,25 @@ following: | |||
174 | That way we can inject an arbitrary amount of code as replacement for a single | 174 | That way we can inject an arbitrary amount of code as replacement for a single |
175 | instruction. This allows us to check for pending interrupts when setting EE=1 | 175 | instruction. This allows us to check for pending interrupts when setting EE=1 |
176 | for example. | 176 | for example. |
177 | |||
178 | Hypercall ABIs in KVM on PowerPC | ||
179 | ================================= | ||
180 | 1) KVM hypercalls (ePAPR) | ||
181 | |||
182 | These are ePAPR compliant hypercall implementation (mentioned above). Even | ||
183 | generic hypercalls are implemented here, like the ePAPR idle hcall. These are | ||
184 | available on all targets. | ||
185 | |||
186 | 2) PAPR hypercalls | ||
187 | |||
188 | PAPR hypercalls are needed to run server PowerPC PAPR guests (-M pseries in QEMU). | ||
189 | These are the same hypercalls that pHyp, the POWER hypervisor implements. Some of | ||
190 | them are handled in the kernel, some are handled in user space. This is only | ||
191 | available on book3s_64. | ||
192 | |||
193 | 3) OSI hypercalls | ||
194 | |||
195 | Mac-on-Linux is another user of KVM on PowerPC, which has its own hypercall (long | ||
196 | before KVM). This is supported to maintain compatibility. All these hypercalls get | ||
197 | forwarded to user space. This is only useful on book3s_32, but can be used with | ||
198 | book3s_64 as well. | ||
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index bd77cb507c1c..8b3a9c0e771d 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c | |||
@@ -924,6 +924,16 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
924 | return 0; | 924 | return 0; |
925 | } | 925 | } |
926 | 926 | ||
927 | int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event) | ||
928 | { | ||
929 | if (!irqchip_in_kernel(kvm)) | ||
930 | return -ENXIO; | ||
931 | |||
932 | irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, | ||
933 | irq_event->irq, irq_event->level); | ||
934 | return 0; | ||
935 | } | ||
936 | |||
927 | long kvm_arch_vm_ioctl(struct file *filp, | 937 | long kvm_arch_vm_ioctl(struct file *filp, |
928 | unsigned int ioctl, unsigned long arg) | 938 | unsigned int ioctl, unsigned long arg) |
929 | { | 939 | { |
@@ -963,29 +973,6 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
963 | goto out; | 973 | goto out; |
964 | } | 974 | } |
965 | break; | 975 | break; |
966 | case KVM_IRQ_LINE_STATUS: | ||
967 | case KVM_IRQ_LINE: { | ||
968 | struct kvm_irq_level irq_event; | ||
969 | |||
970 | r = -EFAULT; | ||
971 | if (copy_from_user(&irq_event, argp, sizeof irq_event)) | ||
972 | goto out; | ||
973 | r = -ENXIO; | ||
974 | if (irqchip_in_kernel(kvm)) { | ||
975 | __s32 status; | ||
976 | status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, | ||
977 | irq_event.irq, irq_event.level); | ||
978 | if (ioctl == KVM_IRQ_LINE_STATUS) { | ||
979 | r = -EFAULT; | ||
980 | irq_event.status = status; | ||
981 | if (copy_to_user(argp, &irq_event, | ||
982 | sizeof irq_event)) | ||
983 | goto out; | ||
984 | } | ||
985 | r = 0; | ||
986 | } | ||
987 | break; | ||
988 | } | ||
989 | case KVM_GET_IRQCHIP: { | 976 | case KVM_GET_IRQCHIP: { |
990 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | 977 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ |
991 | struct kvm_irqchip chip; | 978 | struct kvm_irqchip chip; |
@@ -1626,11 +1613,17 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, | |||
1626 | return; | 1613 | return; |
1627 | } | 1614 | } |
1628 | 1615 | ||
1629 | void kvm_arch_flush_shadow(struct kvm *kvm) | 1616 | void kvm_arch_flush_shadow_all(struct kvm *kvm) |
1630 | { | 1617 | { |
1631 | kvm_flush_remote_tlbs(kvm); | 1618 | kvm_flush_remote_tlbs(kvm); |
1632 | } | 1619 | } |
1633 | 1620 | ||
1621 | void kvm_arch_flush_shadow_memslot(struct kvm *kvm, | ||
1622 | struct kvm_memory_slot *slot) | ||
1623 | { | ||
1624 | kvm_arch_flush_shadow_all(); | ||
1625 | } | ||
1626 | |||
1634 | long kvm_arch_dev_ioctl(struct file *filp, | 1627 | long kvm_arch_dev_ioctl(struct file *filp, |
1635 | unsigned int ioctl, unsigned long arg) | 1628 | unsigned int ioctl, unsigned long arg) |
1636 | { | 1629 | { |
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index a8bf5c673a3c..28e8f5e5c63e 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h | |||
@@ -53,6 +53,8 @@ | |||
53 | 53 | ||
54 | struct kvm; | 54 | struct kvm; |
55 | extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); | 55 | extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); |
56 | extern int kvm_unmap_hva_range(struct kvm *kvm, | ||
57 | unsigned long start, unsigned long end); | ||
56 | extern int kvm_age_hva(struct kvm *kvm, unsigned long hva); | 58 | extern int kvm_age_hva(struct kvm *kvm, unsigned long hva); |
57 | extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); | 59 | extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); |
58 | extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); | 60 | extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); |
@@ -220,6 +222,7 @@ struct revmap_entry { | |||
220 | #define KVMPPC_GOT_PAGE 0x80 | 222 | #define KVMPPC_GOT_PAGE 0x80 |
221 | 223 | ||
222 | struct kvm_arch_memory_slot { | 224 | struct kvm_arch_memory_slot { |
225 | unsigned long *rmap; | ||
223 | }; | 226 | }; |
224 | 227 | ||
225 | struct kvm_arch { | 228 | struct kvm_arch { |
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 33aa715dab28..5dd3ab469976 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c | |||
@@ -319,7 +319,6 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, | |||
319 | if (is_error_page(new_page)) { | 319 | if (is_error_page(new_page)) { |
320 | printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n", | 320 | printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n", |
321 | (unsigned long long)gfn); | 321 | (unsigned long long)gfn); |
322 | kvm_release_page_clean(new_page); | ||
323 | return; | 322 | return; |
324 | } | 323 | } |
325 | hpaddr = page_to_phys(new_page); | 324 | hpaddr = page_to_phys(new_page); |
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index d03eb6f7b058..d95d11322a15 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c | |||
@@ -705,7 +705,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, | |||
705 | goto out_unlock; | 705 | goto out_unlock; |
706 | hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; | 706 | hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; |
707 | 707 | ||
708 | rmap = &memslot->rmap[gfn - memslot->base_gfn]; | 708 | rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; |
709 | lock_rmap(rmap); | 709 | lock_rmap(rmap); |
710 | 710 | ||
711 | /* Check if we might have been invalidated; let the guest retry if so */ | 711 | /* Check if we might have been invalidated; let the guest retry if so */ |
@@ -756,9 +756,12 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, | |||
756 | goto out_put; | 756 | goto out_put; |
757 | } | 757 | } |
758 | 758 | ||
759 | static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | 759 | static int kvm_handle_hva_range(struct kvm *kvm, |
760 | int (*handler)(struct kvm *kvm, unsigned long *rmapp, | 760 | unsigned long start, |
761 | unsigned long gfn)) | 761 | unsigned long end, |
762 | int (*handler)(struct kvm *kvm, | ||
763 | unsigned long *rmapp, | ||
764 | unsigned long gfn)) | ||
762 | { | 765 | { |
763 | int ret; | 766 | int ret; |
764 | int retval = 0; | 767 | int retval = 0; |
@@ -767,15 +770,25 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | |||
767 | 770 | ||
768 | slots = kvm_memslots(kvm); | 771 | slots = kvm_memslots(kvm); |
769 | kvm_for_each_memslot(memslot, slots) { | 772 | kvm_for_each_memslot(memslot, slots) { |
770 | unsigned long start = memslot->userspace_addr; | 773 | unsigned long hva_start, hva_end; |
771 | unsigned long end; | 774 | gfn_t gfn, gfn_end; |
772 | 775 | ||
773 | end = start + (memslot->npages << PAGE_SHIFT); | 776 | hva_start = max(start, memslot->userspace_addr); |
774 | if (hva >= start && hva < end) { | 777 | hva_end = min(end, memslot->userspace_addr + |
775 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; | 778 | (memslot->npages << PAGE_SHIFT)); |
779 | if (hva_start >= hva_end) | ||
780 | continue; | ||
781 | /* | ||
782 | * {gfn(page) | page intersects with [hva_start, hva_end)} = | ||
783 | * {gfn, gfn+1, ..., gfn_end-1}. | ||
784 | */ | ||
785 | gfn = hva_to_gfn_memslot(hva_start, memslot); | ||
786 | gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); | ||
787 | |||
788 | for (; gfn < gfn_end; ++gfn) { | ||
789 | gfn_t gfn_offset = gfn - memslot->base_gfn; | ||
776 | 790 | ||
777 | ret = handler(kvm, &memslot->rmap[gfn_offset], | 791 | ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn); |
778 | memslot->base_gfn + gfn_offset); | ||
779 | retval |= ret; | 792 | retval |= ret; |
780 | } | 793 | } |
781 | } | 794 | } |
@@ -783,6 +796,13 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | |||
783 | return retval; | 796 | return retval; |
784 | } | 797 | } |
785 | 798 | ||
799 | static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | ||
800 | int (*handler)(struct kvm *kvm, unsigned long *rmapp, | ||
801 | unsigned long gfn)) | ||
802 | { | ||
803 | return kvm_handle_hva_range(kvm, hva, hva + 1, handler); | ||
804 | } | ||
805 | |||
786 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, | 806 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, |
787 | unsigned long gfn) | 807 | unsigned long gfn) |
788 | { | 808 | { |
@@ -850,6 +870,13 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) | |||
850 | return 0; | 870 | return 0; |
851 | } | 871 | } |
852 | 872 | ||
873 | int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) | ||
874 | { | ||
875 | if (kvm->arch.using_mmu_notifiers) | ||
876 | kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp); | ||
877 | return 0; | ||
878 | } | ||
879 | |||
853 | static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | 880 | static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, |
854 | unsigned long gfn) | 881 | unsigned long gfn) |
855 | { | 882 | { |
@@ -1009,7 +1036,7 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) | |||
1009 | unsigned long *rmapp, *map; | 1036 | unsigned long *rmapp, *map; |
1010 | 1037 | ||
1011 | preempt_disable(); | 1038 | preempt_disable(); |
1012 | rmapp = memslot->rmap; | 1039 | rmapp = memslot->arch.rmap; |
1013 | map = memslot->dirty_bitmap; | 1040 | map = memslot->dirty_bitmap; |
1014 | for (i = 0; i < memslot->npages; ++i) { | 1041 | for (i = 0; i < memslot->npages; ++i) { |
1015 | if (kvm_test_clear_dirty(kvm, rmapp)) | 1042 | if (kvm_test_clear_dirty(kvm, rmapp)) |
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 5c70d19494f9..fb0e821622d4 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c | |||
@@ -84,7 +84,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, | |||
84 | if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) | 84 | if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) |
85 | return; | 85 | return; |
86 | 86 | ||
87 | rmap = real_vmalloc_addr(&memslot->rmap[gfn - memslot->base_gfn]); | 87 | rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]); |
88 | lock_rmap(rmap); | 88 | lock_rmap(rmap); |
89 | 89 | ||
90 | head = *rmap & KVMPPC_RMAP_INDEX; | 90 | head = *rmap & KVMPPC_RMAP_INDEX; |
@@ -180,7 +180,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, | |||
180 | if (!slot_is_aligned(memslot, psize)) | 180 | if (!slot_is_aligned(memslot, psize)) |
181 | return H_PARAMETER; | 181 | return H_PARAMETER; |
182 | slot_fn = gfn - memslot->base_gfn; | 182 | slot_fn = gfn - memslot->base_gfn; |
183 | rmap = &memslot->rmap[slot_fn]; | 183 | rmap = &memslot->arch.rmap[slot_fn]; |
184 | 184 | ||
185 | if (!kvm->arch.using_mmu_notifiers) { | 185 | if (!kvm->arch.using_mmu_notifiers) { |
186 | physp = kvm->arch.slot_phys[memslot->id]; | 186 | physp = kvm->arch.slot_phys[memslot->id]; |
@@ -197,7 +197,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, | |||
197 | pa &= PAGE_MASK; | 197 | pa &= PAGE_MASK; |
198 | } else { | 198 | } else { |
199 | /* Translate to host virtual address */ | 199 | /* Translate to host virtual address */ |
200 | hva = gfn_to_hva_memslot(memslot, gfn); | 200 | hva = __gfn_to_hva_memslot(memslot, gfn); |
201 | 201 | ||
202 | /* Look up the Linux PTE for the backing page */ | 202 | /* Look up the Linux PTE for the backing page */ |
203 | pte_size = psize; | 203 | pte_size = psize; |
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index a1baec340f7e..05c28f59f77f 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c | |||
@@ -242,10 +242,8 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) | |||
242 | int i; | 242 | int i; |
243 | 243 | ||
244 | hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT); | 244 | hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT); |
245 | if (is_error_page(hpage)) { | 245 | if (is_error_page(hpage)) |
246 | kvm_release_page_clean(hpage); | ||
247 | return; | 246 | return; |
248 | } | ||
249 | 247 | ||
250 | hpage_offset = pte->raddr & ~PAGE_MASK; | 248 | hpage_offset = pte->raddr & ~PAGE_MASK; |
251 | hpage_offset &= ~0xFFFULL; | 249 | hpage_offset &= ~0xFFFULL; |
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index a2b66717813d..ff38b664195d 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c | |||
@@ -520,11 +520,10 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, | |||
520 | 520 | ||
521 | if (likely(!pfnmap)) { | 521 | if (likely(!pfnmap)) { |
522 | unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); | 522 | unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); |
523 | pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn); | 523 | pfn = gfn_to_pfn_memslot(slot, gfn); |
524 | if (is_error_pfn(pfn)) { | 524 | if (is_error_pfn(pfn)) { |
525 | printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", | 525 | printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", |
526 | (long)gfn); | 526 | (long)gfn); |
527 | kvm_release_pfn_clean(pfn); | ||
528 | return; | 527 | return; |
529 | } | 528 | } |
530 | 529 | ||
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 87f4dc886076..4d213b8b0fb5 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c | |||
@@ -302,10 +302,18 @@ long kvm_arch_dev_ioctl(struct file *filp, | |||
302 | void kvm_arch_free_memslot(struct kvm_memory_slot *free, | 302 | void kvm_arch_free_memslot(struct kvm_memory_slot *free, |
303 | struct kvm_memory_slot *dont) | 303 | struct kvm_memory_slot *dont) |
304 | { | 304 | { |
305 | if (!dont || free->arch.rmap != dont->arch.rmap) { | ||
306 | vfree(free->arch.rmap); | ||
307 | free->arch.rmap = NULL; | ||
308 | } | ||
305 | } | 309 | } |
306 | 310 | ||
307 | int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) | 311 | int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) |
308 | { | 312 | { |
313 | slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); | ||
314 | if (!slot->arch.rmap) | ||
315 | return -ENOMEM; | ||
316 | |||
309 | return 0; | 317 | return 0; |
310 | } | 318 | } |
311 | 319 | ||
@@ -326,8 +334,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, | |||
326 | kvmppc_core_commit_memory_region(kvm, mem); | 334 | kvmppc_core_commit_memory_region(kvm, mem); |
327 | } | 335 | } |
328 | 336 | ||
337 | void kvm_arch_flush_shadow_all(struct kvm *kvm) | ||
338 | { | ||
339 | } | ||
329 | 340 | ||
330 | void kvm_arch_flush_shadow(struct kvm *kvm) | 341 | void kvm_arch_flush_shadow_memslot(struct kvm *kvm, |
342 | struct kvm_memory_slot *slot) | ||
331 | { | 343 | { |
332 | } | 344 | } |
333 | 345 | ||
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index f3e0aabfc6bc..56831dfa9198 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h | |||
@@ -159,6 +159,7 @@ extern unsigned long thread_saved_pc(struct task_struct *t); | |||
159 | 159 | ||
160 | extern void show_code(struct pt_regs *regs); | 160 | extern void show_code(struct pt_regs *regs); |
161 | extern void print_fn_code(unsigned char *code, unsigned long len); | 161 | extern void print_fn_code(unsigned char *code, unsigned long len); |
162 | extern int insn_to_mnemonic(unsigned char *instruction, char buf[8]); | ||
162 | 163 | ||
163 | unsigned long get_wchan(struct task_struct *p); | 164 | unsigned long get_wchan(struct task_struct *p); |
164 | #define task_pt_regs(tsk) ((struct pt_regs *) \ | 165 | #define task_pt_regs(tsk) ((struct pt_regs *) \ |
diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index cc84a24c023f..f00286bd2ef9 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c | |||
@@ -1501,6 +1501,33 @@ static struct insn *find_insn(unsigned char *code) | |||
1501 | return NULL; | 1501 | return NULL; |
1502 | } | 1502 | } |
1503 | 1503 | ||
1504 | /** | ||
1505 | * insn_to_mnemonic - decode an s390 instruction | ||
1506 | * @instruction: instruction to decode | ||
1507 | * @buf: buffer to fill with mnemonic | ||
1508 | * | ||
1509 | * Decode the instruction at @instruction and store the corresponding | ||
1510 | * mnemonic into @buf. | ||
1511 | * @buf is left unchanged if the instruction could not be decoded. | ||
1512 | * Returns: | ||
1513 | * %0 on success, %-ENOENT if the instruction was not found. | ||
1514 | */ | ||
1515 | int insn_to_mnemonic(unsigned char *instruction, char buf[8]) | ||
1516 | { | ||
1517 | struct insn *insn; | ||
1518 | |||
1519 | insn = find_insn(instruction); | ||
1520 | if (!insn) | ||
1521 | return -ENOENT; | ||
1522 | if (insn->name[0] == '\0') | ||
1523 | snprintf(buf, sizeof(buf), "%s", | ||
1524 | long_insn_name[(int) insn->name[1]]); | ||
1525 | else | ||
1526 | snprintf(buf, sizeof(buf), "%.5s", insn->name); | ||
1527 | return 0; | ||
1528 | } | ||
1529 | EXPORT_SYMBOL_GPL(insn_to_mnemonic); | ||
1530 | |||
1504 | static int print_insn(char *buffer, unsigned char *code, unsigned long addr) | 1531 | static int print_insn(char *buffer, unsigned char *code, unsigned long addr) |
1505 | { | 1532 | { |
1506 | struct insn *insn; | 1533 | struct insn *insn; |
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 9b04a32e5695..b58dd869cb32 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig | |||
@@ -21,6 +21,7 @@ config KVM | |||
21 | depends on HAVE_KVM && EXPERIMENTAL | 21 | depends on HAVE_KVM && EXPERIMENTAL |
22 | select PREEMPT_NOTIFIERS | 22 | select PREEMPT_NOTIFIERS |
23 | select ANON_INODES | 23 | select ANON_INODES |
24 | select HAVE_KVM_CPU_RELAX_INTERCEPT | ||
24 | ---help--- | 25 | ---help--- |
25 | Support hosting paravirtualized guest machines using the SIE | 26 | Support hosting paravirtualized guest machines using the SIE |
26 | virtualization capability on the mainframe. This should work | 27 | virtualization capability on the mainframe. This should work |
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index c88bb7793390..a390687feb13 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c | |||
@@ -14,6 +14,8 @@ | |||
14 | #include <linux/kvm.h> | 14 | #include <linux/kvm.h> |
15 | #include <linux/kvm_host.h> | 15 | #include <linux/kvm_host.h> |
16 | #include "kvm-s390.h" | 16 | #include "kvm-s390.h" |
17 | #include "trace.h" | ||
18 | #include "trace-s390.h" | ||
17 | 19 | ||
18 | static int diag_release_pages(struct kvm_vcpu *vcpu) | 20 | static int diag_release_pages(struct kvm_vcpu *vcpu) |
19 | { | 21 | { |
@@ -98,6 +100,7 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu) | |||
98 | vcpu->run->exit_reason = KVM_EXIT_S390_RESET; | 100 | vcpu->run->exit_reason = KVM_EXIT_S390_RESET; |
99 | VCPU_EVENT(vcpu, 3, "requesting userspace resets %llx", | 101 | VCPU_EVENT(vcpu, 3, "requesting userspace resets %llx", |
100 | vcpu->run->s390_reset_flags); | 102 | vcpu->run->s390_reset_flags); |
103 | trace_kvm_s390_request_resets(vcpu->run->s390_reset_flags); | ||
101 | return -EREMOTE; | 104 | return -EREMOTE; |
102 | } | 105 | } |
103 | 106 | ||
@@ -105,6 +108,7 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu) | |||
105 | { | 108 | { |
106 | int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16; | 109 | int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16; |
107 | 110 | ||
111 | trace_kvm_s390_handle_diag(vcpu, code); | ||
108 | switch (code) { | 112 | switch (code) { |
109 | case 0x10: | 113 | case 0x10: |
110 | return diag_release_pages(vcpu); | 114 | return diag_release_pages(vcpu); |
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index adae539f12e2..22798ec33fd1 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c | |||
@@ -19,6 +19,8 @@ | |||
19 | 19 | ||
20 | #include "kvm-s390.h" | 20 | #include "kvm-s390.h" |
21 | #include "gaccess.h" | 21 | #include "gaccess.h" |
22 | #include "trace.h" | ||
23 | #include "trace-s390.h" | ||
22 | 24 | ||
23 | static int handle_lctlg(struct kvm_vcpu *vcpu) | 25 | static int handle_lctlg(struct kvm_vcpu *vcpu) |
24 | { | 26 | { |
@@ -45,6 +47,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu) | |||
45 | 47 | ||
46 | VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2, | 48 | VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2, |
47 | disp2); | 49 | disp2); |
50 | trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr); | ||
48 | 51 | ||
49 | do { | 52 | do { |
50 | rc = get_guest_u64(vcpu, useraddr, | 53 | rc = get_guest_u64(vcpu, useraddr, |
@@ -82,6 +85,7 @@ static int handle_lctl(struct kvm_vcpu *vcpu) | |||
82 | 85 | ||
83 | VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2, | 86 | VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2, |
84 | disp2); | 87 | disp2); |
88 | trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr); | ||
85 | 89 | ||
86 | reg = reg1; | 90 | reg = reg1; |
87 | do { | 91 | do { |
@@ -135,6 +139,8 @@ static int handle_stop(struct kvm_vcpu *vcpu) | |||
135 | vcpu->stat.exit_stop_request++; | 139 | vcpu->stat.exit_stop_request++; |
136 | spin_lock_bh(&vcpu->arch.local_int.lock); | 140 | spin_lock_bh(&vcpu->arch.local_int.lock); |
137 | 141 | ||
142 | trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits); | ||
143 | |||
138 | if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) { | 144 | if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) { |
139 | vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP; | 145 | vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP; |
140 | rc = SIE_INTERCEPT_RERUNVCPU; | 146 | rc = SIE_INTERCEPT_RERUNVCPU; |
@@ -171,6 +177,7 @@ static int handle_validity(struct kvm_vcpu *vcpu) | |||
171 | int rc; | 177 | int rc; |
172 | 178 | ||
173 | vcpu->stat.exit_validity++; | 179 | vcpu->stat.exit_validity++; |
180 | trace_kvm_s390_intercept_validity(vcpu, viwhy); | ||
174 | if (viwhy == 0x37) { | 181 | if (viwhy == 0x37) { |
175 | vmaddr = gmap_fault(vcpu->arch.sie_block->prefix, | 182 | vmaddr = gmap_fault(vcpu->arch.sie_block->prefix, |
176 | vcpu->arch.gmap); | 183 | vcpu->arch.gmap); |
@@ -213,6 +220,9 @@ static int handle_instruction(struct kvm_vcpu *vcpu) | |||
213 | intercept_handler_t handler; | 220 | intercept_handler_t handler; |
214 | 221 | ||
215 | vcpu->stat.exit_instruction++; | 222 | vcpu->stat.exit_instruction++; |
223 | trace_kvm_s390_intercept_instruction(vcpu, | ||
224 | vcpu->arch.sie_block->ipa, | ||
225 | vcpu->arch.sie_block->ipb); | ||
216 | handler = instruction_handlers[vcpu->arch.sie_block->ipa >> 8]; | 226 | handler = instruction_handlers[vcpu->arch.sie_block->ipa >> 8]; |
217 | if (handler) | 227 | if (handler) |
218 | return handler(vcpu); | 228 | return handler(vcpu); |
@@ -222,6 +232,7 @@ static int handle_instruction(struct kvm_vcpu *vcpu) | |||
222 | static int handle_prog(struct kvm_vcpu *vcpu) | 232 | static int handle_prog(struct kvm_vcpu *vcpu) |
223 | { | 233 | { |
224 | vcpu->stat.exit_program_interruption++; | 234 | vcpu->stat.exit_program_interruption++; |
235 | trace_kvm_s390_intercept_prog(vcpu, vcpu->arch.sie_block->iprcc); | ||
225 | return kvm_s390_inject_program_int(vcpu, vcpu->arch.sie_block->iprcc); | 236 | return kvm_s390_inject_program_int(vcpu, vcpu->arch.sie_block->iprcc); |
226 | } | 237 | } |
227 | 238 | ||
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index b7bc1aac8ed2..ff1e2f8ef94a 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <asm/uaccess.h> | 19 | #include <asm/uaccess.h> |
20 | #include "kvm-s390.h" | 20 | #include "kvm-s390.h" |
21 | #include "gaccess.h" | 21 | #include "gaccess.h" |
22 | #include "trace-s390.h" | ||
22 | 23 | ||
23 | static int psw_extint_disabled(struct kvm_vcpu *vcpu) | 24 | static int psw_extint_disabled(struct kvm_vcpu *vcpu) |
24 | { | 25 | { |
@@ -130,6 +131,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, | |||
130 | case KVM_S390_INT_EMERGENCY: | 131 | case KVM_S390_INT_EMERGENCY: |
131 | VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp emerg"); | 132 | VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp emerg"); |
132 | vcpu->stat.deliver_emergency_signal++; | 133 | vcpu->stat.deliver_emergency_signal++; |
134 | trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, | ||
135 | inti->emerg.code, 0); | ||
133 | rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1201); | 136 | rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1201); |
134 | if (rc == -EFAULT) | 137 | if (rc == -EFAULT) |
135 | exception = 1; | 138 | exception = 1; |
@@ -152,6 +155,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, | |||
152 | case KVM_S390_INT_EXTERNAL_CALL: | 155 | case KVM_S390_INT_EXTERNAL_CALL: |
153 | VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call"); | 156 | VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call"); |
154 | vcpu->stat.deliver_external_call++; | 157 | vcpu->stat.deliver_external_call++; |
158 | trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, | ||
159 | inti->extcall.code, 0); | ||
155 | rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202); | 160 | rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202); |
156 | if (rc == -EFAULT) | 161 | if (rc == -EFAULT) |
157 | exception = 1; | 162 | exception = 1; |
@@ -175,6 +180,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, | |||
175 | VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x", | 180 | VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x", |
176 | inti->ext.ext_params); | 181 | inti->ext.ext_params); |
177 | vcpu->stat.deliver_service_signal++; | 182 | vcpu->stat.deliver_service_signal++; |
183 | trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, | ||
184 | inti->ext.ext_params, 0); | ||
178 | rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2401); | 185 | rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2401); |
179 | if (rc == -EFAULT) | 186 | if (rc == -EFAULT) |
180 | exception = 1; | 187 | exception = 1; |
@@ -198,6 +205,9 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, | |||
198 | VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx", | 205 | VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx", |
199 | inti->ext.ext_params, inti->ext.ext_params2); | 206 | inti->ext.ext_params, inti->ext.ext_params2); |
200 | vcpu->stat.deliver_virtio_interrupt++; | 207 | vcpu->stat.deliver_virtio_interrupt++; |
208 | trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, | ||
209 | inti->ext.ext_params, | ||
210 | inti->ext.ext_params2); | ||
201 | rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2603); | 211 | rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2603); |
202 | if (rc == -EFAULT) | 212 | if (rc == -EFAULT) |
203 | exception = 1; | 213 | exception = 1; |
@@ -229,6 +239,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, | |||
229 | case KVM_S390_SIGP_STOP: | 239 | case KVM_S390_SIGP_STOP: |
230 | VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop"); | 240 | VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop"); |
231 | vcpu->stat.deliver_stop_signal++; | 241 | vcpu->stat.deliver_stop_signal++; |
242 | trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, | ||
243 | 0, 0); | ||
232 | __set_intercept_indicator(vcpu, inti); | 244 | __set_intercept_indicator(vcpu, inti); |
233 | break; | 245 | break; |
234 | 246 | ||
@@ -236,12 +248,16 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, | |||
236 | VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x", | 248 | VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x", |
237 | inti->prefix.address); | 249 | inti->prefix.address); |
238 | vcpu->stat.deliver_prefix_signal++; | 250 | vcpu->stat.deliver_prefix_signal++; |
251 | trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, | ||
252 | inti->prefix.address, 0); | ||
239 | kvm_s390_set_prefix(vcpu, inti->prefix.address); | 253 | kvm_s390_set_prefix(vcpu, inti->prefix.address); |
240 | break; | 254 | break; |
241 | 255 | ||
242 | case KVM_S390_RESTART: | 256 | case KVM_S390_RESTART: |
243 | VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu restart"); | 257 | VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu restart"); |
244 | vcpu->stat.deliver_restart_signal++; | 258 | vcpu->stat.deliver_restart_signal++; |
259 | trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, | ||
260 | 0, 0); | ||
245 | rc = copy_to_guest(vcpu, offsetof(struct _lowcore, | 261 | rc = copy_to_guest(vcpu, offsetof(struct _lowcore, |
246 | restart_old_psw), &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); | 262 | restart_old_psw), &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); |
247 | if (rc == -EFAULT) | 263 | if (rc == -EFAULT) |
@@ -259,6 +275,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, | |||
259 | inti->pgm.code, | 275 | inti->pgm.code, |
260 | table[vcpu->arch.sie_block->ipa >> 14]); | 276 | table[vcpu->arch.sie_block->ipa >> 14]); |
261 | vcpu->stat.deliver_program_int++; | 277 | vcpu->stat.deliver_program_int++; |
278 | trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, | ||
279 | inti->pgm.code, 0); | ||
262 | rc = put_guest_u16(vcpu, __LC_PGM_INT_CODE, inti->pgm.code); | 280 | rc = put_guest_u16(vcpu, __LC_PGM_INT_CODE, inti->pgm.code); |
263 | if (rc == -EFAULT) | 281 | if (rc == -EFAULT) |
264 | exception = 1; | 282 | exception = 1; |
@@ -405,9 +423,7 @@ no_timer: | |||
405 | set_current_state(TASK_INTERRUPTIBLE); | 423 | set_current_state(TASK_INTERRUPTIBLE); |
406 | spin_unlock_bh(&vcpu->arch.local_int.lock); | 424 | spin_unlock_bh(&vcpu->arch.local_int.lock); |
407 | spin_unlock(&vcpu->arch.local_int.float_int->lock); | 425 | spin_unlock(&vcpu->arch.local_int.float_int->lock); |
408 | vcpu_put(vcpu); | ||
409 | schedule(); | 426 | schedule(); |
410 | vcpu_load(vcpu); | ||
411 | spin_lock(&vcpu->arch.local_int.float_int->lock); | 427 | spin_lock(&vcpu->arch.local_int.float_int->lock); |
412 | spin_lock_bh(&vcpu->arch.local_int.lock); | 428 | spin_lock_bh(&vcpu->arch.local_int.lock); |
413 | } | 429 | } |
@@ -515,6 +531,7 @@ int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code) | |||
515 | inti->pgm.code = code; | 531 | inti->pgm.code = code; |
516 | 532 | ||
517 | VCPU_EVENT(vcpu, 3, "inject: program check %d (from kernel)", code); | 533 | VCPU_EVENT(vcpu, 3, "inject: program check %d (from kernel)", code); |
534 | trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, inti->type, code, 0, 1); | ||
518 | spin_lock_bh(&li->lock); | 535 | spin_lock_bh(&li->lock); |
519 | list_add(&inti->list, &li->list); | 536 | list_add(&inti->list, &li->list); |
520 | atomic_set(&li->active, 1); | 537 | atomic_set(&li->active, 1); |
@@ -556,6 +573,8 @@ int kvm_s390_inject_vm(struct kvm *kvm, | |||
556 | kfree(inti); | 573 | kfree(inti); |
557 | return -EINVAL; | 574 | return -EINVAL; |
558 | } | 575 | } |
576 | trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64, | ||
577 | 2); | ||
559 | 578 | ||
560 | mutex_lock(&kvm->lock); | 579 | mutex_lock(&kvm->lock); |
561 | fi = &kvm->arch.float_int; | 580 | fi = &kvm->arch.float_int; |
@@ -621,6 +640,8 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, | |||
621 | kfree(inti); | 640 | kfree(inti); |
622 | return -EINVAL; | 641 | return -EINVAL; |
623 | } | 642 | } |
643 | trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, s390int->type, s390int->parm, | ||
644 | s390int->parm64, 2); | ||
624 | 645 | ||
625 | mutex_lock(&vcpu->kvm->lock); | 646 | mutex_lock(&vcpu->kvm->lock); |
626 | li = &vcpu->arch.local_int; | 647 | li = &vcpu->arch.local_int; |
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d470ccbfabae..ecced9d18986 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c | |||
@@ -32,6 +32,10 @@ | |||
32 | #include "kvm-s390.h" | 32 | #include "kvm-s390.h" |
33 | #include "gaccess.h" | 33 | #include "gaccess.h" |
34 | 34 | ||
35 | #define CREATE_TRACE_POINTS | ||
36 | #include "trace.h" | ||
37 | #include "trace-s390.h" | ||
38 | |||
35 | #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU | 39 | #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU |
36 | 40 | ||
37 | struct kvm_stats_debugfs_item debugfs_entries[] = { | 41 | struct kvm_stats_debugfs_item debugfs_entries[] = { |
@@ -242,6 +246,7 @@ out_err: | |||
242 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) | 246 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) |
243 | { | 247 | { |
244 | VCPU_EVENT(vcpu, 3, "%s", "free cpu"); | 248 | VCPU_EVENT(vcpu, 3, "%s", "free cpu"); |
249 | trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id); | ||
245 | if (!kvm_is_ucontrol(vcpu->kvm)) { | 250 | if (!kvm_is_ucontrol(vcpu->kvm)) { |
246 | clear_bit(63 - vcpu->vcpu_id, | 251 | clear_bit(63 - vcpu->vcpu_id, |
247 | (unsigned long *) &vcpu->kvm->arch.sca->mcn); | 252 | (unsigned long *) &vcpu->kvm->arch.sca->mcn); |
@@ -417,6 +422,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, | |||
417 | goto out_free_sie_block; | 422 | goto out_free_sie_block; |
418 | VM_EVENT(kvm, 3, "create cpu %d at %p, sie block at %p", id, vcpu, | 423 | VM_EVENT(kvm, 3, "create cpu %d at %p, sie block at %p", id, vcpu, |
419 | vcpu->arch.sie_block); | 424 | vcpu->arch.sie_block); |
425 | trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block); | ||
420 | 426 | ||
421 | return vcpu; | 427 | return vcpu; |
422 | out_free_sie_block: | 428 | out_free_sie_block: |
@@ -607,18 +613,22 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
607 | local_irq_enable(); | 613 | local_irq_enable(); |
608 | VCPU_EVENT(vcpu, 6, "entering sie flags %x", | 614 | VCPU_EVENT(vcpu, 6, "entering sie flags %x", |
609 | atomic_read(&vcpu->arch.sie_block->cpuflags)); | 615 | atomic_read(&vcpu->arch.sie_block->cpuflags)); |
616 | trace_kvm_s390_sie_enter(vcpu, | ||
617 | atomic_read(&vcpu->arch.sie_block->cpuflags)); | ||
610 | rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); | 618 | rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); |
611 | if (rc) { | 619 | if (rc) { |
612 | if (kvm_is_ucontrol(vcpu->kvm)) { | 620 | if (kvm_is_ucontrol(vcpu->kvm)) { |
613 | rc = SIE_INTERCEPT_UCONTROL; | 621 | rc = SIE_INTERCEPT_UCONTROL; |
614 | } else { | 622 | } else { |
615 | VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction"); | 623 | VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction"); |
624 | trace_kvm_s390_sie_fault(vcpu); | ||
616 | kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); | 625 | kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); |
617 | rc = 0; | 626 | rc = 0; |
618 | } | 627 | } |
619 | } | 628 | } |
620 | VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", | 629 | VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", |
621 | vcpu->arch.sie_block->icptcode); | 630 | vcpu->arch.sie_block->icptcode); |
631 | trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode); | ||
622 | local_irq_disable(); | 632 | local_irq_disable(); |
623 | kvm_guest_exit(); | 633 | kvm_guest_exit(); |
624 | local_irq_enable(); | 634 | local_irq_enable(); |
@@ -959,7 +969,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, | |||
959 | return; | 969 | return; |
960 | } | 970 | } |
961 | 971 | ||
962 | void kvm_arch_flush_shadow(struct kvm *kvm) | 972 | void kvm_arch_flush_shadow_all(struct kvm *kvm) |
973 | { | ||
974 | } | ||
975 | |||
976 | void kvm_arch_flush_shadow_memslot(struct kvm *kvm, | ||
977 | struct kvm_memory_slot *slot) | ||
963 | { | 978 | { |
964 | } | 979 | } |
965 | 980 | ||
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 310be61bead7..d768906f15c8 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <asm/sysinfo.h> | 20 | #include <asm/sysinfo.h> |
21 | #include "gaccess.h" | 21 | #include "gaccess.h" |
22 | #include "kvm-s390.h" | 22 | #include "kvm-s390.h" |
23 | #include "trace.h" | ||
23 | 24 | ||
24 | static int handle_set_prefix(struct kvm_vcpu *vcpu) | 25 | static int handle_set_prefix(struct kvm_vcpu *vcpu) |
25 | { | 26 | { |
@@ -59,6 +60,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu) | |||
59 | kvm_s390_set_prefix(vcpu, address); | 60 | kvm_s390_set_prefix(vcpu, address); |
60 | 61 | ||
61 | VCPU_EVENT(vcpu, 5, "setting prefix to %x", address); | 62 | VCPU_EVENT(vcpu, 5, "setting prefix to %x", address); |
63 | trace_kvm_s390_handle_prefix(vcpu, 1, address); | ||
62 | out: | 64 | out: |
63 | return 0; | 65 | return 0; |
64 | } | 66 | } |
@@ -91,6 +93,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu) | |||
91 | } | 93 | } |
92 | 94 | ||
93 | VCPU_EVENT(vcpu, 5, "storing prefix to %x", address); | 95 | VCPU_EVENT(vcpu, 5, "storing prefix to %x", address); |
96 | trace_kvm_s390_handle_prefix(vcpu, 0, address); | ||
94 | out: | 97 | out: |
95 | return 0; | 98 | return 0; |
96 | } | 99 | } |
@@ -119,6 +122,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu) | |||
119 | } | 122 | } |
120 | 123 | ||
121 | VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr); | 124 | VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr); |
125 | trace_kvm_s390_handle_stap(vcpu, useraddr); | ||
122 | out: | 126 | out: |
123 | return 0; | 127 | return 0; |
124 | } | 128 | } |
@@ -164,9 +168,11 @@ static int handle_stfl(struct kvm_vcpu *vcpu) | |||
164 | &facility_list, sizeof(facility_list)); | 168 | &facility_list, sizeof(facility_list)); |
165 | if (rc == -EFAULT) | 169 | if (rc == -EFAULT) |
166 | kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); | 170 | kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); |
167 | else | 171 | else { |
168 | VCPU_EVENT(vcpu, 5, "store facility list value %x", | 172 | VCPU_EVENT(vcpu, 5, "store facility list value %x", |
169 | facility_list); | 173 | facility_list); |
174 | trace_kvm_s390_handle_stfl(vcpu, facility_list); | ||
175 | } | ||
170 | return 0; | 176 | return 0; |
171 | } | 177 | } |
172 | 178 | ||
@@ -278,6 +284,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu) | |||
278 | kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); | 284 | kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); |
279 | goto out_mem; | 285 | goto out_mem; |
280 | } | 286 | } |
287 | trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2); | ||
281 | free_page(mem); | 288 | free_page(mem); |
282 | vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); | 289 | vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); |
283 | vcpu->run->s.regs.gprs[0] = 0; | 290 | vcpu->run->s.regs.gprs[0] = 0; |
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 56f80e1f98f7..566ddf6e8dfb 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <asm/sigp.h> | 18 | #include <asm/sigp.h> |
19 | #include "gaccess.h" | 19 | #include "gaccess.h" |
20 | #include "kvm-s390.h" | 20 | #include "kvm-s390.h" |
21 | #include "trace.h" | ||
21 | 22 | ||
22 | static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr, | 23 | static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr, |
23 | u64 *reg) | 24 | u64 *reg) |
@@ -344,6 +345,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) | |||
344 | else | 345 | else |
345 | parameter = vcpu->run->s.regs.gprs[r1 + 1]; | 346 | parameter = vcpu->run->s.regs.gprs[r1 + 1]; |
346 | 347 | ||
348 | trace_kvm_s390_handle_sigp(vcpu, order_code, cpu_addr, parameter); | ||
347 | switch (order_code) { | 349 | switch (order_code) { |
348 | case SIGP_SENSE: | 350 | case SIGP_SENSE: |
349 | vcpu->stat.instruction_sigp_sense++; | 351 | vcpu->stat.instruction_sigp_sense++; |
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h new file mode 100644 index 000000000000..90fdf85b5ff7 --- /dev/null +++ b/arch/s390/kvm/trace-s390.h | |||
@@ -0,0 +1,210 @@ | |||
1 | #if !defined(_TRACE_KVMS390_H) || defined(TRACE_HEADER_MULTI_READ) | ||
2 | #define _TRACE_KVMS390_H | ||
3 | |||
4 | #include <linux/tracepoint.h> | ||
5 | |||
6 | #undef TRACE_SYSTEM | ||
7 | #define TRACE_SYSTEM kvm-s390 | ||
8 | #define TRACE_INCLUDE_PATH . | ||
9 | #undef TRACE_INCLUDE_FILE | ||
10 | #define TRACE_INCLUDE_FILE trace-s390 | ||
11 | |||
12 | /* | ||
13 | * Trace point for the creation of the kvm instance. | ||
14 | */ | ||
15 | TRACE_EVENT(kvm_s390_create_vm, | ||
16 | TP_PROTO(unsigned long type), | ||
17 | TP_ARGS(type), | ||
18 | |||
19 | TP_STRUCT__entry( | ||
20 | __field(unsigned long, type) | ||
21 | ), | ||
22 | |||
23 | TP_fast_assign( | ||
24 | __entry->type = type; | ||
25 | ), | ||
26 | |||
27 | TP_printk("create vm%s", | ||
28 | __entry->type & KVM_VM_S390_UCONTROL ? " (UCONTROL)" : "") | ||
29 | ); | ||
30 | |||
31 | /* | ||
32 | * Trace points for creation and destruction of vpcus. | ||
33 | */ | ||
34 | TRACE_EVENT(kvm_s390_create_vcpu, | ||
35 | TP_PROTO(unsigned int id, struct kvm_vcpu *vcpu, | ||
36 | struct kvm_s390_sie_block *sie_block), | ||
37 | TP_ARGS(id, vcpu, sie_block), | ||
38 | |||
39 | TP_STRUCT__entry( | ||
40 | __field(unsigned int, id) | ||
41 | __field(struct kvm_vcpu *, vcpu) | ||
42 | __field(struct kvm_s390_sie_block *, sie_block) | ||
43 | ), | ||
44 | |||
45 | TP_fast_assign( | ||
46 | __entry->id = id; | ||
47 | __entry->vcpu = vcpu; | ||
48 | __entry->sie_block = sie_block; | ||
49 | ), | ||
50 | |||
51 | TP_printk("create cpu %d at %p, sie block at %p", __entry->id, | ||
52 | __entry->vcpu, __entry->sie_block) | ||
53 | ); | ||
54 | |||
55 | TRACE_EVENT(kvm_s390_destroy_vcpu, | ||
56 | TP_PROTO(unsigned int id), | ||
57 | TP_ARGS(id), | ||
58 | |||
59 | TP_STRUCT__entry( | ||
60 | __field(unsigned int, id) | ||
61 | ), | ||
62 | |||
63 | TP_fast_assign( | ||
64 | __entry->id = id; | ||
65 | ), | ||
66 | |||
67 | TP_printk("destroy cpu %d", __entry->id) | ||
68 | ); | ||
69 | |||
70 | /* | ||
71 | * Trace points for injection of interrupts, either per machine or | ||
72 | * per vcpu. | ||
73 | */ | ||
74 | |||
75 | #define kvm_s390_int_type \ | ||
76 | {KVM_S390_SIGP_STOP, "sigp stop"}, \ | ||
77 | {KVM_S390_PROGRAM_INT, "program interrupt"}, \ | ||
78 | {KVM_S390_SIGP_SET_PREFIX, "sigp set prefix"}, \ | ||
79 | {KVM_S390_RESTART, "sigp restart"}, \ | ||
80 | {KVM_S390_INT_VIRTIO, "virtio interrupt"}, \ | ||
81 | {KVM_S390_INT_SERVICE, "sclp interrupt"}, \ | ||
82 | {KVM_S390_INT_EMERGENCY, "sigp emergency"}, \ | ||
83 | {KVM_S390_INT_EXTERNAL_CALL, "sigp ext call"} | ||
84 | |||
85 | TRACE_EVENT(kvm_s390_inject_vm, | ||
86 | TP_PROTO(__u64 type, __u32 parm, __u64 parm64, int who), | ||
87 | TP_ARGS(type, parm, parm64, who), | ||
88 | |||
89 | TP_STRUCT__entry( | ||
90 | __field(__u32, inttype) | ||
91 | __field(__u32, parm) | ||
92 | __field(__u64, parm64) | ||
93 | __field(int, who) | ||
94 | ), | ||
95 | |||
96 | TP_fast_assign( | ||
97 | __entry->inttype = type & 0x00000000ffffffff; | ||
98 | __entry->parm = parm; | ||
99 | __entry->parm64 = parm64; | ||
100 | __entry->who = who; | ||
101 | ), | ||
102 | |||
103 | TP_printk("inject%s: type:%x (%s) parm:%x parm64:%llx", | ||
104 | (__entry->who == 1) ? " (from kernel)" : | ||
105 | (__entry->who == 2) ? " (from user)" : "", | ||
106 | __entry->inttype, | ||
107 | __print_symbolic(__entry->inttype, kvm_s390_int_type), | ||
108 | __entry->parm, __entry->parm64) | ||
109 | ); | ||
110 | |||
111 | TRACE_EVENT(kvm_s390_inject_vcpu, | ||
112 | TP_PROTO(unsigned int id, __u64 type, __u32 parm, __u64 parm64, \ | ||
113 | int who), | ||
114 | TP_ARGS(id, type, parm, parm64, who), | ||
115 | |||
116 | TP_STRUCT__entry( | ||
117 | __field(int, id) | ||
118 | __field(__u32, inttype) | ||
119 | __field(__u32, parm) | ||
120 | __field(__u64, parm64) | ||
121 | __field(int, who) | ||
122 | ), | ||
123 | |||
124 | TP_fast_assign( | ||
125 | __entry->id = id; | ||
126 | __entry->inttype = type & 0x00000000ffffffff; | ||
127 | __entry->parm = parm; | ||
128 | __entry->parm64 = parm64; | ||
129 | __entry->who = who; | ||
130 | ), | ||
131 | |||
132 | TP_printk("inject%s (vcpu %d): type:%x (%s) parm:%x parm64:%llx", | ||
133 | (__entry->who == 1) ? " (from kernel)" : | ||
134 | (__entry->who == 2) ? " (from user)" : "", | ||
135 | __entry->id, __entry->inttype, | ||
136 | __print_symbolic(__entry->inttype, kvm_s390_int_type), | ||
137 | __entry->parm, __entry->parm64) | ||
138 | ); | ||
139 | |||
140 | /* | ||
141 | * Trace point for the actual delivery of interrupts. | ||
142 | */ | ||
143 | TRACE_EVENT(kvm_s390_deliver_interrupt, | ||
144 | TP_PROTO(unsigned int id, __u64 type, __u32 data0, __u64 data1), | ||
145 | TP_ARGS(id, type, data0, data1), | ||
146 | |||
147 | TP_STRUCT__entry( | ||
148 | __field(int, id) | ||
149 | __field(__u32, inttype) | ||
150 | __field(__u32, data0) | ||
151 | __field(__u64, data1) | ||
152 | ), | ||
153 | |||
154 | TP_fast_assign( | ||
155 | __entry->id = id; | ||
156 | __entry->inttype = type & 0x00000000ffffffff; | ||
157 | __entry->data0 = data0; | ||
158 | __entry->data1 = data1; | ||
159 | ), | ||
160 | |||
161 | TP_printk("deliver interrupt (vcpu %d): type:%x (%s) " \ | ||
162 | "data:%08x %016llx", | ||
163 | __entry->id, __entry->inttype, | ||
164 | __print_symbolic(__entry->inttype, kvm_s390_int_type), | ||
165 | __entry->data0, __entry->data1) | ||
166 | ); | ||
167 | |||
168 | /* | ||
169 | * Trace point for resets that may be requested from userspace. | ||
170 | */ | ||
171 | TRACE_EVENT(kvm_s390_request_resets, | ||
172 | TP_PROTO(__u64 resets), | ||
173 | TP_ARGS(resets), | ||
174 | |||
175 | TP_STRUCT__entry( | ||
176 | __field(__u64, resets) | ||
177 | ), | ||
178 | |||
179 | TP_fast_assign( | ||
180 | __entry->resets = resets; | ||
181 | ), | ||
182 | |||
183 | TP_printk("requesting userspace resets %llx", | ||
184 | __entry->resets) | ||
185 | ); | ||
186 | |||
187 | /* | ||
188 | * Trace point for a vcpu's stop requests. | ||
189 | */ | ||
190 | TRACE_EVENT(kvm_s390_stop_request, | ||
191 | TP_PROTO(unsigned int action_bits), | ||
192 | TP_ARGS(action_bits), | ||
193 | |||
194 | TP_STRUCT__entry( | ||
195 | __field(unsigned int, action_bits) | ||
196 | ), | ||
197 | |||
198 | TP_fast_assign( | ||
199 | __entry->action_bits = action_bits; | ||
200 | ), | ||
201 | |||
202 | TP_printk("stop request, action_bits = %08x", | ||
203 | __entry->action_bits) | ||
204 | ); | ||
205 | |||
206 | |||
207 | #endif /* _TRACE_KVMS390_H */ | ||
208 | |||
209 | /* This part must be outside protection */ | ||
210 | #include <trace/define_trace.h> | ||
diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h new file mode 100644 index 000000000000..2b29e62351d3 --- /dev/null +++ b/arch/s390/kvm/trace.h | |||
@@ -0,0 +1,341 @@ | |||
1 | #if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) | ||
2 | #define _TRACE_KVM_H | ||
3 | |||
4 | #include <linux/tracepoint.h> | ||
5 | #include <asm/sigp.h> | ||
6 | #include <asm/debug.h> | ||
7 | |||
8 | #undef TRACE_SYSTEM | ||
9 | #define TRACE_SYSTEM kvm | ||
10 | #define TRACE_INCLUDE_PATH . | ||
11 | #undef TRACE_INCLUDE_FILE | ||
12 | #define TRACE_INCLUDE_FILE trace | ||
13 | |||
14 | /* | ||
15 | * Helpers for vcpu-specific tracepoints containing the same information | ||
16 | * as s390dbf VCPU_EVENTs. | ||
17 | */ | ||
18 | #define VCPU_PROTO_COMMON struct kvm_vcpu *vcpu | ||
19 | #define VCPU_ARGS_COMMON vcpu | ||
20 | #define VCPU_FIELD_COMMON __field(int, id) \ | ||
21 | __field(unsigned long, pswmask) \ | ||
22 | __field(unsigned long, pswaddr) | ||
23 | #define VCPU_ASSIGN_COMMON do { \ | ||
24 | __entry->id = vcpu->vcpu_id; \ | ||
25 | __entry->pswmask = vcpu->arch.sie_block->gpsw.mask; \ | ||
26 | __entry->pswaddr = vcpu->arch.sie_block->gpsw.addr; \ | ||
27 | } while (0); | ||
28 | #define VCPU_TP_PRINTK(p_str, p_args...) \ | ||
29 | TP_printk("%02d[%016lx-%016lx]: " p_str, __entry->id, \ | ||
30 | __entry->pswmask, __entry->pswaddr, p_args) | ||
31 | |||
32 | /* | ||
33 | * Tracepoints for SIE entry and exit. | ||
34 | */ | ||
35 | TRACE_EVENT(kvm_s390_sie_enter, | ||
36 | TP_PROTO(VCPU_PROTO_COMMON, int cpuflags), | ||
37 | TP_ARGS(VCPU_ARGS_COMMON, cpuflags), | ||
38 | |||
39 | TP_STRUCT__entry( | ||
40 | VCPU_FIELD_COMMON | ||
41 | __field(int, cpuflags) | ||
42 | ), | ||
43 | |||
44 | TP_fast_assign( | ||
45 | VCPU_ASSIGN_COMMON | ||
46 | __entry->cpuflags = cpuflags; | ||
47 | ), | ||
48 | |||
49 | VCPU_TP_PRINTK("entering sie flags %x", __entry->cpuflags) | ||
50 | ); | ||
51 | |||
52 | TRACE_EVENT(kvm_s390_sie_fault, | ||
53 | TP_PROTO(VCPU_PROTO_COMMON), | ||
54 | TP_ARGS(VCPU_ARGS_COMMON), | ||
55 | |||
56 | TP_STRUCT__entry( | ||
57 | VCPU_FIELD_COMMON | ||
58 | ), | ||
59 | |||
60 | TP_fast_assign( | ||
61 | VCPU_ASSIGN_COMMON | ||
62 | ), | ||
63 | |||
64 | VCPU_TP_PRINTK("%s", "fault in sie instruction") | ||
65 | ); | ||
66 | |||
67 | #define sie_intercept_code \ | ||
68 | {0x04, "Instruction"}, \ | ||
69 | {0x08, "Program interruption"}, \ | ||
70 | {0x0C, "Instruction and program interuption"}, \ | ||
71 | {0x10, "External request"}, \ | ||
72 | {0x14, "External interruption"}, \ | ||
73 | {0x18, "I/O request"}, \ | ||
74 | {0x1C, "Wait state"}, \ | ||
75 | {0x20, "Validity"}, \ | ||
76 | {0x28, "Stop request"} | ||
77 | |||
78 | TRACE_EVENT(kvm_s390_sie_exit, | ||
79 | TP_PROTO(VCPU_PROTO_COMMON, u8 icptcode), | ||
80 | TP_ARGS(VCPU_ARGS_COMMON, icptcode), | ||
81 | |||
82 | TP_STRUCT__entry( | ||
83 | VCPU_FIELD_COMMON | ||
84 | __field(u8, icptcode) | ||
85 | ), | ||
86 | |||
87 | TP_fast_assign( | ||
88 | VCPU_ASSIGN_COMMON | ||
89 | __entry->icptcode = icptcode; | ||
90 | ), | ||
91 | |||
92 | VCPU_TP_PRINTK("exit sie icptcode %d (%s)", __entry->icptcode, | ||
93 | __print_symbolic(__entry->icptcode, | ||
94 | sie_intercept_code)) | ||
95 | ); | ||
96 | |||
97 | /* | ||
98 | * Trace point for intercepted instructions. | ||
99 | */ | ||
100 | TRACE_EVENT(kvm_s390_intercept_instruction, | ||
101 | TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb), | ||
102 | TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb), | ||
103 | |||
104 | TP_STRUCT__entry( | ||
105 | VCPU_FIELD_COMMON | ||
106 | __field(__u64, instruction) | ||
107 | __field(char, insn[8]) | ||
108 | ), | ||
109 | |||
110 | TP_fast_assign( | ||
111 | VCPU_ASSIGN_COMMON | ||
112 | __entry->instruction = ((__u64)ipa << 48) | | ||
113 | ((__u64)ipb << 16); | ||
114 | ), | ||
115 | |||
116 | VCPU_TP_PRINTK("intercepted instruction %016llx (%s)", | ||
117 | __entry->instruction, | ||
118 | insn_to_mnemonic((unsigned char *) | ||
119 | &__entry->instruction, | ||
120 | __entry->insn) ? | ||
121 | "unknown" : __entry->insn) | ||
122 | ); | ||
123 | |||
124 | /* | ||
125 | * Trace point for intercepted program interruptions. | ||
126 | */ | ||
127 | TRACE_EVENT(kvm_s390_intercept_prog, | ||
128 | TP_PROTO(VCPU_PROTO_COMMON, __u16 code), | ||
129 | TP_ARGS(VCPU_ARGS_COMMON, code), | ||
130 | |||
131 | TP_STRUCT__entry( | ||
132 | VCPU_FIELD_COMMON | ||
133 | __field(__u16, code) | ||
134 | ), | ||
135 | |||
136 | TP_fast_assign( | ||
137 | VCPU_ASSIGN_COMMON | ||
138 | __entry->code = code; | ||
139 | ), | ||
140 | |||
141 | VCPU_TP_PRINTK("intercepted program interruption %04x", | ||
142 | __entry->code) | ||
143 | ); | ||
144 | |||
145 | /* | ||
146 | * Trace point for validity intercepts. | ||
147 | */ | ||
148 | TRACE_EVENT(kvm_s390_intercept_validity, | ||
149 | TP_PROTO(VCPU_PROTO_COMMON, __u16 viwhy), | ||
150 | TP_ARGS(VCPU_ARGS_COMMON, viwhy), | ||
151 | |||
152 | TP_STRUCT__entry( | ||
153 | VCPU_FIELD_COMMON | ||
154 | __field(__u16, viwhy) | ||
155 | ), | ||
156 | |||
157 | TP_fast_assign( | ||
158 | VCPU_ASSIGN_COMMON | ||
159 | __entry->viwhy = viwhy; | ||
160 | ), | ||
161 | |||
162 | VCPU_TP_PRINTK("got validity intercept %04x", __entry->viwhy) | ||
163 | ); | ||
164 | |||
165 | /* | ||
166 | * Trace points for instructions that are of special interest. | ||
167 | */ | ||
168 | |||
169 | #define sigp_order_codes \ | ||
170 | {SIGP_SENSE, "sense"}, \ | ||
171 | {SIGP_EXTERNAL_CALL, "external call"}, \ | ||
172 | {SIGP_EMERGENCY_SIGNAL, "emergency signal"}, \ | ||
173 | {SIGP_STOP, "stop"}, \ | ||
174 | {SIGP_STOP_AND_STORE_STATUS, "stop and store status"}, \ | ||
175 | {SIGP_SET_ARCHITECTURE, "set architecture"}, \ | ||
176 | {SIGP_SET_PREFIX, "set prefix"}, \ | ||
177 | {SIGP_SENSE_RUNNING, "sense running"}, \ | ||
178 | {SIGP_RESTART, "restart"} | ||
179 | |||
180 | TRACE_EVENT(kvm_s390_handle_sigp, | ||
181 | TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr, \ | ||
182 | __u32 parameter), | ||
183 | TP_ARGS(VCPU_ARGS_COMMON, order_code, cpu_addr, parameter), | ||
184 | |||
185 | TP_STRUCT__entry( | ||
186 | VCPU_FIELD_COMMON | ||
187 | __field(__u8, order_code) | ||
188 | __field(__u16, cpu_addr) | ||
189 | __field(__u32, parameter) | ||
190 | ), | ||
191 | |||
192 | TP_fast_assign( | ||
193 | VCPU_ASSIGN_COMMON | ||
194 | __entry->order_code = order_code; | ||
195 | __entry->cpu_addr = cpu_addr; | ||
196 | __entry->parameter = parameter; | ||
197 | ), | ||
198 | |||
199 | VCPU_TP_PRINTK("handle sigp order %02x (%s), cpu address %04x, " \ | ||
200 | "parameter %08x", __entry->order_code, | ||
201 | __print_symbolic(__entry->order_code, | ||
202 | sigp_order_codes), | ||
203 | __entry->cpu_addr, __entry->parameter) | ||
204 | ); | ||
205 | |||
206 | #define diagnose_codes \ | ||
207 | {0x10, "release pages"}, \ | ||
208 | {0x44, "time slice end"}, \ | ||
209 | {0x308, "ipl functions"}, \ | ||
210 | {0x500, "kvm hypercall"}, \ | ||
211 | {0x501, "kvm breakpoint"} | ||
212 | |||
213 | TRACE_EVENT(kvm_s390_handle_diag, | ||
214 | TP_PROTO(VCPU_PROTO_COMMON, __u16 code), | ||
215 | TP_ARGS(VCPU_ARGS_COMMON, code), | ||
216 | |||
217 | TP_STRUCT__entry( | ||
218 | VCPU_FIELD_COMMON | ||
219 | __field(__u16, code) | ||
220 | ), | ||
221 | |||
222 | TP_fast_assign( | ||
223 | VCPU_ASSIGN_COMMON | ||
224 | __entry->code = code; | ||
225 | ), | ||
226 | |||
227 | VCPU_TP_PRINTK("handle diagnose call %04x (%s)", __entry->code, | ||
228 | __print_symbolic(__entry->code, diagnose_codes)) | ||
229 | ); | ||
230 | |||
231 | TRACE_EVENT(kvm_s390_handle_lctl, | ||
232 | TP_PROTO(VCPU_PROTO_COMMON, int g, int reg1, int reg3, u64 addr), | ||
233 | TP_ARGS(VCPU_ARGS_COMMON, g, reg1, reg3, addr), | ||
234 | |||
235 | TP_STRUCT__entry( | ||
236 | VCPU_FIELD_COMMON | ||
237 | __field(int, g) | ||
238 | __field(int, reg1) | ||
239 | __field(int, reg3) | ||
240 | __field(u64, addr) | ||
241 | ), | ||
242 | |||
243 | TP_fast_assign( | ||
244 | VCPU_ASSIGN_COMMON | ||
245 | __entry->g = g; | ||
246 | __entry->reg1 = reg1; | ||
247 | __entry->reg3 = reg3; | ||
248 | __entry->addr = addr; | ||
249 | ), | ||
250 | |||
251 | VCPU_TP_PRINTK("%s: loading cr %x-%x from %016llx", | ||
252 | __entry->g ? "lctlg" : "lctl", | ||
253 | __entry->reg1, __entry->reg3, __entry->addr) | ||
254 | ); | ||
255 | |||
256 | TRACE_EVENT(kvm_s390_handle_prefix, | ||
257 | TP_PROTO(VCPU_PROTO_COMMON, int set, u32 address), | ||
258 | TP_ARGS(VCPU_ARGS_COMMON, set, address), | ||
259 | |||
260 | TP_STRUCT__entry( | ||
261 | VCPU_FIELD_COMMON | ||
262 | __field(int, set) | ||
263 | __field(u32, address) | ||
264 | ), | ||
265 | |||
266 | TP_fast_assign( | ||
267 | VCPU_ASSIGN_COMMON | ||
268 | __entry->set = set; | ||
269 | __entry->address = address; | ||
270 | ), | ||
271 | |||
272 | VCPU_TP_PRINTK("%s prefix to %08x", | ||
273 | __entry->set ? "setting" : "storing", | ||
274 | __entry->address) | ||
275 | ); | ||
276 | |||
277 | TRACE_EVENT(kvm_s390_handle_stap, | ||
278 | TP_PROTO(VCPU_PROTO_COMMON, u64 address), | ||
279 | TP_ARGS(VCPU_ARGS_COMMON, address), | ||
280 | |||
281 | TP_STRUCT__entry( | ||
282 | VCPU_FIELD_COMMON | ||
283 | __field(u64, address) | ||
284 | ), | ||
285 | |||
286 | TP_fast_assign( | ||
287 | VCPU_ASSIGN_COMMON | ||
288 | __entry->address = address; | ||
289 | ), | ||
290 | |||
291 | VCPU_TP_PRINTK("storing cpu address to %016llx", | ||
292 | __entry->address) | ||
293 | ); | ||
294 | |||
295 | TRACE_EVENT(kvm_s390_handle_stfl, | ||
296 | TP_PROTO(VCPU_PROTO_COMMON, unsigned int facility_list), | ||
297 | TP_ARGS(VCPU_ARGS_COMMON, facility_list), | ||
298 | |||
299 | TP_STRUCT__entry( | ||
300 | VCPU_FIELD_COMMON | ||
301 | __field(unsigned int, facility_list) | ||
302 | ), | ||
303 | |||
304 | TP_fast_assign( | ||
305 | VCPU_ASSIGN_COMMON | ||
306 | __entry->facility_list = facility_list; | ||
307 | ), | ||
308 | |||
309 | VCPU_TP_PRINTK("store facility list value %08x", | ||
310 | __entry->facility_list) | ||
311 | ); | ||
312 | |||
313 | TRACE_EVENT(kvm_s390_handle_stsi, | ||
314 | TP_PROTO(VCPU_PROTO_COMMON, int fc, int sel1, int sel2, u64 addr), | ||
315 | TP_ARGS(VCPU_ARGS_COMMON, fc, sel1, sel2, addr), | ||
316 | |||
317 | TP_STRUCT__entry( | ||
318 | VCPU_FIELD_COMMON | ||
319 | __field(int, fc) | ||
320 | __field(int, sel1) | ||
321 | __field(int, sel2) | ||
322 | __field(u64, addr) | ||
323 | ), | ||
324 | |||
325 | TP_fast_assign( | ||
326 | VCPU_ASSIGN_COMMON | ||
327 | __entry->fc = fc; | ||
328 | __entry->sel1 = sel1; | ||
329 | __entry->sel2 = sel2; | ||
330 | __entry->addr = addr; | ||
331 | ), | ||
332 | |||
333 | VCPU_TP_PRINTK("STSI %d.%d.%d information stored to %016llx", | ||
334 | __entry->fc, __entry->sel1, __entry->sel2, | ||
335 | __entry->addr) | ||
336 | ); | ||
337 | |||
338 | #endif /* _TRACE_KVM_H */ | ||
339 | |||
340 | /* This part must be outside protection */ | ||
341 | #include <trace/define_trace.h> | ||
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7f9a395c5254..b72777ff32a9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -586,23 +586,18 @@ config PARAVIRT_TIME_ACCOUNTING | |||
586 | 586 | ||
587 | source "arch/x86/xen/Kconfig" | 587 | source "arch/x86/xen/Kconfig" |
588 | 588 | ||
589 | config KVM_CLOCK | ||
590 | bool "KVM paravirtualized clock" | ||
591 | select PARAVIRT | ||
592 | select PARAVIRT_CLOCK | ||
593 | ---help--- | ||
594 | Turning on this option will allow you to run a paravirtualized clock | ||
595 | when running over the KVM hypervisor. Instead of relying on a PIT | ||
596 | (or probably other) emulation by the underlying device model, the host | ||
597 | provides the guest with timing infrastructure such as time of day, and | ||
598 | system time | ||
599 | |||
600 | config KVM_GUEST | 589 | config KVM_GUEST |
601 | bool "KVM Guest support" | 590 | bool "KVM Guest support (including kvmclock)" |
591 | select PARAVIRT | ||
602 | select PARAVIRT | 592 | select PARAVIRT |
593 | select PARAVIRT_CLOCK | ||
594 | default y if PARAVIRT_GUEST | ||
603 | ---help--- | 595 | ---help--- |
604 | This option enables various optimizations for running under the KVM | 596 | This option enables various optimizations for running under the KVM |
605 | hypervisor. | 597 | hypervisor. It includes a paravirtualized clock, so that instead |
598 | of relying on a PIT (or probably other) emulation by the | ||
599 | underlying device model, the host provides the guest with | ||
600 | timing infrastructure such as time of day, and system time | ||
606 | 601 | ||
607 | source "arch/x86/lguest/Kconfig" | 602 | source "arch/x86/lguest/Kconfig" |
608 | 603 | ||
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 41e08cb6a092..a65ec29e6ffb 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h | |||
@@ -41,6 +41,7 @@ | |||
41 | #define __KVM_HAVE_DEBUGREGS | 41 | #define __KVM_HAVE_DEBUGREGS |
42 | #define __KVM_HAVE_XSAVE | 42 | #define __KVM_HAVE_XSAVE |
43 | #define __KVM_HAVE_XCRS | 43 | #define __KVM_HAVE_XCRS |
44 | #define __KVM_HAVE_READONLY_MEM | ||
44 | 45 | ||
45 | /* Architectural interrupt line count. */ | 46 | /* Architectural interrupt line count. */ |
46 | #define KVM_NR_INTERRUPTS 256 | 47 | #define KVM_NR_INTERRUPTS 256 |
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index c764f43b71c5..15f960c06ff7 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h | |||
@@ -86,6 +86,19 @@ struct x86_instruction_info { | |||
86 | 86 | ||
87 | struct x86_emulate_ops { | 87 | struct x86_emulate_ops { |
88 | /* | 88 | /* |
89 | * read_gpr: read a general purpose register (rax - r15) | ||
90 | * | ||
91 | * @reg: gpr number. | ||
92 | */ | ||
93 | ulong (*read_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg); | ||
94 | /* | ||
95 | * write_gpr: write a general purpose register (rax - r15) | ||
96 | * | ||
97 | * @reg: gpr number. | ||
98 | * @val: value to write. | ||
99 | */ | ||
100 | void (*write_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val); | ||
101 | /* | ||
89 | * read_std: Read bytes of standard (non-emulated/special) memory. | 102 | * read_std: Read bytes of standard (non-emulated/special) memory. |
90 | * Used for descriptor reading. | 103 | * Used for descriptor reading. |
91 | * @addr: [IN ] Linear address from which to read. | 104 | * @addr: [IN ] Linear address from which to read. |
@@ -200,8 +213,9 @@ typedef u32 __attribute__((vector_size(16))) sse128_t; | |||
200 | 213 | ||
201 | /* Type, address-of, and value of an instruction's operand. */ | 214 | /* Type, address-of, and value of an instruction's operand. */ |
202 | struct operand { | 215 | struct operand { |
203 | enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_MM, OP_NONE } type; | 216 | enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type; |
204 | unsigned int bytes; | 217 | unsigned int bytes; |
218 | unsigned int count; | ||
205 | union { | 219 | union { |
206 | unsigned long orig_val; | 220 | unsigned long orig_val; |
207 | u64 orig_val64; | 221 | u64 orig_val64; |
@@ -221,6 +235,7 @@ struct operand { | |||
221 | char valptr[sizeof(unsigned long) + 2]; | 235 | char valptr[sizeof(unsigned long) + 2]; |
222 | sse128_t vec_val; | 236 | sse128_t vec_val; |
223 | u64 mm_val; | 237 | u64 mm_val; |
238 | void *data; | ||
224 | }; | 239 | }; |
225 | }; | 240 | }; |
226 | 241 | ||
@@ -236,14 +251,23 @@ struct read_cache { | |||
236 | unsigned long end; | 251 | unsigned long end; |
237 | }; | 252 | }; |
238 | 253 | ||
254 | /* Execution mode, passed to the emulator. */ | ||
255 | enum x86emul_mode { | ||
256 | X86EMUL_MODE_REAL, /* Real mode. */ | ||
257 | X86EMUL_MODE_VM86, /* Virtual 8086 mode. */ | ||
258 | X86EMUL_MODE_PROT16, /* 16-bit protected mode. */ | ||
259 | X86EMUL_MODE_PROT32, /* 32-bit protected mode. */ | ||
260 | X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */ | ||
261 | }; | ||
262 | |||
239 | struct x86_emulate_ctxt { | 263 | struct x86_emulate_ctxt { |
240 | struct x86_emulate_ops *ops; | 264 | const struct x86_emulate_ops *ops; |
241 | 265 | ||
242 | /* Register state before/after emulation. */ | 266 | /* Register state before/after emulation. */ |
243 | unsigned long eflags; | 267 | unsigned long eflags; |
244 | unsigned long eip; /* eip before instruction emulation */ | 268 | unsigned long eip; /* eip before instruction emulation */ |
245 | /* Emulated execution mode, represented by an X86EMUL_MODE value. */ | 269 | /* Emulated execution mode, represented by an X86EMUL_MODE value. */ |
246 | int mode; | 270 | enum x86emul_mode mode; |
247 | 271 | ||
248 | /* interruptibility state, as a result of execution of STI or MOV SS */ | 272 | /* interruptibility state, as a result of execution of STI or MOV SS */ |
249 | int interruptibility; | 273 | int interruptibility; |
@@ -281,8 +305,10 @@ struct x86_emulate_ctxt { | |||
281 | bool rip_relative; | 305 | bool rip_relative; |
282 | unsigned long _eip; | 306 | unsigned long _eip; |
283 | struct operand memop; | 307 | struct operand memop; |
308 | u32 regs_valid; /* bitmaps of registers in _regs[] that can be read */ | ||
309 | u32 regs_dirty; /* bitmaps of registers in _regs[] that have been written */ | ||
284 | /* Fields above regs are cleared together. */ | 310 | /* Fields above regs are cleared together. */ |
285 | unsigned long regs[NR_VCPU_REGS]; | 311 | unsigned long _regs[NR_VCPU_REGS]; |
286 | struct operand *memopp; | 312 | struct operand *memopp; |
287 | struct fetch_cache fetch; | 313 | struct fetch_cache fetch; |
288 | struct read_cache io_read; | 314 | struct read_cache io_read; |
@@ -293,17 +319,6 @@ struct x86_emulate_ctxt { | |||
293 | #define REPE_PREFIX 0xf3 | 319 | #define REPE_PREFIX 0xf3 |
294 | #define REPNE_PREFIX 0xf2 | 320 | #define REPNE_PREFIX 0xf2 |
295 | 321 | ||
296 | /* Execution mode, passed to the emulator. */ | ||
297 | #define X86EMUL_MODE_REAL 0 /* Real mode. */ | ||
298 | #define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */ | ||
299 | #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ | ||
300 | #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ | ||
301 | #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ | ||
302 | |||
303 | /* any protected mode */ | ||
304 | #define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \ | ||
305 | X86EMUL_MODE_PROT64) | ||
306 | |||
307 | /* CPUID vendors */ | 322 | /* CPUID vendors */ |
308 | #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541 | 323 | #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541 |
309 | #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163 | 324 | #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163 |
@@ -394,4 +409,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | |||
394 | u16 tss_selector, int idt_index, int reason, | 409 | u16 tss_selector, int idt_index, int reason, |
395 | bool has_error_code, u32 error_code); | 410 | bool has_error_code, u32 error_code); |
396 | int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); | 411 | int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); |
412 | void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt); | ||
413 | void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt); | ||
414 | |||
397 | #endif /* _ASM_X86_KVM_X86_EMULATE_H */ | 415 | #endif /* _ASM_X86_KVM_X86_EMULATE_H */ |
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1eaa6b056670..b2e11f452435 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -271,10 +271,24 @@ struct kvm_mmu { | |||
271 | union kvm_mmu_page_role base_role; | 271 | union kvm_mmu_page_role base_role; |
272 | bool direct_map; | 272 | bool direct_map; |
273 | 273 | ||
274 | /* | ||
275 | * Bitmap; bit set = permission fault | ||
276 | * Byte index: page fault error code [4:1] | ||
277 | * Bit index: pte permissions in ACC_* format | ||
278 | */ | ||
279 | u8 permissions[16]; | ||
280 | |||
274 | u64 *pae_root; | 281 | u64 *pae_root; |
275 | u64 *lm_root; | 282 | u64 *lm_root; |
276 | u64 rsvd_bits_mask[2][4]; | 283 | u64 rsvd_bits_mask[2][4]; |
277 | 284 | ||
285 | /* | ||
286 | * Bitmap: bit set = last pte in walk | ||
287 | * index[0:1]: level (zero-based) | ||
288 | * index[2]: pte.ps | ||
289 | */ | ||
290 | u8 last_pte_bitmap; | ||
291 | |||
278 | bool nx; | 292 | bool nx; |
279 | 293 | ||
280 | u64 pdptrs[4]; /* pae */ | 294 | u64 pdptrs[4]; /* pae */ |
@@ -398,12 +412,15 @@ struct kvm_vcpu_arch { | |||
398 | struct x86_emulate_ctxt emulate_ctxt; | 412 | struct x86_emulate_ctxt emulate_ctxt; |
399 | bool emulate_regs_need_sync_to_vcpu; | 413 | bool emulate_regs_need_sync_to_vcpu; |
400 | bool emulate_regs_need_sync_from_vcpu; | 414 | bool emulate_regs_need_sync_from_vcpu; |
415 | int (*complete_userspace_io)(struct kvm_vcpu *vcpu); | ||
401 | 416 | ||
402 | gpa_t time; | 417 | gpa_t time; |
403 | struct pvclock_vcpu_time_info hv_clock; | 418 | struct pvclock_vcpu_time_info hv_clock; |
404 | unsigned int hw_tsc_khz; | 419 | unsigned int hw_tsc_khz; |
405 | unsigned int time_offset; | 420 | unsigned int time_offset; |
406 | struct page *time_page; | 421 | struct page *time_page; |
422 | /* set guest stopped flag in pvclock flags field */ | ||
423 | bool pvclock_set_guest_stopped_request; | ||
407 | 424 | ||
408 | struct { | 425 | struct { |
409 | u64 msr_val; | 426 | u64 msr_val; |
@@ -438,6 +455,7 @@ struct kvm_vcpu_arch { | |||
438 | unsigned long dr6; | 455 | unsigned long dr6; |
439 | unsigned long dr7; | 456 | unsigned long dr7; |
440 | unsigned long eff_db[KVM_NR_DB_REGS]; | 457 | unsigned long eff_db[KVM_NR_DB_REGS]; |
458 | unsigned long guest_debug_dr7; | ||
441 | 459 | ||
442 | u64 mcg_cap; | 460 | u64 mcg_cap; |
443 | u64 mcg_status; | 461 | u64 mcg_status; |
@@ -484,14 +502,24 @@ struct kvm_vcpu_arch { | |||
484 | }; | 502 | }; |
485 | 503 | ||
486 | struct kvm_lpage_info { | 504 | struct kvm_lpage_info { |
487 | unsigned long rmap_pde; | ||
488 | int write_count; | 505 | int write_count; |
489 | }; | 506 | }; |
490 | 507 | ||
491 | struct kvm_arch_memory_slot { | 508 | struct kvm_arch_memory_slot { |
509 | unsigned long *rmap[KVM_NR_PAGE_SIZES]; | ||
492 | struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; | 510 | struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; |
493 | }; | 511 | }; |
494 | 512 | ||
513 | struct kvm_apic_map { | ||
514 | struct rcu_head rcu; | ||
515 | u8 ldr_bits; | ||
516 | /* fields bellow are used to decode ldr values in different modes */ | ||
517 | u32 cid_shift, cid_mask, lid_mask; | ||
518 | struct kvm_lapic *phys_map[256]; | ||
519 | /* first index is cluster id second is cpu id in a cluster */ | ||
520 | struct kvm_lapic *logical_map[16][16]; | ||
521 | }; | ||
522 | |||
495 | struct kvm_arch { | 523 | struct kvm_arch { |
496 | unsigned int n_used_mmu_pages; | 524 | unsigned int n_used_mmu_pages; |
497 | unsigned int n_requested_mmu_pages; | 525 | unsigned int n_requested_mmu_pages; |
@@ -509,6 +537,8 @@ struct kvm_arch { | |||
509 | struct kvm_ioapic *vioapic; | 537 | struct kvm_ioapic *vioapic; |
510 | struct kvm_pit *vpit; | 538 | struct kvm_pit *vpit; |
511 | int vapics_in_nmi_mode; | 539 | int vapics_in_nmi_mode; |
540 | struct mutex apic_map_lock; | ||
541 | struct kvm_apic_map *apic_map; | ||
512 | 542 | ||
513 | unsigned int tss_addr; | 543 | unsigned int tss_addr; |
514 | struct page *apic_access_page; | 544 | struct page *apic_access_page; |
@@ -602,8 +632,7 @@ struct kvm_x86_ops { | |||
602 | void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); | 632 | void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); |
603 | void (*vcpu_put)(struct kvm_vcpu *vcpu); | 633 | void (*vcpu_put)(struct kvm_vcpu *vcpu); |
604 | 634 | ||
605 | void (*set_guest_debug)(struct kvm_vcpu *vcpu, | 635 | void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); |
606 | struct kvm_guest_debug *dbg); | ||
607 | int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); | 636 | int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); |
608 | int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); | 637 | int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); |
609 | u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); | 638 | u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); |
@@ -941,6 +970,7 @@ extern bool kvm_rebooting; | |||
941 | 970 | ||
942 | #define KVM_ARCH_WANT_MMU_NOTIFIER | 971 | #define KVM_ARCH_WANT_MMU_NOTIFIER |
943 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); | 972 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); |
973 | int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); | ||
944 | int kvm_age_hva(struct kvm *kvm, unsigned long hva); | 974 | int kvm_age_hva(struct kvm *kvm, unsigned long hva); |
945 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); | 975 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); |
946 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); | 976 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); |
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 2f7712e08b1e..eb3e9d85e1f1 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h | |||
@@ -102,21 +102,21 @@ struct kvm_vcpu_pv_apf_data { | |||
102 | extern void kvmclock_init(void); | 102 | extern void kvmclock_init(void); |
103 | extern int kvm_register_clock(char *txt); | 103 | extern int kvm_register_clock(char *txt); |
104 | 104 | ||
105 | #ifdef CONFIG_KVM_CLOCK | 105 | #ifdef CONFIG_KVM_GUEST |
106 | bool kvm_check_and_clear_guest_paused(void); | 106 | bool kvm_check_and_clear_guest_paused(void); |
107 | #else | 107 | #else |
108 | static inline bool kvm_check_and_clear_guest_paused(void) | 108 | static inline bool kvm_check_and_clear_guest_paused(void) |
109 | { | 109 | { |
110 | return false; | 110 | return false; |
111 | } | 111 | } |
112 | #endif /* CONFIG_KVMCLOCK */ | 112 | #endif /* CONFIG_KVM_GUEST */ |
113 | 113 | ||
114 | /* This instruction is vmcall. On non-VT architectures, it will generate a | 114 | /* This instruction is vmcall. On non-VT architectures, it will generate a |
115 | * trap that we will then rewrite to the appropriate instruction. | 115 | * trap that we will then rewrite to the appropriate instruction. |
116 | */ | 116 | */ |
117 | #define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" | 117 | #define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" |
118 | 118 | ||
119 | /* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun | 119 | /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall |
120 | * instruction. The hypervisor may replace it with something else but only the | 120 | * instruction. The hypervisor may replace it with something else but only the |
121 | * instructions are guaranteed to be supported. | 121 | * instructions are guaranteed to be supported. |
122 | * | 122 | * |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8d7a619718b5..a48ea05157d3 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -81,8 +81,7 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o | |||
81 | obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o | 81 | obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o |
82 | obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o | 82 | obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o |
83 | 83 | ||
84 | obj-$(CONFIG_KVM_GUEST) += kvm.o | 84 | obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o |
85 | obj-$(CONFIG_KVM_CLOCK) += kvmclock.o | ||
86 | obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o | 85 | obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o |
87 | obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o | 86 | obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o |
88 | obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o | 87 | obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index c1d61ee4b4f1..b3e5e51bc907 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -354,6 +354,7 @@ static void kvm_pv_guest_cpu_reboot(void *unused) | |||
354 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) | 354 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) |
355 | wrmsrl(MSR_KVM_PV_EOI_EN, 0); | 355 | wrmsrl(MSR_KVM_PV_EOI_EN, 0); |
356 | kvm_pv_disable_apf(); | 356 | kvm_pv_disable_apf(); |
357 | kvm_disable_steal_time(); | ||
357 | } | 358 | } |
358 | 359 | ||
359 | static int kvm_pv_reboot_notify(struct notifier_block *nb, | 360 | static int kvm_pv_reboot_notify(struct notifier_block *nb, |
@@ -396,9 +397,7 @@ void kvm_disable_steal_time(void) | |||
396 | #ifdef CONFIG_SMP | 397 | #ifdef CONFIG_SMP |
397 | static void __init kvm_smp_prepare_boot_cpu(void) | 398 | static void __init kvm_smp_prepare_boot_cpu(void) |
398 | { | 399 | { |
399 | #ifdef CONFIG_KVM_CLOCK | ||
400 | WARN_ON(kvm_register_clock("primary cpu clock")); | 400 | WARN_ON(kvm_register_clock("primary cpu clock")); |
401 | #endif | ||
402 | kvm_guest_cpu_init(); | 401 | kvm_guest_cpu_init(); |
403 | native_smp_prepare_boot_cpu(); | 402 | native_smp_prepare_boot_cpu(); |
404 | } | 403 | } |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4f165479c453..d609be046b57 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -957,7 +957,7 @@ void __init setup_arch(char **cmdline_p) | |||
957 | initmem_init(); | 957 | initmem_init(); |
958 | memblock_find_dma_reserve(); | 958 | memblock_find_dma_reserve(); |
959 | 959 | ||
960 | #ifdef CONFIG_KVM_CLOCK | 960 | #ifdef CONFIG_KVM_GUEST |
961 | kvmclock_init(); | 961 | kvmclock_init(); |
962 | #endif | 962 | #endif |
963 | 963 | ||
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index a28f338843ea..586f00059805 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
@@ -20,6 +20,7 @@ if VIRTUALIZATION | |||
20 | config KVM | 20 | config KVM |
21 | tristate "Kernel-based Virtual Machine (KVM) support" | 21 | tristate "Kernel-based Virtual Machine (KVM) support" |
22 | depends on HAVE_KVM | 22 | depends on HAVE_KVM |
23 | depends on HIGH_RES_TIMERS | ||
23 | # for device assignment: | 24 | # for device assignment: |
24 | depends on PCI | 25 | depends on PCI |
25 | # for TASKSTATS/TASK_DELAY_ACCT: | 26 | # for TASKSTATS/TASK_DELAY_ACCT: |
@@ -37,6 +38,7 @@ config KVM | |||
37 | select TASK_DELAY_ACCT | 38 | select TASK_DELAY_ACCT |
38 | select PERF_EVENTS | 39 | select PERF_EVENTS |
39 | select HAVE_KVM_MSI | 40 | select HAVE_KVM_MSI |
41 | select HAVE_KVM_CPU_RELAX_INTERCEPT | ||
40 | ---help--- | 42 | ---help--- |
41 | Support hosting fully virtualized guest machines using hardware | 43 | Support hosting fully virtualized guest machines using hardware |
42 | virtualization extensions. You will need a fairly recent | 44 | virtualization extensions. You will need a fairly recent |
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 4f579e8dcacf..04d30401c5cb 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) | |||
12 | kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) | 12 | kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) |
13 | 13 | ||
14 | kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ | 14 | kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ |
15 | i8254.o timer.o cpuid.o pmu.o | 15 | i8254.o cpuid.o pmu.o |
16 | kvm-intel-y += vmx.o | 16 | kvm-intel-y += vmx.o |
17 | kvm-amd-y += svm.o | 17 | kvm-amd-y += svm.o |
18 | 18 | ||
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0595f1397b7c..ec79e773342e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c | |||
@@ -316,7 +316,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
316 | } | 316 | } |
317 | case 7: { | 317 | case 7: { |
318 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | 318 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; |
319 | /* Mask ebx against host capbability word 9 */ | 319 | /* Mask ebx against host capability word 9 */ |
320 | if (index == 0) { | 320 | if (index == 0) { |
321 | entry->ebx &= kvm_supported_word9_x86_features; | 321 | entry->ebx &= kvm_supported_word9_x86_features; |
322 | cpuid_mask(&entry->ebx, 9); | 322 | cpuid_mask(&entry->ebx, 9); |
@@ -397,8 +397,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
397 | break; | 397 | break; |
398 | } | 398 | } |
399 | case KVM_CPUID_SIGNATURE: { | 399 | case KVM_CPUID_SIGNATURE: { |
400 | char signature[12] = "KVMKVMKVM\0\0"; | 400 | static const char signature[12] = "KVMKVMKVM\0\0"; |
401 | u32 *sigptr = (u32 *)signature; | 401 | const u32 *sigptr = (const u32 *)signature; |
402 | entry->eax = KVM_CPUID_FEATURES; | 402 | entry->eax = KVM_CPUID_FEATURES; |
403 | entry->ebx = sigptr[0]; | 403 | entry->ebx = sigptr[0]; |
404 | entry->ecx = sigptr[1]; | 404 | entry->ecx = sigptr[1]; |
@@ -484,10 +484,10 @@ struct kvm_cpuid_param { | |||
484 | u32 func; | 484 | u32 func; |
485 | u32 idx; | 485 | u32 idx; |
486 | bool has_leaf_count; | 486 | bool has_leaf_count; |
487 | bool (*qualifier)(struct kvm_cpuid_param *param); | 487 | bool (*qualifier)(const struct kvm_cpuid_param *param); |
488 | }; | 488 | }; |
489 | 489 | ||
490 | static bool is_centaur_cpu(struct kvm_cpuid_param *param) | 490 | static bool is_centaur_cpu(const struct kvm_cpuid_param *param) |
491 | { | 491 | { |
492 | return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; | 492 | return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; |
493 | } | 493 | } |
@@ -498,7 +498,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, | |||
498 | struct kvm_cpuid_entry2 *cpuid_entries; | 498 | struct kvm_cpuid_entry2 *cpuid_entries; |
499 | int limit, nent = 0, r = -E2BIG, i; | 499 | int limit, nent = 0, r = -E2BIG, i; |
500 | u32 func; | 500 | u32 func; |
501 | static struct kvm_cpuid_param param[] = { | 501 | static const struct kvm_cpuid_param param[] = { |
502 | { .func = 0, .has_leaf_count = true }, | 502 | { .func = 0, .has_leaf_count = true }, |
503 | { .func = 0x80000000, .has_leaf_count = true }, | 503 | { .func = 0x80000000, .has_leaf_count = true }, |
504 | { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true }, | 504 | { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true }, |
@@ -517,7 +517,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, | |||
517 | 517 | ||
518 | r = 0; | 518 | r = 0; |
519 | for (i = 0; i < ARRAY_SIZE(param); i++) { | 519 | for (i = 0; i < ARRAY_SIZE(param); i++) { |
520 | struct kvm_cpuid_param *ent = ¶m[i]; | 520 | const struct kvm_cpuid_param *ent = ¶m[i]; |
521 | 521 | ||
522 | if (ent->qualifier && !ent->qualifier(ent)) | 522 | if (ent->qualifier && !ent->qualifier(ent)) |
523 | continue; | 523 | continue; |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a3b57a27be88..39171cb307ea 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -161,9 +161,9 @@ struct opcode { | |||
161 | u64 intercept : 8; | 161 | u64 intercept : 8; |
162 | union { | 162 | union { |
163 | int (*execute)(struct x86_emulate_ctxt *ctxt); | 163 | int (*execute)(struct x86_emulate_ctxt *ctxt); |
164 | struct opcode *group; | 164 | const struct opcode *group; |
165 | struct group_dual *gdual; | 165 | const struct group_dual *gdual; |
166 | struct gprefix *gprefix; | 166 | const struct gprefix *gprefix; |
167 | } u; | 167 | } u; |
168 | int (*check_perm)(struct x86_emulate_ctxt *ctxt); | 168 | int (*check_perm)(struct x86_emulate_ctxt *ctxt); |
169 | }; | 169 | }; |
@@ -202,6 +202,42 @@ struct gprefix { | |||
202 | #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a | 202 | #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a |
203 | #define EFLG_RESERVED_ONE_MASK 2 | 203 | #define EFLG_RESERVED_ONE_MASK 2 |
204 | 204 | ||
205 | static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) | ||
206 | { | ||
207 | if (!(ctxt->regs_valid & (1 << nr))) { | ||
208 | ctxt->regs_valid |= 1 << nr; | ||
209 | ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr); | ||
210 | } | ||
211 | return ctxt->_regs[nr]; | ||
212 | } | ||
213 | |||
214 | static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) | ||
215 | { | ||
216 | ctxt->regs_valid |= 1 << nr; | ||
217 | ctxt->regs_dirty |= 1 << nr; | ||
218 | return &ctxt->_regs[nr]; | ||
219 | } | ||
220 | |||
221 | static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) | ||
222 | { | ||
223 | reg_read(ctxt, nr); | ||
224 | return reg_write(ctxt, nr); | ||
225 | } | ||
226 | |||
227 | static void writeback_registers(struct x86_emulate_ctxt *ctxt) | ||
228 | { | ||
229 | unsigned reg; | ||
230 | |||
231 | for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16) | ||
232 | ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]); | ||
233 | } | ||
234 | |||
235 | static void invalidate_registers(struct x86_emulate_ctxt *ctxt) | ||
236 | { | ||
237 | ctxt->regs_dirty = 0; | ||
238 | ctxt->regs_valid = 0; | ||
239 | } | ||
240 | |||
205 | /* | 241 | /* |
206 | * Instruction emulation: | 242 | * Instruction emulation: |
207 | * Most instructions are emulated directly via a fragment of inline assembly | 243 | * Most instructions are emulated directly via a fragment of inline assembly |
@@ -374,8 +410,8 @@ struct gprefix { | |||
374 | #define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ | 410 | #define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ |
375 | do { \ | 411 | do { \ |
376 | unsigned long _tmp; \ | 412 | unsigned long _tmp; \ |
377 | ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX]; \ | 413 | ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX); \ |
378 | ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX]; \ | 414 | ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX); \ |
379 | \ | 415 | \ |
380 | __asm__ __volatile__ ( \ | 416 | __asm__ __volatile__ ( \ |
381 | _PRE_EFLAGS("0", "5", "1") \ | 417 | _PRE_EFLAGS("0", "5", "1") \ |
@@ -494,7 +530,7 @@ register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, in | |||
494 | 530 | ||
495 | static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) | 531 | static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) |
496 | { | 532 | { |
497 | masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc); | 533 | masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc); |
498 | } | 534 | } |
499 | 535 | ||
500 | static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) | 536 | static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) |
@@ -632,8 +668,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, | |||
632 | 668 | ||
633 | la = seg_base(ctxt, addr.seg) + addr.ea; | 669 | la = seg_base(ctxt, addr.seg) + addr.ea; |
634 | switch (ctxt->mode) { | 670 | switch (ctxt->mode) { |
635 | case X86EMUL_MODE_REAL: | ||
636 | break; | ||
637 | case X86EMUL_MODE_PROT64: | 671 | case X86EMUL_MODE_PROT64: |
638 | if (((signed long)la << 16) >> 16 != la) | 672 | if (((signed long)la << 16) >> 16 != la) |
639 | return emulate_gp(ctxt, 0); | 673 | return emulate_gp(ctxt, 0); |
@@ -655,7 +689,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, | |||
655 | if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) | 689 | if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) |
656 | goto bad; | 690 | goto bad; |
657 | } else { | 691 | } else { |
658 | /* exapand-down segment */ | 692 | /* expand-down segment */ |
659 | if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) | 693 | if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) |
660 | goto bad; | 694 | goto bad; |
661 | lim = desc.d ? 0xffffffff : 0xffff; | 695 | lim = desc.d ? 0xffffffff : 0xffff; |
@@ -663,7 +697,10 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, | |||
663 | goto bad; | 697 | goto bad; |
664 | } | 698 | } |
665 | cpl = ctxt->ops->cpl(ctxt); | 699 | cpl = ctxt->ops->cpl(ctxt); |
666 | rpl = sel & 3; | 700 | if (ctxt->mode == X86EMUL_MODE_REAL) |
701 | rpl = 0; | ||
702 | else | ||
703 | rpl = sel & 3; | ||
667 | cpl = max(cpl, rpl); | 704 | cpl = max(cpl, rpl); |
668 | if (!(desc.type & 8)) { | 705 | if (!(desc.type & 8)) { |
669 | /* data segment */ | 706 | /* data segment */ |
@@ -688,9 +725,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, | |||
688 | return X86EMUL_CONTINUE; | 725 | return X86EMUL_CONTINUE; |
689 | bad: | 726 | bad: |
690 | if (addr.seg == VCPU_SREG_SS) | 727 | if (addr.seg == VCPU_SREG_SS) |
691 | return emulate_ss(ctxt, addr.seg); | 728 | return emulate_ss(ctxt, sel); |
692 | else | 729 | else |
693 | return emulate_gp(ctxt, addr.seg); | 730 | return emulate_gp(ctxt, sel); |
694 | } | 731 | } |
695 | 732 | ||
696 | static int linearize(struct x86_emulate_ctxt *ctxt, | 733 | static int linearize(struct x86_emulate_ctxt *ctxt, |
@@ -786,14 +823,15 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, | |||
786 | * pointer into the block that addresses the relevant register. | 823 | * pointer into the block that addresses the relevant register. |
787 | * @highbyte_regs specifies whether to decode AH,CH,DH,BH. | 824 | * @highbyte_regs specifies whether to decode AH,CH,DH,BH. |
788 | */ | 825 | */ |
789 | static void *decode_register(u8 modrm_reg, unsigned long *regs, | 826 | static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg, |
790 | int highbyte_regs) | 827 | int highbyte_regs) |
791 | { | 828 | { |
792 | void *p; | 829 | void *p; |
793 | 830 | ||
794 | p = ®s[modrm_reg]; | ||
795 | if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) | 831 | if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) |
796 | p = (unsigned char *)®s[modrm_reg & 3] + 1; | 832 | p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1; |
833 | else | ||
834 | p = reg_rmw(ctxt, modrm_reg); | ||
797 | return p; | 835 | return p; |
798 | } | 836 | } |
799 | 837 | ||
@@ -871,23 +909,23 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) | |||
871 | { | 909 | { |
872 | ctxt->ops->get_fpu(ctxt); | 910 | ctxt->ops->get_fpu(ctxt); |
873 | switch (reg) { | 911 | switch (reg) { |
874 | case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break; | 912 | case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; |
875 | case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break; | 913 | case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; |
876 | case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break; | 914 | case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break; |
877 | case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break; | 915 | case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break; |
878 | case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break; | 916 | case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break; |
879 | case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break; | 917 | case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break; |
880 | case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break; | 918 | case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break; |
881 | case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break; | 919 | case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break; |
882 | #ifdef CONFIG_X86_64 | 920 | #ifdef CONFIG_X86_64 |
883 | case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break; | 921 | case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break; |
884 | case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break; | 922 | case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break; |
885 | case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break; | 923 | case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break; |
886 | case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break; | 924 | case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break; |
887 | case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break; | 925 | case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break; |
888 | case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break; | 926 | case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break; |
889 | case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break; | 927 | case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break; |
890 | case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break; | 928 | case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break; |
891 | #endif | 929 | #endif |
892 | default: BUG(); | 930 | default: BUG(); |
893 | } | 931 | } |
@@ -899,23 +937,23 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, | |||
899 | { | 937 | { |
900 | ctxt->ops->get_fpu(ctxt); | 938 | ctxt->ops->get_fpu(ctxt); |
901 | switch (reg) { | 939 | switch (reg) { |
902 | case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break; | 940 | case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; |
903 | case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break; | 941 | case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; |
904 | case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break; | 942 | case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break; |
905 | case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break; | 943 | case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break; |
906 | case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break; | 944 | case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break; |
907 | case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break; | 945 | case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break; |
908 | case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break; | 946 | case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break; |
909 | case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break; | 947 | case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break; |
910 | #ifdef CONFIG_X86_64 | 948 | #ifdef CONFIG_X86_64 |
911 | case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break; | 949 | case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break; |
912 | case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break; | 950 | case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break; |
913 | case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break; | 951 | case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break; |
914 | case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break; | 952 | case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break; |
915 | case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break; | 953 | case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break; |
916 | case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break; | 954 | case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break; |
917 | case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break; | 955 | case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break; |
918 | case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break; | 956 | case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break; |
919 | #endif | 957 | #endif |
920 | default: BUG(); | 958 | default: BUG(); |
921 | } | 959 | } |
@@ -982,10 +1020,10 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, | |||
982 | 1020 | ||
983 | op->type = OP_REG; | 1021 | op->type = OP_REG; |
984 | if (ctxt->d & ByteOp) { | 1022 | if (ctxt->d & ByteOp) { |
985 | op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); | 1023 | op->addr.reg = decode_register(ctxt, reg, highbyte_regs); |
986 | op->bytes = 1; | 1024 | op->bytes = 1; |
987 | } else { | 1025 | } else { |
988 | op->addr.reg = decode_register(reg, ctxt->regs, 0); | 1026 | op->addr.reg = decode_register(ctxt, reg, 0); |
989 | op->bytes = ctxt->op_bytes; | 1027 | op->bytes = ctxt->op_bytes; |
990 | } | 1028 | } |
991 | fetch_register_operand(op); | 1029 | fetch_register_operand(op); |
@@ -1020,8 +1058,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
1020 | if (ctxt->modrm_mod == 3) { | 1058 | if (ctxt->modrm_mod == 3) { |
1021 | op->type = OP_REG; | 1059 | op->type = OP_REG; |
1022 | op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; | 1060 | op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
1023 | op->addr.reg = decode_register(ctxt->modrm_rm, | 1061 | op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, ctxt->d & ByteOp); |
1024 | ctxt->regs, ctxt->d & ByteOp); | ||
1025 | if (ctxt->d & Sse) { | 1062 | if (ctxt->d & Sse) { |
1026 | op->type = OP_XMM; | 1063 | op->type = OP_XMM; |
1027 | op->bytes = 16; | 1064 | op->bytes = 16; |
@@ -1042,10 +1079,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
1042 | op->type = OP_MEM; | 1079 | op->type = OP_MEM; |
1043 | 1080 | ||
1044 | if (ctxt->ad_bytes == 2) { | 1081 | if (ctxt->ad_bytes == 2) { |
1045 | unsigned bx = ctxt->regs[VCPU_REGS_RBX]; | 1082 | unsigned bx = reg_read(ctxt, VCPU_REGS_RBX); |
1046 | unsigned bp = ctxt->regs[VCPU_REGS_RBP]; | 1083 | unsigned bp = reg_read(ctxt, VCPU_REGS_RBP); |
1047 | unsigned si = ctxt->regs[VCPU_REGS_RSI]; | 1084 | unsigned si = reg_read(ctxt, VCPU_REGS_RSI); |
1048 | unsigned di = ctxt->regs[VCPU_REGS_RDI]; | 1085 | unsigned di = reg_read(ctxt, VCPU_REGS_RDI); |
1049 | 1086 | ||
1050 | /* 16-bit ModR/M decode. */ | 1087 | /* 16-bit ModR/M decode. */ |
1051 | switch (ctxt->modrm_mod) { | 1088 | switch (ctxt->modrm_mod) { |
@@ -1102,17 +1139,17 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
1102 | if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) | 1139 | if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) |
1103 | modrm_ea += insn_fetch(s32, ctxt); | 1140 | modrm_ea += insn_fetch(s32, ctxt); |
1104 | else { | 1141 | else { |
1105 | modrm_ea += ctxt->regs[base_reg]; | 1142 | modrm_ea += reg_read(ctxt, base_reg); |
1106 | adjust_modrm_seg(ctxt, base_reg); | 1143 | adjust_modrm_seg(ctxt, base_reg); |
1107 | } | 1144 | } |
1108 | if (index_reg != 4) | 1145 | if (index_reg != 4) |
1109 | modrm_ea += ctxt->regs[index_reg] << scale; | 1146 | modrm_ea += reg_read(ctxt, index_reg) << scale; |
1110 | } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { | 1147 | } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { |
1111 | if (ctxt->mode == X86EMUL_MODE_PROT64) | 1148 | if (ctxt->mode == X86EMUL_MODE_PROT64) |
1112 | ctxt->rip_relative = 1; | 1149 | ctxt->rip_relative = 1; |
1113 | } else { | 1150 | } else { |
1114 | base_reg = ctxt->modrm_rm; | 1151 | base_reg = ctxt->modrm_rm; |
1115 | modrm_ea += ctxt->regs[base_reg]; | 1152 | modrm_ea += reg_read(ctxt, base_reg); |
1116 | adjust_modrm_seg(ctxt, base_reg); | 1153 | adjust_modrm_seg(ctxt, base_reg); |
1117 | } | 1154 | } |
1118 | switch (ctxt->modrm_mod) { | 1155 | switch (ctxt->modrm_mod) { |
@@ -1179,24 +1216,21 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, | |||
1179 | int rc; | 1216 | int rc; |
1180 | struct read_cache *mc = &ctxt->mem_read; | 1217 | struct read_cache *mc = &ctxt->mem_read; |
1181 | 1218 | ||
1182 | while (size) { | 1219 | if (mc->pos < mc->end) |
1183 | int n = min(size, 8u); | 1220 | goto read_cached; |
1184 | size -= n; | ||
1185 | if (mc->pos < mc->end) | ||
1186 | goto read_cached; | ||
1187 | 1221 | ||
1188 | rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n, | 1222 | WARN_ON((mc->end + size) >= sizeof(mc->data)); |
1189 | &ctxt->exception); | ||
1190 | if (rc != X86EMUL_CONTINUE) | ||
1191 | return rc; | ||
1192 | mc->end += n; | ||
1193 | 1223 | ||
1194 | read_cached: | 1224 | rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size, |
1195 | memcpy(dest, mc->data + mc->pos, n); | 1225 | &ctxt->exception); |
1196 | mc->pos += n; | 1226 | if (rc != X86EMUL_CONTINUE) |
1197 | dest += n; | 1227 | return rc; |
1198 | addr += n; | 1228 | |
1199 | } | 1229 | mc->end += size; |
1230 | |||
1231 | read_cached: | ||
1232 | memcpy(dest, mc->data + mc->pos, size); | ||
1233 | mc->pos += size; | ||
1200 | return X86EMUL_CONTINUE; | 1234 | return X86EMUL_CONTINUE; |
1201 | } | 1235 | } |
1202 | 1236 | ||
@@ -1253,10 +1287,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, | |||
1253 | if (rc->pos == rc->end) { /* refill pio read ahead */ | 1287 | if (rc->pos == rc->end) { /* refill pio read ahead */ |
1254 | unsigned int in_page, n; | 1288 | unsigned int in_page, n; |
1255 | unsigned int count = ctxt->rep_prefix ? | 1289 | unsigned int count = ctxt->rep_prefix ? |
1256 | address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1; | 1290 | address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1; |
1257 | in_page = (ctxt->eflags & EFLG_DF) ? | 1291 | in_page = (ctxt->eflags & EFLG_DF) ? |
1258 | offset_in_page(ctxt->regs[VCPU_REGS_RDI]) : | 1292 | offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) : |
1259 | PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]); | 1293 | PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)); |
1260 | n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, | 1294 | n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, |
1261 | count); | 1295 | count); |
1262 | if (n == 0) | 1296 | if (n == 0) |
@@ -1267,8 +1301,15 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, | |||
1267 | rc->end = n * size; | 1301 | rc->end = n * size; |
1268 | } | 1302 | } |
1269 | 1303 | ||
1270 | memcpy(dest, rc->data + rc->pos, size); | 1304 | if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) { |
1271 | rc->pos += size; | 1305 | ctxt->dst.data = rc->data + rc->pos; |
1306 | ctxt->dst.type = OP_MEM_STR; | ||
1307 | ctxt->dst.count = (rc->end - rc->pos) / size; | ||
1308 | rc->pos = rc->end; | ||
1309 | } else { | ||
1310 | memcpy(dest, rc->data + rc->pos, size); | ||
1311 | rc->pos += size; | ||
1312 | } | ||
1272 | return 1; | 1313 | return 1; |
1273 | } | 1314 | } |
1274 | 1315 | ||
@@ -1291,7 +1332,7 @@ static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1291 | static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, | 1332 | static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, |
1292 | u16 selector, struct desc_ptr *dt) | 1333 | u16 selector, struct desc_ptr *dt) |
1293 | { | 1334 | { |
1294 | struct x86_emulate_ops *ops = ctxt->ops; | 1335 | const struct x86_emulate_ops *ops = ctxt->ops; |
1295 | 1336 | ||
1296 | if (selector & 1 << 2) { | 1337 | if (selector & 1 << 2) { |
1297 | struct desc_struct desc; | 1338 | struct desc_struct desc; |
@@ -1355,19 +1396,15 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1355 | bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ | 1396 | bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ |
1356 | ulong desc_addr; | 1397 | ulong desc_addr; |
1357 | int ret; | 1398 | int ret; |
1399 | u16 dummy; | ||
1358 | 1400 | ||
1359 | memset(&seg_desc, 0, sizeof seg_desc); | 1401 | memset(&seg_desc, 0, sizeof seg_desc); |
1360 | 1402 | ||
1361 | if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) | 1403 | if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) |
1362 | || ctxt->mode == X86EMUL_MODE_REAL) { | 1404 | || ctxt->mode == X86EMUL_MODE_REAL) { |
1363 | /* set real mode segment descriptor */ | 1405 | /* set real mode segment descriptor */ |
1406 | ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg); | ||
1364 | set_desc_base(&seg_desc, selector << 4); | 1407 | set_desc_base(&seg_desc, selector << 4); |
1365 | set_desc_limit(&seg_desc, 0xffff); | ||
1366 | seg_desc.type = 3; | ||
1367 | seg_desc.p = 1; | ||
1368 | seg_desc.s = 1; | ||
1369 | if (ctxt->mode == X86EMUL_MODE_VM86) | ||
1370 | seg_desc.dpl = 3; | ||
1371 | goto load; | 1408 | goto load; |
1372 | } | 1409 | } |
1373 | 1410 | ||
@@ -1396,7 +1433,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1396 | err_code = selector & 0xfffc; | 1433 | err_code = selector & 0xfffc; |
1397 | err_vec = GP_VECTOR; | 1434 | err_vec = GP_VECTOR; |
1398 | 1435 | ||
1399 | /* can't load system descriptor into segment selecor */ | 1436 | /* can't load system descriptor into segment selector */ |
1400 | if (seg <= VCPU_SREG_GS && !seg_desc.s) | 1437 | if (seg <= VCPU_SREG_GS && !seg_desc.s) |
1401 | goto exception; | 1438 | goto exception; |
1402 | 1439 | ||
@@ -1516,6 +1553,14 @@ static int writeback(struct x86_emulate_ctxt *ctxt) | |||
1516 | if (rc != X86EMUL_CONTINUE) | 1553 | if (rc != X86EMUL_CONTINUE) |
1517 | return rc; | 1554 | return rc; |
1518 | break; | 1555 | break; |
1556 | case OP_MEM_STR: | ||
1557 | rc = segmented_write(ctxt, | ||
1558 | ctxt->dst.addr.mem, | ||
1559 | ctxt->dst.data, | ||
1560 | ctxt->dst.bytes * ctxt->dst.count); | ||
1561 | if (rc != X86EMUL_CONTINUE) | ||
1562 | return rc; | ||
1563 | break; | ||
1519 | case OP_XMM: | 1564 | case OP_XMM: |
1520 | write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); | 1565 | write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); |
1521 | break; | 1566 | break; |
@@ -1536,7 +1581,7 @@ static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) | |||
1536 | struct segmented_address addr; | 1581 | struct segmented_address addr; |
1537 | 1582 | ||
1538 | rsp_increment(ctxt, -bytes); | 1583 | rsp_increment(ctxt, -bytes); |
1539 | addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); | 1584 | addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt); |
1540 | addr.seg = VCPU_SREG_SS; | 1585 | addr.seg = VCPU_SREG_SS; |
1541 | 1586 | ||
1542 | return segmented_write(ctxt, addr, data, bytes); | 1587 | return segmented_write(ctxt, addr, data, bytes); |
@@ -1555,7 +1600,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, | |||
1555 | int rc; | 1600 | int rc; |
1556 | struct segmented_address addr; | 1601 | struct segmented_address addr; |
1557 | 1602 | ||
1558 | addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); | 1603 | addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt); |
1559 | addr.seg = VCPU_SREG_SS; | 1604 | addr.seg = VCPU_SREG_SS; |
1560 | rc = segmented_read(ctxt, addr, dest, len); | 1605 | rc = segmented_read(ctxt, addr, dest, len); |
1561 | if (rc != X86EMUL_CONTINUE) | 1606 | if (rc != X86EMUL_CONTINUE) |
@@ -1623,26 +1668,28 @@ static int em_enter(struct x86_emulate_ctxt *ctxt) | |||
1623 | int rc; | 1668 | int rc; |
1624 | unsigned frame_size = ctxt->src.val; | 1669 | unsigned frame_size = ctxt->src.val; |
1625 | unsigned nesting_level = ctxt->src2.val & 31; | 1670 | unsigned nesting_level = ctxt->src2.val & 31; |
1671 | ulong rbp; | ||
1626 | 1672 | ||
1627 | if (nesting_level) | 1673 | if (nesting_level) |
1628 | return X86EMUL_UNHANDLEABLE; | 1674 | return X86EMUL_UNHANDLEABLE; |
1629 | 1675 | ||
1630 | rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt)); | 1676 | rbp = reg_read(ctxt, VCPU_REGS_RBP); |
1677 | rc = push(ctxt, &rbp, stack_size(ctxt)); | ||
1631 | if (rc != X86EMUL_CONTINUE) | 1678 | if (rc != X86EMUL_CONTINUE) |
1632 | return rc; | 1679 | return rc; |
1633 | assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP], | 1680 | assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP), |
1634 | stack_mask(ctxt)); | 1681 | stack_mask(ctxt)); |
1635 | assign_masked(&ctxt->regs[VCPU_REGS_RSP], | 1682 | assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), |
1636 | ctxt->regs[VCPU_REGS_RSP] - frame_size, | 1683 | reg_read(ctxt, VCPU_REGS_RSP) - frame_size, |
1637 | stack_mask(ctxt)); | 1684 | stack_mask(ctxt)); |
1638 | return X86EMUL_CONTINUE; | 1685 | return X86EMUL_CONTINUE; |
1639 | } | 1686 | } |
1640 | 1687 | ||
1641 | static int em_leave(struct x86_emulate_ctxt *ctxt) | 1688 | static int em_leave(struct x86_emulate_ctxt *ctxt) |
1642 | { | 1689 | { |
1643 | assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP], | 1690 | assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), reg_read(ctxt, VCPU_REGS_RBP), |
1644 | stack_mask(ctxt)); | 1691 | stack_mask(ctxt)); |
1645 | return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes); | 1692 | return emulate_pop(ctxt, reg_rmw(ctxt, VCPU_REGS_RBP), ctxt->op_bytes); |
1646 | } | 1693 | } |
1647 | 1694 | ||
1648 | static int em_push_sreg(struct x86_emulate_ctxt *ctxt) | 1695 | static int em_push_sreg(struct x86_emulate_ctxt *ctxt) |
@@ -1670,13 +1717,13 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) | |||
1670 | 1717 | ||
1671 | static int em_pusha(struct x86_emulate_ctxt *ctxt) | 1718 | static int em_pusha(struct x86_emulate_ctxt *ctxt) |
1672 | { | 1719 | { |
1673 | unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP]; | 1720 | unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP); |
1674 | int rc = X86EMUL_CONTINUE; | 1721 | int rc = X86EMUL_CONTINUE; |
1675 | int reg = VCPU_REGS_RAX; | 1722 | int reg = VCPU_REGS_RAX; |
1676 | 1723 | ||
1677 | while (reg <= VCPU_REGS_RDI) { | 1724 | while (reg <= VCPU_REGS_RDI) { |
1678 | (reg == VCPU_REGS_RSP) ? | 1725 | (reg == VCPU_REGS_RSP) ? |
1679 | (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]); | 1726 | (ctxt->src.val = old_esp) : (ctxt->src.val = reg_read(ctxt, reg)); |
1680 | 1727 | ||
1681 | rc = em_push(ctxt); | 1728 | rc = em_push(ctxt); |
1682 | if (rc != X86EMUL_CONTINUE) | 1729 | if (rc != X86EMUL_CONTINUE) |
@@ -1705,7 +1752,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) | |||
1705 | --reg; | 1752 | --reg; |
1706 | } | 1753 | } |
1707 | 1754 | ||
1708 | rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes); | 1755 | rc = emulate_pop(ctxt, reg_rmw(ctxt, reg), ctxt->op_bytes); |
1709 | if (rc != X86EMUL_CONTINUE) | 1756 | if (rc != X86EMUL_CONTINUE) |
1710 | break; | 1757 | break; |
1711 | --reg; | 1758 | --reg; |
@@ -1713,9 +1760,9 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) | |||
1713 | return rc; | 1760 | return rc; |
1714 | } | 1761 | } |
1715 | 1762 | ||
1716 | int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) | 1763 | static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) |
1717 | { | 1764 | { |
1718 | struct x86_emulate_ops *ops = ctxt->ops; | 1765 | const struct x86_emulate_ops *ops = ctxt->ops; |
1719 | int rc; | 1766 | int rc; |
1720 | struct desc_ptr dt; | 1767 | struct desc_ptr dt; |
1721 | gva_t cs_addr; | 1768 | gva_t cs_addr; |
@@ -1762,11 +1809,22 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) | |||
1762 | return rc; | 1809 | return rc; |
1763 | } | 1810 | } |
1764 | 1811 | ||
1812 | int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) | ||
1813 | { | ||
1814 | int rc; | ||
1815 | |||
1816 | invalidate_registers(ctxt); | ||
1817 | rc = __emulate_int_real(ctxt, irq); | ||
1818 | if (rc == X86EMUL_CONTINUE) | ||
1819 | writeback_registers(ctxt); | ||
1820 | return rc; | ||
1821 | } | ||
1822 | |||
1765 | static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq) | 1823 | static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq) |
1766 | { | 1824 | { |
1767 | switch(ctxt->mode) { | 1825 | switch(ctxt->mode) { |
1768 | case X86EMUL_MODE_REAL: | 1826 | case X86EMUL_MODE_REAL: |
1769 | return emulate_int_real(ctxt, irq); | 1827 | return __emulate_int_real(ctxt, irq); |
1770 | case X86EMUL_MODE_VM86: | 1828 | case X86EMUL_MODE_VM86: |
1771 | case X86EMUL_MODE_PROT16: | 1829 | case X86EMUL_MODE_PROT16: |
1772 | case X86EMUL_MODE_PROT32: | 1830 | case X86EMUL_MODE_PROT32: |
@@ -1973,14 +2031,14 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) | |||
1973 | { | 2031 | { |
1974 | u64 old = ctxt->dst.orig_val64; | 2032 | u64 old = ctxt->dst.orig_val64; |
1975 | 2033 | ||
1976 | if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) || | 2034 | if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) || |
1977 | ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) { | 2035 | ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) { |
1978 | ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0); | 2036 | *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0); |
1979 | ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32); | 2037 | *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32); |
1980 | ctxt->eflags &= ~EFLG_ZF; | 2038 | ctxt->eflags &= ~EFLG_ZF; |
1981 | } else { | 2039 | } else { |
1982 | ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) | | 2040 | ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) | |
1983 | (u32) ctxt->regs[VCPU_REGS_RBX]; | 2041 | (u32) reg_read(ctxt, VCPU_REGS_RBX); |
1984 | 2042 | ||
1985 | ctxt->eflags |= EFLG_ZF; | 2043 | ctxt->eflags |= EFLG_ZF; |
1986 | } | 2044 | } |
@@ -2016,7 +2074,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) | |||
2016 | { | 2074 | { |
2017 | /* Save real source value, then compare EAX against destination. */ | 2075 | /* Save real source value, then compare EAX against destination. */ |
2018 | ctxt->src.orig_val = ctxt->src.val; | 2076 | ctxt->src.orig_val = ctxt->src.val; |
2019 | ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; | 2077 | ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX); |
2020 | emulate_2op_SrcV(ctxt, "cmp"); | 2078 | emulate_2op_SrcV(ctxt, "cmp"); |
2021 | 2079 | ||
2022 | if (ctxt->eflags & EFLG_ZF) { | 2080 | if (ctxt->eflags & EFLG_ZF) { |
@@ -2025,7 +2083,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) | |||
2025 | } else { | 2083 | } else { |
2026 | /* Failure: write the value we saw to EAX. */ | 2084 | /* Failure: write the value we saw to EAX. */ |
2027 | ctxt->dst.type = OP_REG; | 2085 | ctxt->dst.type = OP_REG; |
2028 | ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; | 2086 | ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); |
2029 | } | 2087 | } |
2030 | return X86EMUL_CONTINUE; | 2088 | return X86EMUL_CONTINUE; |
2031 | } | 2089 | } |
@@ -2050,12 +2108,6 @@ static void | |||
2050 | setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, | 2108 | setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, |
2051 | struct desc_struct *cs, struct desc_struct *ss) | 2109 | struct desc_struct *cs, struct desc_struct *ss) |
2052 | { | 2110 | { |
2053 | u16 selector; | ||
2054 | |||
2055 | memset(cs, 0, sizeof(struct desc_struct)); | ||
2056 | ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); | ||
2057 | memset(ss, 0, sizeof(struct desc_struct)); | ||
2058 | |||
2059 | cs->l = 0; /* will be adjusted later */ | 2111 | cs->l = 0; /* will be adjusted later */ |
2060 | set_desc_base(cs, 0); /* flat segment */ | 2112 | set_desc_base(cs, 0); /* flat segment */ |
2061 | cs->g = 1; /* 4kb granularity */ | 2113 | cs->g = 1; /* 4kb granularity */ |
@@ -2065,6 +2117,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, | |||
2065 | cs->dpl = 0; /* will be adjusted later */ | 2117 | cs->dpl = 0; /* will be adjusted later */ |
2066 | cs->p = 1; | 2118 | cs->p = 1; |
2067 | cs->d = 1; | 2119 | cs->d = 1; |
2120 | cs->avl = 0; | ||
2068 | 2121 | ||
2069 | set_desc_base(ss, 0); /* flat segment */ | 2122 | set_desc_base(ss, 0); /* flat segment */ |
2070 | set_desc_limit(ss, 0xfffff); /* 4GB limit */ | 2123 | set_desc_limit(ss, 0xfffff); /* 4GB limit */ |
@@ -2074,6 +2127,8 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, | |||
2074 | ss->d = 1; /* 32bit stack segment */ | 2127 | ss->d = 1; /* 32bit stack segment */ |
2075 | ss->dpl = 0; | 2128 | ss->dpl = 0; |
2076 | ss->p = 1; | 2129 | ss->p = 1; |
2130 | ss->l = 0; | ||
2131 | ss->avl = 0; | ||
2077 | } | 2132 | } |
2078 | 2133 | ||
2079 | static bool vendor_intel(struct x86_emulate_ctxt *ctxt) | 2134 | static bool vendor_intel(struct x86_emulate_ctxt *ctxt) |
@@ -2089,7 +2144,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt) | |||
2089 | 2144 | ||
2090 | static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) | 2145 | static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) |
2091 | { | 2146 | { |
2092 | struct x86_emulate_ops *ops = ctxt->ops; | 2147 | const struct x86_emulate_ops *ops = ctxt->ops; |
2093 | u32 eax, ebx, ecx, edx; | 2148 | u32 eax, ebx, ecx, edx; |
2094 | 2149 | ||
2095 | /* | 2150 | /* |
@@ -2133,7 +2188,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) | |||
2133 | 2188 | ||
2134 | static int em_syscall(struct x86_emulate_ctxt *ctxt) | 2189 | static int em_syscall(struct x86_emulate_ctxt *ctxt) |
2135 | { | 2190 | { |
2136 | struct x86_emulate_ops *ops = ctxt->ops; | 2191 | const struct x86_emulate_ops *ops = ctxt->ops; |
2137 | struct desc_struct cs, ss; | 2192 | struct desc_struct cs, ss; |
2138 | u64 msr_data; | 2193 | u64 msr_data; |
2139 | u16 cs_sel, ss_sel; | 2194 | u16 cs_sel, ss_sel; |
@@ -2165,10 +2220,10 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) | |||
2165 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); | 2220 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); |
2166 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); | 2221 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); |
2167 | 2222 | ||
2168 | ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip; | 2223 | *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip; |
2169 | if (efer & EFER_LMA) { | 2224 | if (efer & EFER_LMA) { |
2170 | #ifdef CONFIG_X86_64 | 2225 | #ifdef CONFIG_X86_64 |
2171 | ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; | 2226 | *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags & ~EFLG_RF; |
2172 | 2227 | ||
2173 | ops->get_msr(ctxt, | 2228 | ops->get_msr(ctxt, |
2174 | ctxt->mode == X86EMUL_MODE_PROT64 ? | 2229 | ctxt->mode == X86EMUL_MODE_PROT64 ? |
@@ -2191,7 +2246,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) | |||
2191 | 2246 | ||
2192 | static int em_sysenter(struct x86_emulate_ctxt *ctxt) | 2247 | static int em_sysenter(struct x86_emulate_ctxt *ctxt) |
2193 | { | 2248 | { |
2194 | struct x86_emulate_ops *ops = ctxt->ops; | 2249 | const struct x86_emulate_ops *ops = ctxt->ops; |
2195 | struct desc_struct cs, ss; | 2250 | struct desc_struct cs, ss; |
2196 | u64 msr_data; | 2251 | u64 msr_data; |
2197 | u16 cs_sel, ss_sel; | 2252 | u16 cs_sel, ss_sel; |
@@ -2228,6 +2283,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) | |||
2228 | if (msr_data == 0x0) | 2283 | if (msr_data == 0x0) |
2229 | return emulate_gp(ctxt, 0); | 2284 | return emulate_gp(ctxt, 0); |
2230 | break; | 2285 | break; |
2286 | default: | ||
2287 | break; | ||
2231 | } | 2288 | } |
2232 | 2289 | ||
2233 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); | 2290 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); |
@@ -2247,14 +2304,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) | |||
2247 | ctxt->_eip = msr_data; | 2304 | ctxt->_eip = msr_data; |
2248 | 2305 | ||
2249 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); | 2306 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); |
2250 | ctxt->regs[VCPU_REGS_RSP] = msr_data; | 2307 | *reg_write(ctxt, VCPU_REGS_RSP) = msr_data; |
2251 | 2308 | ||
2252 | return X86EMUL_CONTINUE; | 2309 | return X86EMUL_CONTINUE; |
2253 | } | 2310 | } |
2254 | 2311 | ||
2255 | static int em_sysexit(struct x86_emulate_ctxt *ctxt) | 2312 | static int em_sysexit(struct x86_emulate_ctxt *ctxt) |
2256 | { | 2313 | { |
2257 | struct x86_emulate_ops *ops = ctxt->ops; | 2314 | const struct x86_emulate_ops *ops = ctxt->ops; |
2258 | struct desc_struct cs, ss; | 2315 | struct desc_struct cs, ss; |
2259 | u64 msr_data; | 2316 | u64 msr_data; |
2260 | int usermode; | 2317 | int usermode; |
@@ -2297,8 +2354,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) | |||
2297 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); | 2354 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); |
2298 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); | 2355 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); |
2299 | 2356 | ||
2300 | ctxt->_eip = ctxt->regs[VCPU_REGS_RDX]; | 2357 | ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX); |
2301 | ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX]; | 2358 | *reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX); |
2302 | 2359 | ||
2303 | return X86EMUL_CONTINUE; | 2360 | return X86EMUL_CONTINUE; |
2304 | } | 2361 | } |
@@ -2317,7 +2374,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) | |||
2317 | static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, | 2374 | static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, |
2318 | u16 port, u16 len) | 2375 | u16 port, u16 len) |
2319 | { | 2376 | { |
2320 | struct x86_emulate_ops *ops = ctxt->ops; | 2377 | const struct x86_emulate_ops *ops = ctxt->ops; |
2321 | struct desc_struct tr_seg; | 2378 | struct desc_struct tr_seg; |
2322 | u32 base3; | 2379 | u32 base3; |
2323 | int r; | 2380 | int r; |
@@ -2367,14 +2424,14 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, | |||
2367 | { | 2424 | { |
2368 | tss->ip = ctxt->_eip; | 2425 | tss->ip = ctxt->_eip; |
2369 | tss->flag = ctxt->eflags; | 2426 | tss->flag = ctxt->eflags; |
2370 | tss->ax = ctxt->regs[VCPU_REGS_RAX]; | 2427 | tss->ax = reg_read(ctxt, VCPU_REGS_RAX); |
2371 | tss->cx = ctxt->regs[VCPU_REGS_RCX]; | 2428 | tss->cx = reg_read(ctxt, VCPU_REGS_RCX); |
2372 | tss->dx = ctxt->regs[VCPU_REGS_RDX]; | 2429 | tss->dx = reg_read(ctxt, VCPU_REGS_RDX); |
2373 | tss->bx = ctxt->regs[VCPU_REGS_RBX]; | 2430 | tss->bx = reg_read(ctxt, VCPU_REGS_RBX); |
2374 | tss->sp = ctxt->regs[VCPU_REGS_RSP]; | 2431 | tss->sp = reg_read(ctxt, VCPU_REGS_RSP); |
2375 | tss->bp = ctxt->regs[VCPU_REGS_RBP]; | 2432 | tss->bp = reg_read(ctxt, VCPU_REGS_RBP); |
2376 | tss->si = ctxt->regs[VCPU_REGS_RSI]; | 2433 | tss->si = reg_read(ctxt, VCPU_REGS_RSI); |
2377 | tss->di = ctxt->regs[VCPU_REGS_RDI]; | 2434 | tss->di = reg_read(ctxt, VCPU_REGS_RDI); |
2378 | 2435 | ||
2379 | tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); | 2436 | tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); |
2380 | tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); | 2437 | tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); |
@@ -2390,14 +2447,14 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, | |||
2390 | 2447 | ||
2391 | ctxt->_eip = tss->ip; | 2448 | ctxt->_eip = tss->ip; |
2392 | ctxt->eflags = tss->flag | 2; | 2449 | ctxt->eflags = tss->flag | 2; |
2393 | ctxt->regs[VCPU_REGS_RAX] = tss->ax; | 2450 | *reg_write(ctxt, VCPU_REGS_RAX) = tss->ax; |
2394 | ctxt->regs[VCPU_REGS_RCX] = tss->cx; | 2451 | *reg_write(ctxt, VCPU_REGS_RCX) = tss->cx; |
2395 | ctxt->regs[VCPU_REGS_RDX] = tss->dx; | 2452 | *reg_write(ctxt, VCPU_REGS_RDX) = tss->dx; |
2396 | ctxt->regs[VCPU_REGS_RBX] = tss->bx; | 2453 | *reg_write(ctxt, VCPU_REGS_RBX) = tss->bx; |
2397 | ctxt->regs[VCPU_REGS_RSP] = tss->sp; | 2454 | *reg_write(ctxt, VCPU_REGS_RSP) = tss->sp; |
2398 | ctxt->regs[VCPU_REGS_RBP] = tss->bp; | 2455 | *reg_write(ctxt, VCPU_REGS_RBP) = tss->bp; |
2399 | ctxt->regs[VCPU_REGS_RSI] = tss->si; | 2456 | *reg_write(ctxt, VCPU_REGS_RSI) = tss->si; |
2400 | ctxt->regs[VCPU_REGS_RDI] = tss->di; | 2457 | *reg_write(ctxt, VCPU_REGS_RDI) = tss->di; |
2401 | 2458 | ||
2402 | /* | 2459 | /* |
2403 | * SDM says that segment selectors are loaded before segment | 2460 | * SDM says that segment selectors are loaded before segment |
@@ -2410,7 +2467,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, | |||
2410 | set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); | 2467 | set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); |
2411 | 2468 | ||
2412 | /* | 2469 | /* |
2413 | * Now load segment descriptors. If fault happenes at this stage | 2470 | * Now load segment descriptors. If fault happens at this stage |
2414 | * it is handled in a context of new task | 2471 | * it is handled in a context of new task |
2415 | */ | 2472 | */ |
2416 | ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); | 2473 | ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); |
@@ -2436,7 +2493,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
2436 | u16 tss_selector, u16 old_tss_sel, | 2493 | u16 tss_selector, u16 old_tss_sel, |
2437 | ulong old_tss_base, struct desc_struct *new_desc) | 2494 | ulong old_tss_base, struct desc_struct *new_desc) |
2438 | { | 2495 | { |
2439 | struct x86_emulate_ops *ops = ctxt->ops; | 2496 | const struct x86_emulate_ops *ops = ctxt->ops; |
2440 | struct tss_segment_16 tss_seg; | 2497 | struct tss_segment_16 tss_seg; |
2441 | int ret; | 2498 | int ret; |
2442 | u32 new_tss_base = get_desc_base(new_desc); | 2499 | u32 new_tss_base = get_desc_base(new_desc); |
@@ -2482,14 +2539,14 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, | |||
2482 | tss->cr3 = ctxt->ops->get_cr(ctxt, 3); | 2539 | tss->cr3 = ctxt->ops->get_cr(ctxt, 3); |
2483 | tss->eip = ctxt->_eip; | 2540 | tss->eip = ctxt->_eip; |
2484 | tss->eflags = ctxt->eflags; | 2541 | tss->eflags = ctxt->eflags; |
2485 | tss->eax = ctxt->regs[VCPU_REGS_RAX]; | 2542 | tss->eax = reg_read(ctxt, VCPU_REGS_RAX); |
2486 | tss->ecx = ctxt->regs[VCPU_REGS_RCX]; | 2543 | tss->ecx = reg_read(ctxt, VCPU_REGS_RCX); |
2487 | tss->edx = ctxt->regs[VCPU_REGS_RDX]; | 2544 | tss->edx = reg_read(ctxt, VCPU_REGS_RDX); |
2488 | tss->ebx = ctxt->regs[VCPU_REGS_RBX]; | 2545 | tss->ebx = reg_read(ctxt, VCPU_REGS_RBX); |
2489 | tss->esp = ctxt->regs[VCPU_REGS_RSP]; | 2546 | tss->esp = reg_read(ctxt, VCPU_REGS_RSP); |
2490 | tss->ebp = ctxt->regs[VCPU_REGS_RBP]; | 2547 | tss->ebp = reg_read(ctxt, VCPU_REGS_RBP); |
2491 | tss->esi = ctxt->regs[VCPU_REGS_RSI]; | 2548 | tss->esi = reg_read(ctxt, VCPU_REGS_RSI); |
2492 | tss->edi = ctxt->regs[VCPU_REGS_RDI]; | 2549 | tss->edi = reg_read(ctxt, VCPU_REGS_RDI); |
2493 | 2550 | ||
2494 | tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); | 2551 | tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); |
2495 | tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); | 2552 | tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); |
@@ -2511,14 +2568,14 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | |||
2511 | ctxt->eflags = tss->eflags | 2; | 2568 | ctxt->eflags = tss->eflags | 2; |
2512 | 2569 | ||
2513 | /* General purpose registers */ | 2570 | /* General purpose registers */ |
2514 | ctxt->regs[VCPU_REGS_RAX] = tss->eax; | 2571 | *reg_write(ctxt, VCPU_REGS_RAX) = tss->eax; |
2515 | ctxt->regs[VCPU_REGS_RCX] = tss->ecx; | 2572 | *reg_write(ctxt, VCPU_REGS_RCX) = tss->ecx; |
2516 | ctxt->regs[VCPU_REGS_RDX] = tss->edx; | 2573 | *reg_write(ctxt, VCPU_REGS_RDX) = tss->edx; |
2517 | ctxt->regs[VCPU_REGS_RBX] = tss->ebx; | 2574 | *reg_write(ctxt, VCPU_REGS_RBX) = tss->ebx; |
2518 | ctxt->regs[VCPU_REGS_RSP] = tss->esp; | 2575 | *reg_write(ctxt, VCPU_REGS_RSP) = tss->esp; |
2519 | ctxt->regs[VCPU_REGS_RBP] = tss->ebp; | 2576 | *reg_write(ctxt, VCPU_REGS_RBP) = tss->ebp; |
2520 | ctxt->regs[VCPU_REGS_RSI] = tss->esi; | 2577 | *reg_write(ctxt, VCPU_REGS_RSI) = tss->esi; |
2521 | ctxt->regs[VCPU_REGS_RDI] = tss->edi; | 2578 | *reg_write(ctxt, VCPU_REGS_RDI) = tss->edi; |
2522 | 2579 | ||
2523 | /* | 2580 | /* |
2524 | * SDM says that segment selectors are loaded before segment | 2581 | * SDM says that segment selectors are loaded before segment |
@@ -2583,7 +2640,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2583 | u16 tss_selector, u16 old_tss_sel, | 2640 | u16 tss_selector, u16 old_tss_sel, |
2584 | ulong old_tss_base, struct desc_struct *new_desc) | 2641 | ulong old_tss_base, struct desc_struct *new_desc) |
2585 | { | 2642 | { |
2586 | struct x86_emulate_ops *ops = ctxt->ops; | 2643 | const struct x86_emulate_ops *ops = ctxt->ops; |
2587 | struct tss_segment_32 tss_seg; | 2644 | struct tss_segment_32 tss_seg; |
2588 | int ret; | 2645 | int ret; |
2589 | u32 new_tss_base = get_desc_base(new_desc); | 2646 | u32 new_tss_base = get_desc_base(new_desc); |
@@ -2627,7 +2684,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2627 | u16 tss_selector, int idt_index, int reason, | 2684 | u16 tss_selector, int idt_index, int reason, |
2628 | bool has_error_code, u32 error_code) | 2685 | bool has_error_code, u32 error_code) |
2629 | { | 2686 | { |
2630 | struct x86_emulate_ops *ops = ctxt->ops; | 2687 | const struct x86_emulate_ops *ops = ctxt->ops; |
2631 | struct desc_struct curr_tss_desc, next_tss_desc; | 2688 | struct desc_struct curr_tss_desc, next_tss_desc; |
2632 | int ret; | 2689 | int ret; |
2633 | u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); | 2690 | u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); |
@@ -2652,7 +2709,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2652 | * | 2709 | * |
2653 | * 1. jmp/call/int to task gate: Check against DPL of the task gate | 2710 | * 1. jmp/call/int to task gate: Check against DPL of the task gate |
2654 | * 2. Exception/IRQ/iret: No check is performed | 2711 | * 2. Exception/IRQ/iret: No check is performed |
2655 | * 3. jmp/call to TSS: Check agains DPL of the TSS | 2712 | * 3. jmp/call to TSS: Check against DPL of the TSS |
2656 | */ | 2713 | */ |
2657 | if (reason == TASK_SWITCH_GATE) { | 2714 | if (reason == TASK_SWITCH_GATE) { |
2658 | if (idt_index != -1) { | 2715 | if (idt_index != -1) { |
@@ -2693,7 +2750,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2693 | ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT; | 2750 | ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT; |
2694 | 2751 | ||
2695 | /* set back link to prev task only if NT bit is set in eflags | 2752 | /* set back link to prev task only if NT bit is set in eflags |
2696 | note that old_tss_sel is not used afetr this point */ | 2753 | note that old_tss_sel is not used after this point */ |
2697 | if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) | 2754 | if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) |
2698 | old_tss_sel = 0xffff; | 2755 | old_tss_sel = 0xffff; |
2699 | 2756 | ||
@@ -2733,26 +2790,28 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2733 | { | 2790 | { |
2734 | int rc; | 2791 | int rc; |
2735 | 2792 | ||
2793 | invalidate_registers(ctxt); | ||
2736 | ctxt->_eip = ctxt->eip; | 2794 | ctxt->_eip = ctxt->eip; |
2737 | ctxt->dst.type = OP_NONE; | 2795 | ctxt->dst.type = OP_NONE; |
2738 | 2796 | ||
2739 | rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason, | 2797 | rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason, |
2740 | has_error_code, error_code); | 2798 | has_error_code, error_code); |
2741 | 2799 | ||
2742 | if (rc == X86EMUL_CONTINUE) | 2800 | if (rc == X86EMUL_CONTINUE) { |
2743 | ctxt->eip = ctxt->_eip; | 2801 | ctxt->eip = ctxt->_eip; |
2802 | writeback_registers(ctxt); | ||
2803 | } | ||
2744 | 2804 | ||
2745 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; | 2805 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; |
2746 | } | 2806 | } |
2747 | 2807 | ||
2748 | static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, | 2808 | static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg, |
2749 | int reg, struct operand *op) | 2809 | struct operand *op) |
2750 | { | 2810 | { |
2751 | int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; | 2811 | int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count; |
2752 | 2812 | ||
2753 | register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes); | 2813 | register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes); |
2754 | op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]); | 2814 | op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg)); |
2755 | op->addr.mem.seg = seg; | ||
2756 | } | 2815 | } |
2757 | 2816 | ||
2758 | static int em_das(struct x86_emulate_ctxt *ctxt) | 2817 | static int em_das(struct x86_emulate_ctxt *ctxt) |
@@ -2927,7 +2986,7 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt) | |||
2927 | { | 2986 | { |
2928 | ctxt->dst.type = OP_REG; | 2987 | ctxt->dst.type = OP_REG; |
2929 | ctxt->dst.bytes = ctxt->src.bytes; | 2988 | ctxt->dst.bytes = ctxt->src.bytes; |
2930 | ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; | 2989 | ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); |
2931 | ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1); | 2990 | ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1); |
2932 | 2991 | ||
2933 | return X86EMUL_CONTINUE; | 2992 | return X86EMUL_CONTINUE; |
@@ -2938,8 +2997,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt) | |||
2938 | u64 tsc = 0; | 2997 | u64 tsc = 0; |
2939 | 2998 | ||
2940 | ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); | 2999 | ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); |
2941 | ctxt->regs[VCPU_REGS_RAX] = (u32)tsc; | 3000 | *reg_write(ctxt, VCPU_REGS_RAX) = (u32)tsc; |
2942 | ctxt->regs[VCPU_REGS_RDX] = tsc >> 32; | 3001 | *reg_write(ctxt, VCPU_REGS_RDX) = tsc >> 32; |
2943 | return X86EMUL_CONTINUE; | 3002 | return X86EMUL_CONTINUE; |
2944 | } | 3003 | } |
2945 | 3004 | ||
@@ -2947,10 +3006,10 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt) | |||
2947 | { | 3006 | { |
2948 | u64 pmc; | 3007 | u64 pmc; |
2949 | 3008 | ||
2950 | if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc)) | 3009 | if (ctxt->ops->read_pmc(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &pmc)) |
2951 | return emulate_gp(ctxt, 0); | 3010 | return emulate_gp(ctxt, 0); |
2952 | ctxt->regs[VCPU_REGS_RAX] = (u32)pmc; | 3011 | *reg_write(ctxt, VCPU_REGS_RAX) = (u32)pmc; |
2953 | ctxt->regs[VCPU_REGS_RDX] = pmc >> 32; | 3012 | *reg_write(ctxt, VCPU_REGS_RDX) = pmc >> 32; |
2954 | return X86EMUL_CONTINUE; | 3013 | return X86EMUL_CONTINUE; |
2955 | } | 3014 | } |
2956 | 3015 | ||
@@ -2992,9 +3051,9 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt) | |||
2992 | { | 3051 | { |
2993 | u64 msr_data; | 3052 | u64 msr_data; |
2994 | 3053 | ||
2995 | msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] | 3054 | msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX) |
2996 | | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32); | 3055 | | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32); |
2997 | if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) | 3056 | if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data)) |
2998 | return emulate_gp(ctxt, 0); | 3057 | return emulate_gp(ctxt, 0); |
2999 | 3058 | ||
3000 | return X86EMUL_CONTINUE; | 3059 | return X86EMUL_CONTINUE; |
@@ -3004,11 +3063,11 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt) | |||
3004 | { | 3063 | { |
3005 | u64 msr_data; | 3064 | u64 msr_data; |
3006 | 3065 | ||
3007 | if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) | 3066 | if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data)) |
3008 | return emulate_gp(ctxt, 0); | 3067 | return emulate_gp(ctxt, 0); |
3009 | 3068 | ||
3010 | ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data; | 3069 | *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data; |
3011 | ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32; | 3070 | *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32; |
3012 | return X86EMUL_CONTINUE; | 3071 | return X86EMUL_CONTINUE; |
3013 | } | 3072 | } |
3014 | 3073 | ||
@@ -3188,8 +3247,8 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt) | |||
3188 | 3247 | ||
3189 | static int em_loop(struct x86_emulate_ctxt *ctxt) | 3248 | static int em_loop(struct x86_emulate_ctxt *ctxt) |
3190 | { | 3249 | { |
3191 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); | 3250 | register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1); |
3192 | if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) && | 3251 | if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) && |
3193 | (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) | 3252 | (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) |
3194 | jmp_rel(ctxt, ctxt->src.val); | 3253 | jmp_rel(ctxt, ctxt->src.val); |
3195 | 3254 | ||
@@ -3198,7 +3257,7 @@ static int em_loop(struct x86_emulate_ctxt *ctxt) | |||
3198 | 3257 | ||
3199 | static int em_jcxz(struct x86_emulate_ctxt *ctxt) | 3258 | static int em_jcxz(struct x86_emulate_ctxt *ctxt) |
3200 | { | 3259 | { |
3201 | if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) | 3260 | if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) |
3202 | jmp_rel(ctxt, ctxt->src.val); | 3261 | jmp_rel(ctxt, ctxt->src.val); |
3203 | 3262 | ||
3204 | return X86EMUL_CONTINUE; | 3263 | return X86EMUL_CONTINUE; |
@@ -3286,20 +3345,20 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt) | |||
3286 | { | 3345 | { |
3287 | u32 eax, ebx, ecx, edx; | 3346 | u32 eax, ebx, ecx, edx; |
3288 | 3347 | ||
3289 | eax = ctxt->regs[VCPU_REGS_RAX]; | 3348 | eax = reg_read(ctxt, VCPU_REGS_RAX); |
3290 | ecx = ctxt->regs[VCPU_REGS_RCX]; | 3349 | ecx = reg_read(ctxt, VCPU_REGS_RCX); |
3291 | ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); | 3350 | ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); |
3292 | ctxt->regs[VCPU_REGS_RAX] = eax; | 3351 | *reg_write(ctxt, VCPU_REGS_RAX) = eax; |
3293 | ctxt->regs[VCPU_REGS_RBX] = ebx; | 3352 | *reg_write(ctxt, VCPU_REGS_RBX) = ebx; |
3294 | ctxt->regs[VCPU_REGS_RCX] = ecx; | 3353 | *reg_write(ctxt, VCPU_REGS_RCX) = ecx; |
3295 | ctxt->regs[VCPU_REGS_RDX] = edx; | 3354 | *reg_write(ctxt, VCPU_REGS_RDX) = edx; |
3296 | return X86EMUL_CONTINUE; | 3355 | return X86EMUL_CONTINUE; |
3297 | } | 3356 | } |
3298 | 3357 | ||
3299 | static int em_lahf(struct x86_emulate_ctxt *ctxt) | 3358 | static int em_lahf(struct x86_emulate_ctxt *ctxt) |
3300 | { | 3359 | { |
3301 | ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL; | 3360 | *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL; |
3302 | ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8; | 3361 | *reg_rmw(ctxt, VCPU_REGS_RAX) |= (ctxt->eflags & 0xff) << 8; |
3303 | return X86EMUL_CONTINUE; | 3362 | return X86EMUL_CONTINUE; |
3304 | } | 3363 | } |
3305 | 3364 | ||
@@ -3456,7 +3515,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt) | |||
3456 | 3515 | ||
3457 | static int check_svme_pa(struct x86_emulate_ctxt *ctxt) | 3516 | static int check_svme_pa(struct x86_emulate_ctxt *ctxt) |
3458 | { | 3517 | { |
3459 | u64 rax = ctxt->regs[VCPU_REGS_RAX]; | 3518 | u64 rax = reg_read(ctxt, VCPU_REGS_RAX); |
3460 | 3519 | ||
3461 | /* Valid physical address? */ | 3520 | /* Valid physical address? */ |
3462 | if (rax & 0xffff000000000000ULL) | 3521 | if (rax & 0xffff000000000000ULL) |
@@ -3478,7 +3537,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt) | |||
3478 | static int check_rdpmc(struct x86_emulate_ctxt *ctxt) | 3537 | static int check_rdpmc(struct x86_emulate_ctxt *ctxt) |
3479 | { | 3538 | { |
3480 | u64 cr4 = ctxt->ops->get_cr(ctxt, 4); | 3539 | u64 cr4 = ctxt->ops->get_cr(ctxt, 4); |
3481 | u64 rcx = ctxt->regs[VCPU_REGS_RCX]; | 3540 | u64 rcx = reg_read(ctxt, VCPU_REGS_RCX); |
3482 | 3541 | ||
3483 | if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || | 3542 | if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || |
3484 | (rcx > 3)) | 3543 | (rcx > 3)) |
@@ -3531,13 +3590,13 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) | |||
3531 | I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ | 3590 | I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ |
3532 | I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) | 3591 | I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) |
3533 | 3592 | ||
3534 | static struct opcode group7_rm1[] = { | 3593 | static const struct opcode group7_rm1[] = { |
3535 | DI(SrcNone | Priv, monitor), | 3594 | DI(SrcNone | Priv, monitor), |
3536 | DI(SrcNone | Priv, mwait), | 3595 | DI(SrcNone | Priv, mwait), |
3537 | N, N, N, N, N, N, | 3596 | N, N, N, N, N, N, |
3538 | }; | 3597 | }; |
3539 | 3598 | ||
3540 | static struct opcode group7_rm3[] = { | 3599 | static const struct opcode group7_rm3[] = { |
3541 | DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), | 3600 | DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), |
3542 | II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall), | 3601 | II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall), |
3543 | DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), | 3602 | DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), |
@@ -3548,13 +3607,13 @@ static struct opcode group7_rm3[] = { | |||
3548 | DIP(SrcNone | Prot | Priv, invlpga, check_svme), | 3607 | DIP(SrcNone | Prot | Priv, invlpga, check_svme), |
3549 | }; | 3608 | }; |
3550 | 3609 | ||
3551 | static struct opcode group7_rm7[] = { | 3610 | static const struct opcode group7_rm7[] = { |
3552 | N, | 3611 | N, |
3553 | DIP(SrcNone, rdtscp, check_rdtsc), | 3612 | DIP(SrcNone, rdtscp, check_rdtsc), |
3554 | N, N, N, N, N, N, | 3613 | N, N, N, N, N, N, |
3555 | }; | 3614 | }; |
3556 | 3615 | ||
3557 | static struct opcode group1[] = { | 3616 | static const struct opcode group1[] = { |
3558 | I(Lock, em_add), | 3617 | I(Lock, em_add), |
3559 | I(Lock | PageTable, em_or), | 3618 | I(Lock | PageTable, em_or), |
3560 | I(Lock, em_adc), | 3619 | I(Lock, em_adc), |
@@ -3565,11 +3624,11 @@ static struct opcode group1[] = { | |||
3565 | I(0, em_cmp), | 3624 | I(0, em_cmp), |
3566 | }; | 3625 | }; |
3567 | 3626 | ||
3568 | static struct opcode group1A[] = { | 3627 | static const struct opcode group1A[] = { |
3569 | I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, | 3628 | I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, |
3570 | }; | 3629 | }; |
3571 | 3630 | ||
3572 | static struct opcode group3[] = { | 3631 | static const struct opcode group3[] = { |
3573 | I(DstMem | SrcImm, em_test), | 3632 | I(DstMem | SrcImm, em_test), |
3574 | I(DstMem | SrcImm, em_test), | 3633 | I(DstMem | SrcImm, em_test), |
3575 | I(DstMem | SrcNone | Lock, em_not), | 3634 | I(DstMem | SrcNone | Lock, em_not), |
@@ -3580,13 +3639,13 @@ static struct opcode group3[] = { | |||
3580 | I(SrcMem, em_idiv_ex), | 3639 | I(SrcMem, em_idiv_ex), |
3581 | }; | 3640 | }; |
3582 | 3641 | ||
3583 | static struct opcode group4[] = { | 3642 | static const struct opcode group4[] = { |
3584 | I(ByteOp | DstMem | SrcNone | Lock, em_grp45), | 3643 | I(ByteOp | DstMem | SrcNone | Lock, em_grp45), |
3585 | I(ByteOp | DstMem | SrcNone | Lock, em_grp45), | 3644 | I(ByteOp | DstMem | SrcNone | Lock, em_grp45), |
3586 | N, N, N, N, N, N, | 3645 | N, N, N, N, N, N, |
3587 | }; | 3646 | }; |
3588 | 3647 | ||
3589 | static struct opcode group5[] = { | 3648 | static const struct opcode group5[] = { |
3590 | I(DstMem | SrcNone | Lock, em_grp45), | 3649 | I(DstMem | SrcNone | Lock, em_grp45), |
3591 | I(DstMem | SrcNone | Lock, em_grp45), | 3650 | I(DstMem | SrcNone | Lock, em_grp45), |
3592 | I(SrcMem | Stack, em_grp45), | 3651 | I(SrcMem | Stack, em_grp45), |
@@ -3596,7 +3655,7 @@ static struct opcode group5[] = { | |||
3596 | I(SrcMem | Stack, em_grp45), N, | 3655 | I(SrcMem | Stack, em_grp45), N, |
3597 | }; | 3656 | }; |
3598 | 3657 | ||
3599 | static struct opcode group6[] = { | 3658 | static const struct opcode group6[] = { |
3600 | DI(Prot, sldt), | 3659 | DI(Prot, sldt), |
3601 | DI(Prot, str), | 3660 | DI(Prot, str), |
3602 | II(Prot | Priv | SrcMem16, em_lldt, lldt), | 3661 | II(Prot | Priv | SrcMem16, em_lldt, lldt), |
@@ -3604,7 +3663,7 @@ static struct opcode group6[] = { | |||
3604 | N, N, N, N, | 3663 | N, N, N, N, |
3605 | }; | 3664 | }; |
3606 | 3665 | ||
3607 | static struct group_dual group7 = { { | 3666 | static const struct group_dual group7 = { { |
3608 | II(Mov | DstMem | Priv, em_sgdt, sgdt), | 3667 | II(Mov | DstMem | Priv, em_sgdt, sgdt), |
3609 | II(Mov | DstMem | Priv, em_sidt, sidt), | 3668 | II(Mov | DstMem | Priv, em_sidt, sidt), |
3610 | II(SrcMem | Priv, em_lgdt, lgdt), | 3669 | II(SrcMem | Priv, em_lgdt, lgdt), |
@@ -3621,7 +3680,7 @@ static struct group_dual group7 = { { | |||
3621 | EXT(0, group7_rm7), | 3680 | EXT(0, group7_rm7), |
3622 | } }; | 3681 | } }; |
3623 | 3682 | ||
3624 | static struct opcode group8[] = { | 3683 | static const struct opcode group8[] = { |
3625 | N, N, N, N, | 3684 | N, N, N, N, |
3626 | I(DstMem | SrcImmByte, em_bt), | 3685 | I(DstMem | SrcImmByte, em_bt), |
3627 | I(DstMem | SrcImmByte | Lock | PageTable, em_bts), | 3686 | I(DstMem | SrcImmByte | Lock | PageTable, em_bts), |
@@ -3629,26 +3688,26 @@ static struct opcode group8[] = { | |||
3629 | I(DstMem | SrcImmByte | Lock | PageTable, em_btc), | 3688 | I(DstMem | SrcImmByte | Lock | PageTable, em_btc), |
3630 | }; | 3689 | }; |
3631 | 3690 | ||
3632 | static struct group_dual group9 = { { | 3691 | static const struct group_dual group9 = { { |
3633 | N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, | 3692 | N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, |
3634 | }, { | 3693 | }, { |
3635 | N, N, N, N, N, N, N, N, | 3694 | N, N, N, N, N, N, N, N, |
3636 | } }; | 3695 | } }; |
3637 | 3696 | ||
3638 | static struct opcode group11[] = { | 3697 | static const struct opcode group11[] = { |
3639 | I(DstMem | SrcImm | Mov | PageTable, em_mov), | 3698 | I(DstMem | SrcImm | Mov | PageTable, em_mov), |
3640 | X7(D(Undefined)), | 3699 | X7(D(Undefined)), |
3641 | }; | 3700 | }; |
3642 | 3701 | ||
3643 | static struct gprefix pfx_0f_6f_0f_7f = { | 3702 | static const struct gprefix pfx_0f_6f_0f_7f = { |
3644 | I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), | 3703 | I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), |
3645 | }; | 3704 | }; |
3646 | 3705 | ||
3647 | static struct gprefix pfx_vmovntpx = { | 3706 | static const struct gprefix pfx_vmovntpx = { |
3648 | I(0, em_mov), N, N, N, | 3707 | I(0, em_mov), N, N, N, |
3649 | }; | 3708 | }; |
3650 | 3709 | ||
3651 | static struct opcode opcode_table[256] = { | 3710 | static const struct opcode opcode_table[256] = { |
3652 | /* 0x00 - 0x07 */ | 3711 | /* 0x00 - 0x07 */ |
3653 | I6ALU(Lock, em_add), | 3712 | I6ALU(Lock, em_add), |
3654 | I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), | 3713 | I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), |
@@ -3689,7 +3748,7 @@ static struct opcode opcode_table[256] = { | |||
3689 | I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), | 3748 | I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), |
3690 | I(SrcImmByte | Mov | Stack, em_push), | 3749 | I(SrcImmByte | Mov | Stack, em_push), |
3691 | I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), | 3750 | I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), |
3692 | I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */ | 3751 | I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */ |
3693 | I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */ | 3752 | I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */ |
3694 | /* 0x70 - 0x7F */ | 3753 | /* 0x70 - 0x7F */ |
3695 | X16(D(SrcImmByte)), | 3754 | X16(D(SrcImmByte)), |
@@ -3765,7 +3824,7 @@ static struct opcode opcode_table[256] = { | |||
3765 | D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), | 3824 | D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), |
3766 | }; | 3825 | }; |
3767 | 3826 | ||
3768 | static struct opcode twobyte_table[256] = { | 3827 | static const struct opcode twobyte_table[256] = { |
3769 | /* 0x00 - 0x0F */ | 3828 | /* 0x00 - 0x0F */ |
3770 | G(0, group6), GD(0, &group7), N, N, | 3829 | G(0, group6), GD(0, &group7), N, N, |
3771 | N, I(ImplicitOps | VendorSpecific, em_syscall), | 3830 | N, I(ImplicitOps | VendorSpecific, em_syscall), |
@@ -3936,7 +3995,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, | |||
3936 | case OpAcc: | 3995 | case OpAcc: |
3937 | op->type = OP_REG; | 3996 | op->type = OP_REG; |
3938 | op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; | 3997 | op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
3939 | op->addr.reg = &ctxt->regs[VCPU_REGS_RAX]; | 3998 | op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); |
3940 | fetch_register_operand(op); | 3999 | fetch_register_operand(op); |
3941 | op->orig_val = op->val; | 4000 | op->orig_val = op->val; |
3942 | break; | 4001 | break; |
@@ -3944,19 +4003,20 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, | |||
3944 | op->type = OP_MEM; | 4003 | op->type = OP_MEM; |
3945 | op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; | 4004 | op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
3946 | op->addr.mem.ea = | 4005 | op->addr.mem.ea = |
3947 | register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]); | 4006 | register_address(ctxt, reg_read(ctxt, VCPU_REGS_RDI)); |
3948 | op->addr.mem.seg = VCPU_SREG_ES; | 4007 | op->addr.mem.seg = VCPU_SREG_ES; |
3949 | op->val = 0; | 4008 | op->val = 0; |
4009 | op->count = 1; | ||
3950 | break; | 4010 | break; |
3951 | case OpDX: | 4011 | case OpDX: |
3952 | op->type = OP_REG; | 4012 | op->type = OP_REG; |
3953 | op->bytes = 2; | 4013 | op->bytes = 2; |
3954 | op->addr.reg = &ctxt->regs[VCPU_REGS_RDX]; | 4014 | op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); |
3955 | fetch_register_operand(op); | 4015 | fetch_register_operand(op); |
3956 | break; | 4016 | break; |
3957 | case OpCL: | 4017 | case OpCL: |
3958 | op->bytes = 1; | 4018 | op->bytes = 1; |
3959 | op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff; | 4019 | op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff; |
3960 | break; | 4020 | break; |
3961 | case OpImmByte: | 4021 | case OpImmByte: |
3962 | rc = decode_imm(ctxt, op, 1, true); | 4022 | rc = decode_imm(ctxt, op, 1, true); |
@@ -3987,9 +4047,10 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, | |||
3987 | op->type = OP_MEM; | 4047 | op->type = OP_MEM; |
3988 | op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; | 4048 | op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
3989 | op->addr.mem.ea = | 4049 | op->addr.mem.ea = |
3990 | register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]); | 4050 | register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI)); |
3991 | op->addr.mem.seg = seg_override(ctxt); | 4051 | op->addr.mem.seg = seg_override(ctxt); |
3992 | op->val = 0; | 4052 | op->val = 0; |
4053 | op->count = 1; | ||
3993 | break; | 4054 | break; |
3994 | case OpImmFAddr: | 4055 | case OpImmFAddr: |
3995 | op->type = OP_IMM; | 4056 | op->type = OP_IMM; |
@@ -4293,9 +4354,10 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, | |||
4293 | read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); | 4354 | read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); |
4294 | } | 4355 | } |
4295 | 4356 | ||
4357 | |||
4296 | int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | 4358 | int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) |
4297 | { | 4359 | { |
4298 | struct x86_emulate_ops *ops = ctxt->ops; | 4360 | const struct x86_emulate_ops *ops = ctxt->ops; |
4299 | int rc = X86EMUL_CONTINUE; | 4361 | int rc = X86EMUL_CONTINUE; |
4300 | int saved_dst_type = ctxt->dst.type; | 4362 | int saved_dst_type = ctxt->dst.type; |
4301 | 4363 | ||
@@ -4356,7 +4418,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | |||
4356 | } | 4418 | } |
4357 | 4419 | ||
4358 | /* Instruction can only be executed in protected mode */ | 4420 | /* Instruction can only be executed in protected mode */ |
4359 | if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { | 4421 | if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) { |
4360 | rc = emulate_ud(ctxt); | 4422 | rc = emulate_ud(ctxt); |
4361 | goto done; | 4423 | goto done; |
4362 | } | 4424 | } |
@@ -4377,7 +4439,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | |||
4377 | 4439 | ||
4378 | if (ctxt->rep_prefix && (ctxt->d & String)) { | 4440 | if (ctxt->rep_prefix && (ctxt->d & String)) { |
4379 | /* All REP prefixes have the same first termination condition */ | 4441 | /* All REP prefixes have the same first termination condition */ |
4380 | if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) { | 4442 | if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { |
4381 | ctxt->eip = ctxt->_eip; | 4443 | ctxt->eip = ctxt->_eip; |
4382 | goto done; | 4444 | goto done; |
4383 | } | 4445 | } |
@@ -4450,7 +4512,7 @@ special_insn: | |||
4450 | ctxt->dst.val = ctxt->src.addr.mem.ea; | 4512 | ctxt->dst.val = ctxt->src.addr.mem.ea; |
4451 | break; | 4513 | break; |
4452 | case 0x90 ... 0x97: /* nop / xchg reg, rax */ | 4514 | case 0x90 ... 0x97: /* nop / xchg reg, rax */ |
4453 | if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX]) | 4515 | if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX)) |
4454 | break; | 4516 | break; |
4455 | rc = em_xchg(ctxt); | 4517 | rc = em_xchg(ctxt); |
4456 | break; | 4518 | break; |
@@ -4478,7 +4540,7 @@ special_insn: | |||
4478 | rc = em_grp2(ctxt); | 4540 | rc = em_grp2(ctxt); |
4479 | break; | 4541 | break; |
4480 | case 0xd2 ... 0xd3: /* Grp2 */ | 4542 | case 0xd2 ... 0xd3: /* Grp2 */ |
4481 | ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; | 4543 | ctxt->src.val = reg_read(ctxt, VCPU_REGS_RCX); |
4482 | rc = em_grp2(ctxt); | 4544 | rc = em_grp2(ctxt); |
4483 | break; | 4545 | break; |
4484 | case 0xe9: /* jmp rel */ | 4546 | case 0xe9: /* jmp rel */ |
@@ -4524,23 +4586,27 @@ writeback: | |||
4524 | ctxt->dst.type = saved_dst_type; | 4586 | ctxt->dst.type = saved_dst_type; |
4525 | 4587 | ||
4526 | if ((ctxt->d & SrcMask) == SrcSI) | 4588 | if ((ctxt->d & SrcMask) == SrcSI) |
4527 | string_addr_inc(ctxt, seg_override(ctxt), | 4589 | string_addr_inc(ctxt, VCPU_REGS_RSI, &ctxt->src); |
4528 | VCPU_REGS_RSI, &ctxt->src); | ||
4529 | 4590 | ||
4530 | if ((ctxt->d & DstMask) == DstDI) | 4591 | if ((ctxt->d & DstMask) == DstDI) |
4531 | string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, | 4592 | string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst); |
4532 | &ctxt->dst); | ||
4533 | 4593 | ||
4534 | if (ctxt->rep_prefix && (ctxt->d & String)) { | 4594 | if (ctxt->rep_prefix && (ctxt->d & String)) { |
4595 | unsigned int count; | ||
4535 | struct read_cache *r = &ctxt->io_read; | 4596 | struct read_cache *r = &ctxt->io_read; |
4536 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); | 4597 | if ((ctxt->d & SrcMask) == SrcSI) |
4598 | count = ctxt->src.count; | ||
4599 | else | ||
4600 | count = ctxt->dst.count; | ||
4601 | register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), | ||
4602 | -count); | ||
4537 | 4603 | ||
4538 | if (!string_insn_completed(ctxt)) { | 4604 | if (!string_insn_completed(ctxt)) { |
4539 | /* | 4605 | /* |
4540 | * Re-enter guest when pio read ahead buffer is empty | 4606 | * Re-enter guest when pio read ahead buffer is empty |
4541 | * or, if it is not used, after each 1024 iteration. | 4607 | * or, if it is not used, after each 1024 iteration. |
4542 | */ | 4608 | */ |
4543 | if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) && | 4609 | if ((r->end != 0 || reg_read(ctxt, VCPU_REGS_RCX) & 0x3ff) && |
4544 | (r->end == 0 || r->end != r->pos)) { | 4610 | (r->end == 0 || r->end != r->pos)) { |
4545 | /* | 4611 | /* |
4546 | * Reset read cache. Usually happens before | 4612 | * Reset read cache. Usually happens before |
@@ -4548,6 +4614,7 @@ writeback: | |||
4548 | * we have to do it here. | 4614 | * we have to do it here. |
4549 | */ | 4615 | */ |
4550 | ctxt->mem_read.end = 0; | 4616 | ctxt->mem_read.end = 0; |
4617 | writeback_registers(ctxt); | ||
4551 | return EMULATION_RESTART; | 4618 | return EMULATION_RESTART; |
4552 | } | 4619 | } |
4553 | goto done; /* skip rip writeback */ | 4620 | goto done; /* skip rip writeback */ |
@@ -4562,6 +4629,9 @@ done: | |||
4562 | if (rc == X86EMUL_INTERCEPTED) | 4629 | if (rc == X86EMUL_INTERCEPTED) |
4563 | return EMULATION_INTERCEPTED; | 4630 | return EMULATION_INTERCEPTED; |
4564 | 4631 | ||
4632 | if (rc == X86EMUL_CONTINUE) | ||
4633 | writeback_registers(ctxt); | ||
4634 | |||
4565 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; | 4635 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; |
4566 | 4636 | ||
4567 | twobyte_insn: | 4637 | twobyte_insn: |
@@ -4634,3 +4704,13 @@ twobyte_insn: | |||
4634 | cannot_emulate: | 4704 | cannot_emulate: |
4635 | return EMULATION_FAILED; | 4705 | return EMULATION_FAILED; |
4636 | } | 4706 | } |
4707 | |||
4708 | void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt) | ||
4709 | { | ||
4710 | invalidate_registers(ctxt); | ||
4711 | } | ||
4712 | |||
4713 | void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt) | ||
4714 | { | ||
4715 | writeback_registers(ctxt); | ||
4716 | } | ||
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index adba28f88d1a..11300d2fa714 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -108,7 +108,7 @@ static s64 __kpit_elapsed(struct kvm *kvm) | |||
108 | ktime_t remaining; | 108 | ktime_t remaining; |
109 | struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; | 109 | struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; |
110 | 110 | ||
111 | if (!ps->pit_timer.period) | 111 | if (!ps->period) |
112 | return 0; | 112 | return 0; |
113 | 113 | ||
114 | /* | 114 | /* |
@@ -120,9 +120,9 @@ static s64 __kpit_elapsed(struct kvm *kvm) | |||
120 | * itself with the initial count and continues counting | 120 | * itself with the initial count and continues counting |
121 | * from there. | 121 | * from there. |
122 | */ | 122 | */ |
123 | remaining = hrtimer_get_remaining(&ps->pit_timer.timer); | 123 | remaining = hrtimer_get_remaining(&ps->timer); |
124 | elapsed = ps->pit_timer.period - ktime_to_ns(remaining); | 124 | elapsed = ps->period - ktime_to_ns(remaining); |
125 | elapsed = mod_64(elapsed, ps->pit_timer.period); | 125 | elapsed = mod_64(elapsed, ps->period); |
126 | 126 | ||
127 | return elapsed; | 127 | return elapsed; |
128 | } | 128 | } |
@@ -238,12 +238,12 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) | |||
238 | int value; | 238 | int value; |
239 | 239 | ||
240 | spin_lock(&ps->inject_lock); | 240 | spin_lock(&ps->inject_lock); |
241 | value = atomic_dec_return(&ps->pit_timer.pending); | 241 | value = atomic_dec_return(&ps->pending); |
242 | if (value < 0) | 242 | if (value < 0) |
243 | /* spurious acks can be generated if, for example, the | 243 | /* spurious acks can be generated if, for example, the |
244 | * PIC is being reset. Handle it gracefully here | 244 | * PIC is being reset. Handle it gracefully here |
245 | */ | 245 | */ |
246 | atomic_inc(&ps->pit_timer.pending); | 246 | atomic_inc(&ps->pending); |
247 | else if (value > 0) | 247 | else if (value > 0) |
248 | /* in this case, we had multiple outstanding pit interrupts | 248 | /* in this case, we had multiple outstanding pit interrupts |
249 | * that we needed to inject. Reinject | 249 | * that we needed to inject. Reinject |
@@ -261,28 +261,17 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) | |||
261 | if (!kvm_vcpu_is_bsp(vcpu) || !pit) | 261 | if (!kvm_vcpu_is_bsp(vcpu) || !pit) |
262 | return; | 262 | return; |
263 | 263 | ||
264 | timer = &pit->pit_state.pit_timer.timer; | 264 | timer = &pit->pit_state.timer; |
265 | if (hrtimer_cancel(timer)) | 265 | if (hrtimer_cancel(timer)) |
266 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); | 266 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); |
267 | } | 267 | } |
268 | 268 | ||
269 | static void destroy_pit_timer(struct kvm_pit *pit) | 269 | static void destroy_pit_timer(struct kvm_pit *pit) |
270 | { | 270 | { |
271 | hrtimer_cancel(&pit->pit_state.pit_timer.timer); | 271 | hrtimer_cancel(&pit->pit_state.timer); |
272 | flush_kthread_work(&pit->expired); | 272 | flush_kthread_work(&pit->expired); |
273 | } | 273 | } |
274 | 274 | ||
275 | static bool kpit_is_periodic(struct kvm_timer *ktimer) | ||
276 | { | ||
277 | struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state, | ||
278 | pit_timer); | ||
279 | return ps->is_periodic; | ||
280 | } | ||
281 | |||
282 | static struct kvm_timer_ops kpit_ops = { | ||
283 | .is_periodic = kpit_is_periodic, | ||
284 | }; | ||
285 | |||
286 | static void pit_do_work(struct kthread_work *work) | 275 | static void pit_do_work(struct kthread_work *work) |
287 | { | 276 | { |
288 | struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); | 277 | struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); |
@@ -322,16 +311,16 @@ static void pit_do_work(struct kthread_work *work) | |||
322 | 311 | ||
323 | static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) | 312 | static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) |
324 | { | 313 | { |
325 | struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); | 314 | struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer); |
326 | struct kvm_pit *pt = ktimer->kvm->arch.vpit; | 315 | struct kvm_pit *pt = ps->kvm->arch.vpit; |
327 | 316 | ||
328 | if (ktimer->reinject || !atomic_read(&ktimer->pending)) { | 317 | if (ps->reinject || !atomic_read(&ps->pending)) { |
329 | atomic_inc(&ktimer->pending); | 318 | atomic_inc(&ps->pending); |
330 | queue_kthread_work(&pt->worker, &pt->expired); | 319 | queue_kthread_work(&pt->worker, &pt->expired); |
331 | } | 320 | } |
332 | 321 | ||
333 | if (ktimer->t_ops->is_periodic(ktimer)) { | 322 | if (ps->is_periodic) { |
334 | hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); | 323 | hrtimer_add_expires_ns(&ps->timer, ps->period); |
335 | return HRTIMER_RESTART; | 324 | return HRTIMER_RESTART; |
336 | } else | 325 | } else |
337 | return HRTIMER_NORESTART; | 326 | return HRTIMER_NORESTART; |
@@ -340,7 +329,6 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) | |||
340 | static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) | 329 | static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) |
341 | { | 330 | { |
342 | struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; | 331 | struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; |
343 | struct kvm_timer *pt = &ps->pit_timer; | ||
344 | s64 interval; | 332 | s64 interval; |
345 | 333 | ||
346 | if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) | 334 | if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) |
@@ -351,19 +339,18 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) | |||
351 | pr_debug("create pit timer, interval is %llu nsec\n", interval); | 339 | pr_debug("create pit timer, interval is %llu nsec\n", interval); |
352 | 340 | ||
353 | /* TODO The new value only affected after the retriggered */ | 341 | /* TODO The new value only affected after the retriggered */ |
354 | hrtimer_cancel(&pt->timer); | 342 | hrtimer_cancel(&ps->timer); |
355 | flush_kthread_work(&ps->pit->expired); | 343 | flush_kthread_work(&ps->pit->expired); |
356 | pt->period = interval; | 344 | ps->period = interval; |
357 | ps->is_periodic = is_period; | 345 | ps->is_periodic = is_period; |
358 | 346 | ||
359 | pt->timer.function = pit_timer_fn; | 347 | ps->timer.function = pit_timer_fn; |
360 | pt->t_ops = &kpit_ops; | 348 | ps->kvm = ps->pit->kvm; |
361 | pt->kvm = ps->pit->kvm; | ||
362 | 349 | ||
363 | atomic_set(&pt->pending, 0); | 350 | atomic_set(&ps->pending, 0); |
364 | ps->irq_ack = 1; | 351 | ps->irq_ack = 1; |
365 | 352 | ||
366 | hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), | 353 | hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval), |
367 | HRTIMER_MODE_ABS); | 354 | HRTIMER_MODE_ABS); |
368 | } | 355 | } |
369 | 356 | ||
@@ -639,7 +626,7 @@ void kvm_pit_reset(struct kvm_pit *pit) | |||
639 | } | 626 | } |
640 | mutex_unlock(&pit->pit_state.lock); | 627 | mutex_unlock(&pit->pit_state.lock); |
641 | 628 | ||
642 | atomic_set(&pit->pit_state.pit_timer.pending, 0); | 629 | atomic_set(&pit->pit_state.pending, 0); |
643 | pit->pit_state.irq_ack = 1; | 630 | pit->pit_state.irq_ack = 1; |
644 | } | 631 | } |
645 | 632 | ||
@@ -648,7 +635,7 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) | |||
648 | struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); | 635 | struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); |
649 | 636 | ||
650 | if (!mask) { | 637 | if (!mask) { |
651 | atomic_set(&pit->pit_state.pit_timer.pending, 0); | 638 | atomic_set(&pit->pit_state.pending, 0); |
652 | pit->pit_state.irq_ack = 1; | 639 | pit->pit_state.irq_ack = 1; |
653 | } | 640 | } |
654 | } | 641 | } |
@@ -706,12 +693,11 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) | |||
706 | 693 | ||
707 | pit_state = &pit->pit_state; | 694 | pit_state = &pit->pit_state; |
708 | pit_state->pit = pit; | 695 | pit_state->pit = pit; |
709 | hrtimer_init(&pit_state->pit_timer.timer, | 696 | hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); |
710 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | ||
711 | pit_state->irq_ack_notifier.gsi = 0; | 697 | pit_state->irq_ack_notifier.gsi = 0; |
712 | pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; | 698 | pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; |
713 | kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); | 699 | kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); |
714 | pit_state->pit_timer.reinject = true; | 700 | pit_state->reinject = true; |
715 | mutex_unlock(&pit->pit_state.lock); | 701 | mutex_unlock(&pit->pit_state.lock); |
716 | 702 | ||
717 | kvm_pit_reset(pit); | 703 | kvm_pit_reset(pit); |
@@ -761,7 +747,7 @@ void kvm_free_pit(struct kvm *kvm) | |||
761 | kvm_unregister_irq_ack_notifier(kvm, | 747 | kvm_unregister_irq_ack_notifier(kvm, |
762 | &kvm->arch.vpit->pit_state.irq_ack_notifier); | 748 | &kvm->arch.vpit->pit_state.irq_ack_notifier); |
763 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | 749 | mutex_lock(&kvm->arch.vpit->pit_state.lock); |
764 | timer = &kvm->arch.vpit->pit_state.pit_timer.timer; | 750 | timer = &kvm->arch.vpit->pit_state.timer; |
765 | hrtimer_cancel(timer); | 751 | hrtimer_cancel(timer); |
766 | flush_kthread_work(&kvm->arch.vpit->expired); | 752 | flush_kthread_work(&kvm->arch.vpit->expired); |
767 | kthread_stop(kvm->arch.vpit->worker_task); | 753 | kthread_stop(kvm->arch.vpit->worker_task); |
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index fdf40425ea1d..dd1b16b611b0 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h | |||
@@ -24,8 +24,12 @@ struct kvm_kpit_channel_state { | |||
24 | struct kvm_kpit_state { | 24 | struct kvm_kpit_state { |
25 | struct kvm_kpit_channel_state channels[3]; | 25 | struct kvm_kpit_channel_state channels[3]; |
26 | u32 flags; | 26 | u32 flags; |
27 | struct kvm_timer pit_timer; | ||
28 | bool is_periodic; | 27 | bool is_periodic; |
28 | s64 period; /* unit: ns */ | ||
29 | struct hrtimer timer; | ||
30 | atomic_t pending; /* accumulated triggered timers */ | ||
31 | bool reinject; | ||
32 | struct kvm *kvm; | ||
29 | u32 speaker_data_on; | 33 | u32 speaker_data_on; |
30 | struct mutex lock; | 34 | struct mutex lock; |
31 | struct kvm_pit *pit; | 35 | struct kvm_pit *pit; |
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 9fc9aa7ac703..848206df0967 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
@@ -190,17 +190,17 @@ void kvm_pic_update_irq(struct kvm_pic *s) | |||
190 | 190 | ||
191 | int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) | 191 | int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) |
192 | { | 192 | { |
193 | int ret = -1; | 193 | int ret, irq_level; |
194 | |||
195 | BUG_ON(irq < 0 || irq >= PIC_NUM_PINS); | ||
194 | 196 | ||
195 | pic_lock(s); | 197 | pic_lock(s); |
196 | if (irq >= 0 && irq < PIC_NUM_PINS) { | 198 | irq_level = __kvm_irq_line_state(&s->irq_states[irq], |
197 | int irq_level = __kvm_irq_line_state(&s->irq_states[irq], | 199 | irq_source_id, level); |
198 | irq_source_id, level); | 200 | ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); |
199 | ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); | 201 | pic_update_irq(s); |
200 | pic_update_irq(s); | 202 | trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, |
201 | trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, | 203 | s->pics[irq >> 3].imr, ret == 0); |
202 | s->pics[irq >> 3].imr, ret == 0); | ||
203 | } | ||
204 | pic_unlock(s); | 204 | pic_unlock(s); |
205 | 205 | ||
206 | return ret; | 206 | return ret; |
@@ -275,23 +275,20 @@ void kvm_pic_reset(struct kvm_kpic_state *s) | |||
275 | { | 275 | { |
276 | int irq, i; | 276 | int irq, i; |
277 | struct kvm_vcpu *vcpu; | 277 | struct kvm_vcpu *vcpu; |
278 | u8 irr = s->irr, isr = s->imr; | 278 | u8 edge_irr = s->irr & ~s->elcr; |
279 | bool found = false; | 279 | bool found = false; |
280 | 280 | ||
281 | s->last_irr = 0; | 281 | s->last_irr = 0; |
282 | s->irr = 0; | 282 | s->irr &= s->elcr; |
283 | s->imr = 0; | 283 | s->imr = 0; |
284 | s->isr = 0; | ||
285 | s->priority_add = 0; | 284 | s->priority_add = 0; |
286 | s->irq_base = 0; | ||
287 | s->read_reg_select = 0; | ||
288 | s->poll = 0; | ||
289 | s->special_mask = 0; | 285 | s->special_mask = 0; |
290 | s->init_state = 0; | 286 | s->read_reg_select = 0; |
291 | s->auto_eoi = 0; | 287 | if (!s->init4) { |
292 | s->rotate_on_auto_eoi = 0; | 288 | s->special_fully_nested_mode = 0; |
293 | s->special_fully_nested_mode = 0; | 289 | s->auto_eoi = 0; |
294 | s->init4 = 0; | 290 | } |
291 | s->init_state = 1; | ||
295 | 292 | ||
296 | kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm) | 293 | kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm) |
297 | if (kvm_apic_accept_pic_intr(vcpu)) { | 294 | if (kvm_apic_accept_pic_intr(vcpu)) { |
@@ -304,7 +301,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s) | |||
304 | return; | 301 | return; |
305 | 302 | ||
306 | for (irq = 0; irq < PIC_NUM_PINS/2; irq++) | 303 | for (irq = 0; irq < PIC_NUM_PINS/2; irq++) |
307 | if (irr & (1 << irq) || isr & (1 << irq)) | 304 | if (edge_irr & (1 << irq)) |
308 | pic_clear_isr(s, irq); | 305 | pic_clear_isr(s, irq); |
309 | } | 306 | } |
310 | 307 | ||
@@ -316,40 +313,13 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
316 | addr &= 1; | 313 | addr &= 1; |
317 | if (addr == 0) { | 314 | if (addr == 0) { |
318 | if (val & 0x10) { | 315 | if (val & 0x10) { |
319 | u8 edge_irr = s->irr & ~s->elcr; | ||
320 | int i; | ||
321 | bool found = false; | ||
322 | struct kvm_vcpu *vcpu; | ||
323 | |||
324 | s->init4 = val & 1; | 316 | s->init4 = val & 1; |
325 | s->last_irr = 0; | ||
326 | s->irr &= s->elcr; | ||
327 | s->imr = 0; | ||
328 | s->priority_add = 0; | ||
329 | s->special_mask = 0; | ||
330 | s->read_reg_select = 0; | ||
331 | if (!s->init4) { | ||
332 | s->special_fully_nested_mode = 0; | ||
333 | s->auto_eoi = 0; | ||
334 | } | ||
335 | s->init_state = 1; | ||
336 | if (val & 0x02) | 317 | if (val & 0x02) |
337 | pr_pic_unimpl("single mode not supported"); | 318 | pr_pic_unimpl("single mode not supported"); |
338 | if (val & 0x08) | 319 | if (val & 0x08) |
339 | pr_pic_unimpl( | 320 | pr_pic_unimpl( |
340 | "level sensitive irq not supported"); | 321 | "level sensitive irq not supported"); |
341 | 322 | kvm_pic_reset(s); | |
342 | kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm) | ||
343 | if (kvm_apic_accept_pic_intr(vcpu)) { | ||
344 | found = true; | ||
345 | break; | ||
346 | } | ||
347 | |||
348 | |||
349 | if (found) | ||
350 | for (irq = 0; irq < PIC_NUM_PINS/2; irq++) | ||
351 | if (edge_irr & (1 << irq)) | ||
352 | pic_clear_isr(s, irq); | ||
353 | } else if (val & 0x08) { | 323 | } else if (val & 0x08) { |
354 | if (val & 0x04) | 324 | if (val & 0x04) |
355 | s->poll = 1; | 325 | s->poll = 1; |
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 2086f2bfba33..2d03568e9498 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h | |||
@@ -70,7 +70,7 @@ struct kvm_pic { | |||
70 | struct kvm_io_device dev_slave; | 70 | struct kvm_io_device dev_slave; |
71 | struct kvm_io_device dev_eclr; | 71 | struct kvm_io_device dev_eclr; |
72 | void (*ack_notifier)(void *opaque, int irq); | 72 | void (*ack_notifier)(void *opaque, int irq); |
73 | unsigned long irq_states[16]; | 73 | unsigned long irq_states[PIC_NUM_PINS]; |
74 | }; | 74 | }; |
75 | 75 | ||
76 | struct kvm_pic *kvm_create_pic(struct kvm *kvm); | 76 | struct kvm_pic *kvm_create_pic(struct kvm *kvm); |
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h deleted file mode 100644 index 497dbaa366d4..000000000000 --- a/arch/x86/kvm/kvm_timer.h +++ /dev/null | |||
@@ -1,18 +0,0 @@ | |||
1 | |||
2 | struct kvm_timer { | ||
3 | struct hrtimer timer; | ||
4 | s64 period; /* unit: ns */ | ||
5 | u32 timer_mode_mask; | ||
6 | u64 tscdeadline; | ||
7 | atomic_t pending; /* accumulated triggered timers */ | ||
8 | bool reinject; | ||
9 | struct kvm_timer_ops *t_ops; | ||
10 | struct kvm *kvm; | ||
11 | struct kvm_vcpu *vcpu; | ||
12 | }; | ||
13 | |||
14 | struct kvm_timer_ops { | ||
15 | bool (*is_periodic)(struct kvm_timer *); | ||
16 | }; | ||
17 | |||
18 | enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); | ||
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ce878788a39f..c6e6b721b6ee 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <asm/current.h> | 34 | #include <asm/current.h> |
35 | #include <asm/apicdef.h> | 35 | #include <asm/apicdef.h> |
36 | #include <linux/atomic.h> | 36 | #include <linux/atomic.h> |
37 | #include <linux/jump_label.h> | ||
37 | #include "kvm_cache_regs.h" | 38 | #include "kvm_cache_regs.h" |
38 | #include "irq.h" | 39 | #include "irq.h" |
39 | #include "trace.h" | 40 | #include "trace.h" |
@@ -65,6 +66,7 @@ | |||
65 | #define APIC_DEST_NOSHORT 0x0 | 66 | #define APIC_DEST_NOSHORT 0x0 |
66 | #define APIC_DEST_MASK 0x800 | 67 | #define APIC_DEST_MASK 0x800 |
67 | #define MAX_APIC_VECTOR 256 | 68 | #define MAX_APIC_VECTOR 256 |
69 | #define APIC_VECTORS_PER_REG 32 | ||
68 | 70 | ||
69 | #define VEC_POS(v) ((v) & (32 - 1)) | 71 | #define VEC_POS(v) ((v) & (32 - 1)) |
70 | #define REG_POS(v) (((v) >> 5) << 4) | 72 | #define REG_POS(v) (((v) >> 5) << 4) |
@@ -72,11 +74,6 @@ | |||
72 | static unsigned int min_timer_period_us = 500; | 74 | static unsigned int min_timer_period_us = 500; |
73 | module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); | 75 | module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); |
74 | 76 | ||
75 | static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) | ||
76 | { | ||
77 | return *((u32 *) (apic->regs + reg_off)); | ||
78 | } | ||
79 | |||
80 | static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) | 77 | static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) |
81 | { | 78 | { |
82 | *((u32 *) (apic->regs + reg_off)) = val; | 79 | *((u32 *) (apic->regs + reg_off)) = val; |
@@ -117,19 +114,23 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap) | |||
117 | return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | 114 | return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); |
118 | } | 115 | } |
119 | 116 | ||
120 | static inline int apic_hw_enabled(struct kvm_lapic *apic) | 117 | struct static_key_deferred apic_hw_disabled __read_mostly; |
121 | { | 118 | struct static_key_deferred apic_sw_disabled __read_mostly; |
122 | return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; | ||
123 | } | ||
124 | 119 | ||
125 | static inline int apic_sw_enabled(struct kvm_lapic *apic) | 120 | static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) |
126 | { | 121 | { |
127 | return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; | 122 | if ((kvm_apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) { |
123 | if (val & APIC_SPIV_APIC_ENABLED) | ||
124 | static_key_slow_dec_deferred(&apic_sw_disabled); | ||
125 | else | ||
126 | static_key_slow_inc(&apic_sw_disabled.key); | ||
127 | } | ||
128 | apic_set_reg(apic, APIC_SPIV, val); | ||
128 | } | 129 | } |
129 | 130 | ||
130 | static inline int apic_enabled(struct kvm_lapic *apic) | 131 | static inline int apic_enabled(struct kvm_lapic *apic) |
131 | { | 132 | { |
132 | return apic_sw_enabled(apic) && apic_hw_enabled(apic); | 133 | return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic); |
133 | } | 134 | } |
134 | 135 | ||
135 | #define LVT_MASK \ | 136 | #define LVT_MASK \ |
@@ -139,36 +140,135 @@ static inline int apic_enabled(struct kvm_lapic *apic) | |||
139 | (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ | 140 | (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ |
140 | APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) | 141 | APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) |
141 | 142 | ||
143 | static inline int apic_x2apic_mode(struct kvm_lapic *apic) | ||
144 | { | ||
145 | return apic->vcpu->arch.apic_base & X2APIC_ENABLE; | ||
146 | } | ||
147 | |||
142 | static inline int kvm_apic_id(struct kvm_lapic *apic) | 148 | static inline int kvm_apic_id(struct kvm_lapic *apic) |
143 | { | 149 | { |
144 | return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff; | 150 | return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; |
151 | } | ||
152 | |||
153 | static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) | ||
154 | { | ||
155 | u16 cid; | ||
156 | ldr >>= 32 - map->ldr_bits; | ||
157 | cid = (ldr >> map->cid_shift) & map->cid_mask; | ||
158 | |||
159 | BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); | ||
160 | |||
161 | return cid; | ||
162 | } | ||
163 | |||
164 | static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) | ||
165 | { | ||
166 | ldr >>= (32 - map->ldr_bits); | ||
167 | return ldr & map->lid_mask; | ||
168 | } | ||
169 | |||
170 | static void recalculate_apic_map(struct kvm *kvm) | ||
171 | { | ||
172 | struct kvm_apic_map *new, *old = NULL; | ||
173 | struct kvm_vcpu *vcpu; | ||
174 | int i; | ||
175 | |||
176 | new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL); | ||
177 | |||
178 | mutex_lock(&kvm->arch.apic_map_lock); | ||
179 | |||
180 | if (!new) | ||
181 | goto out; | ||
182 | |||
183 | new->ldr_bits = 8; | ||
184 | /* flat mode is default */ | ||
185 | new->cid_shift = 8; | ||
186 | new->cid_mask = 0; | ||
187 | new->lid_mask = 0xff; | ||
188 | |||
189 | kvm_for_each_vcpu(i, vcpu, kvm) { | ||
190 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
191 | u16 cid, lid; | ||
192 | u32 ldr; | ||
193 | |||
194 | if (!kvm_apic_present(vcpu)) | ||
195 | continue; | ||
196 | |||
197 | /* | ||
198 | * All APICs have to be configured in the same mode by an OS. | ||
199 | * We take advatage of this while building logical id loockup | ||
200 | * table. After reset APICs are in xapic/flat mode, so if we | ||
201 | * find apic with different setting we assume this is the mode | ||
202 | * OS wants all apics to be in; build lookup table accordingly. | ||
203 | */ | ||
204 | if (apic_x2apic_mode(apic)) { | ||
205 | new->ldr_bits = 32; | ||
206 | new->cid_shift = 16; | ||
207 | new->cid_mask = new->lid_mask = 0xffff; | ||
208 | } else if (kvm_apic_sw_enabled(apic) && | ||
209 | !new->cid_mask /* flat mode */ && | ||
210 | kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER) { | ||
211 | new->cid_shift = 4; | ||
212 | new->cid_mask = 0xf; | ||
213 | new->lid_mask = 0xf; | ||
214 | } | ||
215 | |||
216 | new->phys_map[kvm_apic_id(apic)] = apic; | ||
217 | |||
218 | ldr = kvm_apic_get_reg(apic, APIC_LDR); | ||
219 | cid = apic_cluster_id(new, ldr); | ||
220 | lid = apic_logical_id(new, ldr); | ||
221 | |||
222 | if (lid) | ||
223 | new->logical_map[cid][ffs(lid) - 1] = apic; | ||
224 | } | ||
225 | out: | ||
226 | old = rcu_dereference_protected(kvm->arch.apic_map, | ||
227 | lockdep_is_held(&kvm->arch.apic_map_lock)); | ||
228 | rcu_assign_pointer(kvm->arch.apic_map, new); | ||
229 | mutex_unlock(&kvm->arch.apic_map_lock); | ||
230 | |||
231 | if (old) | ||
232 | kfree_rcu(old, rcu); | ||
233 | } | ||
234 | |||
235 | static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) | ||
236 | { | ||
237 | apic_set_reg(apic, APIC_ID, id << 24); | ||
238 | recalculate_apic_map(apic->vcpu->kvm); | ||
239 | } | ||
240 | |||
241 | static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) | ||
242 | { | ||
243 | apic_set_reg(apic, APIC_LDR, id); | ||
244 | recalculate_apic_map(apic->vcpu->kvm); | ||
145 | } | 245 | } |
146 | 246 | ||
147 | static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) | 247 | static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) |
148 | { | 248 | { |
149 | return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); | 249 | return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); |
150 | } | 250 | } |
151 | 251 | ||
152 | static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) | 252 | static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) |
153 | { | 253 | { |
154 | return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; | 254 | return kvm_apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; |
155 | } | 255 | } |
156 | 256 | ||
157 | static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) | 257 | static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) |
158 | { | 258 | { |
159 | return ((apic_get_reg(apic, APIC_LVTT) & | 259 | return ((kvm_apic_get_reg(apic, APIC_LVTT) & |
160 | apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT); | 260 | apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT); |
161 | } | 261 | } |
162 | 262 | ||
163 | static inline int apic_lvtt_period(struct kvm_lapic *apic) | 263 | static inline int apic_lvtt_period(struct kvm_lapic *apic) |
164 | { | 264 | { |
165 | return ((apic_get_reg(apic, APIC_LVTT) & | 265 | return ((kvm_apic_get_reg(apic, APIC_LVTT) & |
166 | apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC); | 266 | apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC); |
167 | } | 267 | } |
168 | 268 | ||
169 | static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) | 269 | static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) |
170 | { | 270 | { |
171 | return ((apic_get_reg(apic, APIC_LVTT) & | 271 | return ((kvm_apic_get_reg(apic, APIC_LVTT) & |
172 | apic->lapic_timer.timer_mode_mask) == | 272 | apic->lapic_timer.timer_mode_mask) == |
173 | APIC_LVT_TIMER_TSCDEADLINE); | 273 | APIC_LVT_TIMER_TSCDEADLINE); |
174 | } | 274 | } |
@@ -184,7 +284,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) | |||
184 | struct kvm_cpuid_entry2 *feat; | 284 | struct kvm_cpuid_entry2 *feat; |
185 | u32 v = APIC_VERSION; | 285 | u32 v = APIC_VERSION; |
186 | 286 | ||
187 | if (!irqchip_in_kernel(vcpu->kvm)) | 287 | if (!kvm_vcpu_has_lapic(vcpu)) |
188 | return; | 288 | return; |
189 | 289 | ||
190 | feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); | 290 | feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); |
@@ -193,12 +293,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) | |||
193 | apic_set_reg(apic, APIC_LVR, v); | 293 | apic_set_reg(apic, APIC_LVR, v); |
194 | } | 294 | } |
195 | 295 | ||
196 | static inline int apic_x2apic_mode(struct kvm_lapic *apic) | 296 | static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = { |
197 | { | ||
198 | return apic->vcpu->arch.apic_base & X2APIC_ENABLE; | ||
199 | } | ||
200 | |||
201 | static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { | ||
202 | LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ | 297 | LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ |
203 | LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ | 298 | LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ |
204 | LVT_MASK | APIC_MODE_MASK, /* LVTPC */ | 299 | LVT_MASK | APIC_MODE_MASK, /* LVTPC */ |
@@ -208,25 +303,30 @@ static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { | |||
208 | 303 | ||
209 | static int find_highest_vector(void *bitmap) | 304 | static int find_highest_vector(void *bitmap) |
210 | { | 305 | { |
211 | u32 *word = bitmap; | 306 | int vec; |
212 | int word_offset = MAX_APIC_VECTOR >> 5; | 307 | u32 *reg; |
213 | 308 | ||
214 | while ((word_offset != 0) && (word[(--word_offset) << 2] == 0)) | 309 | for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG; |
215 | continue; | 310 | vec >= 0; vec -= APIC_VECTORS_PER_REG) { |
311 | reg = bitmap + REG_POS(vec); | ||
312 | if (*reg) | ||
313 | return fls(*reg) - 1 + vec; | ||
314 | } | ||
216 | 315 | ||
217 | if (likely(!word_offset && !word[0])) | 316 | return -1; |
218 | return -1; | ||
219 | else | ||
220 | return fls(word[word_offset << 2]) - 1 + (word_offset << 5); | ||
221 | } | 317 | } |
222 | 318 | ||
223 | static u8 count_vectors(void *bitmap) | 319 | static u8 count_vectors(void *bitmap) |
224 | { | 320 | { |
225 | u32 *word = bitmap; | 321 | int vec; |
226 | int word_offset; | 322 | u32 *reg; |
227 | u8 count = 0; | 323 | u8 count = 0; |
228 | for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset) | 324 | |
229 | count += hweight32(word[word_offset << 2]); | 325 | for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) { |
326 | reg = bitmap + REG_POS(vec); | ||
327 | count += hweight32(*reg); | ||
328 | } | ||
329 | |||
230 | return count; | 330 | return count; |
231 | } | 331 | } |
232 | 332 | ||
@@ -285,7 +385,6 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) | |||
285 | 385 | ||
286 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) | 386 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) |
287 | { | 387 | { |
288 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
289 | int highest_irr; | 388 | int highest_irr; |
290 | 389 | ||
291 | /* This may race with setting of irr in __apic_accept_irq() and | 390 | /* This may race with setting of irr in __apic_accept_irq() and |
@@ -293,9 +392,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) | |||
293 | * will cause vmexit immediately and the value will be recalculated | 392 | * will cause vmexit immediately and the value will be recalculated |
294 | * on the next vmentry. | 393 | * on the next vmentry. |
295 | */ | 394 | */ |
296 | if (!apic) | 395 | if (!kvm_vcpu_has_lapic(vcpu)) |
297 | return 0; | 396 | return 0; |
298 | highest_irr = apic_find_highest_irr(apic); | 397 | highest_irr = apic_find_highest_irr(vcpu->arch.apic); |
299 | 398 | ||
300 | return highest_irr; | 399 | return highest_irr; |
301 | } | 400 | } |
@@ -378,8 +477,8 @@ static void apic_update_ppr(struct kvm_lapic *apic) | |||
378 | u32 tpr, isrv, ppr, old_ppr; | 477 | u32 tpr, isrv, ppr, old_ppr; |
379 | int isr; | 478 | int isr; |
380 | 479 | ||
381 | old_ppr = apic_get_reg(apic, APIC_PROCPRI); | 480 | old_ppr = kvm_apic_get_reg(apic, APIC_PROCPRI); |
382 | tpr = apic_get_reg(apic, APIC_TASKPRI); | 481 | tpr = kvm_apic_get_reg(apic, APIC_TASKPRI); |
383 | isr = apic_find_highest_isr(apic); | 482 | isr = apic_find_highest_isr(apic); |
384 | isrv = (isr != -1) ? isr : 0; | 483 | isrv = (isr != -1) ? isr : 0; |
385 | 484 | ||
@@ -415,13 +514,13 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) | |||
415 | u32 logical_id; | 514 | u32 logical_id; |
416 | 515 | ||
417 | if (apic_x2apic_mode(apic)) { | 516 | if (apic_x2apic_mode(apic)) { |
418 | logical_id = apic_get_reg(apic, APIC_LDR); | 517 | logical_id = kvm_apic_get_reg(apic, APIC_LDR); |
419 | return logical_id & mda; | 518 | return logical_id & mda; |
420 | } | 519 | } |
421 | 520 | ||
422 | logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); | 521 | logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR)); |
423 | 522 | ||
424 | switch (apic_get_reg(apic, APIC_DFR)) { | 523 | switch (kvm_apic_get_reg(apic, APIC_DFR)) { |
425 | case APIC_DFR_FLAT: | 524 | case APIC_DFR_FLAT: |
426 | if (logical_id & mda) | 525 | if (logical_id & mda) |
427 | result = 1; | 526 | result = 1; |
@@ -433,7 +532,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) | |||
433 | break; | 532 | break; |
434 | default: | 533 | default: |
435 | apic_debug("Bad DFR vcpu %d: %08x\n", | 534 | apic_debug("Bad DFR vcpu %d: %08x\n", |
436 | apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); | 535 | apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR)); |
437 | break; | 536 | break; |
438 | } | 537 | } |
439 | 538 | ||
@@ -478,6 +577,72 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, | |||
478 | return result; | 577 | return result; |
479 | } | 578 | } |
480 | 579 | ||
580 | bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, | ||
581 | struct kvm_lapic_irq *irq, int *r) | ||
582 | { | ||
583 | struct kvm_apic_map *map; | ||
584 | unsigned long bitmap = 1; | ||
585 | struct kvm_lapic **dst; | ||
586 | int i; | ||
587 | bool ret = false; | ||
588 | |||
589 | *r = -1; | ||
590 | |||
591 | if (irq->shorthand == APIC_DEST_SELF) { | ||
592 | *r = kvm_apic_set_irq(src->vcpu, irq); | ||
593 | return true; | ||
594 | } | ||
595 | |||
596 | if (irq->shorthand) | ||
597 | return false; | ||
598 | |||
599 | rcu_read_lock(); | ||
600 | map = rcu_dereference(kvm->arch.apic_map); | ||
601 | |||
602 | if (!map) | ||
603 | goto out; | ||
604 | |||
605 | if (irq->dest_mode == 0) { /* physical mode */ | ||
606 | if (irq->delivery_mode == APIC_DM_LOWEST || | ||
607 | irq->dest_id == 0xff) | ||
608 | goto out; | ||
609 | dst = &map->phys_map[irq->dest_id & 0xff]; | ||
610 | } else { | ||
611 | u32 mda = irq->dest_id << (32 - map->ldr_bits); | ||
612 | |||
613 | dst = map->logical_map[apic_cluster_id(map, mda)]; | ||
614 | |||
615 | bitmap = apic_logical_id(map, mda); | ||
616 | |||
617 | if (irq->delivery_mode == APIC_DM_LOWEST) { | ||
618 | int l = -1; | ||
619 | for_each_set_bit(i, &bitmap, 16) { | ||
620 | if (!dst[i]) | ||
621 | continue; | ||
622 | if (l < 0) | ||
623 | l = i; | ||
624 | else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0) | ||
625 | l = i; | ||
626 | } | ||
627 | |||
628 | bitmap = (l >= 0) ? 1 << l : 0; | ||
629 | } | ||
630 | } | ||
631 | |||
632 | for_each_set_bit(i, &bitmap, 16) { | ||
633 | if (!dst[i]) | ||
634 | continue; | ||
635 | if (*r < 0) | ||
636 | *r = 0; | ||
637 | *r += kvm_apic_set_irq(dst[i]->vcpu, irq); | ||
638 | } | ||
639 | |||
640 | ret = true; | ||
641 | out: | ||
642 | rcu_read_unlock(); | ||
643 | return ret; | ||
644 | } | ||
645 | |||
481 | /* | 646 | /* |
482 | * Add a pending IRQ into lapic. | 647 | * Add a pending IRQ into lapic. |
483 | * Return 1 if successfully added and 0 if discarded. | 648 | * Return 1 if successfully added and 0 if discarded. |
@@ -591,7 +756,7 @@ static int apic_set_eoi(struct kvm_lapic *apic) | |||
591 | apic_clear_isr(vector, apic); | 756 | apic_clear_isr(vector, apic); |
592 | apic_update_ppr(apic); | 757 | apic_update_ppr(apic); |
593 | 758 | ||
594 | if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && | 759 | if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && |
595 | kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { | 760 | kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { |
596 | int trigger_mode; | 761 | int trigger_mode; |
597 | if (apic_test_vector(vector, apic->regs + APIC_TMR)) | 762 | if (apic_test_vector(vector, apic->regs + APIC_TMR)) |
@@ -606,8 +771,8 @@ static int apic_set_eoi(struct kvm_lapic *apic) | |||
606 | 771 | ||
607 | static void apic_send_ipi(struct kvm_lapic *apic) | 772 | static void apic_send_ipi(struct kvm_lapic *apic) |
608 | { | 773 | { |
609 | u32 icr_low = apic_get_reg(apic, APIC_ICR); | 774 | u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR); |
610 | u32 icr_high = apic_get_reg(apic, APIC_ICR2); | 775 | u32 icr_high = kvm_apic_get_reg(apic, APIC_ICR2); |
611 | struct kvm_lapic_irq irq; | 776 | struct kvm_lapic_irq irq; |
612 | 777 | ||
613 | irq.vector = icr_low & APIC_VECTOR_MASK; | 778 | irq.vector = icr_low & APIC_VECTOR_MASK; |
@@ -642,7 +807,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) | |||
642 | ASSERT(apic != NULL); | 807 | ASSERT(apic != NULL); |
643 | 808 | ||
644 | /* if initial count is 0, current count should also be 0 */ | 809 | /* if initial count is 0, current count should also be 0 */ |
645 | if (apic_get_reg(apic, APIC_TMICT) == 0) | 810 | if (kvm_apic_get_reg(apic, APIC_TMICT) == 0) |
646 | return 0; | 811 | return 0; |
647 | 812 | ||
648 | remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); | 813 | remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); |
@@ -696,13 +861,15 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) | |||
696 | 861 | ||
697 | val = apic_get_tmcct(apic); | 862 | val = apic_get_tmcct(apic); |
698 | break; | 863 | break; |
699 | 864 | case APIC_PROCPRI: | |
865 | apic_update_ppr(apic); | ||
866 | val = kvm_apic_get_reg(apic, offset); | ||
867 | break; | ||
700 | case APIC_TASKPRI: | 868 | case APIC_TASKPRI: |
701 | report_tpr_access(apic, false); | 869 | report_tpr_access(apic, false); |
702 | /* fall thru */ | 870 | /* fall thru */ |
703 | default: | 871 | default: |
704 | apic_update_ppr(apic); | 872 | val = kvm_apic_get_reg(apic, offset); |
705 | val = apic_get_reg(apic, offset); | ||
706 | break; | 873 | break; |
707 | } | 874 | } |
708 | 875 | ||
@@ -719,7 +886,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, | |||
719 | { | 886 | { |
720 | unsigned char alignment = offset & 0xf; | 887 | unsigned char alignment = offset & 0xf; |
721 | u32 result; | 888 | u32 result; |
722 | /* this bitmask has a bit cleared for each reserver register */ | 889 | /* this bitmask has a bit cleared for each reserved register */ |
723 | static const u64 rmask = 0x43ff01ffffffe70cULL; | 890 | static const u64 rmask = 0x43ff01ffffffe70cULL; |
724 | 891 | ||
725 | if ((alignment + len) > 4) { | 892 | if ((alignment + len) > 4) { |
@@ -754,7 +921,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, | |||
754 | 921 | ||
755 | static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) | 922 | static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) |
756 | { | 923 | { |
757 | return apic_hw_enabled(apic) && | 924 | return kvm_apic_hw_enabled(apic) && |
758 | addr >= apic->base_address && | 925 | addr >= apic->base_address && |
759 | addr < apic->base_address + LAPIC_MMIO_LENGTH; | 926 | addr < apic->base_address + LAPIC_MMIO_LENGTH; |
760 | } | 927 | } |
@@ -777,7 +944,7 @@ static void update_divide_count(struct kvm_lapic *apic) | |||
777 | { | 944 | { |
778 | u32 tmp1, tmp2, tdcr; | 945 | u32 tmp1, tmp2, tdcr; |
779 | 946 | ||
780 | tdcr = apic_get_reg(apic, APIC_TDCR); | 947 | tdcr = kvm_apic_get_reg(apic, APIC_TDCR); |
781 | tmp1 = tdcr & 0xf; | 948 | tmp1 = tdcr & 0xf; |
782 | tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; | 949 | tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; |
783 | apic->divide_count = 0x1 << (tmp2 & 0x7); | 950 | apic->divide_count = 0x1 << (tmp2 & 0x7); |
@@ -792,9 +959,9 @@ static void start_apic_timer(struct kvm_lapic *apic) | |||
792 | atomic_set(&apic->lapic_timer.pending, 0); | 959 | atomic_set(&apic->lapic_timer.pending, 0); |
793 | 960 | ||
794 | if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { | 961 | if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { |
795 | /* lapic timer in oneshot or peroidic mode */ | 962 | /* lapic timer in oneshot or periodic mode */ |
796 | now = apic->lapic_timer.timer.base->get_time(); | 963 | now = apic->lapic_timer.timer.base->get_time(); |
797 | apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) | 964 | apic->lapic_timer.period = (u64)kvm_apic_get_reg(apic, APIC_TMICT) |
798 | * APIC_BUS_CYCLE_NS * apic->divide_count; | 965 | * APIC_BUS_CYCLE_NS * apic->divide_count; |
799 | 966 | ||
800 | if (!apic->lapic_timer.period) | 967 | if (!apic->lapic_timer.period) |
@@ -826,7 +993,7 @@ static void start_apic_timer(struct kvm_lapic *apic) | |||
826 | "timer initial count 0x%x, period %lldns, " | 993 | "timer initial count 0x%x, period %lldns, " |
827 | "expire @ 0x%016" PRIx64 ".\n", __func__, | 994 | "expire @ 0x%016" PRIx64 ".\n", __func__, |
828 | APIC_BUS_CYCLE_NS, ktime_to_ns(now), | 995 | APIC_BUS_CYCLE_NS, ktime_to_ns(now), |
829 | apic_get_reg(apic, APIC_TMICT), | 996 | kvm_apic_get_reg(apic, APIC_TMICT), |
830 | apic->lapic_timer.period, | 997 | apic->lapic_timer.period, |
831 | ktime_to_ns(ktime_add_ns(now, | 998 | ktime_to_ns(ktime_add_ns(now, |
832 | apic->lapic_timer.period))); | 999 | apic->lapic_timer.period))); |
@@ -858,7 +1025,7 @@ static void start_apic_timer(struct kvm_lapic *apic) | |||
858 | 1025 | ||
859 | static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) | 1026 | static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) |
860 | { | 1027 | { |
861 | int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0)); | 1028 | int nmi_wd_enabled = apic_lvt_nmi_mode(kvm_apic_get_reg(apic, APIC_LVT0)); |
862 | 1029 | ||
863 | if (apic_lvt_nmi_mode(lvt0_val)) { | 1030 | if (apic_lvt_nmi_mode(lvt0_val)) { |
864 | if (!nmi_wd_enabled) { | 1031 | if (!nmi_wd_enabled) { |
@@ -879,7 +1046,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) | |||
879 | switch (reg) { | 1046 | switch (reg) { |
880 | case APIC_ID: /* Local APIC ID */ | 1047 | case APIC_ID: /* Local APIC ID */ |
881 | if (!apic_x2apic_mode(apic)) | 1048 | if (!apic_x2apic_mode(apic)) |
882 | apic_set_reg(apic, APIC_ID, val); | 1049 | kvm_apic_set_id(apic, val >> 24); |
883 | else | 1050 | else |
884 | ret = 1; | 1051 | ret = 1; |
885 | break; | 1052 | break; |
@@ -895,29 +1062,30 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) | |||
895 | 1062 | ||
896 | case APIC_LDR: | 1063 | case APIC_LDR: |
897 | if (!apic_x2apic_mode(apic)) | 1064 | if (!apic_x2apic_mode(apic)) |
898 | apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); | 1065 | kvm_apic_set_ldr(apic, val & APIC_LDR_MASK); |
899 | else | 1066 | else |
900 | ret = 1; | 1067 | ret = 1; |
901 | break; | 1068 | break; |
902 | 1069 | ||
903 | case APIC_DFR: | 1070 | case APIC_DFR: |
904 | if (!apic_x2apic_mode(apic)) | 1071 | if (!apic_x2apic_mode(apic)) { |
905 | apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); | 1072 | apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); |
906 | else | 1073 | recalculate_apic_map(apic->vcpu->kvm); |
1074 | } else | ||
907 | ret = 1; | 1075 | ret = 1; |
908 | break; | 1076 | break; |
909 | 1077 | ||
910 | case APIC_SPIV: { | 1078 | case APIC_SPIV: { |
911 | u32 mask = 0x3ff; | 1079 | u32 mask = 0x3ff; |
912 | if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) | 1080 | if (kvm_apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) |
913 | mask |= APIC_SPIV_DIRECTED_EOI; | 1081 | mask |= APIC_SPIV_DIRECTED_EOI; |
914 | apic_set_reg(apic, APIC_SPIV, val & mask); | 1082 | apic_set_spiv(apic, val & mask); |
915 | if (!(val & APIC_SPIV_APIC_ENABLED)) { | 1083 | if (!(val & APIC_SPIV_APIC_ENABLED)) { |
916 | int i; | 1084 | int i; |
917 | u32 lvt_val; | 1085 | u32 lvt_val; |
918 | 1086 | ||
919 | for (i = 0; i < APIC_LVT_NUM; i++) { | 1087 | for (i = 0; i < APIC_LVT_NUM; i++) { |
920 | lvt_val = apic_get_reg(apic, | 1088 | lvt_val = kvm_apic_get_reg(apic, |
921 | APIC_LVTT + 0x10 * i); | 1089 | APIC_LVTT + 0x10 * i); |
922 | apic_set_reg(apic, APIC_LVTT + 0x10 * i, | 1090 | apic_set_reg(apic, APIC_LVTT + 0x10 * i, |
923 | lvt_val | APIC_LVT_MASKED); | 1091 | lvt_val | APIC_LVT_MASKED); |
@@ -946,7 +1114,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) | |||
946 | case APIC_LVT1: | 1114 | case APIC_LVT1: |
947 | case APIC_LVTERR: | 1115 | case APIC_LVTERR: |
948 | /* TODO: Check vector */ | 1116 | /* TODO: Check vector */ |
949 | if (!apic_sw_enabled(apic)) | 1117 | if (!kvm_apic_sw_enabled(apic)) |
950 | val |= APIC_LVT_MASKED; | 1118 | val |= APIC_LVT_MASKED; |
951 | 1119 | ||
952 | val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; | 1120 | val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; |
@@ -955,12 +1123,12 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) | |||
955 | break; | 1123 | break; |
956 | 1124 | ||
957 | case APIC_LVTT: | 1125 | case APIC_LVTT: |
958 | if ((apic_get_reg(apic, APIC_LVTT) & | 1126 | if ((kvm_apic_get_reg(apic, APIC_LVTT) & |
959 | apic->lapic_timer.timer_mode_mask) != | 1127 | apic->lapic_timer.timer_mode_mask) != |
960 | (val & apic->lapic_timer.timer_mode_mask)) | 1128 | (val & apic->lapic_timer.timer_mode_mask)) |
961 | hrtimer_cancel(&apic->lapic_timer.timer); | 1129 | hrtimer_cancel(&apic->lapic_timer.timer); |
962 | 1130 | ||
963 | if (!apic_sw_enabled(apic)) | 1131 | if (!kvm_apic_sw_enabled(apic)) |
964 | val |= APIC_LVT_MASKED; | 1132 | val |= APIC_LVT_MASKED; |
965 | val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); | 1133 | val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); |
966 | apic_set_reg(apic, APIC_LVTT, val); | 1134 | apic_set_reg(apic, APIC_LVTT, val); |
@@ -1039,24 +1207,30 @@ static int apic_mmio_write(struct kvm_io_device *this, | |||
1039 | 1207 | ||
1040 | void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) | 1208 | void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) |
1041 | { | 1209 | { |
1042 | struct kvm_lapic *apic = vcpu->arch.apic; | 1210 | if (kvm_vcpu_has_lapic(vcpu)) |
1043 | |||
1044 | if (apic) | ||
1045 | apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); | 1211 | apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); |
1046 | } | 1212 | } |
1047 | EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); | 1213 | EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); |
1048 | 1214 | ||
1049 | void kvm_free_lapic(struct kvm_vcpu *vcpu) | 1215 | void kvm_free_lapic(struct kvm_vcpu *vcpu) |
1050 | { | 1216 | { |
1217 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1218 | |||
1051 | if (!vcpu->arch.apic) | 1219 | if (!vcpu->arch.apic) |
1052 | return; | 1220 | return; |
1053 | 1221 | ||
1054 | hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); | 1222 | hrtimer_cancel(&apic->lapic_timer.timer); |
1223 | |||
1224 | if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)) | ||
1225 | static_key_slow_dec_deferred(&apic_hw_disabled); | ||
1226 | |||
1227 | if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED)) | ||
1228 | static_key_slow_dec_deferred(&apic_sw_disabled); | ||
1055 | 1229 | ||
1056 | if (vcpu->arch.apic->regs) | 1230 | if (apic->regs) |
1057 | free_page((unsigned long)vcpu->arch.apic->regs); | 1231 | free_page((unsigned long)apic->regs); |
1058 | 1232 | ||
1059 | kfree(vcpu->arch.apic); | 1233 | kfree(apic); |
1060 | } | 1234 | } |
1061 | 1235 | ||
1062 | /* | 1236 | /* |
@@ -1068,10 +1242,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) | |||
1068 | u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) | 1242 | u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) |
1069 | { | 1243 | { |
1070 | struct kvm_lapic *apic = vcpu->arch.apic; | 1244 | struct kvm_lapic *apic = vcpu->arch.apic; |
1071 | if (!apic) | ||
1072 | return 0; | ||
1073 | 1245 | ||
1074 | if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) | 1246 | if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) || |
1247 | apic_lvtt_period(apic)) | ||
1075 | return 0; | 1248 | return 0; |
1076 | 1249 | ||
1077 | return apic->lapic_timer.tscdeadline; | 1250 | return apic->lapic_timer.tscdeadline; |
@@ -1080,10 +1253,9 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) | |||
1080 | void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) | 1253 | void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) |
1081 | { | 1254 | { |
1082 | struct kvm_lapic *apic = vcpu->arch.apic; | 1255 | struct kvm_lapic *apic = vcpu->arch.apic; |
1083 | if (!apic) | ||
1084 | return; | ||
1085 | 1256 | ||
1086 | if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) | 1257 | if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) || |
1258 | apic_lvtt_period(apic)) | ||
1087 | return; | 1259 | return; |
1088 | 1260 | ||
1089 | hrtimer_cancel(&apic->lapic_timer.timer); | 1261 | hrtimer_cancel(&apic->lapic_timer.timer); |
@@ -1095,20 +1267,21 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) | |||
1095 | { | 1267 | { |
1096 | struct kvm_lapic *apic = vcpu->arch.apic; | 1268 | struct kvm_lapic *apic = vcpu->arch.apic; |
1097 | 1269 | ||
1098 | if (!apic) | 1270 | if (!kvm_vcpu_has_lapic(vcpu)) |
1099 | return; | 1271 | return; |
1272 | |||
1100 | apic_set_tpr(apic, ((cr8 & 0x0f) << 4) | 1273 | apic_set_tpr(apic, ((cr8 & 0x0f) << 4) |
1101 | | (apic_get_reg(apic, APIC_TASKPRI) & 4)); | 1274 | | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4)); |
1102 | } | 1275 | } |
1103 | 1276 | ||
1104 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) | 1277 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) |
1105 | { | 1278 | { |
1106 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1107 | u64 tpr; | 1279 | u64 tpr; |
1108 | 1280 | ||
1109 | if (!apic) | 1281 | if (!kvm_vcpu_has_lapic(vcpu)) |
1110 | return 0; | 1282 | return 0; |
1111 | tpr = (u64) apic_get_reg(apic, APIC_TASKPRI); | 1283 | |
1284 | tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI); | ||
1112 | 1285 | ||
1113 | return (tpr & 0xf0) >> 4; | 1286 | return (tpr & 0xf0) >> 4; |
1114 | } | 1287 | } |
@@ -1123,6 +1296,15 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) | |||
1123 | return; | 1296 | return; |
1124 | } | 1297 | } |
1125 | 1298 | ||
1299 | /* update jump label if enable bit changes */ | ||
1300 | if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) { | ||
1301 | if (value & MSR_IA32_APICBASE_ENABLE) | ||
1302 | static_key_slow_dec_deferred(&apic_hw_disabled); | ||
1303 | else | ||
1304 | static_key_slow_inc(&apic_hw_disabled.key); | ||
1305 | recalculate_apic_map(vcpu->kvm); | ||
1306 | } | ||
1307 | |||
1126 | if (!kvm_vcpu_is_bsp(apic->vcpu)) | 1308 | if (!kvm_vcpu_is_bsp(apic->vcpu)) |
1127 | value &= ~MSR_IA32_APICBASE_BSP; | 1309 | value &= ~MSR_IA32_APICBASE_BSP; |
1128 | 1310 | ||
@@ -1130,7 +1312,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) | |||
1130 | if (apic_x2apic_mode(apic)) { | 1312 | if (apic_x2apic_mode(apic)) { |
1131 | u32 id = kvm_apic_id(apic); | 1313 | u32 id = kvm_apic_id(apic); |
1132 | u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); | 1314 | u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); |
1133 | apic_set_reg(apic, APIC_LDR, ldr); | 1315 | kvm_apic_set_ldr(apic, ldr); |
1134 | } | 1316 | } |
1135 | apic->base_address = apic->vcpu->arch.apic_base & | 1317 | apic->base_address = apic->vcpu->arch.apic_base & |
1136 | MSR_IA32_APICBASE_BASE; | 1318 | MSR_IA32_APICBASE_BASE; |
@@ -1155,7 +1337,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
1155 | /* Stop the timer in case it's a reset to an active apic */ | 1337 | /* Stop the timer in case it's a reset to an active apic */ |
1156 | hrtimer_cancel(&apic->lapic_timer.timer); | 1338 | hrtimer_cancel(&apic->lapic_timer.timer); |
1157 | 1339 | ||
1158 | apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); | 1340 | kvm_apic_set_id(apic, vcpu->vcpu_id); |
1159 | kvm_apic_set_version(apic->vcpu); | 1341 | kvm_apic_set_version(apic->vcpu); |
1160 | 1342 | ||
1161 | for (i = 0; i < APIC_LVT_NUM; i++) | 1343 | for (i = 0; i < APIC_LVT_NUM; i++) |
@@ -1164,9 +1346,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
1164 | SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); | 1346 | SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); |
1165 | 1347 | ||
1166 | apic_set_reg(apic, APIC_DFR, 0xffffffffU); | 1348 | apic_set_reg(apic, APIC_DFR, 0xffffffffU); |
1167 | apic_set_reg(apic, APIC_SPIV, 0xff); | 1349 | apic_set_spiv(apic, 0xff); |
1168 | apic_set_reg(apic, APIC_TASKPRI, 0); | 1350 | apic_set_reg(apic, APIC_TASKPRI, 0); |
1169 | apic_set_reg(apic, APIC_LDR, 0); | 1351 | kvm_apic_set_ldr(apic, 0); |
1170 | apic_set_reg(apic, APIC_ESR, 0); | 1352 | apic_set_reg(apic, APIC_ESR, 0); |
1171 | apic_set_reg(apic, APIC_ICR, 0); | 1353 | apic_set_reg(apic, APIC_ICR, 0); |
1172 | apic_set_reg(apic, APIC_ICR2, 0); | 1354 | apic_set_reg(apic, APIC_ICR2, 0); |
@@ -1183,7 +1365,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
1183 | update_divide_count(apic); | 1365 | update_divide_count(apic); |
1184 | atomic_set(&apic->lapic_timer.pending, 0); | 1366 | atomic_set(&apic->lapic_timer.pending, 0); |
1185 | if (kvm_vcpu_is_bsp(vcpu)) | 1367 | if (kvm_vcpu_is_bsp(vcpu)) |
1186 | vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; | 1368 | kvm_lapic_set_base(vcpu, |
1369 | vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP); | ||
1187 | vcpu->arch.pv_eoi.msr_val = 0; | 1370 | vcpu->arch.pv_eoi.msr_val = 0; |
1188 | apic_update_ppr(apic); | 1371 | apic_update_ppr(apic); |
1189 | 1372 | ||
@@ -1196,45 +1379,34 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
1196 | vcpu->arch.apic_base, apic->base_address); | 1379 | vcpu->arch.apic_base, apic->base_address); |
1197 | } | 1380 | } |
1198 | 1381 | ||
1199 | bool kvm_apic_present(struct kvm_vcpu *vcpu) | ||
1200 | { | ||
1201 | return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic); | ||
1202 | } | ||
1203 | |||
1204 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu) | ||
1205 | { | ||
1206 | return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); | ||
1207 | } | ||
1208 | |||
1209 | /* | 1382 | /* |
1210 | *---------------------------------------------------------------------- | 1383 | *---------------------------------------------------------------------- |
1211 | * timer interface | 1384 | * timer interface |
1212 | *---------------------------------------------------------------------- | 1385 | *---------------------------------------------------------------------- |
1213 | */ | 1386 | */ |
1214 | 1387 | ||
1215 | static bool lapic_is_periodic(struct kvm_timer *ktimer) | 1388 | static bool lapic_is_periodic(struct kvm_lapic *apic) |
1216 | { | 1389 | { |
1217 | struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, | ||
1218 | lapic_timer); | ||
1219 | return apic_lvtt_period(apic); | 1390 | return apic_lvtt_period(apic); |
1220 | } | 1391 | } |
1221 | 1392 | ||
1222 | int apic_has_pending_timer(struct kvm_vcpu *vcpu) | 1393 | int apic_has_pending_timer(struct kvm_vcpu *vcpu) |
1223 | { | 1394 | { |
1224 | struct kvm_lapic *lapic = vcpu->arch.apic; | 1395 | struct kvm_lapic *apic = vcpu->arch.apic; |
1225 | 1396 | ||
1226 | if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) | 1397 | if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) && |
1227 | return atomic_read(&lapic->lapic_timer.pending); | 1398 | apic_lvt_enabled(apic, APIC_LVTT)) |
1399 | return atomic_read(&apic->lapic_timer.pending); | ||
1228 | 1400 | ||
1229 | return 0; | 1401 | return 0; |
1230 | } | 1402 | } |
1231 | 1403 | ||
1232 | int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) | 1404 | int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) |
1233 | { | 1405 | { |
1234 | u32 reg = apic_get_reg(apic, lvt_type); | 1406 | u32 reg = kvm_apic_get_reg(apic, lvt_type); |
1235 | int vector, mode, trig_mode; | 1407 | int vector, mode, trig_mode; |
1236 | 1408 | ||
1237 | if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { | 1409 | if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { |
1238 | vector = reg & APIC_VECTOR_MASK; | 1410 | vector = reg & APIC_VECTOR_MASK; |
1239 | mode = reg & APIC_MODE_MASK; | 1411 | mode = reg & APIC_MODE_MASK; |
1240 | trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; | 1412 | trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; |
@@ -1251,15 +1423,40 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) | |||
1251 | kvm_apic_local_deliver(apic, APIC_LVT0); | 1423 | kvm_apic_local_deliver(apic, APIC_LVT0); |
1252 | } | 1424 | } |
1253 | 1425 | ||
1254 | static struct kvm_timer_ops lapic_timer_ops = { | ||
1255 | .is_periodic = lapic_is_periodic, | ||
1256 | }; | ||
1257 | |||
1258 | static const struct kvm_io_device_ops apic_mmio_ops = { | 1426 | static const struct kvm_io_device_ops apic_mmio_ops = { |
1259 | .read = apic_mmio_read, | 1427 | .read = apic_mmio_read, |
1260 | .write = apic_mmio_write, | 1428 | .write = apic_mmio_write, |
1261 | }; | 1429 | }; |
1262 | 1430 | ||
1431 | static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) | ||
1432 | { | ||
1433 | struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); | ||
1434 | struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer); | ||
1435 | struct kvm_vcpu *vcpu = apic->vcpu; | ||
1436 | wait_queue_head_t *q = &vcpu->wq; | ||
1437 | |||
1438 | /* | ||
1439 | * There is a race window between reading and incrementing, but we do | ||
1440 | * not care about potentially losing timer events in the !reinject | ||
1441 | * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked | ||
1442 | * in vcpu_enter_guest. | ||
1443 | */ | ||
1444 | if (!atomic_read(&ktimer->pending)) { | ||
1445 | atomic_inc(&ktimer->pending); | ||
1446 | /* FIXME: this code should not know anything about vcpus */ | ||
1447 | kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); | ||
1448 | } | ||
1449 | |||
1450 | if (waitqueue_active(q)) | ||
1451 | wake_up_interruptible(q); | ||
1452 | |||
1453 | if (lapic_is_periodic(apic)) { | ||
1454 | hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); | ||
1455 | return HRTIMER_RESTART; | ||
1456 | } else | ||
1457 | return HRTIMER_NORESTART; | ||
1458 | } | ||
1459 | |||
1263 | int kvm_create_lapic(struct kvm_vcpu *vcpu) | 1460 | int kvm_create_lapic(struct kvm_vcpu *vcpu) |
1264 | { | 1461 | { |
1265 | struct kvm_lapic *apic; | 1462 | struct kvm_lapic *apic; |
@@ -1283,14 +1480,17 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) | |||
1283 | 1480 | ||
1284 | hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, | 1481 | hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, |
1285 | HRTIMER_MODE_ABS); | 1482 | HRTIMER_MODE_ABS); |
1286 | apic->lapic_timer.timer.function = kvm_timer_fn; | 1483 | apic->lapic_timer.timer.function = apic_timer_fn; |
1287 | apic->lapic_timer.t_ops = &lapic_timer_ops; | ||
1288 | apic->lapic_timer.kvm = vcpu->kvm; | ||
1289 | apic->lapic_timer.vcpu = vcpu; | ||
1290 | 1484 | ||
1291 | apic->base_address = APIC_DEFAULT_PHYS_BASE; | 1485 | /* |
1292 | vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; | 1486 | * APIC is created enabled. This will prevent kvm_lapic_set_base from |
1487 | * thinking that APIC satet has changed. | ||
1488 | */ | ||
1489 | vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; | ||
1490 | kvm_lapic_set_base(vcpu, | ||
1491 | APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE); | ||
1293 | 1492 | ||
1493 | static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ | ||
1294 | kvm_lapic_reset(vcpu); | 1494 | kvm_lapic_reset(vcpu); |
1295 | kvm_iodevice_init(&apic->dev, &apic_mmio_ops); | 1495 | kvm_iodevice_init(&apic->dev, &apic_mmio_ops); |
1296 | 1496 | ||
@@ -1306,23 +1506,23 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) | |||
1306 | struct kvm_lapic *apic = vcpu->arch.apic; | 1506 | struct kvm_lapic *apic = vcpu->arch.apic; |
1307 | int highest_irr; | 1507 | int highest_irr; |
1308 | 1508 | ||
1309 | if (!apic || !apic_enabled(apic)) | 1509 | if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic)) |
1310 | return -1; | 1510 | return -1; |
1311 | 1511 | ||
1312 | apic_update_ppr(apic); | 1512 | apic_update_ppr(apic); |
1313 | highest_irr = apic_find_highest_irr(apic); | 1513 | highest_irr = apic_find_highest_irr(apic); |
1314 | if ((highest_irr == -1) || | 1514 | if ((highest_irr == -1) || |
1315 | ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) | 1515 | ((highest_irr & 0xF0) <= kvm_apic_get_reg(apic, APIC_PROCPRI))) |
1316 | return -1; | 1516 | return -1; |
1317 | return highest_irr; | 1517 | return highest_irr; |
1318 | } | 1518 | } |
1319 | 1519 | ||
1320 | int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) | 1520 | int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) |
1321 | { | 1521 | { |
1322 | u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); | 1522 | u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0); |
1323 | int r = 0; | 1523 | int r = 0; |
1324 | 1524 | ||
1325 | if (!apic_hw_enabled(vcpu->arch.apic)) | 1525 | if (!kvm_apic_hw_enabled(vcpu->arch.apic)) |
1326 | r = 1; | 1526 | r = 1; |
1327 | if ((lvt0 & APIC_LVT_MASKED) == 0 && | 1527 | if ((lvt0 & APIC_LVT_MASKED) == 0 && |
1328 | GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) | 1528 | GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) |
@@ -1334,7 +1534,10 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) | |||
1334 | { | 1534 | { |
1335 | struct kvm_lapic *apic = vcpu->arch.apic; | 1535 | struct kvm_lapic *apic = vcpu->arch.apic; |
1336 | 1536 | ||
1337 | if (apic && atomic_read(&apic->lapic_timer.pending) > 0) { | 1537 | if (!kvm_vcpu_has_lapic(vcpu)) |
1538 | return; | ||
1539 | |||
1540 | if (atomic_read(&apic->lapic_timer.pending) > 0) { | ||
1338 | if (kvm_apic_local_deliver(apic, APIC_LVTT)) | 1541 | if (kvm_apic_local_deliver(apic, APIC_LVTT)) |
1339 | atomic_dec(&apic->lapic_timer.pending); | 1542 | atomic_dec(&apic->lapic_timer.pending); |
1340 | } | 1543 | } |
@@ -1354,12 +1557,17 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) | |||
1354 | return vector; | 1557 | return vector; |
1355 | } | 1558 | } |
1356 | 1559 | ||
1357 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) | 1560 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, |
1561 | struct kvm_lapic_state *s) | ||
1358 | { | 1562 | { |
1359 | struct kvm_lapic *apic = vcpu->arch.apic; | 1563 | struct kvm_lapic *apic = vcpu->arch.apic; |
1360 | 1564 | ||
1361 | apic->base_address = vcpu->arch.apic_base & | 1565 | kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); |
1362 | MSR_IA32_APICBASE_BASE; | 1566 | /* set SPIV separately to get count of SW disabled APICs right */ |
1567 | apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); | ||
1568 | memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); | ||
1569 | /* call kvm_apic_set_id() to put apic into apic_map */ | ||
1570 | kvm_apic_set_id(apic, kvm_apic_id(apic)); | ||
1363 | kvm_apic_set_version(vcpu); | 1571 | kvm_apic_set_version(vcpu); |
1364 | 1572 | ||
1365 | apic_update_ppr(apic); | 1573 | apic_update_ppr(apic); |
@@ -1374,13 +1582,12 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) | |||
1374 | 1582 | ||
1375 | void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) | 1583 | void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) |
1376 | { | 1584 | { |
1377 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1378 | struct hrtimer *timer; | 1585 | struct hrtimer *timer; |
1379 | 1586 | ||
1380 | if (!apic) | 1587 | if (!kvm_vcpu_has_lapic(vcpu)) |
1381 | return; | 1588 | return; |
1382 | 1589 | ||
1383 | timer = &apic->lapic_timer.timer; | 1590 | timer = &vcpu->arch.apic->lapic_timer.timer; |
1384 | if (hrtimer_cancel(timer)) | 1591 | if (hrtimer_cancel(timer)) |
1385 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); | 1592 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); |
1386 | } | 1593 | } |
@@ -1478,7 +1685,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) | |||
1478 | if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) | 1685 | if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) |
1479 | return; | 1686 | return; |
1480 | 1687 | ||
1481 | tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; | 1688 | tpr = kvm_apic_get_reg(apic, APIC_TASKPRI) & 0xff; |
1482 | max_irr = apic_find_highest_irr(apic); | 1689 | max_irr = apic_find_highest_irr(apic); |
1483 | if (max_irr < 0) | 1690 | if (max_irr < 0) |
1484 | max_irr = 0; | 1691 | max_irr = 0; |
@@ -1537,7 +1744,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) | |||
1537 | { | 1744 | { |
1538 | struct kvm_lapic *apic = vcpu->arch.apic; | 1745 | struct kvm_lapic *apic = vcpu->arch.apic; |
1539 | 1746 | ||
1540 | if (!irqchip_in_kernel(vcpu->kvm)) | 1747 | if (!kvm_vcpu_has_lapic(vcpu)) |
1541 | return 1; | 1748 | return 1; |
1542 | 1749 | ||
1543 | /* if this is ICR write vector before command */ | 1750 | /* if this is ICR write vector before command */ |
@@ -1551,7 +1758,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) | |||
1551 | struct kvm_lapic *apic = vcpu->arch.apic; | 1758 | struct kvm_lapic *apic = vcpu->arch.apic; |
1552 | u32 low, high = 0; | 1759 | u32 low, high = 0; |
1553 | 1760 | ||
1554 | if (!irqchip_in_kernel(vcpu->kvm)) | 1761 | if (!kvm_vcpu_has_lapic(vcpu)) |
1555 | return 1; | 1762 | return 1; |
1556 | 1763 | ||
1557 | if (apic_reg_read(apic, reg, 4, &low)) | 1764 | if (apic_reg_read(apic, reg, 4, &low)) |
@@ -1576,3 +1783,10 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) | |||
1576 | return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, | 1783 | return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, |
1577 | addr); | 1784 | addr); |
1578 | } | 1785 | } |
1786 | |||
1787 | void kvm_lapic_init(void) | ||
1788 | { | ||
1789 | /* do not patch jump label more than once per second */ | ||
1790 | jump_label_rate_limit(&apic_hw_disabled, HZ); | ||
1791 | jump_label_rate_limit(&apic_sw_disabled, HZ); | ||
1792 | } | ||
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 4af5405ae1e2..e5ebf9f3571f 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h | |||
@@ -2,10 +2,17 @@ | |||
2 | #define __KVM_X86_LAPIC_H | 2 | #define __KVM_X86_LAPIC_H |
3 | 3 | ||
4 | #include "iodev.h" | 4 | #include "iodev.h" |
5 | #include "kvm_timer.h" | ||
6 | 5 | ||
7 | #include <linux/kvm_host.h> | 6 | #include <linux/kvm_host.h> |
8 | 7 | ||
8 | struct kvm_timer { | ||
9 | struct hrtimer timer; | ||
10 | s64 period; /* unit: ns */ | ||
11 | u32 timer_mode_mask; | ||
12 | u64 tscdeadline; | ||
13 | atomic_t pending; /* accumulated triggered timers */ | ||
14 | }; | ||
15 | |||
9 | struct kvm_lapic { | 16 | struct kvm_lapic { |
10 | unsigned long base_address; | 17 | unsigned long base_address; |
11 | struct kvm_io_device dev; | 18 | struct kvm_io_device dev; |
@@ -45,11 +52,13 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); | |||
45 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); | 52 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); |
46 | int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); | 53 | int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); |
47 | 54 | ||
55 | bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, | ||
56 | struct kvm_lapic_irq *irq, int *r); | ||
57 | |||
48 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); | 58 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); |
49 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); | 59 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); |
50 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); | 60 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, |
51 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu); | 61 | struct kvm_lapic_state *s); |
52 | bool kvm_apic_present(struct kvm_vcpu *vcpu); | ||
53 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); | 62 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); |
54 | 63 | ||
55 | u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); | 64 | u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); |
@@ -71,4 +80,48 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) | |||
71 | } | 80 | } |
72 | 81 | ||
73 | int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); | 82 | int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); |
83 | void kvm_lapic_init(void); | ||
84 | |||
85 | static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off) | ||
86 | { | ||
87 | return *((u32 *) (apic->regs + reg_off)); | ||
88 | } | ||
89 | |||
90 | extern struct static_key kvm_no_apic_vcpu; | ||
91 | |||
92 | static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu) | ||
93 | { | ||
94 | if (static_key_false(&kvm_no_apic_vcpu)) | ||
95 | return vcpu->arch.apic; | ||
96 | return true; | ||
97 | } | ||
98 | |||
99 | extern struct static_key_deferred apic_hw_disabled; | ||
100 | |||
101 | static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic) | ||
102 | { | ||
103 | if (static_key_false(&apic_hw_disabled.key)) | ||
104 | return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; | ||
105 | return MSR_IA32_APICBASE_ENABLE; | ||
106 | } | ||
107 | |||
108 | extern struct static_key_deferred apic_sw_disabled; | ||
109 | |||
110 | static inline int kvm_apic_sw_enabled(struct kvm_lapic *apic) | ||
111 | { | ||
112 | if (static_key_false(&apic_sw_disabled.key)) | ||
113 | return kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; | ||
114 | return APIC_SPIV_APIC_ENABLED; | ||
115 | } | ||
116 | |||
117 | static inline bool kvm_apic_present(struct kvm_vcpu *vcpu) | ||
118 | { | ||
119 | return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic); | ||
120 | } | ||
121 | |||
122 | static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu) | ||
123 | { | ||
124 | return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic); | ||
125 | } | ||
126 | |||
74 | #endif | 127 | #endif |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7fbd0d273ea8..d289fee1ffb8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -556,6 +556,14 @@ static int mmu_spte_clear_track_bits(u64 *sptep) | |||
556 | return 0; | 556 | return 0; |
557 | 557 | ||
558 | pfn = spte_to_pfn(old_spte); | 558 | pfn = spte_to_pfn(old_spte); |
559 | |||
560 | /* | ||
561 | * KVM does not hold the refcount of the page used by | ||
562 | * kvm mmu, before reclaiming the page, we should | ||
563 | * unmap it from mmu first. | ||
564 | */ | ||
565 | WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn))); | ||
566 | |||
559 | if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) | 567 | if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) |
560 | kvm_set_pfn_accessed(pfn); | 568 | kvm_set_pfn_accessed(pfn); |
561 | if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) | 569 | if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) |
@@ -960,13 +968,10 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) | |||
960 | static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, | 968 | static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, |
961 | struct kvm_memory_slot *slot) | 969 | struct kvm_memory_slot *slot) |
962 | { | 970 | { |
963 | struct kvm_lpage_info *linfo; | 971 | unsigned long idx; |
964 | |||
965 | if (likely(level == PT_PAGE_TABLE_LEVEL)) | ||
966 | return &slot->rmap[gfn - slot->base_gfn]; | ||
967 | 972 | ||
968 | linfo = lpage_info_slot(gfn, slot, level); | 973 | idx = gfn_to_index(gfn, slot->base_gfn, level); |
969 | return &linfo->rmap_pde; | 974 | return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx]; |
970 | } | 975 | } |
971 | 976 | ||
972 | /* | 977 | /* |
@@ -1173,7 +1178,8 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, | |||
1173 | unsigned long *rmapp; | 1178 | unsigned long *rmapp; |
1174 | 1179 | ||
1175 | while (mask) { | 1180 | while (mask) { |
1176 | rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; | 1181 | rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), |
1182 | PT_PAGE_TABLE_LEVEL, slot); | ||
1177 | __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); | 1183 | __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); |
1178 | 1184 | ||
1179 | /* clear the first set bit */ | 1185 | /* clear the first set bit */ |
@@ -1200,7 +1206,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
1200 | } | 1206 | } |
1201 | 1207 | ||
1202 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, | 1208 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, |
1203 | unsigned long data) | 1209 | struct kvm_memory_slot *slot, unsigned long data) |
1204 | { | 1210 | { |
1205 | u64 *sptep; | 1211 | u64 *sptep; |
1206 | struct rmap_iterator iter; | 1212 | struct rmap_iterator iter; |
@@ -1218,7 +1224,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1218 | } | 1224 | } |
1219 | 1225 | ||
1220 | static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, | 1226 | static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, |
1221 | unsigned long data) | 1227 | struct kvm_memory_slot *slot, unsigned long data) |
1222 | { | 1228 | { |
1223 | u64 *sptep; | 1229 | u64 *sptep; |
1224 | struct rmap_iterator iter; | 1230 | struct rmap_iterator iter; |
@@ -1259,43 +1265,67 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1259 | return 0; | 1265 | return 0; |
1260 | } | 1266 | } |
1261 | 1267 | ||
1262 | static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | 1268 | static int kvm_handle_hva_range(struct kvm *kvm, |
1263 | unsigned long data, | 1269 | unsigned long start, |
1264 | int (*handler)(struct kvm *kvm, unsigned long *rmapp, | 1270 | unsigned long end, |
1265 | unsigned long data)) | 1271 | unsigned long data, |
1272 | int (*handler)(struct kvm *kvm, | ||
1273 | unsigned long *rmapp, | ||
1274 | struct kvm_memory_slot *slot, | ||
1275 | unsigned long data)) | ||
1266 | { | 1276 | { |
1267 | int j; | 1277 | int j; |
1268 | int ret; | 1278 | int ret = 0; |
1269 | int retval = 0; | ||
1270 | struct kvm_memslots *slots; | 1279 | struct kvm_memslots *slots; |
1271 | struct kvm_memory_slot *memslot; | 1280 | struct kvm_memory_slot *memslot; |
1272 | 1281 | ||
1273 | slots = kvm_memslots(kvm); | 1282 | slots = kvm_memslots(kvm); |
1274 | 1283 | ||
1275 | kvm_for_each_memslot(memslot, slots) { | 1284 | kvm_for_each_memslot(memslot, slots) { |
1276 | unsigned long start = memslot->userspace_addr; | 1285 | unsigned long hva_start, hva_end; |
1277 | unsigned long end; | 1286 | gfn_t gfn_start, gfn_end; |
1278 | 1287 | ||
1279 | end = start + (memslot->npages << PAGE_SHIFT); | 1288 | hva_start = max(start, memslot->userspace_addr); |
1280 | if (hva >= start && hva < end) { | 1289 | hva_end = min(end, memslot->userspace_addr + |
1281 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; | 1290 | (memslot->npages << PAGE_SHIFT)); |
1282 | gfn_t gfn = memslot->base_gfn + gfn_offset; | 1291 | if (hva_start >= hva_end) |
1292 | continue; | ||
1293 | /* | ||
1294 | * {gfn(page) | page intersects with [hva_start, hva_end)} = | ||
1295 | * {gfn_start, gfn_start+1, ..., gfn_end-1}. | ||
1296 | */ | ||
1297 | gfn_start = hva_to_gfn_memslot(hva_start, memslot); | ||
1298 | gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); | ||
1283 | 1299 | ||
1284 | ret = handler(kvm, &memslot->rmap[gfn_offset], data); | 1300 | for (j = PT_PAGE_TABLE_LEVEL; |
1301 | j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) { | ||
1302 | unsigned long idx, idx_end; | ||
1303 | unsigned long *rmapp; | ||
1285 | 1304 | ||
1286 | for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { | 1305 | /* |
1287 | struct kvm_lpage_info *linfo; | 1306 | * {idx(page_j) | page_j intersects with |
1307 | * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}. | ||
1308 | */ | ||
1309 | idx = gfn_to_index(gfn_start, memslot->base_gfn, j); | ||
1310 | idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j); | ||
1288 | 1311 | ||
1289 | linfo = lpage_info_slot(gfn, memslot, | 1312 | rmapp = __gfn_to_rmap(gfn_start, j, memslot); |
1290 | PT_DIRECTORY_LEVEL + j); | 1313 | |
1291 | ret |= handler(kvm, &linfo->rmap_pde, data); | 1314 | for (; idx <= idx_end; ++idx) |
1292 | } | 1315 | ret |= handler(kvm, rmapp++, memslot, data); |
1293 | trace_kvm_age_page(hva, memslot, ret); | ||
1294 | retval |= ret; | ||
1295 | } | 1316 | } |
1296 | } | 1317 | } |
1297 | 1318 | ||
1298 | return retval; | 1319 | return ret; |
1320 | } | ||
1321 | |||
1322 | static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | ||
1323 | unsigned long data, | ||
1324 | int (*handler)(struct kvm *kvm, unsigned long *rmapp, | ||
1325 | struct kvm_memory_slot *slot, | ||
1326 | unsigned long data)) | ||
1327 | { | ||
1328 | return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); | ||
1299 | } | 1329 | } |
1300 | 1330 | ||
1301 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) | 1331 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) |
@@ -1303,13 +1333,18 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) | |||
1303 | return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); | 1333 | return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); |
1304 | } | 1334 | } |
1305 | 1335 | ||
1336 | int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) | ||
1337 | { | ||
1338 | return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); | ||
1339 | } | ||
1340 | |||
1306 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) | 1341 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) |
1307 | { | 1342 | { |
1308 | kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); | 1343 | kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); |
1309 | } | 1344 | } |
1310 | 1345 | ||
1311 | static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | 1346 | static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, |
1312 | unsigned long data) | 1347 | struct kvm_memory_slot *slot, unsigned long data) |
1313 | { | 1348 | { |
1314 | u64 *sptep; | 1349 | u64 *sptep; |
1315 | struct rmap_iterator uninitialized_var(iter); | 1350 | struct rmap_iterator uninitialized_var(iter); |
@@ -1323,8 +1358,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1323 | * This has some overhead, but not as much as the cost of swapping | 1358 | * This has some overhead, but not as much as the cost of swapping |
1324 | * out actively used pages or breaking up actively used hugepages. | 1359 | * out actively used pages or breaking up actively used hugepages. |
1325 | */ | 1360 | */ |
1326 | if (!shadow_accessed_mask) | 1361 | if (!shadow_accessed_mask) { |
1327 | return kvm_unmap_rmapp(kvm, rmapp, data); | 1362 | young = kvm_unmap_rmapp(kvm, rmapp, slot, data); |
1363 | goto out; | ||
1364 | } | ||
1328 | 1365 | ||
1329 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; | 1366 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; |
1330 | sptep = rmap_get_next(&iter)) { | 1367 | sptep = rmap_get_next(&iter)) { |
@@ -1336,12 +1373,14 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1336 | (unsigned long *)sptep); | 1373 | (unsigned long *)sptep); |
1337 | } | 1374 | } |
1338 | } | 1375 | } |
1339 | 1376 | out: | |
1377 | /* @data has hva passed to kvm_age_hva(). */ | ||
1378 | trace_kvm_age_page(data, slot, young); | ||
1340 | return young; | 1379 | return young; |
1341 | } | 1380 | } |
1342 | 1381 | ||
1343 | static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | 1382 | static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, |
1344 | unsigned long data) | 1383 | struct kvm_memory_slot *slot, unsigned long data) |
1345 | { | 1384 | { |
1346 | u64 *sptep; | 1385 | u64 *sptep; |
1347 | struct rmap_iterator iter; | 1386 | struct rmap_iterator iter; |
@@ -1379,13 +1418,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
1379 | 1418 | ||
1380 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); | 1419 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); |
1381 | 1420 | ||
1382 | kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); | 1421 | kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0); |
1383 | kvm_flush_remote_tlbs(vcpu->kvm); | 1422 | kvm_flush_remote_tlbs(vcpu->kvm); |
1384 | } | 1423 | } |
1385 | 1424 | ||
1386 | int kvm_age_hva(struct kvm *kvm, unsigned long hva) | 1425 | int kvm_age_hva(struct kvm *kvm, unsigned long hva) |
1387 | { | 1426 | { |
1388 | return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); | 1427 | return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp); |
1389 | } | 1428 | } |
1390 | 1429 | ||
1391 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) | 1430 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) |
@@ -2457,7 +2496,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2457 | rmap_recycle(vcpu, sptep, gfn); | 2496 | rmap_recycle(vcpu, sptep, gfn); |
2458 | } | 2497 | } |
2459 | } | 2498 | } |
2460 | kvm_release_pfn_clean(pfn); | 2499 | |
2500 | if (!is_error_pfn(pfn)) | ||
2501 | kvm_release_pfn_clean(pfn); | ||
2461 | } | 2502 | } |
2462 | 2503 | ||
2463 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | 2504 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) |
@@ -2469,17 +2510,12 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, | |||
2469 | bool no_dirty_log) | 2510 | bool no_dirty_log) |
2470 | { | 2511 | { |
2471 | struct kvm_memory_slot *slot; | 2512 | struct kvm_memory_slot *slot; |
2472 | unsigned long hva; | ||
2473 | 2513 | ||
2474 | slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); | 2514 | slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); |
2475 | if (!slot) { | 2515 | if (!slot) |
2476 | get_page(fault_page); | 2516 | return KVM_PFN_ERR_FAULT; |
2477 | return page_to_pfn(fault_page); | ||
2478 | } | ||
2479 | 2517 | ||
2480 | hva = gfn_to_hva_memslot(slot, gfn); | 2518 | return gfn_to_pfn_memslot_atomic(slot, gfn); |
2481 | |||
2482 | return hva_to_pfn_atomic(vcpu->kvm, hva); | ||
2483 | } | 2519 | } |
2484 | 2520 | ||
2485 | static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, | 2521 | static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, |
@@ -2580,11 +2616,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2580 | sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, | 2616 | sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, |
2581 | iterator.level - 1, | 2617 | iterator.level - 1, |
2582 | 1, ACC_ALL, iterator.sptep); | 2618 | 1, ACC_ALL, iterator.sptep); |
2583 | if (!sp) { | ||
2584 | pgprintk("nonpaging_map: ENOMEM\n"); | ||
2585 | kvm_release_pfn_clean(pfn); | ||
2586 | return -ENOMEM; | ||
2587 | } | ||
2588 | 2619 | ||
2589 | mmu_spte_set(iterator.sptep, | 2620 | mmu_spte_set(iterator.sptep, |
2590 | __pa(sp->spt) | 2621 | __pa(sp->spt) |
@@ -2611,8 +2642,16 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * | |||
2611 | 2642 | ||
2612 | static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) | 2643 | static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) |
2613 | { | 2644 | { |
2614 | kvm_release_pfn_clean(pfn); | 2645 | /* |
2615 | if (is_hwpoison_pfn(pfn)) { | 2646 | * Do not cache the mmio info caused by writing the readonly gfn |
2647 | * into the spte otherwise read access on readonly gfn also can | ||
2648 | * caused mmio page fault and treat it as mmio access. | ||
2649 | * Return 1 to tell kvm to emulate it. | ||
2650 | */ | ||
2651 | if (pfn == KVM_PFN_ERR_RO_FAULT) | ||
2652 | return 1; | ||
2653 | |||
2654 | if (pfn == KVM_PFN_ERR_HWPOISON) { | ||
2616 | kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); | 2655 | kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); |
2617 | return 0; | 2656 | return 0; |
2618 | } | 2657 | } |
@@ -3236,8 +3275,6 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | |||
3236 | if (!async) | 3275 | if (!async) |
3237 | return false; /* *pfn has correct page already */ | 3276 | return false; /* *pfn has correct page already */ |
3238 | 3277 | ||
3239 | put_page(pfn_to_page(*pfn)); | ||
3240 | |||
3241 | if (!prefault && can_do_async_pf(vcpu)) { | 3278 | if (!prefault && can_do_async_pf(vcpu)) { |
3242 | trace_kvm_try_async_get_page(gva, gfn); | 3279 | trace_kvm_try_async_get_page(gva, gfn); |
3243 | if (kvm_find_async_pf_gfn(vcpu, gfn)) { | 3280 | if (kvm_find_async_pf_gfn(vcpu, gfn)) { |
@@ -3371,6 +3408,18 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) | |||
3371 | return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; | 3408 | return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; |
3372 | } | 3409 | } |
3373 | 3410 | ||
3411 | static inline void protect_clean_gpte(unsigned *access, unsigned gpte) | ||
3412 | { | ||
3413 | unsigned mask; | ||
3414 | |||
3415 | BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); | ||
3416 | |||
3417 | mask = (unsigned)~ACC_WRITE_MASK; | ||
3418 | /* Allow write access to dirty gptes */ | ||
3419 | mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK; | ||
3420 | *access &= mask; | ||
3421 | } | ||
3422 | |||
3374 | static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, | 3423 | static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, |
3375 | int *nr_present) | 3424 | int *nr_present) |
3376 | { | 3425 | { |
@@ -3388,6 +3437,25 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, | |||
3388 | return false; | 3437 | return false; |
3389 | } | 3438 | } |
3390 | 3439 | ||
3440 | static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte) | ||
3441 | { | ||
3442 | unsigned access; | ||
3443 | |||
3444 | access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; | ||
3445 | access &= ~(gpte >> PT64_NX_SHIFT); | ||
3446 | |||
3447 | return access; | ||
3448 | } | ||
3449 | |||
3450 | static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) | ||
3451 | { | ||
3452 | unsigned index; | ||
3453 | |||
3454 | index = level - 1; | ||
3455 | index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2); | ||
3456 | return mmu->last_pte_bitmap & (1 << index); | ||
3457 | } | ||
3458 | |||
3391 | #define PTTYPE 64 | 3459 | #define PTTYPE 64 |
3392 | #include "paging_tmpl.h" | 3460 | #include "paging_tmpl.h" |
3393 | #undef PTTYPE | 3461 | #undef PTTYPE |
@@ -3457,6 +3525,56 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, | |||
3457 | } | 3525 | } |
3458 | } | 3526 | } |
3459 | 3527 | ||
3528 | static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) | ||
3529 | { | ||
3530 | unsigned bit, byte, pfec; | ||
3531 | u8 map; | ||
3532 | bool fault, x, w, u, wf, uf, ff, smep; | ||
3533 | |||
3534 | smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); | ||
3535 | for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { | ||
3536 | pfec = byte << 1; | ||
3537 | map = 0; | ||
3538 | wf = pfec & PFERR_WRITE_MASK; | ||
3539 | uf = pfec & PFERR_USER_MASK; | ||
3540 | ff = pfec & PFERR_FETCH_MASK; | ||
3541 | for (bit = 0; bit < 8; ++bit) { | ||
3542 | x = bit & ACC_EXEC_MASK; | ||
3543 | w = bit & ACC_WRITE_MASK; | ||
3544 | u = bit & ACC_USER_MASK; | ||
3545 | |||
3546 | /* Not really needed: !nx will cause pte.nx to fault */ | ||
3547 | x |= !mmu->nx; | ||
3548 | /* Allow supervisor writes if !cr0.wp */ | ||
3549 | w |= !is_write_protection(vcpu) && !uf; | ||
3550 | /* Disallow supervisor fetches of user code if cr4.smep */ | ||
3551 | x &= !(smep && u && !uf); | ||
3552 | |||
3553 | fault = (ff && !x) || (uf && !u) || (wf && !w); | ||
3554 | map |= fault << bit; | ||
3555 | } | ||
3556 | mmu->permissions[byte] = map; | ||
3557 | } | ||
3558 | } | ||
3559 | |||
3560 | static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) | ||
3561 | { | ||
3562 | u8 map; | ||
3563 | unsigned level, root_level = mmu->root_level; | ||
3564 | const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */ | ||
3565 | |||
3566 | if (root_level == PT32E_ROOT_LEVEL) | ||
3567 | --root_level; | ||
3568 | /* PT_PAGE_TABLE_LEVEL always terminates */ | ||
3569 | map = 1 | (1 << ps_set_index); | ||
3570 | for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) { | ||
3571 | if (level <= PT_PDPE_LEVEL | ||
3572 | && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu))) | ||
3573 | map |= 1 << (ps_set_index | (level - 1)); | ||
3574 | } | ||
3575 | mmu->last_pte_bitmap = map; | ||
3576 | } | ||
3577 | |||
3460 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, | 3578 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, |
3461 | struct kvm_mmu *context, | 3579 | struct kvm_mmu *context, |
3462 | int level) | 3580 | int level) |
@@ -3465,6 +3583,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, | |||
3465 | context->root_level = level; | 3583 | context->root_level = level; |
3466 | 3584 | ||
3467 | reset_rsvds_bits_mask(vcpu, context); | 3585 | reset_rsvds_bits_mask(vcpu, context); |
3586 | update_permission_bitmask(vcpu, context); | ||
3587 | update_last_pte_bitmap(vcpu, context); | ||
3468 | 3588 | ||
3469 | ASSERT(is_pae(vcpu)); | 3589 | ASSERT(is_pae(vcpu)); |
3470 | context->new_cr3 = paging_new_cr3; | 3590 | context->new_cr3 = paging_new_cr3; |
@@ -3493,6 +3613,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, | |||
3493 | context->root_level = PT32_ROOT_LEVEL; | 3613 | context->root_level = PT32_ROOT_LEVEL; |
3494 | 3614 | ||
3495 | reset_rsvds_bits_mask(vcpu, context); | 3615 | reset_rsvds_bits_mask(vcpu, context); |
3616 | update_permission_bitmask(vcpu, context); | ||
3617 | update_last_pte_bitmap(vcpu, context); | ||
3496 | 3618 | ||
3497 | context->new_cr3 = paging_new_cr3; | 3619 | context->new_cr3 = paging_new_cr3; |
3498 | context->page_fault = paging32_page_fault; | 3620 | context->page_fault = paging32_page_fault; |
@@ -3553,6 +3675,9 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
3553 | context->gva_to_gpa = paging32_gva_to_gpa; | 3675 | context->gva_to_gpa = paging32_gva_to_gpa; |
3554 | } | 3676 | } |
3555 | 3677 | ||
3678 | update_permission_bitmask(vcpu, context); | ||
3679 | update_last_pte_bitmap(vcpu, context); | ||
3680 | |||
3556 | return 0; | 3681 | return 0; |
3557 | } | 3682 | } |
3558 | 3683 | ||
@@ -3628,6 +3753,9 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) | |||
3628 | g_context->gva_to_gpa = paging32_gva_to_gpa_nested; | 3753 | g_context->gva_to_gpa = paging32_gva_to_gpa_nested; |
3629 | } | 3754 | } |
3630 | 3755 | ||
3756 | update_permission_bitmask(vcpu, g_context); | ||
3757 | update_last_pte_bitmap(vcpu, g_context); | ||
3758 | |||
3631 | return 0; | 3759 | return 0; |
3632 | } | 3760 | } |
3633 | 3761 | ||
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e374db9af021..69871080e866 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h | |||
@@ -18,8 +18,10 @@ | |||
18 | #define PT_PCD_MASK (1ULL << 4) | 18 | #define PT_PCD_MASK (1ULL << 4) |
19 | #define PT_ACCESSED_SHIFT 5 | 19 | #define PT_ACCESSED_SHIFT 5 |
20 | #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) | 20 | #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) |
21 | #define PT_DIRTY_MASK (1ULL << 6) | 21 | #define PT_DIRTY_SHIFT 6 |
22 | #define PT_PAGE_SIZE_MASK (1ULL << 7) | 22 | #define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT) |
23 | #define PT_PAGE_SIZE_SHIFT 7 | ||
24 | #define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT) | ||
23 | #define PT_PAT_MASK (1ULL << 7) | 25 | #define PT_PAT_MASK (1ULL << 7) |
24 | #define PT_GLOBAL_MASK (1ULL << 8) | 26 | #define PT_GLOBAL_MASK (1ULL << 8) |
25 | #define PT64_NX_SHIFT 63 | 27 | #define PT64_NX_SHIFT 63 |
@@ -88,17 +90,14 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) | |||
88 | return kvm_read_cr0_bits(vcpu, X86_CR0_WP); | 90 | return kvm_read_cr0_bits(vcpu, X86_CR0_WP); |
89 | } | 91 | } |
90 | 92 | ||
91 | static inline bool check_write_user_access(struct kvm_vcpu *vcpu, | 93 | /* |
92 | bool write_fault, bool user_fault, | 94 | * Will a fault with a given page-fault error code (pfec) cause a permission |
93 | unsigned long pte) | 95 | * fault with the given access (in ACC_* format)? |
96 | */ | ||
97 | static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access, | ||
98 | unsigned pfec) | ||
94 | { | 99 | { |
95 | if (unlikely(write_fault && !is_writable_pte(pte) | 100 | return (mmu->permissions[pfec >> 1] >> pte_access) & 1; |
96 | && (user_fault || is_write_protection(vcpu)))) | ||
97 | return false; | ||
98 | |||
99 | if (unlikely(user_fault && !(pte & PT_USER_MASK))) | ||
100 | return false; | ||
101 | |||
102 | return true; | ||
103 | } | 101 | } |
102 | |||
104 | #endif | 103 | #endif |
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 7d7d0b9e23eb..daff69e21150 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c | |||
@@ -116,10 +116,8 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) | |||
116 | gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); | 116 | gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); |
117 | pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); | 117 | pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); |
118 | 118 | ||
119 | if (is_error_pfn(pfn)) { | 119 | if (is_error_pfn(pfn)) |
120 | kvm_release_pfn_clean(pfn); | ||
121 | return; | 120 | return; |
122 | } | ||
123 | 121 | ||
124 | hpa = pfn << PAGE_SHIFT; | 122 | hpa = pfn << PAGE_SHIFT; |
125 | if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) | 123 | if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) |
@@ -190,7 +188,6 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
190 | 188 | ||
191 | static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) | 189 | static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) |
192 | { | 190 | { |
193 | struct kvm_memory_slot *slot; | ||
194 | unsigned long *rmapp; | 191 | unsigned long *rmapp; |
195 | u64 *sptep; | 192 | u64 *sptep; |
196 | struct rmap_iterator iter; | 193 | struct rmap_iterator iter; |
@@ -198,8 +195,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
198 | if (sp->role.direct || sp->unsync || sp->role.invalid) | 195 | if (sp->role.direct || sp->unsync || sp->role.invalid) |
199 | return; | 196 | return; |
200 | 197 | ||
201 | slot = gfn_to_memslot(kvm, sp->gfn); | 198 | rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL); |
202 | rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; | ||
203 | 199 | ||
204 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; | 200 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; |
205 | sptep = rmap_get_next(&iter)) { | 201 | sptep = rmap_get_next(&iter)) { |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bb7cf01cae76..714e2c01a6fe 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -63,10 +63,12 @@ | |||
63 | */ | 63 | */ |
64 | struct guest_walker { | 64 | struct guest_walker { |
65 | int level; | 65 | int level; |
66 | unsigned max_level; | ||
66 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; | 67 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; |
67 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; | 68 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; |
68 | pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; | 69 | pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; |
69 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; | 70 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; |
71 | pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; | ||
70 | unsigned pt_access; | 72 | unsigned pt_access; |
71 | unsigned pte_access; | 73 | unsigned pte_access; |
72 | gfn_t gfn; | 74 | gfn_t gfn; |
@@ -101,38 +103,41 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | |||
101 | return (ret != orig_pte); | 103 | return (ret != orig_pte); |
102 | } | 104 | } |
103 | 105 | ||
104 | static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte, | 106 | static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, |
105 | bool last) | 107 | struct kvm_mmu *mmu, |
108 | struct guest_walker *walker, | ||
109 | int write_fault) | ||
106 | { | 110 | { |
107 | unsigned access; | 111 | unsigned level, index; |
108 | 112 | pt_element_t pte, orig_pte; | |
109 | access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; | 113 | pt_element_t __user *ptep_user; |
110 | if (last && !is_dirty_gpte(gpte)) | 114 | gfn_t table_gfn; |
111 | access &= ~ACC_WRITE_MASK; | 115 | int ret; |
112 | 116 | ||
113 | #if PTTYPE == 64 | 117 | for (level = walker->max_level; level >= walker->level; --level) { |
114 | if (vcpu->arch.mmu.nx) | 118 | pte = orig_pte = walker->ptes[level - 1]; |
115 | access &= ~(gpte >> PT64_NX_SHIFT); | 119 | table_gfn = walker->table_gfn[level - 1]; |
116 | #endif | 120 | ptep_user = walker->ptep_user[level - 1]; |
117 | return access; | 121 | index = offset_in_page(ptep_user) / sizeof(pt_element_t); |
118 | } | 122 | if (!(pte & PT_ACCESSED_MASK)) { |
119 | 123 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); | |
120 | static bool FNAME(is_last_gpte)(struct guest_walker *walker, | 124 | pte |= PT_ACCESSED_MASK; |
121 | struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | 125 | } |
122 | pt_element_t gpte) | 126 | if (level == walker->level && write_fault && !is_dirty_gpte(pte)) { |
123 | { | 127 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); |
124 | if (walker->level == PT_PAGE_TABLE_LEVEL) | 128 | pte |= PT_DIRTY_MASK; |
125 | return true; | 129 | } |
126 | 130 | if (pte == orig_pte) | |
127 | if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) && | 131 | continue; |
128 | (PTTYPE == 64 || is_pse(vcpu))) | ||
129 | return true; | ||
130 | 132 | ||
131 | if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) && | 133 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte); |
132 | (mmu->root_level == PT64_ROOT_LEVEL)) | 134 | if (ret) |
133 | return true; | 135 | return ret; |
134 | 136 | ||
135 | return false; | 137 | mark_page_dirty(vcpu->kvm, table_gfn); |
138 | walker->ptes[level] = pte; | ||
139 | } | ||
140 | return 0; | ||
136 | } | 141 | } |
137 | 142 | ||
138 | /* | 143 | /* |
@@ -142,21 +147,22 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, | |||
142 | struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | 147 | struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, |
143 | gva_t addr, u32 access) | 148 | gva_t addr, u32 access) |
144 | { | 149 | { |
150 | int ret; | ||
145 | pt_element_t pte; | 151 | pt_element_t pte; |
146 | pt_element_t __user *uninitialized_var(ptep_user); | 152 | pt_element_t __user *uninitialized_var(ptep_user); |
147 | gfn_t table_gfn; | 153 | gfn_t table_gfn; |
148 | unsigned index, pt_access, uninitialized_var(pte_access); | 154 | unsigned index, pt_access, pte_access, accessed_dirty, shift; |
149 | gpa_t pte_gpa; | 155 | gpa_t pte_gpa; |
150 | bool eperm, last_gpte; | ||
151 | int offset; | 156 | int offset; |
152 | const int write_fault = access & PFERR_WRITE_MASK; | 157 | const int write_fault = access & PFERR_WRITE_MASK; |
153 | const int user_fault = access & PFERR_USER_MASK; | 158 | const int user_fault = access & PFERR_USER_MASK; |
154 | const int fetch_fault = access & PFERR_FETCH_MASK; | 159 | const int fetch_fault = access & PFERR_FETCH_MASK; |
155 | u16 errcode = 0; | 160 | u16 errcode = 0; |
161 | gpa_t real_gpa; | ||
162 | gfn_t gfn; | ||
156 | 163 | ||
157 | trace_kvm_mmu_pagetable_walk(addr, access); | 164 | trace_kvm_mmu_pagetable_walk(addr, access); |
158 | retry_walk: | 165 | retry_walk: |
159 | eperm = false; | ||
160 | walker->level = mmu->root_level; | 166 | walker->level = mmu->root_level; |
161 | pte = mmu->get_cr3(vcpu); | 167 | pte = mmu->get_cr3(vcpu); |
162 | 168 | ||
@@ -169,15 +175,21 @@ retry_walk: | |||
169 | --walker->level; | 175 | --walker->level; |
170 | } | 176 | } |
171 | #endif | 177 | #endif |
178 | walker->max_level = walker->level; | ||
172 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || | 179 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || |
173 | (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); | 180 | (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); |
174 | 181 | ||
175 | pt_access = ACC_ALL; | 182 | accessed_dirty = PT_ACCESSED_MASK; |
183 | pt_access = pte_access = ACC_ALL; | ||
184 | ++walker->level; | ||
176 | 185 | ||
177 | for (;;) { | 186 | do { |
178 | gfn_t real_gfn; | 187 | gfn_t real_gfn; |
179 | unsigned long host_addr; | 188 | unsigned long host_addr; |
180 | 189 | ||
190 | pt_access &= pte_access; | ||
191 | --walker->level; | ||
192 | |||
181 | index = PT_INDEX(addr, walker->level); | 193 | index = PT_INDEX(addr, walker->level); |
182 | 194 | ||
183 | table_gfn = gpte_to_gfn(pte); | 195 | table_gfn = gpte_to_gfn(pte); |
@@ -199,6 +211,7 @@ retry_walk: | |||
199 | ptep_user = (pt_element_t __user *)((void *)host_addr + offset); | 211 | ptep_user = (pt_element_t __user *)((void *)host_addr + offset); |
200 | if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) | 212 | if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) |
201 | goto error; | 213 | goto error; |
214 | walker->ptep_user[walker->level - 1] = ptep_user; | ||
202 | 215 | ||
203 | trace_kvm_mmu_paging_element(pte, walker->level); | 216 | trace_kvm_mmu_paging_element(pte, walker->level); |
204 | 217 | ||
@@ -211,92 +224,48 @@ retry_walk: | |||
211 | goto error; | 224 | goto error; |
212 | } | 225 | } |
213 | 226 | ||
214 | if (!check_write_user_access(vcpu, write_fault, user_fault, | 227 | accessed_dirty &= pte; |
215 | pte)) | 228 | pte_access = pt_access & gpte_access(vcpu, pte); |
216 | eperm = true; | ||
217 | |||
218 | #if PTTYPE == 64 | ||
219 | if (unlikely(fetch_fault && (pte & PT64_NX_MASK))) | ||
220 | eperm = true; | ||
221 | #endif | ||
222 | |||
223 | last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); | ||
224 | if (last_gpte) { | ||
225 | pte_access = pt_access & | ||
226 | FNAME(gpte_access)(vcpu, pte, true); | ||
227 | /* check if the kernel is fetching from user page */ | ||
228 | if (unlikely(pte_access & PT_USER_MASK) && | ||
229 | kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) | ||
230 | if (fetch_fault && !user_fault) | ||
231 | eperm = true; | ||
232 | } | ||
233 | |||
234 | if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { | ||
235 | int ret; | ||
236 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, | ||
237 | sizeof(pte)); | ||
238 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, | ||
239 | pte, pte|PT_ACCESSED_MASK); | ||
240 | if (unlikely(ret < 0)) | ||
241 | goto error; | ||
242 | else if (ret) | ||
243 | goto retry_walk; | ||
244 | |||
245 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
246 | pte |= PT_ACCESSED_MASK; | ||
247 | } | ||
248 | 229 | ||
249 | walker->ptes[walker->level - 1] = pte; | 230 | walker->ptes[walker->level - 1] = pte; |
231 | } while (!is_last_gpte(mmu, walker->level, pte)); | ||
250 | 232 | ||
251 | if (last_gpte) { | 233 | if (unlikely(permission_fault(mmu, pte_access, access))) { |
252 | int lvl = walker->level; | 234 | errcode |= PFERR_PRESENT_MASK; |
253 | gpa_t real_gpa; | 235 | goto error; |
254 | gfn_t gfn; | 236 | } |
255 | u32 ac; | ||
256 | |||
257 | gfn = gpte_to_gfn_lvl(pte, lvl); | ||
258 | gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; | ||
259 | |||
260 | if (PTTYPE == 32 && | ||
261 | walker->level == PT_DIRECTORY_LEVEL && | ||
262 | is_cpuid_PSE36()) | ||
263 | gfn += pse36_gfn_delta(pte); | ||
264 | |||
265 | ac = write_fault | fetch_fault | user_fault; | ||
266 | 237 | ||
267 | real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), | 238 | gfn = gpte_to_gfn_lvl(pte, walker->level); |
268 | ac); | 239 | gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; |
269 | if (real_gpa == UNMAPPED_GVA) | ||
270 | return 0; | ||
271 | 240 | ||
272 | walker->gfn = real_gpa >> PAGE_SHIFT; | 241 | if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) |
242 | gfn += pse36_gfn_delta(pte); | ||
273 | 243 | ||
274 | break; | 244 | real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access); |
275 | } | 245 | if (real_gpa == UNMAPPED_GVA) |
246 | return 0; | ||
276 | 247 | ||
277 | pt_access &= FNAME(gpte_access)(vcpu, pte, false); | 248 | walker->gfn = real_gpa >> PAGE_SHIFT; |
278 | --walker->level; | ||
279 | } | ||
280 | 249 | ||
281 | if (unlikely(eperm)) { | 250 | if (!write_fault) |
282 | errcode |= PFERR_PRESENT_MASK; | 251 | protect_clean_gpte(&pte_access, pte); |
283 | goto error; | ||
284 | } | ||
285 | 252 | ||
286 | if (write_fault && unlikely(!is_dirty_gpte(pte))) { | 253 | /* |
287 | int ret; | 254 | * On a write fault, fold the dirty bit into accessed_dirty by shifting it one |
255 | * place right. | ||
256 | * | ||
257 | * On a read fault, do nothing. | ||
258 | */ | ||
259 | shift = write_fault >> ilog2(PFERR_WRITE_MASK); | ||
260 | shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT; | ||
261 | accessed_dirty &= pte >> shift; | ||
288 | 262 | ||
289 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); | 263 | if (unlikely(!accessed_dirty)) { |
290 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, | 264 | ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); |
291 | pte, pte|PT_DIRTY_MASK); | ||
292 | if (unlikely(ret < 0)) | 265 | if (unlikely(ret < 0)) |
293 | goto error; | 266 | goto error; |
294 | else if (ret) | 267 | else if (ret) |
295 | goto retry_walk; | 268 | goto retry_walk; |
296 | |||
297 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
298 | pte |= PT_DIRTY_MASK; | ||
299 | walker->ptes[walker->level - 1] = pte; | ||
300 | } | 269 | } |
301 | 270 | ||
302 | walker->pt_access = pt_access; | 271 | walker->pt_access = pt_access; |
@@ -368,12 +337,11 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
368 | return; | 337 | return; |
369 | 338 | ||
370 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); | 339 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); |
371 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true); | 340 | pte_access = sp->role.access & gpte_access(vcpu, gpte); |
341 | protect_clean_gpte(&pte_access, gpte); | ||
372 | pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); | 342 | pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); |
373 | if (mmu_invalid_pfn(pfn)) { | 343 | if (mmu_invalid_pfn(pfn)) |
374 | kvm_release_pfn_clean(pfn); | ||
375 | return; | 344 | return; |
376 | } | ||
377 | 345 | ||
378 | /* | 346 | /* |
379 | * we call mmu_set_spte() with host_writable = true because that | 347 | * we call mmu_set_spte() with host_writable = true because that |
@@ -443,15 +411,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
443 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) | 411 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) |
444 | continue; | 412 | continue; |
445 | 413 | ||
446 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, | 414 | pte_access = sp->role.access & gpte_access(vcpu, gpte); |
447 | true); | 415 | protect_clean_gpte(&pte_access, gpte); |
448 | gfn = gpte_to_gfn(gpte); | 416 | gfn = gpte_to_gfn(gpte); |
449 | pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, | 417 | pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, |
450 | pte_access & ACC_WRITE_MASK); | 418 | pte_access & ACC_WRITE_MASK); |
451 | if (mmu_invalid_pfn(pfn)) { | 419 | if (mmu_invalid_pfn(pfn)) |
452 | kvm_release_pfn_clean(pfn); | ||
453 | break; | 420 | break; |
454 | } | ||
455 | 421 | ||
456 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, | 422 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, |
457 | NULL, PT_PAGE_TABLE_LEVEL, gfn, | 423 | NULL, PT_PAGE_TABLE_LEVEL, gfn, |
@@ -798,7 +764,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
798 | 764 | ||
799 | gfn = gpte_to_gfn(gpte); | 765 | gfn = gpte_to_gfn(gpte); |
800 | pte_access = sp->role.access; | 766 | pte_access = sp->role.access; |
801 | pte_access &= FNAME(gpte_access)(vcpu, gpte, true); | 767 | pte_access &= gpte_access(vcpu, gpte); |
768 | protect_clean_gpte(&pte_access, gpte); | ||
802 | 769 | ||
803 | if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) | 770 | if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) |
804 | continue; | 771 | continue; |
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 9b7ec1150ab0..cfc258a6bf97 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Kernel-based Virtual Machine -- Performane Monitoring Unit support | 2 | * Kernel-based Virtual Machine -- Performance Monitoring Unit support |
3 | * | 3 | * |
4 | * Copyright 2011 Red Hat, Inc. and/or its affiliates. | 4 | * Copyright 2011 Red Hat, Inc. and/or its affiliates. |
5 | * | 5 | * |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index baead950d6c8..d017df3899ef 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -163,7 +163,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio); | |||
163 | 163 | ||
164 | #define MSR_INVALID 0xffffffffU | 164 | #define MSR_INVALID 0xffffffffU |
165 | 165 | ||
166 | static struct svm_direct_access_msrs { | 166 | static const struct svm_direct_access_msrs { |
167 | u32 index; /* Index of the MSR */ | 167 | u32 index; /* Index of the MSR */ |
168 | bool always; /* True if intercept is always on */ | 168 | bool always; /* True if intercept is always on */ |
169 | } direct_access_msrs[] = { | 169 | } direct_access_msrs[] = { |
@@ -400,7 +400,7 @@ struct svm_init_data { | |||
400 | int r; | 400 | int r; |
401 | }; | 401 | }; |
402 | 402 | ||
403 | static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; | 403 | static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; |
404 | 404 | ||
405 | #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) | 405 | #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) |
406 | #define MSRS_RANGE_SIZE 2048 | 406 | #define MSRS_RANGE_SIZE 2048 |
@@ -1146,7 +1146,6 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
1146 | 1146 | ||
1147 | svm_set_efer(&svm->vcpu, 0); | 1147 | svm_set_efer(&svm->vcpu, 0); |
1148 | save->dr6 = 0xffff0ff0; | 1148 | save->dr6 = 0xffff0ff0; |
1149 | save->dr7 = 0x400; | ||
1150 | kvm_set_rflags(&svm->vcpu, 2); | 1149 | kvm_set_rflags(&svm->vcpu, 2); |
1151 | save->rip = 0x0000fff0; | 1150 | save->rip = 0x0000fff0; |
1152 | svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; | 1151 | svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; |
@@ -1643,7 +1642,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, | |||
1643 | mark_dirty(svm->vmcb, VMCB_SEG); | 1642 | mark_dirty(svm->vmcb, VMCB_SEG); |
1644 | } | 1643 | } |
1645 | 1644 | ||
1646 | static void update_db_intercept(struct kvm_vcpu *vcpu) | 1645 | static void update_db_bp_intercept(struct kvm_vcpu *vcpu) |
1647 | { | 1646 | { |
1648 | struct vcpu_svm *svm = to_svm(vcpu); | 1647 | struct vcpu_svm *svm = to_svm(vcpu); |
1649 | 1648 | ||
@@ -1663,20 +1662,6 @@ static void update_db_intercept(struct kvm_vcpu *vcpu) | |||
1663 | vcpu->guest_debug = 0; | 1662 | vcpu->guest_debug = 0; |
1664 | } | 1663 | } |
1665 | 1664 | ||
1666 | static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) | ||
1667 | { | ||
1668 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1669 | |||
1670 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) | ||
1671 | svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; | ||
1672 | else | ||
1673 | svm->vmcb->save.dr7 = vcpu->arch.dr7; | ||
1674 | |||
1675 | mark_dirty(svm->vmcb, VMCB_DR); | ||
1676 | |||
1677 | update_db_intercept(vcpu); | ||
1678 | } | ||
1679 | |||
1680 | static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) | 1665 | static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) |
1681 | { | 1666 | { |
1682 | if (sd->next_asid > sd->max_asid) { | 1667 | if (sd->next_asid > sd->max_asid) { |
@@ -1748,7 +1733,7 @@ static int db_interception(struct vcpu_svm *svm) | |||
1748 | if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) | 1733 | if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) |
1749 | svm->vmcb->save.rflags &= | 1734 | svm->vmcb->save.rflags &= |
1750 | ~(X86_EFLAGS_TF | X86_EFLAGS_RF); | 1735 | ~(X86_EFLAGS_TF | X86_EFLAGS_RF); |
1751 | update_db_intercept(&svm->vcpu); | 1736 | update_db_bp_intercept(&svm->vcpu); |
1752 | } | 1737 | } |
1753 | 1738 | ||
1754 | if (svm->vcpu.guest_debug & | 1739 | if (svm->vcpu.guest_debug & |
@@ -2063,7 +2048,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm) | |||
2063 | if (svm->nested.intercept & 1ULL) { | 2048 | if (svm->nested.intercept & 1ULL) { |
2064 | /* | 2049 | /* |
2065 | * The #vmexit can't be emulated here directly because this | 2050 | * The #vmexit can't be emulated here directly because this |
2066 | * code path runs with irqs and preemtion disabled. A | 2051 | * code path runs with irqs and preemption disabled. A |
2067 | * #vmexit emulation might sleep. Only signal request for | 2052 | * #vmexit emulation might sleep. Only signal request for |
2068 | * the #vmexit here. | 2053 | * the #vmexit here. |
2069 | */ | 2054 | */ |
@@ -2105,7 +2090,6 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) | |||
2105 | return kmap(page); | 2090 | return kmap(page); |
2106 | 2091 | ||
2107 | error: | 2092 | error: |
2108 | kvm_release_page_clean(page); | ||
2109 | kvm_inject_gp(&svm->vcpu, 0); | 2093 | kvm_inject_gp(&svm->vcpu, 0); |
2110 | 2094 | ||
2111 | return NULL; | 2095 | return NULL; |
@@ -2409,7 +2393,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) | |||
2409 | { | 2393 | { |
2410 | /* | 2394 | /* |
2411 | * This function merges the msr permission bitmaps of kvm and the | 2395 | * This function merges the msr permission bitmaps of kvm and the |
2412 | * nested vmcb. It is omptimized in that it only merges the parts where | 2396 | * nested vmcb. It is optimized in that it only merges the parts where |
2413 | * the kvm msr permission bitmap may contain zero bits | 2397 | * the kvm msr permission bitmap may contain zero bits |
2414 | */ | 2398 | */ |
2415 | int i; | 2399 | int i; |
@@ -3268,7 +3252,7 @@ static int pause_interception(struct vcpu_svm *svm) | |||
3268 | return 1; | 3252 | return 1; |
3269 | } | 3253 | } |
3270 | 3254 | ||
3271 | static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { | 3255 | static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { |
3272 | [SVM_EXIT_READ_CR0] = cr_interception, | 3256 | [SVM_EXIT_READ_CR0] = cr_interception, |
3273 | [SVM_EXIT_READ_CR3] = cr_interception, | 3257 | [SVM_EXIT_READ_CR3] = cr_interception, |
3274 | [SVM_EXIT_READ_CR4] = cr_interception, | 3258 | [SVM_EXIT_READ_CR4] = cr_interception, |
@@ -3660,7 +3644,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) | |||
3660 | */ | 3644 | */ |
3661 | svm->nmi_singlestep = true; | 3645 | svm->nmi_singlestep = true; |
3662 | svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); | 3646 | svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); |
3663 | update_db_intercept(vcpu); | 3647 | update_db_bp_intercept(vcpu); |
3664 | } | 3648 | } |
3665 | 3649 | ||
3666 | static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) | 3650 | static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) |
@@ -3783,12 +3767,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) | |||
3783 | svm_complete_interrupts(svm); | 3767 | svm_complete_interrupts(svm); |
3784 | } | 3768 | } |
3785 | 3769 | ||
3786 | #ifdef CONFIG_X86_64 | ||
3787 | #define R "r" | ||
3788 | #else | ||
3789 | #define R "e" | ||
3790 | #endif | ||
3791 | |||
3792 | static void svm_vcpu_run(struct kvm_vcpu *vcpu) | 3770 | static void svm_vcpu_run(struct kvm_vcpu *vcpu) |
3793 | { | 3771 | { |
3794 | struct vcpu_svm *svm = to_svm(vcpu); | 3772 | struct vcpu_svm *svm = to_svm(vcpu); |
@@ -3815,13 +3793,13 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3815 | local_irq_enable(); | 3793 | local_irq_enable(); |
3816 | 3794 | ||
3817 | asm volatile ( | 3795 | asm volatile ( |
3818 | "push %%"R"bp; \n\t" | 3796 | "push %%" _ASM_BP "; \n\t" |
3819 | "mov %c[rbx](%[svm]), %%"R"bx \n\t" | 3797 | "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" |
3820 | "mov %c[rcx](%[svm]), %%"R"cx \n\t" | 3798 | "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t" |
3821 | "mov %c[rdx](%[svm]), %%"R"dx \n\t" | 3799 | "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t" |
3822 | "mov %c[rsi](%[svm]), %%"R"si \n\t" | 3800 | "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t" |
3823 | "mov %c[rdi](%[svm]), %%"R"di \n\t" | 3801 | "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t" |
3824 | "mov %c[rbp](%[svm]), %%"R"bp \n\t" | 3802 | "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t" |
3825 | #ifdef CONFIG_X86_64 | 3803 | #ifdef CONFIG_X86_64 |
3826 | "mov %c[r8](%[svm]), %%r8 \n\t" | 3804 | "mov %c[r8](%[svm]), %%r8 \n\t" |
3827 | "mov %c[r9](%[svm]), %%r9 \n\t" | 3805 | "mov %c[r9](%[svm]), %%r9 \n\t" |
@@ -3834,20 +3812,20 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3834 | #endif | 3812 | #endif |
3835 | 3813 | ||
3836 | /* Enter guest mode */ | 3814 | /* Enter guest mode */ |
3837 | "push %%"R"ax \n\t" | 3815 | "push %%" _ASM_AX " \n\t" |
3838 | "mov %c[vmcb](%[svm]), %%"R"ax \n\t" | 3816 | "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t" |
3839 | __ex(SVM_VMLOAD) "\n\t" | 3817 | __ex(SVM_VMLOAD) "\n\t" |
3840 | __ex(SVM_VMRUN) "\n\t" | 3818 | __ex(SVM_VMRUN) "\n\t" |
3841 | __ex(SVM_VMSAVE) "\n\t" | 3819 | __ex(SVM_VMSAVE) "\n\t" |
3842 | "pop %%"R"ax \n\t" | 3820 | "pop %%" _ASM_AX " \n\t" |
3843 | 3821 | ||
3844 | /* Save guest registers, load host registers */ | 3822 | /* Save guest registers, load host registers */ |
3845 | "mov %%"R"bx, %c[rbx](%[svm]) \n\t" | 3823 | "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t" |
3846 | "mov %%"R"cx, %c[rcx](%[svm]) \n\t" | 3824 | "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t" |
3847 | "mov %%"R"dx, %c[rdx](%[svm]) \n\t" | 3825 | "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t" |
3848 | "mov %%"R"si, %c[rsi](%[svm]) \n\t" | 3826 | "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t" |
3849 | "mov %%"R"di, %c[rdi](%[svm]) \n\t" | 3827 | "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t" |
3850 | "mov %%"R"bp, %c[rbp](%[svm]) \n\t" | 3828 | "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t" |
3851 | #ifdef CONFIG_X86_64 | 3829 | #ifdef CONFIG_X86_64 |
3852 | "mov %%r8, %c[r8](%[svm]) \n\t" | 3830 | "mov %%r8, %c[r8](%[svm]) \n\t" |
3853 | "mov %%r9, %c[r9](%[svm]) \n\t" | 3831 | "mov %%r9, %c[r9](%[svm]) \n\t" |
@@ -3858,7 +3836,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3858 | "mov %%r14, %c[r14](%[svm]) \n\t" | 3836 | "mov %%r14, %c[r14](%[svm]) \n\t" |
3859 | "mov %%r15, %c[r15](%[svm]) \n\t" | 3837 | "mov %%r15, %c[r15](%[svm]) \n\t" |
3860 | #endif | 3838 | #endif |
3861 | "pop %%"R"bp" | 3839 | "pop %%" _ASM_BP |
3862 | : | 3840 | : |
3863 | : [svm]"a"(svm), | 3841 | : [svm]"a"(svm), |
3864 | [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), | 3842 | [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), |
@@ -3879,9 +3857,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3879 | [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) | 3857 | [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) |
3880 | #endif | 3858 | #endif |
3881 | : "cc", "memory" | 3859 | : "cc", "memory" |
3882 | , R"bx", R"cx", R"dx", R"si", R"di" | ||
3883 | #ifdef CONFIG_X86_64 | 3860 | #ifdef CONFIG_X86_64 |
3861 | , "rbx", "rcx", "rdx", "rsi", "rdi" | ||
3884 | , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" | 3862 | , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" |
3863 | #else | ||
3864 | , "ebx", "ecx", "edx", "esi", "edi" | ||
3885 | #endif | 3865 | #endif |
3886 | ); | 3866 | ); |
3887 | 3867 | ||
@@ -3941,8 +3921,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3941 | mark_all_clean(svm->vmcb); | 3921 | mark_all_clean(svm->vmcb); |
3942 | } | 3922 | } |
3943 | 3923 | ||
3944 | #undef R | ||
3945 | |||
3946 | static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) | 3924 | static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) |
3947 | { | 3925 | { |
3948 | struct vcpu_svm *svm = to_svm(vcpu); | 3926 | struct vcpu_svm *svm = to_svm(vcpu); |
@@ -4069,7 +4047,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) | |||
4069 | #define POST_MEM(exit) { .exit_code = (exit), \ | 4047 | #define POST_MEM(exit) { .exit_code = (exit), \ |
4070 | .stage = X86_ICPT_POST_MEMACCESS, } | 4048 | .stage = X86_ICPT_POST_MEMACCESS, } |
4071 | 4049 | ||
4072 | static struct __x86_intercept { | 4050 | static const struct __x86_intercept { |
4073 | u32 exit_code; | 4051 | u32 exit_code; |
4074 | enum x86_intercept_stage stage; | 4052 | enum x86_intercept_stage stage; |
4075 | } x86_intercept_map[] = { | 4053 | } x86_intercept_map[] = { |
@@ -4260,7 +4238,7 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
4260 | .vcpu_load = svm_vcpu_load, | 4238 | .vcpu_load = svm_vcpu_load, |
4261 | .vcpu_put = svm_vcpu_put, | 4239 | .vcpu_put = svm_vcpu_put, |
4262 | 4240 | ||
4263 | .set_guest_debug = svm_guest_debug, | 4241 | .update_db_bp_intercept = update_db_bp_intercept, |
4264 | .get_msr = svm_get_msr, | 4242 | .get_msr = svm_get_msr, |
4265 | .set_msr = svm_set_msr, | 4243 | .set_msr = svm_set_msr, |
4266 | .get_segment_base = svm_get_segment_base, | 4244 | .get_segment_base = svm_get_segment_base, |
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c deleted file mode 100644 index 6b85cc647f34..000000000000 --- a/arch/x86/kvm/timer.c +++ /dev/null | |||
@@ -1,47 +0,0 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * timer support | ||
8 | * | ||
9 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. | ||
10 | * | ||
11 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
12 | * the COPYING file in the top-level directory. | ||
13 | */ | ||
14 | |||
15 | #include <linux/kvm_host.h> | ||
16 | #include <linux/kvm.h> | ||
17 | #include <linux/hrtimer.h> | ||
18 | #include <linux/atomic.h> | ||
19 | #include "kvm_timer.h" | ||
20 | |||
21 | enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) | ||
22 | { | ||
23 | struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); | ||
24 | struct kvm_vcpu *vcpu = ktimer->vcpu; | ||
25 | wait_queue_head_t *q = &vcpu->wq; | ||
26 | |||
27 | /* | ||
28 | * There is a race window between reading and incrementing, but we do | ||
29 | * not care about potentially losing timer events in the !reinject | ||
30 | * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked | ||
31 | * in vcpu_enter_guest. | ||
32 | */ | ||
33 | if (ktimer->reinject || !atomic_read(&ktimer->pending)) { | ||
34 | atomic_inc(&ktimer->pending); | ||
35 | /* FIXME: this code should not know anything about vcpus */ | ||
36 | kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); | ||
37 | } | ||
38 | |||
39 | if (waitqueue_active(q)) | ||
40 | wake_up_interruptible(q); | ||
41 | |||
42 | if (ktimer->t_ops->is_periodic(ktimer)) { | ||
43 | hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); | ||
44 | return HRTIMER_RESTART; | ||
45 | } else | ||
46 | return HRTIMER_NORESTART; | ||
47 | } | ||
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 851aa7c3b890..ad6b1dd06f8b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -127,6 +127,8 @@ module_param(ple_gap, int, S_IRUGO); | |||
127 | static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; | 127 | static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; |
128 | module_param(ple_window, int, S_IRUGO); | 128 | module_param(ple_window, int, S_IRUGO); |
129 | 129 | ||
130 | extern const ulong vmx_return; | ||
131 | |||
130 | #define NR_AUTOLOAD_MSRS 8 | 132 | #define NR_AUTOLOAD_MSRS 8 |
131 | #define VMCS02_POOL_SIZE 1 | 133 | #define VMCS02_POOL_SIZE 1 |
132 | 134 | ||
@@ -405,16 +407,16 @@ struct vcpu_vmx { | |||
405 | struct { | 407 | struct { |
406 | int vm86_active; | 408 | int vm86_active; |
407 | ulong save_rflags; | 409 | ulong save_rflags; |
410 | struct kvm_segment segs[8]; | ||
411 | } rmode; | ||
412 | struct { | ||
413 | u32 bitmask; /* 4 bits per segment (1 bit per field) */ | ||
408 | struct kvm_save_segment { | 414 | struct kvm_save_segment { |
409 | u16 selector; | 415 | u16 selector; |
410 | unsigned long base; | 416 | unsigned long base; |
411 | u32 limit; | 417 | u32 limit; |
412 | u32 ar; | 418 | u32 ar; |
413 | } tr, es, ds, fs, gs; | 419 | } seg[8]; |
414 | } rmode; | ||
415 | struct { | ||
416 | u32 bitmask; /* 4 bits per segment (1 bit per field) */ | ||
417 | struct kvm_save_segment seg[8]; | ||
418 | } segment_cache; | 420 | } segment_cache; |
419 | int vpid; | 421 | int vpid; |
420 | bool emulation_required; | 422 | bool emulation_required; |
@@ -450,7 +452,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | |||
450 | #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ | 452 | #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ |
451 | [number##_HIGH] = VMCS12_OFFSET(name)+4 | 453 | [number##_HIGH] = VMCS12_OFFSET(name)+4 |
452 | 454 | ||
453 | static unsigned short vmcs_field_to_offset_table[] = { | 455 | static const unsigned short vmcs_field_to_offset_table[] = { |
454 | FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), | 456 | FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), |
455 | FIELD(GUEST_ES_SELECTOR, guest_es_selector), | 457 | FIELD(GUEST_ES_SELECTOR, guest_es_selector), |
456 | FIELD(GUEST_CS_SELECTOR, guest_cs_selector), | 458 | FIELD(GUEST_CS_SELECTOR, guest_cs_selector), |
@@ -596,10 +598,9 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) | |||
596 | static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) | 598 | static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) |
597 | { | 599 | { |
598 | struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); | 600 | struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); |
599 | if (is_error_page(page)) { | 601 | if (is_error_page(page)) |
600 | kvm_release_page_clean(page); | ||
601 | return NULL; | 602 | return NULL; |
602 | } | 603 | |
603 | return page; | 604 | return page; |
604 | } | 605 | } |
605 | 606 | ||
@@ -667,7 +668,7 @@ static struct vmx_capability { | |||
667 | .ar_bytes = GUEST_##seg##_AR_BYTES, \ | 668 | .ar_bytes = GUEST_##seg##_AR_BYTES, \ |
668 | } | 669 | } |
669 | 670 | ||
670 | static struct kvm_vmx_segment_field { | 671 | static const struct kvm_vmx_segment_field { |
671 | unsigned selector; | 672 | unsigned selector; |
672 | unsigned base; | 673 | unsigned base; |
673 | unsigned limit; | 674 | unsigned limit; |
@@ -1343,7 +1344,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) | |||
1343 | guest_efer = vmx->vcpu.arch.efer; | 1344 | guest_efer = vmx->vcpu.arch.efer; |
1344 | 1345 | ||
1345 | /* | 1346 | /* |
1346 | * NX is emulated; LMA and LME handled by hardware; SCE meaninless | 1347 | * NX is emulated; LMA and LME handled by hardware; SCE meaningless |
1347 | * outside long mode | 1348 | * outside long mode |
1348 | */ | 1349 | */ |
1349 | ignore_bits = EFER_NX | EFER_SCE; | 1350 | ignore_bits = EFER_NX | EFER_SCE; |
@@ -1995,7 +1996,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) | |||
1995 | #endif | 1996 | #endif |
1996 | CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | | 1997 | CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | |
1997 | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | | 1998 | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | |
1998 | CPU_BASED_RDPMC_EXITING | | 1999 | CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | |
1999 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | 2000 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; |
2000 | /* | 2001 | /* |
2001 | * We can allow some features even when not supported by the | 2002 | * We can allow some features even when not supported by the |
@@ -2291,16 +2292,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | |||
2291 | } | 2292 | } |
2292 | } | 2293 | } |
2293 | 2294 | ||
2294 | static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) | ||
2295 | { | ||
2296 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) | ||
2297 | vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); | ||
2298 | else | ||
2299 | vmcs_writel(GUEST_DR7, vcpu->arch.dr7); | ||
2300 | |||
2301 | update_exception_bitmap(vcpu); | ||
2302 | } | ||
2303 | |||
2304 | static __init int cpu_has_kvm_support(void) | 2295 | static __init int cpu_has_kvm_support(void) |
2305 | { | 2296 | { |
2306 | return cpu_has_vmx(); | 2297 | return cpu_has_vmx(); |
@@ -2698,20 +2689,17 @@ static __exit void hardware_unsetup(void) | |||
2698 | free_kvm_area(); | 2689 | free_kvm_area(); |
2699 | } | 2690 | } |
2700 | 2691 | ||
2701 | static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) | 2692 | static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) |
2702 | { | 2693 | { |
2703 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 2694 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; |
2695 | struct kvm_segment tmp = *save; | ||
2704 | 2696 | ||
2705 | if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) { | 2697 | if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { |
2706 | vmcs_write16(sf->selector, save->selector); | 2698 | tmp.base = vmcs_readl(sf->base); |
2707 | vmcs_writel(sf->base, save->base); | 2699 | tmp.selector = vmcs_read16(sf->selector); |
2708 | vmcs_write32(sf->limit, save->limit); | 2700 | tmp.s = 1; |
2709 | vmcs_write32(sf->ar_bytes, save->ar); | ||
2710 | } else { | ||
2711 | u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK) | ||
2712 | << AR_DPL_SHIFT; | ||
2713 | vmcs_write32(sf->ar_bytes, 0x93 | dpl); | ||
2714 | } | 2701 | } |
2702 | vmx_set_segment(vcpu, &tmp, seg); | ||
2715 | } | 2703 | } |
2716 | 2704 | ||
2717 | static void enter_pmode(struct kvm_vcpu *vcpu) | 2705 | static void enter_pmode(struct kvm_vcpu *vcpu) |
@@ -2724,10 +2712,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
2724 | 2712 | ||
2725 | vmx_segment_cache_clear(vmx); | 2713 | vmx_segment_cache_clear(vmx); |
2726 | 2714 | ||
2727 | vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); | 2715 | vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); |
2728 | vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); | ||
2729 | vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); | ||
2730 | vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); | ||
2731 | 2716 | ||
2732 | flags = vmcs_readl(GUEST_RFLAGS); | 2717 | flags = vmcs_readl(GUEST_RFLAGS); |
2733 | flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; | 2718 | flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; |
@@ -2742,10 +2727,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
2742 | if (emulate_invalid_guest_state) | 2727 | if (emulate_invalid_guest_state) |
2743 | return; | 2728 | return; |
2744 | 2729 | ||
2745 | fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es); | 2730 | fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); |
2746 | fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds); | 2731 | fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); |
2747 | fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); | 2732 | fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); |
2748 | fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); | 2733 | fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); |
2749 | 2734 | ||
2750 | vmx_segment_cache_clear(vmx); | 2735 | vmx_segment_cache_clear(vmx); |
2751 | 2736 | ||
@@ -2773,14 +2758,10 @@ static gva_t rmode_tss_base(struct kvm *kvm) | |||
2773 | return kvm->arch.tss_addr; | 2758 | return kvm->arch.tss_addr; |
2774 | } | 2759 | } |
2775 | 2760 | ||
2776 | static void fix_rmode_seg(int seg, struct kvm_save_segment *save) | 2761 | static void fix_rmode_seg(int seg, struct kvm_segment *save) |
2777 | { | 2762 | { |
2778 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 2763 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; |
2779 | 2764 | ||
2780 | save->selector = vmcs_read16(sf->selector); | ||
2781 | save->base = vmcs_readl(sf->base); | ||
2782 | save->limit = vmcs_read32(sf->limit); | ||
2783 | save->ar = vmcs_read32(sf->ar_bytes); | ||
2784 | vmcs_write16(sf->selector, save->base >> 4); | 2765 | vmcs_write16(sf->selector, save->base >> 4); |
2785 | vmcs_write32(sf->base, save->base & 0xffff0); | 2766 | vmcs_write32(sf->base, save->base & 0xffff0); |
2786 | vmcs_write32(sf->limit, 0xffff); | 2767 | vmcs_write32(sf->limit, 0xffff); |
@@ -2800,9 +2781,16 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
2800 | if (enable_unrestricted_guest) | 2781 | if (enable_unrestricted_guest) |
2801 | return; | 2782 | return; |
2802 | 2783 | ||
2784 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); | ||
2785 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); | ||
2786 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); | ||
2787 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); | ||
2788 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); | ||
2789 | |||
2803 | vmx->emulation_required = 1; | 2790 | vmx->emulation_required = 1; |
2804 | vmx->rmode.vm86_active = 1; | 2791 | vmx->rmode.vm86_active = 1; |
2805 | 2792 | ||
2793 | |||
2806 | /* | 2794 | /* |
2807 | * Very old userspace does not call KVM_SET_TSS_ADDR before entering | 2795 | * Very old userspace does not call KVM_SET_TSS_ADDR before entering |
2808 | * vcpu. Call it here with phys address pointing 16M below 4G. | 2796 | * vcpu. Call it here with phys address pointing 16M below 4G. |
@@ -2817,14 +2805,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
2817 | 2805 | ||
2818 | vmx_segment_cache_clear(vmx); | 2806 | vmx_segment_cache_clear(vmx); |
2819 | 2807 | ||
2820 | vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR); | ||
2821 | vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); | ||
2822 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); | 2808 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); |
2823 | |||
2824 | vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); | ||
2825 | vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); | 2809 | vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); |
2826 | |||
2827 | vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); | ||
2828 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | 2810 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); |
2829 | 2811 | ||
2830 | flags = vmcs_readl(GUEST_RFLAGS); | 2812 | flags = vmcs_readl(GUEST_RFLAGS); |
@@ -3117,35 +3099,24 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, | |||
3117 | struct kvm_segment *var, int seg) | 3099 | struct kvm_segment *var, int seg) |
3118 | { | 3100 | { |
3119 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3101 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3120 | struct kvm_save_segment *save; | ||
3121 | u32 ar; | 3102 | u32 ar; |
3122 | 3103 | ||
3123 | if (vmx->rmode.vm86_active | 3104 | if (vmx->rmode.vm86_active |
3124 | && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES | 3105 | && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES |
3125 | || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS | 3106 | || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS |
3126 | || seg == VCPU_SREG_GS) | 3107 | || seg == VCPU_SREG_GS)) { |
3127 | && !emulate_invalid_guest_state) { | 3108 | *var = vmx->rmode.segs[seg]; |
3128 | switch (seg) { | ||
3129 | case VCPU_SREG_TR: save = &vmx->rmode.tr; break; | ||
3130 | case VCPU_SREG_ES: save = &vmx->rmode.es; break; | ||
3131 | case VCPU_SREG_DS: save = &vmx->rmode.ds; break; | ||
3132 | case VCPU_SREG_FS: save = &vmx->rmode.fs; break; | ||
3133 | case VCPU_SREG_GS: save = &vmx->rmode.gs; break; | ||
3134 | default: BUG(); | ||
3135 | } | ||
3136 | var->selector = save->selector; | ||
3137 | var->base = save->base; | ||
3138 | var->limit = save->limit; | ||
3139 | ar = save->ar; | ||
3140 | if (seg == VCPU_SREG_TR | 3109 | if (seg == VCPU_SREG_TR |
3141 | || var->selector == vmx_read_guest_seg_selector(vmx, seg)) | 3110 | || var->selector == vmx_read_guest_seg_selector(vmx, seg)) |
3142 | goto use_saved_rmode_seg; | 3111 | return; |
3112 | var->base = vmx_read_guest_seg_base(vmx, seg); | ||
3113 | var->selector = vmx_read_guest_seg_selector(vmx, seg); | ||
3114 | return; | ||
3143 | } | 3115 | } |
3144 | var->base = vmx_read_guest_seg_base(vmx, seg); | 3116 | var->base = vmx_read_guest_seg_base(vmx, seg); |
3145 | var->limit = vmx_read_guest_seg_limit(vmx, seg); | 3117 | var->limit = vmx_read_guest_seg_limit(vmx, seg); |
3146 | var->selector = vmx_read_guest_seg_selector(vmx, seg); | 3118 | var->selector = vmx_read_guest_seg_selector(vmx, seg); |
3147 | ar = vmx_read_guest_seg_ar(vmx, seg); | 3119 | ar = vmx_read_guest_seg_ar(vmx, seg); |
3148 | use_saved_rmode_seg: | ||
3149 | if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) | 3120 | if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) |
3150 | ar = 0; | 3121 | ar = 0; |
3151 | var->type = ar & 15; | 3122 | var->type = ar & 15; |
@@ -3227,23 +3198,21 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
3227 | struct kvm_segment *var, int seg) | 3198 | struct kvm_segment *var, int seg) |
3228 | { | 3199 | { |
3229 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3200 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3230 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 3201 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; |
3231 | u32 ar; | 3202 | u32 ar; |
3232 | 3203 | ||
3233 | vmx_segment_cache_clear(vmx); | 3204 | vmx_segment_cache_clear(vmx); |
3234 | 3205 | ||
3235 | if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { | 3206 | if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { |
3236 | vmcs_write16(sf->selector, var->selector); | 3207 | vmcs_write16(sf->selector, var->selector); |
3237 | vmx->rmode.tr.selector = var->selector; | 3208 | vmx->rmode.segs[VCPU_SREG_TR] = *var; |
3238 | vmx->rmode.tr.base = var->base; | ||
3239 | vmx->rmode.tr.limit = var->limit; | ||
3240 | vmx->rmode.tr.ar = vmx_segment_access_rights(var); | ||
3241 | return; | 3209 | return; |
3242 | } | 3210 | } |
3243 | vmcs_writel(sf->base, var->base); | 3211 | vmcs_writel(sf->base, var->base); |
3244 | vmcs_write32(sf->limit, var->limit); | 3212 | vmcs_write32(sf->limit, var->limit); |
3245 | vmcs_write16(sf->selector, var->selector); | 3213 | vmcs_write16(sf->selector, var->selector); |
3246 | if (vmx->rmode.vm86_active && var->s) { | 3214 | if (vmx->rmode.vm86_active && var->s) { |
3215 | vmx->rmode.segs[seg] = *var; | ||
3247 | /* | 3216 | /* |
3248 | * Hack real-mode segments into vm86 compatibility. | 3217 | * Hack real-mode segments into vm86 compatibility. |
3249 | */ | 3218 | */ |
@@ -3258,7 +3227,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
3258 | * qemu binaries. | 3227 | * qemu binaries. |
3259 | * IA32 arch specifies that at the time of processor reset the | 3228 | * IA32 arch specifies that at the time of processor reset the |
3260 | * "Accessed" bit in the AR field of segment registers is 1. And qemu | 3229 | * "Accessed" bit in the AR field of segment registers is 1. And qemu |
3261 | * is setting it to 0 in the usedland code. This causes invalid guest | 3230 | * is setting it to 0 in the userland code. This causes invalid guest |
3262 | * state vmexit when "unrestricted guest" mode is turned on. | 3231 | * state vmexit when "unrestricted guest" mode is turned on. |
3263 | * Fix for this setup issue in cpu_reset is being pushed in the qemu | 3232 | * Fix for this setup issue in cpu_reset is being pushed in the qemu |
3264 | * tree. Newer qemu binaries with that qemu fix would not need this | 3233 | * tree. Newer qemu binaries with that qemu fix would not need this |
@@ -3288,16 +3257,10 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
3288 | vmcs_readl(GUEST_CS_BASE) >> 4); | 3257 | vmcs_readl(GUEST_CS_BASE) >> 4); |
3289 | break; | 3258 | break; |
3290 | case VCPU_SREG_ES: | 3259 | case VCPU_SREG_ES: |
3291 | fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); | ||
3292 | break; | ||
3293 | case VCPU_SREG_DS: | 3260 | case VCPU_SREG_DS: |
3294 | fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); | ||
3295 | break; | ||
3296 | case VCPU_SREG_GS: | 3261 | case VCPU_SREG_GS: |
3297 | fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); | ||
3298 | break; | ||
3299 | case VCPU_SREG_FS: | 3262 | case VCPU_SREG_FS: |
3300 | fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); | 3263 | fix_rmode_seg(seg, &vmx->rmode.segs[seg]); |
3301 | break; | 3264 | break; |
3302 | case VCPU_SREG_SS: | 3265 | case VCPU_SREG_SS: |
3303 | vmcs_write16(GUEST_SS_SELECTOR, | 3266 | vmcs_write16(GUEST_SS_SELECTOR, |
@@ -3351,9 +3314,9 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) | |||
3351 | 3314 | ||
3352 | if (var.base != (var.selector << 4)) | 3315 | if (var.base != (var.selector << 4)) |
3353 | return false; | 3316 | return false; |
3354 | if (var.limit != 0xffff) | 3317 | if (var.limit < 0xffff) |
3355 | return false; | 3318 | return false; |
3356 | if (ar != 0xf3) | 3319 | if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3) |
3357 | return false; | 3320 | return false; |
3358 | 3321 | ||
3359 | return true; | 3322 | return true; |
@@ -3605,7 +3568,7 @@ out: | |||
3605 | 3568 | ||
3606 | static void seg_setup(int seg) | 3569 | static void seg_setup(int seg) |
3607 | { | 3570 | { |
3608 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 3571 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; |
3609 | unsigned int ar; | 3572 | unsigned int ar; |
3610 | 3573 | ||
3611 | vmcs_write16(sf->selector, 0); | 3574 | vmcs_write16(sf->selector, 0); |
@@ -3770,8 +3733,7 @@ static void vmx_set_constant_host_state(void) | |||
3770 | native_store_idt(&dt); | 3733 | native_store_idt(&dt); |
3771 | vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ | 3734 | vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ |
3772 | 3735 | ||
3773 | asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl)); | 3736 | vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ |
3774 | vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */ | ||
3775 | 3737 | ||
3776 | rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); | 3738 | rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); |
3777 | vmcs_write32(HOST_IA32_SYSENTER_CS, low32); | 3739 | vmcs_write32(HOST_IA32_SYSENTER_CS, low32); |
@@ -4005,8 +3967,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
4005 | kvm_rip_write(vcpu, 0); | 3967 | kvm_rip_write(vcpu, 0); |
4006 | kvm_register_write(vcpu, VCPU_REGS_RSP, 0); | 3968 | kvm_register_write(vcpu, VCPU_REGS_RSP, 0); |
4007 | 3969 | ||
4008 | vmcs_writel(GUEST_DR7, 0x400); | ||
4009 | |||
4010 | vmcs_writel(GUEST_GDTR_BASE, 0); | 3970 | vmcs_writel(GUEST_GDTR_BASE, 0); |
4011 | vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); | 3971 | vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); |
4012 | 3972 | ||
@@ -4456,7 +4416,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | |||
4456 | hypercall[2] = 0xc1; | 4416 | hypercall[2] = 0xc1; |
4457 | } | 4417 | } |
4458 | 4418 | ||
4459 | /* called to set cr0 as approriate for a mov-to-cr0 exit. */ | 4419 | /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ |
4460 | static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) | 4420 | static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) |
4461 | { | 4421 | { |
4462 | if (to_vmx(vcpu)->nested.vmxon && | 4422 | if (to_vmx(vcpu)->nested.vmxon && |
@@ -5701,7 +5661,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) | |||
5701 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs | 5661 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs |
5702 | * to be done to userspace and return 0. | 5662 | * to be done to userspace and return 0. |
5703 | */ | 5663 | */ |
5704 | static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | 5664 | static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { |
5705 | [EXIT_REASON_EXCEPTION_NMI] = handle_exception, | 5665 | [EXIT_REASON_EXCEPTION_NMI] = handle_exception, |
5706 | [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, | 5666 | [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, |
5707 | [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, | 5667 | [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, |
@@ -6229,17 +6189,10 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) | |||
6229 | msrs[i].host); | 6189 | msrs[i].host); |
6230 | } | 6190 | } |
6231 | 6191 | ||
6232 | #ifdef CONFIG_X86_64 | ||
6233 | #define R "r" | ||
6234 | #define Q "q" | ||
6235 | #else | ||
6236 | #define R "e" | ||
6237 | #define Q "l" | ||
6238 | #endif | ||
6239 | |||
6240 | static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | 6192 | static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) |
6241 | { | 6193 | { |
6242 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 6194 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
6195 | unsigned long debugctlmsr; | ||
6243 | 6196 | ||
6244 | if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { | 6197 | if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { |
6245 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | 6198 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
@@ -6279,34 +6232,35 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
6279 | vmx_set_interrupt_shadow(vcpu, 0); | 6232 | vmx_set_interrupt_shadow(vcpu, 0); |
6280 | 6233 | ||
6281 | atomic_switch_perf_msrs(vmx); | 6234 | atomic_switch_perf_msrs(vmx); |
6235 | debugctlmsr = get_debugctlmsr(); | ||
6282 | 6236 | ||
6283 | vmx->__launched = vmx->loaded_vmcs->launched; | 6237 | vmx->__launched = vmx->loaded_vmcs->launched; |
6284 | asm( | 6238 | asm( |
6285 | /* Store host registers */ | 6239 | /* Store host registers */ |
6286 | "push %%"R"dx; push %%"R"bp;" | 6240 | "push %%" _ASM_DX "; push %%" _ASM_BP ";" |
6287 | "push %%"R"cx \n\t" /* placeholder for guest rcx */ | 6241 | "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ |
6288 | "push %%"R"cx \n\t" | 6242 | "push %%" _ASM_CX " \n\t" |
6289 | "cmp %%"R"sp, %c[host_rsp](%0) \n\t" | 6243 | "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" |
6290 | "je 1f \n\t" | 6244 | "je 1f \n\t" |
6291 | "mov %%"R"sp, %c[host_rsp](%0) \n\t" | 6245 | "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" |
6292 | __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" | 6246 | __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" |
6293 | "1: \n\t" | 6247 | "1: \n\t" |
6294 | /* Reload cr2 if changed */ | 6248 | /* Reload cr2 if changed */ |
6295 | "mov %c[cr2](%0), %%"R"ax \n\t" | 6249 | "mov %c[cr2](%0), %%" _ASM_AX " \n\t" |
6296 | "mov %%cr2, %%"R"dx \n\t" | 6250 | "mov %%cr2, %%" _ASM_DX " \n\t" |
6297 | "cmp %%"R"ax, %%"R"dx \n\t" | 6251 | "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" |
6298 | "je 2f \n\t" | 6252 | "je 2f \n\t" |
6299 | "mov %%"R"ax, %%cr2 \n\t" | 6253 | "mov %%" _ASM_AX", %%cr2 \n\t" |
6300 | "2: \n\t" | 6254 | "2: \n\t" |
6301 | /* Check if vmlaunch of vmresume is needed */ | 6255 | /* Check if vmlaunch of vmresume is needed */ |
6302 | "cmpl $0, %c[launched](%0) \n\t" | 6256 | "cmpl $0, %c[launched](%0) \n\t" |
6303 | /* Load guest registers. Don't clobber flags. */ | 6257 | /* Load guest registers. Don't clobber flags. */ |
6304 | "mov %c[rax](%0), %%"R"ax \n\t" | 6258 | "mov %c[rax](%0), %%" _ASM_AX " \n\t" |
6305 | "mov %c[rbx](%0), %%"R"bx \n\t" | 6259 | "mov %c[rbx](%0), %%" _ASM_BX " \n\t" |
6306 | "mov %c[rdx](%0), %%"R"dx \n\t" | 6260 | "mov %c[rdx](%0), %%" _ASM_DX " \n\t" |
6307 | "mov %c[rsi](%0), %%"R"si \n\t" | 6261 | "mov %c[rsi](%0), %%" _ASM_SI " \n\t" |
6308 | "mov %c[rdi](%0), %%"R"di \n\t" | 6262 | "mov %c[rdi](%0), %%" _ASM_DI " \n\t" |
6309 | "mov %c[rbp](%0), %%"R"bp \n\t" | 6263 | "mov %c[rbp](%0), %%" _ASM_BP " \n\t" |
6310 | #ifdef CONFIG_X86_64 | 6264 | #ifdef CONFIG_X86_64 |
6311 | "mov %c[r8](%0), %%r8 \n\t" | 6265 | "mov %c[r8](%0), %%r8 \n\t" |
6312 | "mov %c[r9](%0), %%r9 \n\t" | 6266 | "mov %c[r9](%0), %%r9 \n\t" |
@@ -6317,24 +6271,24 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
6317 | "mov %c[r14](%0), %%r14 \n\t" | 6271 | "mov %c[r14](%0), %%r14 \n\t" |
6318 | "mov %c[r15](%0), %%r15 \n\t" | 6272 | "mov %c[r15](%0), %%r15 \n\t" |
6319 | #endif | 6273 | #endif |
6320 | "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ | 6274 | "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ |
6321 | 6275 | ||
6322 | /* Enter guest mode */ | 6276 | /* Enter guest mode */ |
6323 | "jne .Llaunched \n\t" | 6277 | "jne 1f \n\t" |
6324 | __ex(ASM_VMX_VMLAUNCH) "\n\t" | 6278 | __ex(ASM_VMX_VMLAUNCH) "\n\t" |
6325 | "jmp .Lkvm_vmx_return \n\t" | 6279 | "jmp 2f \n\t" |
6326 | ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" | 6280 | "1: " __ex(ASM_VMX_VMRESUME) "\n\t" |
6327 | ".Lkvm_vmx_return: " | 6281 | "2: " |
6328 | /* Save guest registers, load host registers, keep flags */ | 6282 | /* Save guest registers, load host registers, keep flags */ |
6329 | "mov %0, %c[wordsize](%%"R"sp) \n\t" | 6283 | "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" |
6330 | "pop %0 \n\t" | 6284 | "pop %0 \n\t" |
6331 | "mov %%"R"ax, %c[rax](%0) \n\t" | 6285 | "mov %%" _ASM_AX ", %c[rax](%0) \n\t" |
6332 | "mov %%"R"bx, %c[rbx](%0) \n\t" | 6286 | "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" |
6333 | "pop"Q" %c[rcx](%0) \n\t" | 6287 | __ASM_SIZE(pop) " %c[rcx](%0) \n\t" |
6334 | "mov %%"R"dx, %c[rdx](%0) \n\t" | 6288 | "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" |
6335 | "mov %%"R"si, %c[rsi](%0) \n\t" | 6289 | "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" |
6336 | "mov %%"R"di, %c[rdi](%0) \n\t" | 6290 | "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" |
6337 | "mov %%"R"bp, %c[rbp](%0) \n\t" | 6291 | "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" |
6338 | #ifdef CONFIG_X86_64 | 6292 | #ifdef CONFIG_X86_64 |
6339 | "mov %%r8, %c[r8](%0) \n\t" | 6293 | "mov %%r8, %c[r8](%0) \n\t" |
6340 | "mov %%r9, %c[r9](%0) \n\t" | 6294 | "mov %%r9, %c[r9](%0) \n\t" |
@@ -6345,11 +6299,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
6345 | "mov %%r14, %c[r14](%0) \n\t" | 6299 | "mov %%r14, %c[r14](%0) \n\t" |
6346 | "mov %%r15, %c[r15](%0) \n\t" | 6300 | "mov %%r15, %c[r15](%0) \n\t" |
6347 | #endif | 6301 | #endif |
6348 | "mov %%cr2, %%"R"ax \n\t" | 6302 | "mov %%cr2, %%" _ASM_AX " \n\t" |
6349 | "mov %%"R"ax, %c[cr2](%0) \n\t" | 6303 | "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" |
6350 | 6304 | ||
6351 | "pop %%"R"bp; pop %%"R"dx \n\t" | 6305 | "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" |
6352 | "setbe %c[fail](%0) \n\t" | 6306 | "setbe %c[fail](%0) \n\t" |
6307 | ".pushsection .rodata \n\t" | ||
6308 | ".global vmx_return \n\t" | ||
6309 | "vmx_return: " _ASM_PTR " 2b \n\t" | ||
6310 | ".popsection" | ||
6353 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), | 6311 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), |
6354 | [launched]"i"(offsetof(struct vcpu_vmx, __launched)), | 6312 | [launched]"i"(offsetof(struct vcpu_vmx, __launched)), |
6355 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), | 6313 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), |
@@ -6374,12 +6332,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
6374 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), | 6332 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), |
6375 | [wordsize]"i"(sizeof(ulong)) | 6333 | [wordsize]"i"(sizeof(ulong)) |
6376 | : "cc", "memory" | 6334 | : "cc", "memory" |
6377 | , R"ax", R"bx", R"di", R"si" | ||
6378 | #ifdef CONFIG_X86_64 | 6335 | #ifdef CONFIG_X86_64 |
6336 | , "rax", "rbx", "rdi", "rsi" | ||
6379 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" | 6337 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" |
6338 | #else | ||
6339 | , "eax", "ebx", "edi", "esi" | ||
6380 | #endif | 6340 | #endif |
6381 | ); | 6341 | ); |
6382 | 6342 | ||
6343 | /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ | ||
6344 | if (debugctlmsr) | ||
6345 | update_debugctlmsr(debugctlmsr); | ||
6346 | |||
6383 | #ifndef CONFIG_X86_64 | 6347 | #ifndef CONFIG_X86_64 |
6384 | /* | 6348 | /* |
6385 | * The sysexit path does not restore ds/es, so we must set them to | 6349 | * The sysexit path does not restore ds/es, so we must set them to |
@@ -6424,9 +6388,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
6424 | vmx_complete_interrupts(vmx); | 6388 | vmx_complete_interrupts(vmx); |
6425 | } | 6389 | } |
6426 | 6390 | ||
6427 | #undef R | ||
6428 | #undef Q | ||
6429 | |||
6430 | static void vmx_free_vcpu(struct kvm_vcpu *vcpu) | 6391 | static void vmx_free_vcpu(struct kvm_vcpu *vcpu) |
6431 | { | 6392 | { |
6432 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 6393 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
@@ -7281,7 +7242,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
7281 | .vcpu_load = vmx_vcpu_load, | 7242 | .vcpu_load = vmx_vcpu_load, |
7282 | .vcpu_put = vmx_vcpu_put, | 7243 | .vcpu_put = vmx_vcpu_put, |
7283 | 7244 | ||
7284 | .set_guest_debug = set_guest_debug, | 7245 | .update_db_bp_intercept = update_exception_bitmap, |
7285 | .get_msr = vmx_get_msr, | 7246 | .get_msr = vmx_get_msr, |
7286 | .set_msr = vmx_set_msr, | 7247 | .set_msr = vmx_set_msr, |
7287 | .get_segment_base = vmx_get_segment_base, | 7248 | .get_segment_base = vmx_get_segment_base, |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1f09552572fa..1eefebe5d727 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -246,20 +246,14 @@ static void drop_user_return_notifiers(void *ignore) | |||
246 | 246 | ||
247 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) | 247 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) |
248 | { | 248 | { |
249 | if (irqchip_in_kernel(vcpu->kvm)) | 249 | return vcpu->arch.apic_base; |
250 | return vcpu->arch.apic_base; | ||
251 | else | ||
252 | return vcpu->arch.apic_base; | ||
253 | } | 250 | } |
254 | EXPORT_SYMBOL_GPL(kvm_get_apic_base); | 251 | EXPORT_SYMBOL_GPL(kvm_get_apic_base); |
255 | 252 | ||
256 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) | 253 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) |
257 | { | 254 | { |
258 | /* TODO: reserve bits check */ | 255 | /* TODO: reserve bits check */ |
259 | if (irqchip_in_kernel(vcpu->kvm)) | 256 | kvm_lapic_set_base(vcpu, data); |
260 | kvm_lapic_set_base(vcpu, data); | ||
261 | else | ||
262 | vcpu->arch.apic_base = data; | ||
263 | } | 257 | } |
264 | EXPORT_SYMBOL_GPL(kvm_set_apic_base); | 258 | EXPORT_SYMBOL_GPL(kvm_set_apic_base); |
265 | 259 | ||
@@ -698,6 +692,18 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) | |||
698 | } | 692 | } |
699 | EXPORT_SYMBOL_GPL(kvm_get_cr8); | 693 | EXPORT_SYMBOL_GPL(kvm_get_cr8); |
700 | 694 | ||
695 | static void kvm_update_dr7(struct kvm_vcpu *vcpu) | ||
696 | { | ||
697 | unsigned long dr7; | ||
698 | |||
699 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) | ||
700 | dr7 = vcpu->arch.guest_debug_dr7; | ||
701 | else | ||
702 | dr7 = vcpu->arch.dr7; | ||
703 | kvm_x86_ops->set_dr7(vcpu, dr7); | ||
704 | vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK); | ||
705 | } | ||
706 | |||
701 | static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) | 707 | static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) |
702 | { | 708 | { |
703 | switch (dr) { | 709 | switch (dr) { |
@@ -723,10 +729,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) | |||
723 | if (val & 0xffffffff00000000ULL) | 729 | if (val & 0xffffffff00000000ULL) |
724 | return -1; /* #GP */ | 730 | return -1; /* #GP */ |
725 | vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; | 731 | vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; |
726 | if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { | 732 | kvm_update_dr7(vcpu); |
727 | kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); | ||
728 | vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK); | ||
729 | } | ||
730 | break; | 733 | break; |
731 | } | 734 | } |
732 | 735 | ||
@@ -823,7 +826,7 @@ static u32 msrs_to_save[] = { | |||
823 | 826 | ||
824 | static unsigned num_msrs_to_save; | 827 | static unsigned num_msrs_to_save; |
825 | 828 | ||
826 | static u32 emulated_msrs[] = { | 829 | static const u32 emulated_msrs[] = { |
827 | MSR_IA32_TSCDEADLINE, | 830 | MSR_IA32_TSCDEADLINE, |
828 | MSR_IA32_MISC_ENABLE, | 831 | MSR_IA32_MISC_ENABLE, |
829 | MSR_IA32_MCG_STATUS, | 832 | MSR_IA32_MCG_STATUS, |
@@ -1097,7 +1100,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) | |||
1097 | * For each generation, we track the original measured | 1100 | * For each generation, we track the original measured |
1098 | * nanosecond time, offset, and write, so if TSCs are in | 1101 | * nanosecond time, offset, and write, so if TSCs are in |
1099 | * sync, we can match exact offset, and if not, we can match | 1102 | * sync, we can match exact offset, and if not, we can match |
1100 | * exact software computaion in compute_guest_tsc() | 1103 | * exact software computation in compute_guest_tsc() |
1101 | * | 1104 | * |
1102 | * These values are tracked in kvm->arch.cur_xxx variables. | 1105 | * These values are tracked in kvm->arch.cur_xxx variables. |
1103 | */ | 1106 | */ |
@@ -1140,6 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) | |||
1140 | unsigned long this_tsc_khz; | 1143 | unsigned long this_tsc_khz; |
1141 | s64 kernel_ns, max_kernel_ns; | 1144 | s64 kernel_ns, max_kernel_ns; |
1142 | u64 tsc_timestamp; | 1145 | u64 tsc_timestamp; |
1146 | u8 pvclock_flags; | ||
1143 | 1147 | ||
1144 | /* Keep irq disabled to prevent changes to the clock */ | 1148 | /* Keep irq disabled to prevent changes to the clock */ |
1145 | local_irq_save(flags); | 1149 | local_irq_save(flags); |
@@ -1221,7 +1225,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) | |||
1221 | vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; | 1225 | vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; |
1222 | vcpu->last_kernel_ns = kernel_ns; | 1226 | vcpu->last_kernel_ns = kernel_ns; |
1223 | vcpu->last_guest_tsc = tsc_timestamp; | 1227 | vcpu->last_guest_tsc = tsc_timestamp; |
1224 | vcpu->hv_clock.flags = 0; | 1228 | |
1229 | pvclock_flags = 0; | ||
1230 | if (vcpu->pvclock_set_guest_stopped_request) { | ||
1231 | pvclock_flags |= PVCLOCK_GUEST_STOPPED; | ||
1232 | vcpu->pvclock_set_guest_stopped_request = false; | ||
1233 | } | ||
1234 | |||
1235 | vcpu->hv_clock.flags = pvclock_flags; | ||
1225 | 1236 | ||
1226 | /* | 1237 | /* |
1227 | * The interface expects us to write an even number signaling that the | 1238 | * The interface expects us to write an even number signaling that the |
@@ -1504,7 +1515,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) | |||
1504 | { | 1515 | { |
1505 | gpa_t gpa = data & ~0x3f; | 1516 | gpa_t gpa = data & ~0x3f; |
1506 | 1517 | ||
1507 | /* Bits 2:5 are resrved, Should be zero */ | 1518 | /* Bits 2:5 are reserved, Should be zero */ |
1508 | if (data & 0x3c) | 1519 | if (data & 0x3c) |
1509 | return 1; | 1520 | return 1; |
1510 | 1521 | ||
@@ -1639,10 +1650,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1639 | vcpu->arch.time_page = | 1650 | vcpu->arch.time_page = |
1640 | gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); | 1651 | gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); |
1641 | 1652 | ||
1642 | if (is_error_page(vcpu->arch.time_page)) { | 1653 | if (is_error_page(vcpu->arch.time_page)) |
1643 | kvm_release_page_clean(vcpu->arch.time_page); | ||
1644 | vcpu->arch.time_page = NULL; | 1654 | vcpu->arch.time_page = NULL; |
1645 | } | 1655 | |
1646 | break; | 1656 | break; |
1647 | } | 1657 | } |
1648 | case MSR_KVM_ASYNC_PF_EN: | 1658 | case MSR_KVM_ASYNC_PF_EN: |
@@ -1727,7 +1737,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1727 | * Ignore all writes to this no longer documented MSR. | 1737 | * Ignore all writes to this no longer documented MSR. |
1728 | * Writes are only relevant for old K7 processors, | 1738 | * Writes are only relevant for old K7 processors, |
1729 | * all pre-dating SVM, but a recommended workaround from | 1739 | * all pre-dating SVM, but a recommended workaround from |
1730 | * AMD for these chips. It is possible to speicify the | 1740 | * AMD for these chips. It is possible to specify the |
1731 | * affected processor models on the command line, hence | 1741 | * affected processor models on the command line, hence |
1732 | * the need to ignore the workaround. | 1742 | * the need to ignore the workaround. |
1733 | */ | 1743 | */ |
@@ -2177,6 +2187,8 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
2177 | case KVM_CAP_GET_TSC_KHZ: | 2187 | case KVM_CAP_GET_TSC_KHZ: |
2178 | case KVM_CAP_PCI_2_3: | 2188 | case KVM_CAP_PCI_2_3: |
2179 | case KVM_CAP_KVMCLOCK_CTRL: | 2189 | case KVM_CAP_KVMCLOCK_CTRL: |
2190 | case KVM_CAP_READONLY_MEM: | ||
2191 | case KVM_CAP_IRQFD_RESAMPLE: | ||
2180 | r = 1; | 2192 | r = 1; |
2181 | break; | 2193 | break; |
2182 | case KVM_CAP_COALESCED_MMIO: | 2194 | case KVM_CAP_COALESCED_MMIO: |
@@ -2358,8 +2370,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, | |||
2358 | static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, | 2370 | static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, |
2359 | struct kvm_lapic_state *s) | 2371 | struct kvm_lapic_state *s) |
2360 | { | 2372 | { |
2361 | memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); | 2373 | kvm_apic_post_state_restore(vcpu, s); |
2362 | kvm_apic_post_state_restore(vcpu); | ||
2363 | update_cr8_intercept(vcpu); | 2374 | update_cr8_intercept(vcpu); |
2364 | 2375 | ||
2365 | return 0; | 2376 | return 0; |
@@ -2368,7 +2379,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, | |||
2368 | static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, | 2379 | static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, |
2369 | struct kvm_interrupt *irq) | 2380 | struct kvm_interrupt *irq) |
2370 | { | 2381 | { |
2371 | if (irq->irq < 0 || irq->irq >= 256) | 2382 | if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS) |
2372 | return -EINVAL; | 2383 | return -EINVAL; |
2373 | if (irqchip_in_kernel(vcpu->kvm)) | 2384 | if (irqchip_in_kernel(vcpu->kvm)) |
2374 | return -ENXIO; | 2385 | return -ENXIO; |
@@ -2635,11 +2646,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, | |||
2635 | */ | 2646 | */ |
2636 | static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) | 2647 | static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) |
2637 | { | 2648 | { |
2638 | struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock; | ||
2639 | if (!vcpu->arch.time_page) | 2649 | if (!vcpu->arch.time_page) |
2640 | return -EINVAL; | 2650 | return -EINVAL; |
2641 | src->flags |= PVCLOCK_GUEST_STOPPED; | 2651 | vcpu->arch.pvclock_set_guest_stopped_request = true; |
2642 | mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT); | ||
2643 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | 2652 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
2644 | return 0; | 2653 | return 0; |
2645 | } | 2654 | } |
@@ -3090,7 +3099,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, | |||
3090 | if (!kvm->arch.vpit) | 3099 | if (!kvm->arch.vpit) |
3091 | return -ENXIO; | 3100 | return -ENXIO; |
3092 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | 3101 | mutex_lock(&kvm->arch.vpit->pit_state.lock); |
3093 | kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; | 3102 | kvm->arch.vpit->pit_state.reinject = control->pit_reinject; |
3094 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | 3103 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); |
3095 | return 0; | 3104 | return 0; |
3096 | } | 3105 | } |
@@ -3173,6 +3182,16 @@ out: | |||
3173 | return r; | 3182 | return r; |
3174 | } | 3183 | } |
3175 | 3184 | ||
3185 | int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event) | ||
3186 | { | ||
3187 | if (!irqchip_in_kernel(kvm)) | ||
3188 | return -ENXIO; | ||
3189 | |||
3190 | irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, | ||
3191 | irq_event->irq, irq_event->level); | ||
3192 | return 0; | ||
3193 | } | ||
3194 | |||
3176 | long kvm_arch_vm_ioctl(struct file *filp, | 3195 | long kvm_arch_vm_ioctl(struct file *filp, |
3177 | unsigned int ioctl, unsigned long arg) | 3196 | unsigned int ioctl, unsigned long arg) |
3178 | { | 3197 | { |
@@ -3279,29 +3298,6 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3279 | create_pit_unlock: | 3298 | create_pit_unlock: |
3280 | mutex_unlock(&kvm->slots_lock); | 3299 | mutex_unlock(&kvm->slots_lock); |
3281 | break; | 3300 | break; |
3282 | case KVM_IRQ_LINE_STATUS: | ||
3283 | case KVM_IRQ_LINE: { | ||
3284 | struct kvm_irq_level irq_event; | ||
3285 | |||
3286 | r = -EFAULT; | ||
3287 | if (copy_from_user(&irq_event, argp, sizeof irq_event)) | ||
3288 | goto out; | ||
3289 | r = -ENXIO; | ||
3290 | if (irqchip_in_kernel(kvm)) { | ||
3291 | __s32 status; | ||
3292 | status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, | ||
3293 | irq_event.irq, irq_event.level); | ||
3294 | if (ioctl == KVM_IRQ_LINE_STATUS) { | ||
3295 | r = -EFAULT; | ||
3296 | irq_event.status = status; | ||
3297 | if (copy_to_user(argp, &irq_event, | ||
3298 | sizeof irq_event)) | ||
3299 | goto out; | ||
3300 | } | ||
3301 | r = 0; | ||
3302 | } | ||
3303 | break; | ||
3304 | } | ||
3305 | case KVM_GET_IRQCHIP: { | 3301 | case KVM_GET_IRQCHIP: { |
3306 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | 3302 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ |
3307 | struct kvm_irqchip *chip; | 3303 | struct kvm_irqchip *chip; |
@@ -3689,20 +3685,17 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, | |||
3689 | gpa_t *gpa, struct x86_exception *exception, | 3685 | gpa_t *gpa, struct x86_exception *exception, |
3690 | bool write) | 3686 | bool write) |
3691 | { | 3687 | { |
3692 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3688 | u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) |
3689 | | (write ? PFERR_WRITE_MASK : 0); | ||
3693 | 3690 | ||
3694 | if (vcpu_match_mmio_gva(vcpu, gva) && | 3691 | if (vcpu_match_mmio_gva(vcpu, gva) |
3695 | check_write_user_access(vcpu, write, access, | 3692 | && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) { |
3696 | vcpu->arch.access)) { | ||
3697 | *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | | 3693 | *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | |
3698 | (gva & (PAGE_SIZE - 1)); | 3694 | (gva & (PAGE_SIZE - 1)); |
3699 | trace_vcpu_match_mmio(gva, *gpa, write, false); | 3695 | trace_vcpu_match_mmio(gva, *gpa, write, false); |
3700 | return 1; | 3696 | return 1; |
3701 | } | 3697 | } |
3702 | 3698 | ||
3703 | if (write) | ||
3704 | access |= PFERR_WRITE_MASK; | ||
3705 | |||
3706 | *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); | 3699 | *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); |
3707 | 3700 | ||
3708 | if (*gpa == UNMAPPED_GVA) | 3701 | if (*gpa == UNMAPPED_GVA) |
@@ -3790,14 +3783,14 @@ static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3790 | return X86EMUL_CONTINUE; | 3783 | return X86EMUL_CONTINUE; |
3791 | } | 3784 | } |
3792 | 3785 | ||
3793 | static struct read_write_emulator_ops read_emultor = { | 3786 | static const struct read_write_emulator_ops read_emultor = { |
3794 | .read_write_prepare = read_prepare, | 3787 | .read_write_prepare = read_prepare, |
3795 | .read_write_emulate = read_emulate, | 3788 | .read_write_emulate = read_emulate, |
3796 | .read_write_mmio = vcpu_mmio_read, | 3789 | .read_write_mmio = vcpu_mmio_read, |
3797 | .read_write_exit_mmio = read_exit_mmio, | 3790 | .read_write_exit_mmio = read_exit_mmio, |
3798 | }; | 3791 | }; |
3799 | 3792 | ||
3800 | static struct read_write_emulator_ops write_emultor = { | 3793 | static const struct read_write_emulator_ops write_emultor = { |
3801 | .read_write_emulate = write_emulate, | 3794 | .read_write_emulate = write_emulate, |
3802 | .read_write_mmio = write_mmio, | 3795 | .read_write_mmio = write_mmio, |
3803 | .read_write_exit_mmio = write_exit_mmio, | 3796 | .read_write_exit_mmio = write_exit_mmio, |
@@ -3808,7 +3801,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, | |||
3808 | unsigned int bytes, | 3801 | unsigned int bytes, |
3809 | struct x86_exception *exception, | 3802 | struct x86_exception *exception, |
3810 | struct kvm_vcpu *vcpu, | 3803 | struct kvm_vcpu *vcpu, |
3811 | struct read_write_emulator_ops *ops) | 3804 | const struct read_write_emulator_ops *ops) |
3812 | { | 3805 | { |
3813 | gpa_t gpa; | 3806 | gpa_t gpa; |
3814 | int handled, ret; | 3807 | int handled, ret; |
@@ -3857,7 +3850,7 @@ mmio: | |||
3857 | int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, | 3850 | int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, |
3858 | void *val, unsigned int bytes, | 3851 | void *val, unsigned int bytes, |
3859 | struct x86_exception *exception, | 3852 | struct x86_exception *exception, |
3860 | struct read_write_emulator_ops *ops) | 3853 | const struct read_write_emulator_ops *ops) |
3861 | { | 3854 | { |
3862 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | 3855 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
3863 | gpa_t gpa; | 3856 | gpa_t gpa; |
@@ -3962,10 +3955,8 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, | |||
3962 | goto emul_write; | 3955 | goto emul_write; |
3963 | 3956 | ||
3964 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | 3957 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); |
3965 | if (is_error_page(page)) { | 3958 | if (is_error_page(page)) |
3966 | kvm_release_page_clean(page); | ||
3967 | goto emul_write; | 3959 | goto emul_write; |
3968 | } | ||
3969 | 3960 | ||
3970 | kaddr = kmap_atomic(page); | 3961 | kaddr = kmap_atomic(page); |
3971 | kaddr += offset_in_page(gpa); | 3962 | kaddr += offset_in_page(gpa); |
@@ -4332,7 +4323,19 @@ static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, | |||
4332 | kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); | 4323 | kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); |
4333 | } | 4324 | } |
4334 | 4325 | ||
4335 | static struct x86_emulate_ops emulate_ops = { | 4326 | static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) |
4327 | { | ||
4328 | return kvm_register_read(emul_to_vcpu(ctxt), reg); | ||
4329 | } | ||
4330 | |||
4331 | static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val) | ||
4332 | { | ||
4333 | kvm_register_write(emul_to_vcpu(ctxt), reg, val); | ||
4334 | } | ||
4335 | |||
4336 | static const struct x86_emulate_ops emulate_ops = { | ||
4337 | .read_gpr = emulator_read_gpr, | ||
4338 | .write_gpr = emulator_write_gpr, | ||
4336 | .read_std = kvm_read_guest_virt_system, | 4339 | .read_std = kvm_read_guest_virt_system, |
4337 | .write_std = kvm_write_guest_virt_system, | 4340 | .write_std = kvm_write_guest_virt_system, |
4338 | .fetch = kvm_fetch_guest_virt, | 4341 | .fetch = kvm_fetch_guest_virt, |
@@ -4367,14 +4370,6 @@ static struct x86_emulate_ops emulate_ops = { | |||
4367 | .get_cpuid = emulator_get_cpuid, | 4370 | .get_cpuid = emulator_get_cpuid, |
4368 | }; | 4371 | }; |
4369 | 4372 | ||
4370 | static void cache_all_regs(struct kvm_vcpu *vcpu) | ||
4371 | { | ||
4372 | kvm_register_read(vcpu, VCPU_REGS_RAX); | ||
4373 | kvm_register_read(vcpu, VCPU_REGS_RSP); | ||
4374 | kvm_register_read(vcpu, VCPU_REGS_RIP); | ||
4375 | vcpu->arch.regs_dirty = ~0; | ||
4376 | } | ||
4377 | |||
4378 | static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) | 4373 | static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) |
4379 | { | 4374 | { |
4380 | u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); | 4375 | u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); |
@@ -4401,12 +4396,10 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) | |||
4401 | kvm_queue_exception(vcpu, ctxt->exception.vector); | 4396 | kvm_queue_exception(vcpu, ctxt->exception.vector); |
4402 | } | 4397 | } |
4403 | 4398 | ||
4404 | static void init_decode_cache(struct x86_emulate_ctxt *ctxt, | 4399 | static void init_decode_cache(struct x86_emulate_ctxt *ctxt) |
4405 | const unsigned long *regs) | ||
4406 | { | 4400 | { |
4407 | memset(&ctxt->twobyte, 0, | 4401 | memset(&ctxt->twobyte, 0, |
4408 | (void *)&ctxt->regs - (void *)&ctxt->twobyte); | 4402 | (void *)&ctxt->_regs - (void *)&ctxt->twobyte); |
4409 | memcpy(ctxt->regs, regs, sizeof(ctxt->regs)); | ||
4410 | 4403 | ||
4411 | ctxt->fetch.start = 0; | 4404 | ctxt->fetch.start = 0; |
4412 | ctxt->fetch.end = 0; | 4405 | ctxt->fetch.end = 0; |
@@ -4421,14 +4414,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) | |||
4421 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; | 4414 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
4422 | int cs_db, cs_l; | 4415 | int cs_db, cs_l; |
4423 | 4416 | ||
4424 | /* | ||
4425 | * TODO: fix emulate.c to use guest_read/write_register | ||
4426 | * instead of direct ->regs accesses, can save hundred cycles | ||
4427 | * on Intel for instructions that don't read/change RSP, for | ||
4428 | * for example. | ||
4429 | */ | ||
4430 | cache_all_regs(vcpu); | ||
4431 | |||
4432 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | 4417 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); |
4433 | 4418 | ||
4434 | ctxt->eflags = kvm_get_rflags(vcpu); | 4419 | ctxt->eflags = kvm_get_rflags(vcpu); |
@@ -4440,7 +4425,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) | |||
4440 | X86EMUL_MODE_PROT16; | 4425 | X86EMUL_MODE_PROT16; |
4441 | ctxt->guest_mode = is_guest_mode(vcpu); | 4426 | ctxt->guest_mode = is_guest_mode(vcpu); |
4442 | 4427 | ||
4443 | init_decode_cache(ctxt, vcpu->arch.regs); | 4428 | init_decode_cache(ctxt); |
4444 | vcpu->arch.emulate_regs_need_sync_from_vcpu = false; | 4429 | vcpu->arch.emulate_regs_need_sync_from_vcpu = false; |
4445 | } | 4430 | } |
4446 | 4431 | ||
@@ -4460,7 +4445,6 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) | |||
4460 | return EMULATE_FAIL; | 4445 | return EMULATE_FAIL; |
4461 | 4446 | ||
4462 | ctxt->eip = ctxt->_eip; | 4447 | ctxt->eip = ctxt->_eip; |
4463 | memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); | ||
4464 | kvm_rip_write(vcpu, ctxt->eip); | 4448 | kvm_rip_write(vcpu, ctxt->eip); |
4465 | kvm_set_rflags(vcpu, ctxt->eflags); | 4449 | kvm_set_rflags(vcpu, ctxt->eflags); |
4466 | 4450 | ||
@@ -4493,13 +4477,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) | |||
4493 | static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) | 4477 | static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) |
4494 | { | 4478 | { |
4495 | gpa_t gpa; | 4479 | gpa_t gpa; |
4480 | pfn_t pfn; | ||
4496 | 4481 | ||
4497 | if (tdp_enabled) | 4482 | if (tdp_enabled) |
4498 | return false; | 4483 | return false; |
4499 | 4484 | ||
4500 | /* | 4485 | /* |
4501 | * if emulation was due to access to shadowed page table | 4486 | * if emulation was due to access to shadowed page table |
4502 | * and it failed try to unshadow page and re-entetr the | 4487 | * and it failed try to unshadow page and re-enter the |
4503 | * guest to let CPU execute the instruction. | 4488 | * guest to let CPU execute the instruction. |
4504 | */ | 4489 | */ |
4505 | if (kvm_mmu_unprotect_page_virt(vcpu, gva)) | 4490 | if (kvm_mmu_unprotect_page_virt(vcpu, gva)) |
@@ -4510,8 +4495,17 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) | |||
4510 | if (gpa == UNMAPPED_GVA) | 4495 | if (gpa == UNMAPPED_GVA) |
4511 | return true; /* let cpu generate fault */ | 4496 | return true; /* let cpu generate fault */ |
4512 | 4497 | ||
4513 | if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) | 4498 | /* |
4499 | * Do not retry the unhandleable instruction if it faults on the | ||
4500 | * readonly host memory, otherwise it will goto a infinite loop: | ||
4501 | * retry instruction -> write #PF -> emulation fail -> retry | ||
4502 | * instruction -> ... | ||
4503 | */ | ||
4504 | pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); | ||
4505 | if (!is_error_pfn(pfn)) { | ||
4506 | kvm_release_pfn_clean(pfn); | ||
4514 | return true; | 4507 | return true; |
4508 | } | ||
4515 | 4509 | ||
4516 | return false; | 4510 | return false; |
4517 | } | 4511 | } |
@@ -4560,6 +4554,9 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, | |||
4560 | return true; | 4554 | return true; |
4561 | } | 4555 | } |
4562 | 4556 | ||
4557 | static int complete_emulated_mmio(struct kvm_vcpu *vcpu); | ||
4558 | static int complete_emulated_pio(struct kvm_vcpu *vcpu); | ||
4559 | |||
4563 | int x86_emulate_instruction(struct kvm_vcpu *vcpu, | 4560 | int x86_emulate_instruction(struct kvm_vcpu *vcpu, |
4564 | unsigned long cr2, | 4561 | unsigned long cr2, |
4565 | int emulation_type, | 4562 | int emulation_type, |
@@ -4608,7 +4605,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, | |||
4608 | changes registers values during IO operation */ | 4605 | changes registers values during IO operation */ |
4609 | if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { | 4606 | if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { |
4610 | vcpu->arch.emulate_regs_need_sync_from_vcpu = false; | 4607 | vcpu->arch.emulate_regs_need_sync_from_vcpu = false; |
4611 | memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs); | 4608 | emulator_invalidate_register_cache(ctxt); |
4612 | } | 4609 | } |
4613 | 4610 | ||
4614 | restart: | 4611 | restart: |
@@ -4630,13 +4627,16 @@ restart: | |||
4630 | } else if (vcpu->arch.pio.count) { | 4627 | } else if (vcpu->arch.pio.count) { |
4631 | if (!vcpu->arch.pio.in) | 4628 | if (!vcpu->arch.pio.in) |
4632 | vcpu->arch.pio.count = 0; | 4629 | vcpu->arch.pio.count = 0; |
4633 | else | 4630 | else { |
4634 | writeback = false; | 4631 | writeback = false; |
4632 | vcpu->arch.complete_userspace_io = complete_emulated_pio; | ||
4633 | } | ||
4635 | r = EMULATE_DO_MMIO; | 4634 | r = EMULATE_DO_MMIO; |
4636 | } else if (vcpu->mmio_needed) { | 4635 | } else if (vcpu->mmio_needed) { |
4637 | if (!vcpu->mmio_is_write) | 4636 | if (!vcpu->mmio_is_write) |
4638 | writeback = false; | 4637 | writeback = false; |
4639 | r = EMULATE_DO_MMIO; | 4638 | r = EMULATE_DO_MMIO; |
4639 | vcpu->arch.complete_userspace_io = complete_emulated_mmio; | ||
4640 | } else if (r == EMULATION_RESTART) | 4640 | } else if (r == EMULATION_RESTART) |
4641 | goto restart; | 4641 | goto restart; |
4642 | else | 4642 | else |
@@ -4646,7 +4646,6 @@ restart: | |||
4646 | toggle_interruptibility(vcpu, ctxt->interruptibility); | 4646 | toggle_interruptibility(vcpu, ctxt->interruptibility); |
4647 | kvm_set_rflags(vcpu, ctxt->eflags); | 4647 | kvm_set_rflags(vcpu, ctxt->eflags); |
4648 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 4648 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
4649 | memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); | ||
4650 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; | 4649 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; |
4651 | kvm_rip_write(vcpu, ctxt->eip); | 4650 | kvm_rip_write(vcpu, ctxt->eip); |
4652 | } else | 4651 | } else |
@@ -4929,6 +4928,7 @@ int kvm_arch_init(void *opaque) | |||
4929 | if (cpu_has_xsave) | 4928 | if (cpu_has_xsave) |
4930 | host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | 4929 | host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); |
4931 | 4930 | ||
4931 | kvm_lapic_init(); | ||
4932 | return 0; | 4932 | return 0; |
4933 | 4933 | ||
4934 | out: | 4934 | out: |
@@ -5499,6 +5499,24 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
5499 | return r; | 5499 | return r; |
5500 | } | 5500 | } |
5501 | 5501 | ||
5502 | static inline int complete_emulated_io(struct kvm_vcpu *vcpu) | ||
5503 | { | ||
5504 | int r; | ||
5505 | vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); | ||
5506 | r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); | ||
5507 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); | ||
5508 | if (r != EMULATE_DONE) | ||
5509 | return 0; | ||
5510 | return 1; | ||
5511 | } | ||
5512 | |||
5513 | static int complete_emulated_pio(struct kvm_vcpu *vcpu) | ||
5514 | { | ||
5515 | BUG_ON(!vcpu->arch.pio.count); | ||
5516 | |||
5517 | return complete_emulated_io(vcpu); | ||
5518 | } | ||
5519 | |||
5502 | /* | 5520 | /* |
5503 | * Implements the following, as a state machine: | 5521 | * Implements the following, as a state machine: |
5504 | * | 5522 | * |
@@ -5515,47 +5533,37 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
5515 | * copy data | 5533 | * copy data |
5516 | * exit | 5534 | * exit |
5517 | */ | 5535 | */ |
5518 | static int complete_mmio(struct kvm_vcpu *vcpu) | 5536 | static int complete_emulated_mmio(struct kvm_vcpu *vcpu) |
5519 | { | 5537 | { |
5520 | struct kvm_run *run = vcpu->run; | 5538 | struct kvm_run *run = vcpu->run; |
5521 | struct kvm_mmio_fragment *frag; | 5539 | struct kvm_mmio_fragment *frag; |
5522 | int r; | ||
5523 | 5540 | ||
5524 | if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) | 5541 | BUG_ON(!vcpu->mmio_needed); |
5525 | return 1; | ||
5526 | 5542 | ||
5527 | if (vcpu->mmio_needed) { | 5543 | /* Complete previous fragment */ |
5528 | /* Complete previous fragment */ | 5544 | frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; |
5529 | frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; | 5545 | if (!vcpu->mmio_is_write) |
5530 | if (!vcpu->mmio_is_write) | 5546 | memcpy(frag->data, run->mmio.data, frag->len); |
5531 | memcpy(frag->data, run->mmio.data, frag->len); | 5547 | if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { |
5532 | if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { | 5548 | vcpu->mmio_needed = 0; |
5533 | vcpu->mmio_needed = 0; | ||
5534 | if (vcpu->mmio_is_write) | ||
5535 | return 1; | ||
5536 | vcpu->mmio_read_completed = 1; | ||
5537 | goto done; | ||
5538 | } | ||
5539 | /* Initiate next fragment */ | ||
5540 | ++frag; | ||
5541 | run->exit_reason = KVM_EXIT_MMIO; | ||
5542 | run->mmio.phys_addr = frag->gpa; | ||
5543 | if (vcpu->mmio_is_write) | 5549 | if (vcpu->mmio_is_write) |
5544 | memcpy(run->mmio.data, frag->data, frag->len); | 5550 | return 1; |
5545 | run->mmio.len = frag->len; | 5551 | vcpu->mmio_read_completed = 1; |
5546 | run->mmio.is_write = vcpu->mmio_is_write; | 5552 | return complete_emulated_io(vcpu); |
5547 | return 0; | 5553 | } |
5548 | 5554 | /* Initiate next fragment */ | |
5549 | } | 5555 | ++frag; |
5550 | done: | 5556 | run->exit_reason = KVM_EXIT_MMIO; |
5551 | vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); | 5557 | run->mmio.phys_addr = frag->gpa; |
5552 | r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); | 5558 | if (vcpu->mmio_is_write) |
5553 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); | 5559 | memcpy(run->mmio.data, frag->data, frag->len); |
5554 | if (r != EMULATE_DONE) | 5560 | run->mmio.len = frag->len; |
5555 | return 0; | 5561 | run->mmio.is_write = vcpu->mmio_is_write; |
5556 | return 1; | 5562 | vcpu->arch.complete_userspace_io = complete_emulated_mmio; |
5563 | return 0; | ||
5557 | } | 5564 | } |
5558 | 5565 | ||
5566 | |||
5559 | int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 5567 | int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
5560 | { | 5568 | { |
5561 | int r; | 5569 | int r; |
@@ -5582,9 +5590,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
5582 | } | 5590 | } |
5583 | } | 5591 | } |
5584 | 5592 | ||
5585 | r = complete_mmio(vcpu); | 5593 | if (unlikely(vcpu->arch.complete_userspace_io)) { |
5586 | if (r <= 0) | 5594 | int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; |
5587 | goto out; | 5595 | vcpu->arch.complete_userspace_io = NULL; |
5596 | r = cui(vcpu); | ||
5597 | if (r <= 0) | ||
5598 | goto out; | ||
5599 | } else | ||
5600 | WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); | ||
5588 | 5601 | ||
5589 | r = __vcpu_run(vcpu); | 5602 | r = __vcpu_run(vcpu); |
5590 | 5603 | ||
@@ -5602,12 +5615,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
5602 | /* | 5615 | /* |
5603 | * We are here if userspace calls get_regs() in the middle of | 5616 | * We are here if userspace calls get_regs() in the middle of |
5604 | * instruction emulation. Registers state needs to be copied | 5617 | * instruction emulation. Registers state needs to be copied |
5605 | * back from emulation context to vcpu. Usrapace shouldn't do | 5618 | * back from emulation context to vcpu. Userspace shouldn't do |
5606 | * that usually, but some bad designed PV devices (vmware | 5619 | * that usually, but some bad designed PV devices (vmware |
5607 | * backdoor interface) need this to work | 5620 | * backdoor interface) need this to work |
5608 | */ | 5621 | */ |
5609 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; | 5622 | emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt); |
5610 | memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); | ||
5611 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; | 5623 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; |
5612 | } | 5624 | } |
5613 | regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); | 5625 | regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); |
@@ -5747,7 +5759,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, | |||
5747 | if (ret) | 5759 | if (ret) |
5748 | return EMULATE_FAIL; | 5760 | return EMULATE_FAIL; |
5749 | 5761 | ||
5750 | memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); | ||
5751 | kvm_rip_write(vcpu, ctxt->eip); | 5762 | kvm_rip_write(vcpu, ctxt->eip); |
5752 | kvm_set_rflags(vcpu, ctxt->eflags); | 5763 | kvm_set_rflags(vcpu, ctxt->eflags); |
5753 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 5764 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
@@ -5799,7 +5810,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
5799 | if (mmu_reset_needed) | 5810 | if (mmu_reset_needed) |
5800 | kvm_mmu_reset_context(vcpu); | 5811 | kvm_mmu_reset_context(vcpu); |
5801 | 5812 | ||
5802 | max_bits = (sizeof sregs->interrupt_bitmap) << 3; | 5813 | max_bits = KVM_NR_INTERRUPTS; |
5803 | pending_vec = find_first_bit( | 5814 | pending_vec = find_first_bit( |
5804 | (const unsigned long *)sregs->interrupt_bitmap, max_bits); | 5815 | (const unsigned long *)sregs->interrupt_bitmap, max_bits); |
5805 | if (pending_vec < max_bits) { | 5816 | if (pending_vec < max_bits) { |
@@ -5859,13 +5870,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, | |||
5859 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { | 5870 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { |
5860 | for (i = 0; i < KVM_NR_DB_REGS; ++i) | 5871 | for (i = 0; i < KVM_NR_DB_REGS; ++i) |
5861 | vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; | 5872 | vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; |
5862 | vcpu->arch.switch_db_regs = | 5873 | vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7]; |
5863 | (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); | ||
5864 | } else { | 5874 | } else { |
5865 | for (i = 0; i < KVM_NR_DB_REGS; i++) | 5875 | for (i = 0; i < KVM_NR_DB_REGS; i++) |
5866 | vcpu->arch.eff_db[i] = vcpu->arch.db[i]; | 5876 | vcpu->arch.eff_db[i] = vcpu->arch.db[i]; |
5867 | vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); | ||
5868 | } | 5877 | } |
5878 | kvm_update_dr7(vcpu); | ||
5869 | 5879 | ||
5870 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) | 5880 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) |
5871 | vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + | 5881 | vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + |
@@ -5877,7 +5887,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, | |||
5877 | */ | 5887 | */ |
5878 | kvm_set_rflags(vcpu, rflags); | 5888 | kvm_set_rflags(vcpu, rflags); |
5879 | 5889 | ||
5880 | kvm_x86_ops->set_guest_debug(vcpu, dbg); | 5890 | kvm_x86_ops->update_db_bp_intercept(vcpu); |
5881 | 5891 | ||
5882 | r = 0; | 5892 | r = 0; |
5883 | 5893 | ||
@@ -6023,7 +6033,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | |||
6023 | int r; | 6033 | int r; |
6024 | 6034 | ||
6025 | vcpu->arch.mtrr_state.have_fixed = 1; | 6035 | vcpu->arch.mtrr_state.have_fixed = 1; |
6026 | vcpu_load(vcpu); | 6036 | r = vcpu_load(vcpu); |
6037 | if (r) | ||
6038 | return r; | ||
6027 | r = kvm_arch_vcpu_reset(vcpu); | 6039 | r = kvm_arch_vcpu_reset(vcpu); |
6028 | if (r == 0) | 6040 | if (r == 0) |
6029 | r = kvm_mmu_setup(vcpu); | 6041 | r = kvm_mmu_setup(vcpu); |
@@ -6034,9 +6046,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | |||
6034 | 6046 | ||
6035 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) | 6047 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) |
6036 | { | 6048 | { |
6049 | int r; | ||
6037 | vcpu->arch.apf.msr_val = 0; | 6050 | vcpu->arch.apf.msr_val = 0; |
6038 | 6051 | ||
6039 | vcpu_load(vcpu); | 6052 | r = vcpu_load(vcpu); |
6053 | BUG_ON(r); | ||
6040 | kvm_mmu_unload(vcpu); | 6054 | kvm_mmu_unload(vcpu); |
6041 | vcpu_put(vcpu); | 6055 | vcpu_put(vcpu); |
6042 | 6056 | ||
@@ -6050,10 +6064,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) | |||
6050 | vcpu->arch.nmi_pending = 0; | 6064 | vcpu->arch.nmi_pending = 0; |
6051 | vcpu->arch.nmi_injected = false; | 6065 | vcpu->arch.nmi_injected = false; |
6052 | 6066 | ||
6053 | vcpu->arch.switch_db_regs = 0; | ||
6054 | memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); | 6067 | memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); |
6055 | vcpu->arch.dr6 = DR6_FIXED_1; | 6068 | vcpu->arch.dr6 = DR6_FIXED_1; |
6056 | vcpu->arch.dr7 = DR7_FIXED_1; | 6069 | vcpu->arch.dr7 = DR7_FIXED_1; |
6070 | kvm_update_dr7(vcpu); | ||
6057 | 6071 | ||
6058 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 6072 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
6059 | vcpu->arch.apf.msr_val = 0; | 6073 | vcpu->arch.apf.msr_val = 0; |
@@ -6132,7 +6146,7 @@ int kvm_arch_hardware_enable(void *garbage) | |||
6132 | * as we reset last_host_tsc on all VCPUs to stop this from being | 6146 | * as we reset last_host_tsc on all VCPUs to stop this from being |
6133 | * called multiple times (one for each physical CPU bringup). | 6147 | * called multiple times (one for each physical CPU bringup). |
6134 | * | 6148 | * |
6135 | * Platforms with unnreliable TSCs don't have to deal with this, they | 6149 | * Platforms with unreliable TSCs don't have to deal with this, they |
6136 | * will be compensated by the logic in vcpu_load, which sets the TSC to | 6150 | * will be compensated by the logic in vcpu_load, which sets the TSC to |
6137 | * catchup mode. This will catchup all VCPUs to real time, but cannot | 6151 | * catchup mode. This will catchup all VCPUs to real time, but cannot |
6138 | * guarantee that they stay in perfect synchronization. | 6152 | * guarantee that they stay in perfect synchronization. |
@@ -6185,6 +6199,8 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) | |||
6185 | return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); | 6199 | return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); |
6186 | } | 6200 | } |
6187 | 6201 | ||
6202 | struct static_key kvm_no_apic_vcpu __read_mostly; | ||
6203 | |||
6188 | int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | 6204 | int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) |
6189 | { | 6205 | { |
6190 | struct page *page; | 6206 | struct page *page; |
@@ -6217,7 +6233,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
6217 | r = kvm_create_lapic(vcpu); | 6233 | r = kvm_create_lapic(vcpu); |
6218 | if (r < 0) | 6234 | if (r < 0) |
6219 | goto fail_mmu_destroy; | 6235 | goto fail_mmu_destroy; |
6220 | } | 6236 | } else |
6237 | static_key_slow_inc(&kvm_no_apic_vcpu); | ||
6221 | 6238 | ||
6222 | vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, | 6239 | vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, |
6223 | GFP_KERNEL); | 6240 | GFP_KERNEL); |
@@ -6257,6 +6274,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) | |||
6257 | kvm_mmu_destroy(vcpu); | 6274 | kvm_mmu_destroy(vcpu); |
6258 | srcu_read_unlock(&vcpu->kvm->srcu, idx); | 6275 | srcu_read_unlock(&vcpu->kvm->srcu, idx); |
6259 | free_page((unsigned long)vcpu->arch.pio_data); | 6276 | free_page((unsigned long)vcpu->arch.pio_data); |
6277 | if (!irqchip_in_kernel(vcpu->kvm)) | ||
6278 | static_key_slow_dec(&kvm_no_apic_vcpu); | ||
6260 | } | 6279 | } |
6261 | 6280 | ||
6262 | int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) | 6281 | int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) |
@@ -6269,15 +6288,21 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) | |||
6269 | 6288 | ||
6270 | /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ | 6289 | /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ |
6271 | set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); | 6290 | set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); |
6291 | /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */ | ||
6292 | set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, | ||
6293 | &kvm->arch.irq_sources_bitmap); | ||
6272 | 6294 | ||
6273 | raw_spin_lock_init(&kvm->arch.tsc_write_lock); | 6295 | raw_spin_lock_init(&kvm->arch.tsc_write_lock); |
6296 | mutex_init(&kvm->arch.apic_map_lock); | ||
6274 | 6297 | ||
6275 | return 0; | 6298 | return 0; |
6276 | } | 6299 | } |
6277 | 6300 | ||
6278 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) | 6301 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) |
6279 | { | 6302 | { |
6280 | vcpu_load(vcpu); | 6303 | int r; |
6304 | r = vcpu_load(vcpu); | ||
6305 | BUG_ON(r); | ||
6281 | kvm_mmu_unload(vcpu); | 6306 | kvm_mmu_unload(vcpu); |
6282 | vcpu_put(vcpu); | 6307 | vcpu_put(vcpu); |
6283 | } | 6308 | } |
@@ -6321,6 +6346,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) | |||
6321 | put_page(kvm->arch.apic_access_page); | 6346 | put_page(kvm->arch.apic_access_page); |
6322 | if (kvm->arch.ept_identity_pagetable) | 6347 | if (kvm->arch.ept_identity_pagetable) |
6323 | put_page(kvm->arch.ept_identity_pagetable); | 6348 | put_page(kvm->arch.ept_identity_pagetable); |
6349 | kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); | ||
6324 | } | 6350 | } |
6325 | 6351 | ||
6326 | void kvm_arch_free_memslot(struct kvm_memory_slot *free, | 6352 | void kvm_arch_free_memslot(struct kvm_memory_slot *free, |
@@ -6328,10 +6354,18 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free, | |||
6328 | { | 6354 | { |
6329 | int i; | 6355 | int i; |
6330 | 6356 | ||
6331 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { | 6357 | for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { |
6332 | if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { | 6358 | if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { |
6333 | kvm_kvfree(free->arch.lpage_info[i]); | 6359 | kvm_kvfree(free->arch.rmap[i]); |
6334 | free->arch.lpage_info[i] = NULL; | 6360 | free->arch.rmap[i] = NULL; |
6361 | } | ||
6362 | if (i == 0) | ||
6363 | continue; | ||
6364 | |||
6365 | if (!dont || free->arch.lpage_info[i - 1] != | ||
6366 | dont->arch.lpage_info[i - 1]) { | ||
6367 | kvm_kvfree(free->arch.lpage_info[i - 1]); | ||
6368 | free->arch.lpage_info[i - 1] = NULL; | ||
6335 | } | 6369 | } |
6336 | } | 6370 | } |
6337 | } | 6371 | } |
@@ -6340,23 +6374,30 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) | |||
6340 | { | 6374 | { |
6341 | int i; | 6375 | int i; |
6342 | 6376 | ||
6343 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { | 6377 | for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { |
6344 | unsigned long ugfn; | 6378 | unsigned long ugfn; |
6345 | int lpages; | 6379 | int lpages; |
6346 | int level = i + 2; | 6380 | int level = i + 1; |
6347 | 6381 | ||
6348 | lpages = gfn_to_index(slot->base_gfn + npages - 1, | 6382 | lpages = gfn_to_index(slot->base_gfn + npages - 1, |
6349 | slot->base_gfn, level) + 1; | 6383 | slot->base_gfn, level) + 1; |
6350 | 6384 | ||
6351 | slot->arch.lpage_info[i] = | 6385 | slot->arch.rmap[i] = |
6352 | kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); | 6386 | kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i])); |
6353 | if (!slot->arch.lpage_info[i]) | 6387 | if (!slot->arch.rmap[i]) |
6388 | goto out_free; | ||
6389 | if (i == 0) | ||
6390 | continue; | ||
6391 | |||
6392 | slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages * | ||
6393 | sizeof(*slot->arch.lpage_info[i - 1])); | ||
6394 | if (!slot->arch.lpage_info[i - 1]) | ||
6354 | goto out_free; | 6395 | goto out_free; |
6355 | 6396 | ||
6356 | if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) | 6397 | if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) |
6357 | slot->arch.lpage_info[i][0].write_count = 1; | 6398 | slot->arch.lpage_info[i - 1][0].write_count = 1; |
6358 | if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) | 6399 | if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) |
6359 | slot->arch.lpage_info[i][lpages - 1].write_count = 1; | 6400 | slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1; |
6360 | ugfn = slot->userspace_addr >> PAGE_SHIFT; | 6401 | ugfn = slot->userspace_addr >> PAGE_SHIFT; |
6361 | /* | 6402 | /* |
6362 | * If the gfn and userspace address are not aligned wrt each | 6403 | * If the gfn and userspace address are not aligned wrt each |
@@ -6368,16 +6409,21 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) | |||
6368 | unsigned long j; | 6409 | unsigned long j; |
6369 | 6410 | ||
6370 | for (j = 0; j < lpages; ++j) | 6411 | for (j = 0; j < lpages; ++j) |
6371 | slot->arch.lpage_info[i][j].write_count = 1; | 6412 | slot->arch.lpage_info[i - 1][j].write_count = 1; |
6372 | } | 6413 | } |
6373 | } | 6414 | } |
6374 | 6415 | ||
6375 | return 0; | 6416 | return 0; |
6376 | 6417 | ||
6377 | out_free: | 6418 | out_free: |
6378 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { | 6419 | for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { |
6379 | kvm_kvfree(slot->arch.lpage_info[i]); | 6420 | kvm_kvfree(slot->arch.rmap[i]); |
6380 | slot->arch.lpage_info[i] = NULL; | 6421 | slot->arch.rmap[i] = NULL; |
6422 | if (i == 0) | ||
6423 | continue; | ||
6424 | |||
6425 | kvm_kvfree(slot->arch.lpage_info[i - 1]); | ||
6426 | slot->arch.lpage_info[i - 1] = NULL; | ||
6381 | } | 6427 | } |
6382 | return -ENOMEM; | 6428 | return -ENOMEM; |
6383 | } | 6429 | } |
@@ -6396,10 +6442,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, | |||
6396 | map_flags = MAP_SHARED | MAP_ANONYMOUS; | 6442 | map_flags = MAP_SHARED | MAP_ANONYMOUS; |
6397 | 6443 | ||
6398 | /*To keep backward compatibility with older userspace, | 6444 | /*To keep backward compatibility with older userspace, |
6399 | *x86 needs to hanlde !user_alloc case. | 6445 | *x86 needs to handle !user_alloc case. |
6400 | */ | 6446 | */ |
6401 | if (!user_alloc) { | 6447 | if (!user_alloc) { |
6402 | if (npages && !old.rmap) { | 6448 | if (npages && !old.npages) { |
6403 | unsigned long userspace_addr; | 6449 | unsigned long userspace_addr; |
6404 | 6450 | ||
6405 | userspace_addr = vm_mmap(NULL, 0, | 6451 | userspace_addr = vm_mmap(NULL, 0, |
@@ -6427,7 +6473,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, | |||
6427 | 6473 | ||
6428 | int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; | 6474 | int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; |
6429 | 6475 | ||
6430 | if (!user_alloc && !old.user_alloc && old.rmap && !npages) { | 6476 | if (!user_alloc && !old.user_alloc && old.npages && !npages) { |
6431 | int ret; | 6477 | int ret; |
6432 | 6478 | ||
6433 | ret = vm_munmap(old.userspace_addr, | 6479 | ret = vm_munmap(old.userspace_addr, |
@@ -6446,14 +6492,28 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, | |||
6446 | kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); | 6492 | kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); |
6447 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); | 6493 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); |
6448 | spin_unlock(&kvm->mmu_lock); | 6494 | spin_unlock(&kvm->mmu_lock); |
6495 | /* | ||
6496 | * If memory slot is created, or moved, we need to clear all | ||
6497 | * mmio sptes. | ||
6498 | */ | ||
6499 | if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) { | ||
6500 | kvm_mmu_zap_all(kvm); | ||
6501 | kvm_reload_remote_mmus(kvm); | ||
6502 | } | ||
6449 | } | 6503 | } |
6450 | 6504 | ||
6451 | void kvm_arch_flush_shadow(struct kvm *kvm) | 6505 | void kvm_arch_flush_shadow_all(struct kvm *kvm) |
6452 | { | 6506 | { |
6453 | kvm_mmu_zap_all(kvm); | 6507 | kvm_mmu_zap_all(kvm); |
6454 | kvm_reload_remote_mmus(kvm); | 6508 | kvm_reload_remote_mmus(kvm); |
6455 | } | 6509 | } |
6456 | 6510 | ||
6511 | void kvm_arch_flush_shadow_memslot(struct kvm *kvm, | ||
6512 | struct kvm_memory_slot *slot) | ||
6513 | { | ||
6514 | kvm_arch_flush_shadow_all(kvm); | ||
6515 | } | ||
6516 | |||
6457 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) | 6517 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) |
6458 | { | 6518 | { |
6459 | return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && | 6519 | return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && |
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 3d1134ddb885..2b5219c12ac8 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h | |||
@@ -124,4 +124,5 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, | |||
124 | 124 | ||
125 | extern u64 host_xcr0; | 125 | extern u64 host_xcr0; |
126 | 126 | ||
127 | extern struct static_key kvm_no_apic_vcpu; | ||
127 | #endif | 128 | #endif |
diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 2ce09aa7d3b3..0a6d6ba44c85 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h | |||
@@ -101,9 +101,13 @@ struct kvm_userspace_memory_region { | |||
101 | __u64 userspace_addr; /* start of the userspace allocated memory */ | 101 | __u64 userspace_addr; /* start of the userspace allocated memory */ |
102 | }; | 102 | }; |
103 | 103 | ||
104 | /* for kvm_memory_region::flags */ | 104 | /* |
105 | #define KVM_MEM_LOG_DIRTY_PAGES 1UL | 105 | * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace, |
106 | #define KVM_MEMSLOT_INVALID (1UL << 1) | 106 | * other bits are reserved for kvm internal use which are defined in |
107 | * include/linux/kvm_host.h. | ||
108 | */ | ||
109 | #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) | ||
110 | #define KVM_MEM_READONLY (1UL << 1) | ||
107 | 111 | ||
108 | /* for KVM_IRQ_LINE */ | 112 | /* for KVM_IRQ_LINE */ |
109 | struct kvm_irq_level { | 113 | struct kvm_irq_level { |
@@ -618,6 +622,10 @@ struct kvm_ppc_smmu_info { | |||
618 | #define KVM_CAP_PPC_GET_SMMU_INFO 78 | 622 | #define KVM_CAP_PPC_GET_SMMU_INFO 78 |
619 | #define KVM_CAP_S390_COW 79 | 623 | #define KVM_CAP_S390_COW 79 |
620 | #define KVM_CAP_PPC_ALLOC_HTAB 80 | 624 | #define KVM_CAP_PPC_ALLOC_HTAB 80 |
625 | #ifdef __KVM_HAVE_READONLY_MEM | ||
626 | #define KVM_CAP_READONLY_MEM 81 | ||
627 | #endif | ||
628 | #define KVM_CAP_IRQFD_RESAMPLE 82 | ||
621 | 629 | ||
622 | #ifdef KVM_CAP_IRQ_ROUTING | 630 | #ifdef KVM_CAP_IRQ_ROUTING |
623 | 631 | ||
@@ -683,12 +691,21 @@ struct kvm_xen_hvm_config { | |||
683 | #endif | 691 | #endif |
684 | 692 | ||
685 | #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) | 693 | #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) |
694 | /* | ||
695 | * Available with KVM_CAP_IRQFD_RESAMPLE | ||
696 | * | ||
697 | * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies | ||
698 | * the irqfd to operate in resampling mode for level triggered interrupt | ||
699 | * emlation. See Documentation/virtual/kvm/api.txt. | ||
700 | */ | ||
701 | #define KVM_IRQFD_FLAG_RESAMPLE (1 << 1) | ||
686 | 702 | ||
687 | struct kvm_irqfd { | 703 | struct kvm_irqfd { |
688 | __u32 fd; | 704 | __u32 fd; |
689 | __u32 gsi; | 705 | __u32 gsi; |
690 | __u32 flags; | 706 | __u32 flags; |
691 | __u8 pad[20]; | 707 | __u32 resamplefd; |
708 | __u8 pad[16]; | ||
692 | }; | 709 | }; |
693 | 710 | ||
694 | struct kvm_clock_data { | 711 | struct kvm_clock_data { |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8a59e0abe5fa..93bfc9f9815c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/slab.h> | 21 | #include <linux/slab.h> |
22 | #include <linux/rcupdate.h> | 22 | #include <linux/rcupdate.h> |
23 | #include <linux/ratelimit.h> | 23 | #include <linux/ratelimit.h> |
24 | #include <linux/err.h> | ||
24 | #include <asm/signal.h> | 25 | #include <asm/signal.h> |
25 | 26 | ||
26 | #include <linux/kvm.h> | 27 | #include <linux/kvm.h> |
@@ -35,6 +36,13 @@ | |||
35 | #endif | 36 | #endif |
36 | 37 | ||
37 | /* | 38 | /* |
39 | * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used | ||
40 | * in kvm, other bits are visible for userspace which are defined in | ||
41 | * include/linux/kvm_h. | ||
42 | */ | ||
43 | #define KVM_MEMSLOT_INVALID (1UL << 16) | ||
44 | |||
45 | /* | ||
38 | * If we support unaligned MMIO, at most one fragment will be split into two: | 46 | * If we support unaligned MMIO, at most one fragment will be split into two: |
39 | */ | 47 | */ |
40 | #ifdef KVM_UNALIGNED_MMIO | 48 | #ifdef KVM_UNALIGNED_MMIO |
@@ -49,6 +57,47 @@ | |||
49 | (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS) | 57 | (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS) |
50 | 58 | ||
51 | /* | 59 | /* |
60 | * For the normal pfn, the highest 12 bits should be zero, | ||
61 | * so we can mask these bits to indicate the error. | ||
62 | */ | ||
63 | #define KVM_PFN_ERR_MASK (0xfffULL << 52) | ||
64 | |||
65 | #define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK) | ||
66 | #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) | ||
67 | #define KVM_PFN_ERR_BAD (KVM_PFN_ERR_MASK + 2) | ||
68 | #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 3) | ||
69 | |||
70 | static inline bool is_error_pfn(pfn_t pfn) | ||
71 | { | ||
72 | return !!(pfn & KVM_PFN_ERR_MASK); | ||
73 | } | ||
74 | |||
75 | static inline bool is_noslot_pfn(pfn_t pfn) | ||
76 | { | ||
77 | return pfn == KVM_PFN_ERR_BAD; | ||
78 | } | ||
79 | |||
80 | static inline bool is_invalid_pfn(pfn_t pfn) | ||
81 | { | ||
82 | return !is_noslot_pfn(pfn) && is_error_pfn(pfn); | ||
83 | } | ||
84 | |||
85 | #define KVM_HVA_ERR_BAD (PAGE_OFFSET) | ||
86 | #define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE) | ||
87 | |||
88 | static inline bool kvm_is_error_hva(unsigned long addr) | ||
89 | { | ||
90 | return addr >= PAGE_OFFSET; | ||
91 | } | ||
92 | |||
93 | #define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT)) | ||
94 | |||
95 | static inline bool is_error_page(struct page *page) | ||
96 | { | ||
97 | return IS_ERR(page); | ||
98 | } | ||
99 | |||
100 | /* | ||
52 | * vcpu->requests bit members | 101 | * vcpu->requests bit members |
53 | */ | 102 | */ |
54 | #define KVM_REQ_TLB_FLUSH 0 | 103 | #define KVM_REQ_TLB_FLUSH 0 |
@@ -70,7 +119,8 @@ | |||
70 | #define KVM_REQ_PMU 16 | 119 | #define KVM_REQ_PMU 16 |
71 | #define KVM_REQ_PMI 17 | 120 | #define KVM_REQ_PMI 17 |
72 | 121 | ||
73 | #define KVM_USERSPACE_IRQ_SOURCE_ID 0 | 122 | #define KVM_USERSPACE_IRQ_SOURCE_ID 0 |
123 | #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 | ||
74 | 124 | ||
75 | struct kvm; | 125 | struct kvm; |
76 | struct kvm_vcpu; | 126 | struct kvm_vcpu; |
@@ -183,6 +233,18 @@ struct kvm_vcpu { | |||
183 | } async_pf; | 233 | } async_pf; |
184 | #endif | 234 | #endif |
185 | 235 | ||
236 | #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT | ||
237 | /* | ||
238 | * Cpu relax intercept or pause loop exit optimization | ||
239 | * in_spin_loop: set when a vcpu does a pause loop exit | ||
240 | * or cpu relax intercepted. | ||
241 | * dy_eligible: indicates whether vcpu is eligible for directed yield. | ||
242 | */ | ||
243 | struct { | ||
244 | bool in_spin_loop; | ||
245 | bool dy_eligible; | ||
246 | } spin_loop; | ||
247 | #endif | ||
186 | struct kvm_vcpu_arch arch; | 248 | struct kvm_vcpu_arch arch; |
187 | }; | 249 | }; |
188 | 250 | ||
@@ -201,7 +263,6 @@ struct kvm_memory_slot { | |||
201 | gfn_t base_gfn; | 263 | gfn_t base_gfn; |
202 | unsigned long npages; | 264 | unsigned long npages; |
203 | unsigned long flags; | 265 | unsigned long flags; |
204 | unsigned long *rmap; | ||
205 | unsigned long *dirty_bitmap; | 266 | unsigned long *dirty_bitmap; |
206 | struct kvm_arch_memory_slot arch; | 267 | struct kvm_arch_memory_slot arch; |
207 | unsigned long userspace_addr; | 268 | unsigned long userspace_addr; |
@@ -283,6 +344,8 @@ struct kvm { | |||
283 | struct { | 344 | struct { |
284 | spinlock_t lock; | 345 | spinlock_t lock; |
285 | struct list_head items; | 346 | struct list_head items; |
347 | struct list_head resampler_list; | ||
348 | struct mutex resampler_lock; | ||
286 | } irqfds; | 349 | } irqfds; |
287 | struct list_head ioeventfds; | 350 | struct list_head ioeventfds; |
288 | #endif | 351 | #endif |
@@ -348,7 +411,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) | |||
348 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); | 411 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); |
349 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); | 412 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); |
350 | 413 | ||
351 | void vcpu_load(struct kvm_vcpu *vcpu); | 414 | int __must_check vcpu_load(struct kvm_vcpu *vcpu); |
352 | void vcpu_put(struct kvm_vcpu *vcpu); | 415 | void vcpu_put(struct kvm_vcpu *vcpu); |
353 | 416 | ||
354 | int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, | 417 | int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, |
@@ -378,23 +441,6 @@ id_to_memslot(struct kvm_memslots *slots, int id) | |||
378 | return slot; | 441 | return slot; |
379 | } | 442 | } |
380 | 443 | ||
381 | #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) | ||
382 | #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) | ||
383 | static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } | ||
384 | |||
385 | extern struct page *bad_page; | ||
386 | extern struct page *fault_page; | ||
387 | |||
388 | extern pfn_t bad_pfn; | ||
389 | extern pfn_t fault_pfn; | ||
390 | |||
391 | int is_error_page(struct page *page); | ||
392 | int is_error_pfn(pfn_t pfn); | ||
393 | int is_hwpoison_pfn(pfn_t pfn); | ||
394 | int is_fault_pfn(pfn_t pfn); | ||
395 | int is_noslot_pfn(pfn_t pfn); | ||
396 | int is_invalid_pfn(pfn_t pfn); | ||
397 | int kvm_is_error_hva(unsigned long addr); | ||
398 | int kvm_set_memory_region(struct kvm *kvm, | 444 | int kvm_set_memory_region(struct kvm *kvm, |
399 | struct kvm_userspace_memory_region *mem, | 445 | struct kvm_userspace_memory_region *mem, |
400 | int user_alloc); | 446 | int user_alloc); |
@@ -415,28 +461,33 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, | |||
415 | int user_alloc); | 461 | int user_alloc); |
416 | bool kvm_largepages_enabled(void); | 462 | bool kvm_largepages_enabled(void); |
417 | void kvm_disable_largepages(void); | 463 | void kvm_disable_largepages(void); |
418 | void kvm_arch_flush_shadow(struct kvm *kvm); | 464 | /* flush all memory translations */ |
465 | void kvm_arch_flush_shadow_all(struct kvm *kvm); | ||
466 | /* flush memory translations pointing to 'slot' */ | ||
467 | void kvm_arch_flush_shadow_memslot(struct kvm *kvm, | ||
468 | struct kvm_memory_slot *slot); | ||
419 | 469 | ||
420 | int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, | 470 | int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, |
421 | int nr_pages); | 471 | int nr_pages); |
422 | 472 | ||
423 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); | 473 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); |
424 | unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); | 474 | unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); |
475 | unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn); | ||
425 | void kvm_release_page_clean(struct page *page); | 476 | void kvm_release_page_clean(struct page *page); |
426 | void kvm_release_page_dirty(struct page *page); | 477 | void kvm_release_page_dirty(struct page *page); |
427 | void kvm_set_page_dirty(struct page *page); | 478 | void kvm_set_page_dirty(struct page *page); |
428 | void kvm_set_page_accessed(struct page *page); | 479 | void kvm_set_page_accessed(struct page *page); |
429 | 480 | ||
430 | pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr); | ||
431 | pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); | 481 | pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); |
432 | pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, | 482 | pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, |
433 | bool write_fault, bool *writable); | 483 | bool write_fault, bool *writable); |
434 | pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); | 484 | pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); |
435 | pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, | 485 | pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, |
436 | bool *writable); | 486 | bool *writable); |
437 | pfn_t gfn_to_pfn_memslot(struct kvm *kvm, | 487 | pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); |
438 | struct kvm_memory_slot *slot, gfn_t gfn); | 488 | pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); |
439 | void kvm_release_pfn_dirty(pfn_t); | 489 | |
490 | void kvm_release_pfn_dirty(pfn_t pfn); | ||
440 | void kvm_release_pfn_clean(pfn_t pfn); | 491 | void kvm_release_pfn_clean(pfn_t pfn); |
441 | void kvm_set_pfn_dirty(pfn_t pfn); | 492 | void kvm_set_pfn_dirty(pfn_t pfn); |
442 | void kvm_set_pfn_accessed(pfn_t pfn); | 493 | void kvm_set_pfn_accessed(pfn_t pfn); |
@@ -494,6 +545,7 @@ int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, | |||
494 | struct | 545 | struct |
495 | kvm_userspace_memory_region *mem, | 546 | kvm_userspace_memory_region *mem, |
496 | int user_alloc); | 547 | int user_alloc); |
548 | int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level); | ||
497 | long kvm_arch_vm_ioctl(struct file *filp, | 549 | long kvm_arch_vm_ioctl(struct file *filp, |
498 | unsigned int ioctl, unsigned long arg); | 550 | unsigned int ioctl, unsigned long arg); |
499 | 551 | ||
@@ -573,7 +625,7 @@ void kvm_arch_sync_events(struct kvm *kvm); | |||
573 | int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); | 625 | int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); |
574 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu); | 626 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu); |
575 | 627 | ||
576 | int kvm_is_mmio_pfn(pfn_t pfn); | 628 | bool kvm_is_mmio_pfn(pfn_t pfn); |
577 | 629 | ||
578 | struct kvm_irq_ack_notifier { | 630 | struct kvm_irq_ack_notifier { |
579 | struct hlist_node link; | 631 | struct hlist_node link; |
@@ -728,6 +780,12 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) | |||
728 | return search_memslots(slots, gfn); | 780 | return search_memslots(slots, gfn); |
729 | } | 781 | } |
730 | 782 | ||
783 | static inline unsigned long | ||
784 | __gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) | ||
785 | { | ||
786 | return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; | ||
787 | } | ||
788 | |||
731 | static inline int memslot_id(struct kvm *kvm, gfn_t gfn) | 789 | static inline int memslot_id(struct kvm *kvm, gfn_t gfn) |
732 | { | 790 | { |
733 | return gfn_to_memslot(kvm, gfn)->id; | 791 | return gfn_to_memslot(kvm, gfn)->id; |
@@ -740,10 +798,12 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) | |||
740 | (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); | 798 | (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); |
741 | } | 799 | } |
742 | 800 | ||
743 | static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, | 801 | static inline gfn_t |
744 | gfn_t gfn) | 802 | hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) |
745 | { | 803 | { |
746 | return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; | 804 | gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT; |
805 | |||
806 | return slot->base_gfn + gfn_offset; | ||
747 | } | 807 | } |
748 | 808 | ||
749 | static inline gpa_t gfn_to_gpa(gfn_t gfn) | 809 | static inline gpa_t gfn_to_gpa(gfn_t gfn) |
@@ -899,5 +959,32 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) | |||
899 | } | 959 | } |
900 | } | 960 | } |
901 | 961 | ||
962 | #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT | ||
963 | |||
964 | static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) | ||
965 | { | ||
966 | vcpu->spin_loop.in_spin_loop = val; | ||
967 | } | ||
968 | static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) | ||
969 | { | ||
970 | vcpu->spin_loop.dy_eligible = val; | ||
971 | } | ||
972 | |||
973 | #else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ | ||
974 | |||
975 | static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) | ||
976 | { | ||
977 | } | ||
978 | |||
979 | static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) | ||
980 | { | ||
981 | } | ||
982 | |||
983 | static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) | ||
984 | { | ||
985 | return true; | ||
986 | } | ||
987 | |||
988 | #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ | ||
902 | #endif | 989 | #endif |
903 | 990 | ||
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 43049192b5ec..60f48fa0fd0d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -118,6 +118,7 @@ void jump_label_rate_limit(struct static_key_deferred *key, | |||
118 | key->timeout = rl; | 118 | key->timeout = rl; |
119 | INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); | 119 | INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); |
120 | } | 120 | } |
121 | EXPORT_SYMBOL_GPL(jump_label_rate_limit); | ||
121 | 122 | ||
122 | static int addr_conflict(struct jump_entry *entry, void *start, void *end) | 123 | static int addr_conflict(struct jump_entry *entry, void *start, void *end) |
123 | { | 124 | { |
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 28694f4a9139..d01b24b72c61 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig | |||
@@ -21,3 +21,6 @@ config KVM_ASYNC_PF | |||
21 | 21 | ||
22 | config HAVE_KVM_MSI | 22 | config HAVE_KVM_MSI |
23 | bool | 23 | bool |
24 | |||
25 | config HAVE_KVM_CPU_RELAX_INTERCEPT | ||
26 | bool | ||
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 74268b4c2ee1..ea475cd03511 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c | |||
@@ -111,8 +111,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) | |||
111 | list_entry(vcpu->async_pf.done.next, | 111 | list_entry(vcpu->async_pf.done.next, |
112 | typeof(*work), link); | 112 | typeof(*work), link); |
113 | list_del(&work->link); | 113 | list_del(&work->link); |
114 | if (work->page) | 114 | if (!is_error_page(work->page)) |
115 | put_page(work->page); | 115 | kvm_release_page_clean(work->page); |
116 | kmem_cache_free(async_pf_cache, work); | 116 | kmem_cache_free(async_pf_cache, work); |
117 | } | 117 | } |
118 | spin_unlock(&vcpu->async_pf.lock); | 118 | spin_unlock(&vcpu->async_pf.lock); |
@@ -138,8 +138,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) | |||
138 | 138 | ||
139 | list_del(&work->queue); | 139 | list_del(&work->queue); |
140 | vcpu->async_pf.queued--; | 140 | vcpu->async_pf.queued--; |
141 | if (work->page) | 141 | if (!is_error_page(work->page)) |
142 | put_page(work->page); | 142 | kvm_release_page_clean(work->page); |
143 | kmem_cache_free(async_pf_cache, work); | 143 | kmem_cache_free(async_pf_cache, work); |
144 | } | 144 | } |
145 | } | 145 | } |
@@ -203,8 +203,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) | |||
203 | if (!work) | 203 | if (!work) |
204 | return -ENOMEM; | 204 | return -ENOMEM; |
205 | 205 | ||
206 | work->page = bad_page; | 206 | work->page = KVM_ERR_PTR_BAD_PAGE; |
207 | get_page(bad_page); | ||
208 | INIT_LIST_HEAD(&work->queue); /* for list_del to work */ | 207 | INIT_LIST_HEAD(&work->queue); /* for list_del to work */ |
209 | 208 | ||
210 | spin_lock(&vcpu->async_pf.lock); | 209 | spin_lock(&vcpu->async_pf.lock); |
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 67a35e90384c..9718e98d6d2a 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c | |||
@@ -43,6 +43,31 @@ | |||
43 | * -------------------------------------------------------------------- | 43 | * -------------------------------------------------------------------- |
44 | */ | 44 | */ |
45 | 45 | ||
46 | /* | ||
47 | * Resampling irqfds are a special variety of irqfds used to emulate | ||
48 | * level triggered interrupts. The interrupt is asserted on eventfd | ||
49 | * trigger. On acknowledgement through the irq ack notifier, the | ||
50 | * interrupt is de-asserted and userspace is notified through the | ||
51 | * resamplefd. All resamplers on the same gsi are de-asserted | ||
52 | * together, so we don't need to track the state of each individual | ||
53 | * user. We can also therefore share the same irq source ID. | ||
54 | */ | ||
55 | struct _irqfd_resampler { | ||
56 | struct kvm *kvm; | ||
57 | /* | ||
58 | * List of resampling struct _irqfd objects sharing this gsi. | ||
59 | * RCU list modified under kvm->irqfds.resampler_lock | ||
60 | */ | ||
61 | struct list_head list; | ||
62 | struct kvm_irq_ack_notifier notifier; | ||
63 | /* | ||
64 | * Entry in list of kvm->irqfd.resampler_list. Use for sharing | ||
65 | * resamplers among irqfds on the same gsi. | ||
66 | * Accessed and modified under kvm->irqfds.resampler_lock | ||
67 | */ | ||
68 | struct list_head link; | ||
69 | }; | ||
70 | |||
46 | struct _irqfd { | 71 | struct _irqfd { |
47 | /* Used for MSI fast-path */ | 72 | /* Used for MSI fast-path */ |
48 | struct kvm *kvm; | 73 | struct kvm *kvm; |
@@ -52,6 +77,12 @@ struct _irqfd { | |||
52 | /* Used for level IRQ fast-path */ | 77 | /* Used for level IRQ fast-path */ |
53 | int gsi; | 78 | int gsi; |
54 | struct work_struct inject; | 79 | struct work_struct inject; |
80 | /* The resampler used by this irqfd (resampler-only) */ | ||
81 | struct _irqfd_resampler *resampler; | ||
82 | /* Eventfd notified on resample (resampler-only) */ | ||
83 | struct eventfd_ctx *resamplefd; | ||
84 | /* Entry in list of irqfds for a resampler (resampler-only) */ | ||
85 | struct list_head resampler_link; | ||
55 | /* Used for setup/shutdown */ | 86 | /* Used for setup/shutdown */ |
56 | struct eventfd_ctx *eventfd; | 87 | struct eventfd_ctx *eventfd; |
57 | struct list_head list; | 88 | struct list_head list; |
@@ -67,8 +98,58 @@ irqfd_inject(struct work_struct *work) | |||
67 | struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); | 98 | struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); |
68 | struct kvm *kvm = irqfd->kvm; | 99 | struct kvm *kvm = irqfd->kvm; |
69 | 100 | ||
70 | kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); | 101 | if (!irqfd->resampler) { |
71 | kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); | 102 | kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); |
103 | kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); | ||
104 | } else | ||
105 | kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, | ||
106 | irqfd->gsi, 1); | ||
107 | } | ||
108 | |||
109 | /* | ||
110 | * Since resampler irqfds share an IRQ source ID, we de-assert once | ||
111 | * then notify all of the resampler irqfds using this GSI. We can't | ||
112 | * do multiple de-asserts or we risk racing with incoming re-asserts. | ||
113 | */ | ||
114 | static void | ||
115 | irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) | ||
116 | { | ||
117 | struct _irqfd_resampler *resampler; | ||
118 | struct _irqfd *irqfd; | ||
119 | |||
120 | resampler = container_of(kian, struct _irqfd_resampler, notifier); | ||
121 | |||
122 | kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, | ||
123 | resampler->notifier.gsi, 0); | ||
124 | |||
125 | rcu_read_lock(); | ||
126 | |||
127 | list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link) | ||
128 | eventfd_signal(irqfd->resamplefd, 1); | ||
129 | |||
130 | rcu_read_unlock(); | ||
131 | } | ||
132 | |||
133 | static void | ||
134 | irqfd_resampler_shutdown(struct _irqfd *irqfd) | ||
135 | { | ||
136 | struct _irqfd_resampler *resampler = irqfd->resampler; | ||
137 | struct kvm *kvm = resampler->kvm; | ||
138 | |||
139 | mutex_lock(&kvm->irqfds.resampler_lock); | ||
140 | |||
141 | list_del_rcu(&irqfd->resampler_link); | ||
142 | synchronize_rcu(); | ||
143 | |||
144 | if (list_empty(&resampler->list)) { | ||
145 | list_del(&resampler->link); | ||
146 | kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); | ||
147 | kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, | ||
148 | resampler->notifier.gsi, 0); | ||
149 | kfree(resampler); | ||
150 | } | ||
151 | |||
152 | mutex_unlock(&kvm->irqfds.resampler_lock); | ||
72 | } | 153 | } |
73 | 154 | ||
74 | /* | 155 | /* |
@@ -92,6 +173,11 @@ irqfd_shutdown(struct work_struct *work) | |||
92 | */ | 173 | */ |
93 | flush_work(&irqfd->inject); | 174 | flush_work(&irqfd->inject); |
94 | 175 | ||
176 | if (irqfd->resampler) { | ||
177 | irqfd_resampler_shutdown(irqfd); | ||
178 | eventfd_ctx_put(irqfd->resamplefd); | ||
179 | } | ||
180 | |||
95 | /* | 181 | /* |
96 | * It is now safe to release the object's resources | 182 | * It is now safe to release the object's resources |
97 | */ | 183 | */ |
@@ -203,7 +289,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) | |||
203 | struct kvm_irq_routing_table *irq_rt; | 289 | struct kvm_irq_routing_table *irq_rt; |
204 | struct _irqfd *irqfd, *tmp; | 290 | struct _irqfd *irqfd, *tmp; |
205 | struct file *file = NULL; | 291 | struct file *file = NULL; |
206 | struct eventfd_ctx *eventfd = NULL; | 292 | struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; |
207 | int ret; | 293 | int ret; |
208 | unsigned int events; | 294 | unsigned int events; |
209 | 295 | ||
@@ -231,6 +317,54 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) | |||
231 | 317 | ||
232 | irqfd->eventfd = eventfd; | 318 | irqfd->eventfd = eventfd; |
233 | 319 | ||
320 | if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) { | ||
321 | struct _irqfd_resampler *resampler; | ||
322 | |||
323 | resamplefd = eventfd_ctx_fdget(args->resamplefd); | ||
324 | if (IS_ERR(resamplefd)) { | ||
325 | ret = PTR_ERR(resamplefd); | ||
326 | goto fail; | ||
327 | } | ||
328 | |||
329 | irqfd->resamplefd = resamplefd; | ||
330 | INIT_LIST_HEAD(&irqfd->resampler_link); | ||
331 | |||
332 | mutex_lock(&kvm->irqfds.resampler_lock); | ||
333 | |||
334 | list_for_each_entry(resampler, | ||
335 | &kvm->irqfds.resampler_list, list) { | ||
336 | if (resampler->notifier.gsi == irqfd->gsi) { | ||
337 | irqfd->resampler = resampler; | ||
338 | break; | ||
339 | } | ||
340 | } | ||
341 | |||
342 | if (!irqfd->resampler) { | ||
343 | resampler = kzalloc(sizeof(*resampler), GFP_KERNEL); | ||
344 | if (!resampler) { | ||
345 | ret = -ENOMEM; | ||
346 | mutex_unlock(&kvm->irqfds.resampler_lock); | ||
347 | goto fail; | ||
348 | } | ||
349 | |||
350 | resampler->kvm = kvm; | ||
351 | INIT_LIST_HEAD(&resampler->list); | ||
352 | resampler->notifier.gsi = irqfd->gsi; | ||
353 | resampler->notifier.irq_acked = irqfd_resampler_ack; | ||
354 | INIT_LIST_HEAD(&resampler->link); | ||
355 | |||
356 | list_add(&resampler->link, &kvm->irqfds.resampler_list); | ||
357 | kvm_register_irq_ack_notifier(kvm, | ||
358 | &resampler->notifier); | ||
359 | irqfd->resampler = resampler; | ||
360 | } | ||
361 | |||
362 | list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); | ||
363 | synchronize_rcu(); | ||
364 | |||
365 | mutex_unlock(&kvm->irqfds.resampler_lock); | ||
366 | } | ||
367 | |||
234 | /* | 368 | /* |
235 | * Install our own custom wake-up handling so we are notified via | 369 | * Install our own custom wake-up handling so we are notified via |
236 | * a callback whenever someone signals the underlying eventfd | 370 | * a callback whenever someone signals the underlying eventfd |
@@ -276,6 +410,12 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) | |||
276 | return 0; | 410 | return 0; |
277 | 411 | ||
278 | fail: | 412 | fail: |
413 | if (irqfd->resampler) | ||
414 | irqfd_resampler_shutdown(irqfd); | ||
415 | |||
416 | if (resamplefd && !IS_ERR(resamplefd)) | ||
417 | eventfd_ctx_put(resamplefd); | ||
418 | |||
279 | if (eventfd && !IS_ERR(eventfd)) | 419 | if (eventfd && !IS_ERR(eventfd)) |
280 | eventfd_ctx_put(eventfd); | 420 | eventfd_ctx_put(eventfd); |
281 | 421 | ||
@@ -291,6 +431,8 @@ kvm_eventfd_init(struct kvm *kvm) | |||
291 | { | 431 | { |
292 | spin_lock_init(&kvm->irqfds.lock); | 432 | spin_lock_init(&kvm->irqfds.lock); |
293 | INIT_LIST_HEAD(&kvm->irqfds.items); | 433 | INIT_LIST_HEAD(&kvm->irqfds.items); |
434 | INIT_LIST_HEAD(&kvm->irqfds.resampler_list); | ||
435 | mutex_init(&kvm->irqfds.resampler_lock); | ||
294 | INIT_LIST_HEAD(&kvm->ioeventfds); | 436 | INIT_LIST_HEAD(&kvm->ioeventfds); |
295 | } | 437 | } |
296 | 438 | ||
@@ -340,7 +482,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) | |||
340 | int | 482 | int |
341 | kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) | 483 | kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) |
342 | { | 484 | { |
343 | if (args->flags & ~KVM_IRQFD_FLAG_DEASSIGN) | 485 | if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) |
344 | return -EINVAL; | 486 | return -EINVAL; |
345 | 487 | ||
346 | if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) | 488 | if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) |
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index ef61d529a6c4..cfb7e4d52dc2 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c | |||
@@ -197,28 +197,29 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, | |||
197 | u32 old_irr; | 197 | u32 old_irr; |
198 | u32 mask = 1 << irq; | 198 | u32 mask = 1 << irq; |
199 | union kvm_ioapic_redirect_entry entry; | 199 | union kvm_ioapic_redirect_entry entry; |
200 | int ret = 1; | 200 | int ret, irq_level; |
201 | |||
202 | BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS); | ||
201 | 203 | ||
202 | spin_lock(&ioapic->lock); | 204 | spin_lock(&ioapic->lock); |
203 | old_irr = ioapic->irr; | 205 | old_irr = ioapic->irr; |
204 | if (irq >= 0 && irq < IOAPIC_NUM_PINS) { | 206 | irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], |
205 | int irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], | 207 | irq_source_id, level); |
206 | irq_source_id, level); | 208 | entry = ioapic->redirtbl[irq]; |
207 | entry = ioapic->redirtbl[irq]; | 209 | irq_level ^= entry.fields.polarity; |
208 | irq_level ^= entry.fields.polarity; | 210 | if (!irq_level) { |
209 | if (!irq_level) | 211 | ioapic->irr &= ~mask; |
210 | ioapic->irr &= ~mask; | 212 | ret = 1; |
211 | else { | 213 | } else { |
212 | int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); | 214 | int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); |
213 | ioapic->irr |= mask; | 215 | ioapic->irr |= mask; |
214 | if ((edge && old_irr != ioapic->irr) || | 216 | if ((edge && old_irr != ioapic->irr) || |
215 | (!edge && !entry.fields.remote_irr)) | 217 | (!edge && !entry.fields.remote_irr)) |
216 | ret = ioapic_service(ioapic, irq); | 218 | ret = ioapic_service(ioapic, irq); |
217 | else | 219 | else |
218 | ret = 0; /* report coalesced interrupt */ | 220 | ret = 0; /* report coalesced interrupt */ |
219 | } | ||
220 | trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); | ||
221 | } | 221 | } |
222 | trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); | ||
222 | spin_unlock(&ioapic->lock); | 223 | spin_unlock(&ioapic->lock); |
223 | 224 | ||
224 | return ret; | 225 | return ret; |
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index e9fff9830bf0..037cb6730e68 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c | |||
@@ -42,13 +42,13 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm); | |||
42 | static void kvm_iommu_put_pages(struct kvm *kvm, | 42 | static void kvm_iommu_put_pages(struct kvm *kvm, |
43 | gfn_t base_gfn, unsigned long npages); | 43 | gfn_t base_gfn, unsigned long npages); |
44 | 44 | ||
45 | static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot, | 45 | static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, |
46 | gfn_t gfn, unsigned long size) | 46 | unsigned long size) |
47 | { | 47 | { |
48 | gfn_t end_gfn; | 48 | gfn_t end_gfn; |
49 | pfn_t pfn; | 49 | pfn_t pfn; |
50 | 50 | ||
51 | pfn = gfn_to_pfn_memslot(kvm, slot, gfn); | 51 | pfn = gfn_to_pfn_memslot(slot, gfn); |
52 | end_gfn = gfn + (size >> PAGE_SHIFT); | 52 | end_gfn = gfn + (size >> PAGE_SHIFT); |
53 | gfn += 1; | 53 | gfn += 1; |
54 | 54 | ||
@@ -56,7 +56,7 @@ static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot, | |||
56 | return pfn; | 56 | return pfn; |
57 | 57 | ||
58 | while (gfn < end_gfn) | 58 | while (gfn < end_gfn) |
59 | gfn_to_pfn_memslot(kvm, slot, gfn++); | 59 | gfn_to_pfn_memslot(slot, gfn++); |
60 | 60 | ||
61 | return pfn; | 61 | return pfn; |
62 | } | 62 | } |
@@ -105,7 +105,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) | |||
105 | * Pin all pages we are about to map in memory. This is | 105 | * Pin all pages we are about to map in memory. This is |
106 | * important because we unmap and unpin in 4kb steps later. | 106 | * important because we unmap and unpin in 4kb steps later. |
107 | */ | 107 | */ |
108 | pfn = kvm_pin_pages(kvm, slot, gfn, page_size); | 108 | pfn = kvm_pin_pages(slot, gfn, page_size); |
109 | if (is_error_pfn(pfn)) { | 109 | if (is_error_pfn(pfn)) { |
110 | gfn += 1; | 110 | gfn += 1; |
111 | continue; | 111 | continue; |
@@ -300,6 +300,12 @@ static void kvm_iommu_put_pages(struct kvm *kvm, | |||
300 | 300 | ||
301 | /* Get physical address */ | 301 | /* Get physical address */ |
302 | phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); | 302 | phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); |
303 | |||
304 | if (!phys) { | ||
305 | gfn++; | ||
306 | continue; | ||
307 | } | ||
308 | |||
303 | pfn = phys >> PAGE_SHIFT; | 309 | pfn = phys >> PAGE_SHIFT; |
304 | 310 | ||
305 | /* Unmap address from IO address space */ | 311 | /* Unmap address from IO address space */ |
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 83402d74a767..2eb58af7ee99 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c | |||
@@ -68,8 +68,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, | |||
68 | struct kvm_vcpu *vcpu, *lowest = NULL; | 68 | struct kvm_vcpu *vcpu, *lowest = NULL; |
69 | 69 | ||
70 | if (irq->dest_mode == 0 && irq->dest_id == 0xff && | 70 | if (irq->dest_mode == 0 && irq->dest_id == 0xff && |
71 | kvm_is_dm_lowest_prio(irq)) | 71 | kvm_is_dm_lowest_prio(irq)) { |
72 | printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); | 72 | printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); |
73 | irq->delivery_mode = APIC_DM_FIXED; | ||
74 | } | ||
75 | |||
76 | if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r)) | ||
77 | return r; | ||
73 | 78 | ||
74 | kvm_for_each_vcpu(i, vcpu, kvm) { | 79 | kvm_for_each_vcpu(i, vcpu, kvm) { |
75 | if (!kvm_apic_present(vcpu)) | 80 | if (!kvm_apic_present(vcpu)) |
@@ -223,6 +228,9 @@ int kvm_request_irq_source_id(struct kvm *kvm) | |||
223 | } | 228 | } |
224 | 229 | ||
225 | ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); | 230 | ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); |
231 | #ifdef CONFIG_X86 | ||
232 | ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); | ||
233 | #endif | ||
226 | set_bit(irq_source_id, bitmap); | 234 | set_bit(irq_source_id, bitmap); |
227 | unlock: | 235 | unlock: |
228 | mutex_unlock(&kvm->irq_lock); | 236 | mutex_unlock(&kvm->irq_lock); |
@@ -233,6 +241,9 @@ unlock: | |||
233 | void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) | 241 | void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) |
234 | { | 242 | { |
235 | ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); | 243 | ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); |
244 | #ifdef CONFIG_X86 | ||
245 | ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); | ||
246 | #endif | ||
236 | 247 | ||
237 | mutex_lock(&kvm->irq_lock); | 248 | mutex_lock(&kvm->irq_lock); |
238 | if (irq_source_id < 0 || | 249 | if (irq_source_id < 0 || |
@@ -321,11 +332,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, | |||
321 | switch (ue->u.irqchip.irqchip) { | 332 | switch (ue->u.irqchip.irqchip) { |
322 | case KVM_IRQCHIP_PIC_MASTER: | 333 | case KVM_IRQCHIP_PIC_MASTER: |
323 | e->set = kvm_set_pic_irq; | 334 | e->set = kvm_set_pic_irq; |
324 | max_pin = 16; | 335 | max_pin = PIC_NUM_PINS; |
325 | break; | 336 | break; |
326 | case KVM_IRQCHIP_PIC_SLAVE: | 337 | case KVM_IRQCHIP_PIC_SLAVE: |
327 | e->set = kvm_set_pic_irq; | 338 | e->set = kvm_set_pic_irq; |
328 | max_pin = 16; | 339 | max_pin = PIC_NUM_PINS; |
329 | delta = 8; | 340 | delta = 8; |
330 | break; | 341 | break; |
331 | case KVM_IRQCHIP_IOAPIC: | 342 | case KVM_IRQCHIP_IOAPIC: |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d617f69131d7..c353b4599cec 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
@@ -100,13 +100,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting); | |||
100 | 100 | ||
101 | static bool largepages_enabled = true; | 101 | static bool largepages_enabled = true; |
102 | 102 | ||
103 | static struct page *hwpoison_page; | 103 | bool kvm_is_mmio_pfn(pfn_t pfn) |
104 | static pfn_t hwpoison_pfn; | ||
105 | |||
106 | struct page *fault_page; | ||
107 | pfn_t fault_pfn; | ||
108 | |||
109 | inline int kvm_is_mmio_pfn(pfn_t pfn) | ||
110 | { | 104 | { |
111 | if (pfn_valid(pfn)) { | 105 | if (pfn_valid(pfn)) { |
112 | int reserved; | 106 | int reserved; |
@@ -137,11 +131,12 @@ inline int kvm_is_mmio_pfn(pfn_t pfn) | |||
137 | /* | 131 | /* |
138 | * Switches to specified vcpu, until a matching vcpu_put() | 132 | * Switches to specified vcpu, until a matching vcpu_put() |
139 | */ | 133 | */ |
140 | void vcpu_load(struct kvm_vcpu *vcpu) | 134 | int vcpu_load(struct kvm_vcpu *vcpu) |
141 | { | 135 | { |
142 | int cpu; | 136 | int cpu; |
143 | 137 | ||
144 | mutex_lock(&vcpu->mutex); | 138 | if (mutex_lock_killable(&vcpu->mutex)) |
139 | return -EINTR; | ||
145 | if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { | 140 | if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { |
146 | /* The thread running this VCPU changed. */ | 141 | /* The thread running this VCPU changed. */ |
147 | struct pid *oldpid = vcpu->pid; | 142 | struct pid *oldpid = vcpu->pid; |
@@ -154,6 +149,7 @@ void vcpu_load(struct kvm_vcpu *vcpu) | |||
154 | preempt_notifier_register(&vcpu->preempt_notifier); | 149 | preempt_notifier_register(&vcpu->preempt_notifier); |
155 | kvm_arch_vcpu_load(vcpu, cpu); | 150 | kvm_arch_vcpu_load(vcpu, cpu); |
156 | put_cpu(); | 151 | put_cpu(); |
152 | return 0; | ||
157 | } | 153 | } |
158 | 154 | ||
159 | void vcpu_put(struct kvm_vcpu *vcpu) | 155 | void vcpu_put(struct kvm_vcpu *vcpu) |
@@ -236,6 +232,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) | |||
236 | } | 232 | } |
237 | vcpu->run = page_address(page); | 233 | vcpu->run = page_address(page); |
238 | 234 | ||
235 | kvm_vcpu_set_in_spin_loop(vcpu, false); | ||
236 | kvm_vcpu_set_dy_eligible(vcpu, false); | ||
237 | |||
239 | r = kvm_arch_vcpu_init(vcpu); | 238 | r = kvm_arch_vcpu_init(vcpu); |
240 | if (r < 0) | 239 | if (r < 0) |
241 | goto fail_free_run; | 240 | goto fail_free_run; |
@@ -332,8 +331,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, | |||
332 | * count is also read inside the mmu_lock critical section. | 331 | * count is also read inside the mmu_lock critical section. |
333 | */ | 332 | */ |
334 | kvm->mmu_notifier_count++; | 333 | kvm->mmu_notifier_count++; |
335 | for (; start < end; start += PAGE_SIZE) | 334 | need_tlb_flush = kvm_unmap_hva_range(kvm, start, end); |
336 | need_tlb_flush |= kvm_unmap_hva(kvm, start); | ||
337 | need_tlb_flush |= kvm->tlbs_dirty; | 335 | need_tlb_flush |= kvm->tlbs_dirty; |
338 | /* we've to flush the tlb before the pages can be freed */ | 336 | /* we've to flush the tlb before the pages can be freed */ |
339 | if (need_tlb_flush) | 337 | if (need_tlb_flush) |
@@ -412,7 +410,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, | |||
412 | int idx; | 410 | int idx; |
413 | 411 | ||
414 | idx = srcu_read_lock(&kvm->srcu); | 412 | idx = srcu_read_lock(&kvm->srcu); |
415 | kvm_arch_flush_shadow(kvm); | 413 | kvm_arch_flush_shadow_all(kvm); |
416 | srcu_read_unlock(&kvm->srcu, idx); | 414 | srcu_read_unlock(&kvm->srcu, idx); |
417 | } | 415 | } |
418 | 416 | ||
@@ -551,16 +549,12 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) | |||
551 | static void kvm_free_physmem_slot(struct kvm_memory_slot *free, | 549 | static void kvm_free_physmem_slot(struct kvm_memory_slot *free, |
552 | struct kvm_memory_slot *dont) | 550 | struct kvm_memory_slot *dont) |
553 | { | 551 | { |
554 | if (!dont || free->rmap != dont->rmap) | ||
555 | vfree(free->rmap); | ||
556 | |||
557 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) | 552 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) |
558 | kvm_destroy_dirty_bitmap(free); | 553 | kvm_destroy_dirty_bitmap(free); |
559 | 554 | ||
560 | kvm_arch_free_memslot(free, dont); | 555 | kvm_arch_free_memslot(free, dont); |
561 | 556 | ||
562 | free->npages = 0; | 557 | free->npages = 0; |
563 | free->rmap = NULL; | ||
564 | } | 558 | } |
565 | 559 | ||
566 | void kvm_free_physmem(struct kvm *kvm) | 560 | void kvm_free_physmem(struct kvm *kvm) |
@@ -590,7 +584,7 @@ static void kvm_destroy_vm(struct kvm *kvm) | |||
590 | #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) | 584 | #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) |
591 | mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); | 585 | mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); |
592 | #else | 586 | #else |
593 | kvm_arch_flush_shadow(kvm); | 587 | kvm_arch_flush_shadow_all(kvm); |
594 | #endif | 588 | #endif |
595 | kvm_arch_destroy_vm(kvm); | 589 | kvm_arch_destroy_vm(kvm); |
596 | kvm_free_physmem(kvm); | 590 | kvm_free_physmem(kvm); |
@@ -686,6 +680,20 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new) | |||
686 | slots->generation++; | 680 | slots->generation++; |
687 | } | 681 | } |
688 | 682 | ||
683 | static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) | ||
684 | { | ||
685 | u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; | ||
686 | |||
687 | #ifdef KVM_CAP_READONLY_MEM | ||
688 | valid_flags |= KVM_MEM_READONLY; | ||
689 | #endif | ||
690 | |||
691 | if (mem->flags & ~valid_flags) | ||
692 | return -EINVAL; | ||
693 | |||
694 | return 0; | ||
695 | } | ||
696 | |||
689 | /* | 697 | /* |
690 | * Allocate some memory and give it an address in the guest physical address | 698 | * Allocate some memory and give it an address in the guest physical address |
691 | * space. | 699 | * space. |
@@ -706,6 +714,10 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
706 | struct kvm_memory_slot old, new; | 714 | struct kvm_memory_slot old, new; |
707 | struct kvm_memslots *slots, *old_memslots; | 715 | struct kvm_memslots *slots, *old_memslots; |
708 | 716 | ||
717 | r = check_memory_region_flags(mem); | ||
718 | if (r) | ||
719 | goto out; | ||
720 | |||
709 | r = -EINVAL; | 721 | r = -EINVAL; |
710 | /* General sanity checks */ | 722 | /* General sanity checks */ |
711 | if (mem->memory_size & (PAGE_SIZE - 1)) | 723 | if (mem->memory_size & (PAGE_SIZE - 1)) |
@@ -769,11 +781,7 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
769 | if (npages && !old.npages) { | 781 | if (npages && !old.npages) { |
770 | new.user_alloc = user_alloc; | 782 | new.user_alloc = user_alloc; |
771 | new.userspace_addr = mem->userspace_addr; | 783 | new.userspace_addr = mem->userspace_addr; |
772 | #ifndef CONFIG_S390 | 784 | |
773 | new.rmap = vzalloc(npages * sizeof(*new.rmap)); | ||
774 | if (!new.rmap) | ||
775 | goto out_free; | ||
776 | #endif /* not defined CONFIG_S390 */ | ||
777 | if (kvm_arch_create_memslot(&new, npages)) | 785 | if (kvm_arch_create_memslot(&new, npages)) |
778 | goto out_free; | 786 | goto out_free; |
779 | } | 787 | } |
@@ -785,7 +793,7 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
785 | /* destroy any largepage mappings for dirty tracking */ | 793 | /* destroy any largepage mappings for dirty tracking */ |
786 | } | 794 | } |
787 | 795 | ||
788 | if (!npages) { | 796 | if (!npages || base_gfn != old.base_gfn) { |
789 | struct kvm_memory_slot *slot; | 797 | struct kvm_memory_slot *slot; |
790 | 798 | ||
791 | r = -ENOMEM; | 799 | r = -ENOMEM; |
@@ -801,14 +809,14 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
801 | old_memslots = kvm->memslots; | 809 | old_memslots = kvm->memslots; |
802 | rcu_assign_pointer(kvm->memslots, slots); | 810 | rcu_assign_pointer(kvm->memslots, slots); |
803 | synchronize_srcu_expedited(&kvm->srcu); | 811 | synchronize_srcu_expedited(&kvm->srcu); |
804 | /* From this point no new shadow pages pointing to a deleted | 812 | /* From this point no new shadow pages pointing to a deleted, |
805 | * memslot will be created. | 813 | * or moved, memslot will be created. |
806 | * | 814 | * |
807 | * validation of sp->gfn happens in: | 815 | * validation of sp->gfn happens in: |
808 | * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) | 816 | * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) |
809 | * - kvm_is_visible_gfn (mmu_check_roots) | 817 | * - kvm_is_visible_gfn (mmu_check_roots) |
810 | */ | 818 | */ |
811 | kvm_arch_flush_shadow(kvm); | 819 | kvm_arch_flush_shadow_memslot(kvm, slot); |
812 | kfree(old_memslots); | 820 | kfree(old_memslots); |
813 | } | 821 | } |
814 | 822 | ||
@@ -832,7 +840,6 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
832 | 840 | ||
833 | /* actual memory is freed via old in kvm_free_physmem_slot below */ | 841 | /* actual memory is freed via old in kvm_free_physmem_slot below */ |
834 | if (!npages) { | 842 | if (!npages) { |
835 | new.rmap = NULL; | ||
836 | new.dirty_bitmap = NULL; | 843 | new.dirty_bitmap = NULL; |
837 | memset(&new.arch, 0, sizeof(new.arch)); | 844 | memset(&new.arch, 0, sizeof(new.arch)); |
838 | } | 845 | } |
@@ -844,13 +851,6 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
844 | 851 | ||
845 | kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); | 852 | kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); |
846 | 853 | ||
847 | /* | ||
848 | * If the new memory slot is created, we need to clear all | ||
849 | * mmio sptes. | ||
850 | */ | ||
851 | if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) | ||
852 | kvm_arch_flush_shadow(kvm); | ||
853 | |||
854 | kvm_free_physmem_slot(&old, &new); | 854 | kvm_free_physmem_slot(&old, &new); |
855 | kfree(old_memslots); | 855 | kfree(old_memslots); |
856 | 856 | ||
@@ -932,53 +932,6 @@ void kvm_disable_largepages(void) | |||
932 | } | 932 | } |
933 | EXPORT_SYMBOL_GPL(kvm_disable_largepages); | 933 | EXPORT_SYMBOL_GPL(kvm_disable_largepages); |
934 | 934 | ||
935 | int is_error_page(struct page *page) | ||
936 | { | ||
937 | return page == bad_page || page == hwpoison_page || page == fault_page; | ||
938 | } | ||
939 | EXPORT_SYMBOL_GPL(is_error_page); | ||
940 | |||
941 | int is_error_pfn(pfn_t pfn) | ||
942 | { | ||
943 | return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn; | ||
944 | } | ||
945 | EXPORT_SYMBOL_GPL(is_error_pfn); | ||
946 | |||
947 | int is_hwpoison_pfn(pfn_t pfn) | ||
948 | { | ||
949 | return pfn == hwpoison_pfn; | ||
950 | } | ||
951 | EXPORT_SYMBOL_GPL(is_hwpoison_pfn); | ||
952 | |||
953 | int is_fault_pfn(pfn_t pfn) | ||
954 | { | ||
955 | return pfn == fault_pfn; | ||
956 | } | ||
957 | EXPORT_SYMBOL_GPL(is_fault_pfn); | ||
958 | |||
959 | int is_noslot_pfn(pfn_t pfn) | ||
960 | { | ||
961 | return pfn == bad_pfn; | ||
962 | } | ||
963 | EXPORT_SYMBOL_GPL(is_noslot_pfn); | ||
964 | |||
965 | int is_invalid_pfn(pfn_t pfn) | ||
966 | { | ||
967 | return pfn == hwpoison_pfn || pfn == fault_pfn; | ||
968 | } | ||
969 | EXPORT_SYMBOL_GPL(is_invalid_pfn); | ||
970 | |||
971 | static inline unsigned long bad_hva(void) | ||
972 | { | ||
973 | return PAGE_OFFSET; | ||
974 | } | ||
975 | |||
976 | int kvm_is_error_hva(unsigned long addr) | ||
977 | { | ||
978 | return addr == bad_hva(); | ||
979 | } | ||
980 | EXPORT_SYMBOL_GPL(kvm_is_error_hva); | ||
981 | |||
982 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | 935 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) |
983 | { | 936 | { |
984 | return __gfn_to_memslot(kvm_memslots(kvm), gfn); | 937 | return __gfn_to_memslot(kvm_memslots(kvm), gfn); |
@@ -1021,28 +974,62 @@ out: | |||
1021 | return size; | 974 | return size; |
1022 | } | 975 | } |
1023 | 976 | ||
1024 | static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, | 977 | static bool memslot_is_readonly(struct kvm_memory_slot *slot) |
1025 | gfn_t *nr_pages) | 978 | { |
979 | return slot->flags & KVM_MEM_READONLY; | ||
980 | } | ||
981 | |||
982 | static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, | ||
983 | gfn_t *nr_pages, bool write) | ||
1026 | { | 984 | { |
1027 | if (!slot || slot->flags & KVM_MEMSLOT_INVALID) | 985 | if (!slot || slot->flags & KVM_MEMSLOT_INVALID) |
1028 | return bad_hva(); | 986 | return KVM_HVA_ERR_BAD; |
987 | |||
988 | if (memslot_is_readonly(slot) && write) | ||
989 | return KVM_HVA_ERR_RO_BAD; | ||
1029 | 990 | ||
1030 | if (nr_pages) | 991 | if (nr_pages) |
1031 | *nr_pages = slot->npages - (gfn - slot->base_gfn); | 992 | *nr_pages = slot->npages - (gfn - slot->base_gfn); |
1032 | 993 | ||
1033 | return gfn_to_hva_memslot(slot, gfn); | 994 | return __gfn_to_hva_memslot(slot, gfn); |
1034 | } | 995 | } |
1035 | 996 | ||
997 | static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, | ||
998 | gfn_t *nr_pages) | ||
999 | { | ||
1000 | return __gfn_to_hva_many(slot, gfn, nr_pages, true); | ||
1001 | } | ||
1002 | |||
1003 | unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, | ||
1004 | gfn_t gfn) | ||
1005 | { | ||
1006 | return gfn_to_hva_many(slot, gfn, NULL); | ||
1007 | } | ||
1008 | EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); | ||
1009 | |||
1036 | unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) | 1010 | unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) |
1037 | { | 1011 | { |
1038 | return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); | 1012 | return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); |
1039 | } | 1013 | } |
1040 | EXPORT_SYMBOL_GPL(gfn_to_hva); | 1014 | EXPORT_SYMBOL_GPL(gfn_to_hva); |
1041 | 1015 | ||
1042 | static pfn_t get_fault_pfn(void) | 1016 | /* |
1017 | * The hva returned by this function is only allowed to be read. | ||
1018 | * It should pair with kvm_read_hva() or kvm_read_hva_atomic(). | ||
1019 | */ | ||
1020 | static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn) | ||
1021 | { | ||
1022 | return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false); | ||
1023 | } | ||
1024 | |||
1025 | static int kvm_read_hva(void *data, void __user *hva, int len) | ||
1043 | { | 1026 | { |
1044 | get_page(fault_page); | 1027 | return __copy_from_user(data, hva, len); |
1045 | return fault_pfn; | 1028 | } |
1029 | |||
1030 | static int kvm_read_hva_atomic(void *data, void __user *hva, int len) | ||
1031 | { | ||
1032 | return __copy_from_user_inatomic(data, hva, len); | ||
1046 | } | 1033 | } |
1047 | 1034 | ||
1048 | int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, | 1035 | int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, |
@@ -1065,108 +1052,186 @@ static inline int check_user_page_hwpoison(unsigned long addr) | |||
1065 | return rc == -EHWPOISON; | 1052 | return rc == -EHWPOISON; |
1066 | } | 1053 | } |
1067 | 1054 | ||
1068 | static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, | 1055 | /* |
1069 | bool *async, bool write_fault, bool *writable) | 1056 | * The atomic path to get the writable pfn which will be stored in @pfn, |
1057 | * true indicates success, otherwise false is returned. | ||
1058 | */ | ||
1059 | static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, | ||
1060 | bool write_fault, bool *writable, pfn_t *pfn) | ||
1070 | { | 1061 | { |
1071 | struct page *page[1]; | 1062 | struct page *page[1]; |
1072 | int npages = 0; | 1063 | int npages; |
1073 | pfn_t pfn; | ||
1074 | 1064 | ||
1075 | /* we can do it either atomically or asynchronously, not both */ | 1065 | if (!(async || atomic)) |
1076 | BUG_ON(atomic && async); | 1066 | return false; |
1077 | 1067 | ||
1078 | BUG_ON(!write_fault && !writable); | 1068 | /* |
1069 | * Fast pin a writable pfn only if it is a write fault request | ||
1070 | * or the caller allows to map a writable pfn for a read fault | ||
1071 | * request. | ||
1072 | */ | ||
1073 | if (!(write_fault || writable)) | ||
1074 | return false; | ||
1079 | 1075 | ||
1080 | if (writable) | 1076 | npages = __get_user_pages_fast(addr, 1, 1, page); |
1081 | *writable = true; | 1077 | if (npages == 1) { |
1078 | *pfn = page_to_pfn(page[0]); | ||
1082 | 1079 | ||
1083 | if (atomic || async) | 1080 | if (writable) |
1084 | npages = __get_user_pages_fast(addr, 1, 1, page); | 1081 | *writable = true; |
1082 | return true; | ||
1083 | } | ||
1085 | 1084 | ||
1086 | if (unlikely(npages != 1) && !atomic) { | 1085 | return false; |
1087 | might_sleep(); | 1086 | } |
1088 | 1087 | ||
1089 | if (writable) | 1088 | /* |
1090 | *writable = write_fault; | 1089 | * The slow path to get the pfn of the specified host virtual address, |
1090 | * 1 indicates success, -errno is returned if error is detected. | ||
1091 | */ | ||
1092 | static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, | ||
1093 | bool *writable, pfn_t *pfn) | ||
1094 | { | ||
1095 | struct page *page[1]; | ||
1096 | int npages = 0; | ||
1091 | 1097 | ||
1092 | if (async) { | 1098 | might_sleep(); |
1093 | down_read(¤t->mm->mmap_sem); | 1099 | |
1094 | npages = get_user_page_nowait(current, current->mm, | 1100 | if (writable) |
1095 | addr, write_fault, page); | 1101 | *writable = write_fault; |
1096 | up_read(¤t->mm->mmap_sem); | 1102 | |
1097 | } else | 1103 | if (async) { |
1098 | npages = get_user_pages_fast(addr, 1, write_fault, | 1104 | down_read(¤t->mm->mmap_sem); |
1099 | page); | 1105 | npages = get_user_page_nowait(current, current->mm, |
1100 | 1106 | addr, write_fault, page); | |
1101 | /* map read fault as writable if possible */ | 1107 | up_read(¤t->mm->mmap_sem); |
1102 | if (unlikely(!write_fault) && npages == 1) { | 1108 | } else |
1103 | struct page *wpage[1]; | 1109 | npages = get_user_pages_fast(addr, 1, write_fault, |
1104 | 1110 | page); | |
1105 | npages = __get_user_pages_fast(addr, 1, 1, wpage); | 1111 | if (npages != 1) |
1106 | if (npages == 1) { | 1112 | return npages; |
1107 | *writable = true; | 1113 | |
1108 | put_page(page[0]); | 1114 | /* map read fault as writable if possible */ |
1109 | page[0] = wpage[0]; | 1115 | if (unlikely(!write_fault) && writable) { |
1110 | } | 1116 | struct page *wpage[1]; |
1111 | npages = 1; | 1117 | |
1118 | npages = __get_user_pages_fast(addr, 1, 1, wpage); | ||
1119 | if (npages == 1) { | ||
1120 | *writable = true; | ||
1121 | put_page(page[0]); | ||
1122 | page[0] = wpage[0]; | ||
1112 | } | 1123 | } |
1124 | |||
1125 | npages = 1; | ||
1113 | } | 1126 | } |
1127 | *pfn = page_to_pfn(page[0]); | ||
1128 | return npages; | ||
1129 | } | ||
1114 | 1130 | ||
1115 | if (unlikely(npages != 1)) { | 1131 | static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) |
1116 | struct vm_area_struct *vma; | 1132 | { |
1133 | if (unlikely(!(vma->vm_flags & VM_READ))) | ||
1134 | return false; | ||
1117 | 1135 | ||
1118 | if (atomic) | 1136 | if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) |
1119 | return get_fault_pfn(); | 1137 | return false; |
1120 | 1138 | ||
1121 | down_read(¤t->mm->mmap_sem); | 1139 | return true; |
1122 | if (npages == -EHWPOISON || | 1140 | } |
1123 | (!async && check_user_page_hwpoison(addr))) { | ||
1124 | up_read(¤t->mm->mmap_sem); | ||
1125 | get_page(hwpoison_page); | ||
1126 | return page_to_pfn(hwpoison_page); | ||
1127 | } | ||
1128 | 1141 | ||
1129 | vma = find_vma_intersection(current->mm, addr, addr+1); | 1142 | /* |
1130 | 1143 | * Pin guest page in memory and return its pfn. | |
1131 | if (vma == NULL) | 1144 | * @addr: host virtual address which maps memory to the guest |
1132 | pfn = get_fault_pfn(); | 1145 | * @atomic: whether this function can sleep |
1133 | else if ((vma->vm_flags & VM_PFNMAP)) { | 1146 | * @async: whether this function need to wait IO complete if the |
1134 | pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + | 1147 | * host page is not in the memory |
1135 | vma->vm_pgoff; | 1148 | * @write_fault: whether we should get a writable host page |
1136 | BUG_ON(!kvm_is_mmio_pfn(pfn)); | 1149 | * @writable: whether it allows to map a writable host page for !@write_fault |
1137 | } else { | 1150 | * |
1138 | if (async && (vma->vm_flags & VM_WRITE)) | 1151 | * The function will map a writable host page for these two cases: |
1139 | *async = true; | 1152 | * 1): @write_fault = true |
1140 | pfn = get_fault_pfn(); | 1153 | * 2): @write_fault = false && @writable, @writable will tell the caller |
1141 | } | 1154 | * whether the mapping is writable. |
1142 | up_read(¤t->mm->mmap_sem); | 1155 | */ |
1143 | } else | 1156 | static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, |
1144 | pfn = page_to_pfn(page[0]); | 1157 | bool write_fault, bool *writable) |
1158 | { | ||
1159 | struct vm_area_struct *vma; | ||
1160 | pfn_t pfn = 0; | ||
1161 | int npages; | ||
1162 | |||
1163 | /* we can do it either atomically or asynchronously, not both */ | ||
1164 | BUG_ON(atomic && async); | ||
1145 | 1165 | ||
1166 | if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn)) | ||
1167 | return pfn; | ||
1168 | |||
1169 | if (atomic) | ||
1170 | return KVM_PFN_ERR_FAULT; | ||
1171 | |||
1172 | npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); | ||
1173 | if (npages == 1) | ||
1174 | return pfn; | ||
1175 | |||
1176 | down_read(¤t->mm->mmap_sem); | ||
1177 | if (npages == -EHWPOISON || | ||
1178 | (!async && check_user_page_hwpoison(addr))) { | ||
1179 | pfn = KVM_PFN_ERR_HWPOISON; | ||
1180 | goto exit; | ||
1181 | } | ||
1182 | |||
1183 | vma = find_vma_intersection(current->mm, addr, addr + 1); | ||
1184 | |||
1185 | if (vma == NULL) | ||
1186 | pfn = KVM_PFN_ERR_FAULT; | ||
1187 | else if ((vma->vm_flags & VM_PFNMAP)) { | ||
1188 | pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + | ||
1189 | vma->vm_pgoff; | ||
1190 | BUG_ON(!kvm_is_mmio_pfn(pfn)); | ||
1191 | } else { | ||
1192 | if (async && vma_is_valid(vma, write_fault)) | ||
1193 | *async = true; | ||
1194 | pfn = KVM_PFN_ERR_FAULT; | ||
1195 | } | ||
1196 | exit: | ||
1197 | up_read(¤t->mm->mmap_sem); | ||
1146 | return pfn; | 1198 | return pfn; |
1147 | } | 1199 | } |
1148 | 1200 | ||
1149 | pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) | 1201 | static pfn_t |
1202 | __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, | ||
1203 | bool *async, bool write_fault, bool *writable) | ||
1150 | { | 1204 | { |
1151 | return hva_to_pfn(kvm, addr, true, NULL, true, NULL); | 1205 | unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); |
1206 | |||
1207 | if (addr == KVM_HVA_ERR_RO_BAD) | ||
1208 | return KVM_PFN_ERR_RO_FAULT; | ||
1209 | |||
1210 | if (kvm_is_error_hva(addr)) | ||
1211 | return KVM_PFN_ERR_BAD; | ||
1212 | |||
1213 | /* Do not map writable pfn in the readonly memslot. */ | ||
1214 | if (writable && memslot_is_readonly(slot)) { | ||
1215 | *writable = false; | ||
1216 | writable = NULL; | ||
1217 | } | ||
1218 | |||
1219 | return hva_to_pfn(addr, atomic, async, write_fault, | ||
1220 | writable); | ||
1152 | } | 1221 | } |
1153 | EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); | ||
1154 | 1222 | ||
1155 | static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, | 1223 | static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, |
1156 | bool write_fault, bool *writable) | 1224 | bool write_fault, bool *writable) |
1157 | { | 1225 | { |
1158 | unsigned long addr; | 1226 | struct kvm_memory_slot *slot; |
1159 | 1227 | ||
1160 | if (async) | 1228 | if (async) |
1161 | *async = false; | 1229 | *async = false; |
1162 | 1230 | ||
1163 | addr = gfn_to_hva(kvm, gfn); | 1231 | slot = gfn_to_memslot(kvm, gfn); |
1164 | if (kvm_is_error_hva(addr)) { | ||
1165 | get_page(bad_page); | ||
1166 | return page_to_pfn(bad_page); | ||
1167 | } | ||
1168 | 1232 | ||
1169 | return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); | 1233 | return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault, |
1234 | writable); | ||
1170 | } | 1235 | } |
1171 | 1236 | ||
1172 | pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) | 1237 | pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) |
@@ -1195,12 +1260,16 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, | |||
1195 | } | 1260 | } |
1196 | EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); | 1261 | EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); |
1197 | 1262 | ||
1198 | pfn_t gfn_to_pfn_memslot(struct kvm *kvm, | 1263 | pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) |
1199 | struct kvm_memory_slot *slot, gfn_t gfn) | 1264 | { |
1265 | return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); | ||
1266 | } | ||
1267 | |||
1268 | pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) | ||
1200 | { | 1269 | { |
1201 | unsigned long addr = gfn_to_hva_memslot(slot, gfn); | 1270 | return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); |
1202 | return hva_to_pfn(kvm, addr, false, NULL, true, NULL); | ||
1203 | } | 1271 | } |
1272 | EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); | ||
1204 | 1273 | ||
1205 | int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, | 1274 | int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, |
1206 | int nr_pages) | 1275 | int nr_pages) |
@@ -1219,30 +1288,42 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, | |||
1219 | } | 1288 | } |
1220 | EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); | 1289 | EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); |
1221 | 1290 | ||
1291 | static struct page *kvm_pfn_to_page(pfn_t pfn) | ||
1292 | { | ||
1293 | if (is_error_pfn(pfn)) | ||
1294 | return KVM_ERR_PTR_BAD_PAGE; | ||
1295 | |||
1296 | if (kvm_is_mmio_pfn(pfn)) { | ||
1297 | WARN_ON(1); | ||
1298 | return KVM_ERR_PTR_BAD_PAGE; | ||
1299 | } | ||
1300 | |||
1301 | return pfn_to_page(pfn); | ||
1302 | } | ||
1303 | |||
1222 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) | 1304 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) |
1223 | { | 1305 | { |
1224 | pfn_t pfn; | 1306 | pfn_t pfn; |
1225 | 1307 | ||
1226 | pfn = gfn_to_pfn(kvm, gfn); | 1308 | pfn = gfn_to_pfn(kvm, gfn); |
1227 | if (!kvm_is_mmio_pfn(pfn)) | ||
1228 | return pfn_to_page(pfn); | ||
1229 | |||
1230 | WARN_ON(kvm_is_mmio_pfn(pfn)); | ||
1231 | 1309 | ||
1232 | get_page(bad_page); | 1310 | return kvm_pfn_to_page(pfn); |
1233 | return bad_page; | ||
1234 | } | 1311 | } |
1235 | 1312 | ||
1236 | EXPORT_SYMBOL_GPL(gfn_to_page); | 1313 | EXPORT_SYMBOL_GPL(gfn_to_page); |
1237 | 1314 | ||
1238 | void kvm_release_page_clean(struct page *page) | 1315 | void kvm_release_page_clean(struct page *page) |
1239 | { | 1316 | { |
1317 | WARN_ON(is_error_page(page)); | ||
1318 | |||
1240 | kvm_release_pfn_clean(page_to_pfn(page)); | 1319 | kvm_release_pfn_clean(page_to_pfn(page)); |
1241 | } | 1320 | } |
1242 | EXPORT_SYMBOL_GPL(kvm_release_page_clean); | 1321 | EXPORT_SYMBOL_GPL(kvm_release_page_clean); |
1243 | 1322 | ||
1244 | void kvm_release_pfn_clean(pfn_t pfn) | 1323 | void kvm_release_pfn_clean(pfn_t pfn) |
1245 | { | 1324 | { |
1325 | WARN_ON(is_error_pfn(pfn)); | ||
1326 | |||
1246 | if (!kvm_is_mmio_pfn(pfn)) | 1327 | if (!kvm_is_mmio_pfn(pfn)) |
1247 | put_page(pfn_to_page(pfn)); | 1328 | put_page(pfn_to_page(pfn)); |
1248 | } | 1329 | } |
@@ -1250,6 +1331,8 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); | |||
1250 | 1331 | ||
1251 | void kvm_release_page_dirty(struct page *page) | 1332 | void kvm_release_page_dirty(struct page *page) |
1252 | { | 1333 | { |
1334 | WARN_ON(is_error_page(page)); | ||
1335 | |||
1253 | kvm_release_pfn_dirty(page_to_pfn(page)); | 1336 | kvm_release_pfn_dirty(page_to_pfn(page)); |
1254 | } | 1337 | } |
1255 | EXPORT_SYMBOL_GPL(kvm_release_page_dirty); | 1338 | EXPORT_SYMBOL_GPL(kvm_release_page_dirty); |
@@ -1305,10 +1388,10 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, | |||
1305 | int r; | 1388 | int r; |
1306 | unsigned long addr; | 1389 | unsigned long addr; |
1307 | 1390 | ||
1308 | addr = gfn_to_hva(kvm, gfn); | 1391 | addr = gfn_to_hva_read(kvm, gfn); |
1309 | if (kvm_is_error_hva(addr)) | 1392 | if (kvm_is_error_hva(addr)) |
1310 | return -EFAULT; | 1393 | return -EFAULT; |
1311 | r = __copy_from_user(data, (void __user *)addr + offset, len); | 1394 | r = kvm_read_hva(data, (void __user *)addr + offset, len); |
1312 | if (r) | 1395 | if (r) |
1313 | return -EFAULT; | 1396 | return -EFAULT; |
1314 | return 0; | 1397 | return 0; |
@@ -1343,11 +1426,11 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, | |||
1343 | gfn_t gfn = gpa >> PAGE_SHIFT; | 1426 | gfn_t gfn = gpa >> PAGE_SHIFT; |
1344 | int offset = offset_in_page(gpa); | 1427 | int offset = offset_in_page(gpa); |
1345 | 1428 | ||
1346 | addr = gfn_to_hva(kvm, gfn); | 1429 | addr = gfn_to_hva_read(kvm, gfn); |
1347 | if (kvm_is_error_hva(addr)) | 1430 | if (kvm_is_error_hva(addr)) |
1348 | return -EFAULT; | 1431 | return -EFAULT; |
1349 | pagefault_disable(); | 1432 | pagefault_disable(); |
1350 | r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); | 1433 | r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len); |
1351 | pagefault_enable(); | 1434 | pagefault_enable(); |
1352 | if (r) | 1435 | if (r) |
1353 | return -EFAULT; | 1436 | return -EFAULT; |
@@ -1580,6 +1663,43 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target) | |||
1580 | } | 1663 | } |
1581 | EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); | 1664 | EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); |
1582 | 1665 | ||
1666 | #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT | ||
1667 | /* | ||
1668 | * Helper that checks whether a VCPU is eligible for directed yield. | ||
1669 | * Most eligible candidate to yield is decided by following heuristics: | ||
1670 | * | ||
1671 | * (a) VCPU which has not done pl-exit or cpu relax intercepted recently | ||
1672 | * (preempted lock holder), indicated by @in_spin_loop. | ||
1673 | * Set at the beiginning and cleared at the end of interception/PLE handler. | ||
1674 | * | ||
1675 | * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get | ||
1676 | * chance last time (mostly it has become eligible now since we have probably | ||
1677 | * yielded to lockholder in last iteration. This is done by toggling | ||
1678 | * @dy_eligible each time a VCPU checked for eligibility.) | ||
1679 | * | ||
1680 | * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding | ||
1681 | * to preempted lock-holder could result in wrong VCPU selection and CPU | ||
1682 | * burning. Giving priority for a potential lock-holder increases lock | ||
1683 | * progress. | ||
1684 | * | ||
1685 | * Since algorithm is based on heuristics, accessing another VCPU data without | ||
1686 | * locking does not harm. It may result in trying to yield to same VCPU, fail | ||
1687 | * and continue with next VCPU and so on. | ||
1688 | */ | ||
1689 | bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) | ||
1690 | { | ||
1691 | bool eligible; | ||
1692 | |||
1693 | eligible = !vcpu->spin_loop.in_spin_loop || | ||
1694 | (vcpu->spin_loop.in_spin_loop && | ||
1695 | vcpu->spin_loop.dy_eligible); | ||
1696 | |||
1697 | if (vcpu->spin_loop.in_spin_loop) | ||
1698 | kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); | ||
1699 | |||
1700 | return eligible; | ||
1701 | } | ||
1702 | #endif | ||
1583 | void kvm_vcpu_on_spin(struct kvm_vcpu *me) | 1703 | void kvm_vcpu_on_spin(struct kvm_vcpu *me) |
1584 | { | 1704 | { |
1585 | struct kvm *kvm = me->kvm; | 1705 | struct kvm *kvm = me->kvm; |
@@ -1589,6 +1709,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) | |||
1589 | int pass; | 1709 | int pass; |
1590 | int i; | 1710 | int i; |
1591 | 1711 | ||
1712 | kvm_vcpu_set_in_spin_loop(me, true); | ||
1592 | /* | 1713 | /* |
1593 | * We boost the priority of a VCPU that is runnable but not | 1714 | * We boost the priority of a VCPU that is runnable but not |
1594 | * currently running, because it got preempted by something | 1715 | * currently running, because it got preempted by something |
@@ -1607,6 +1728,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) | |||
1607 | continue; | 1728 | continue; |
1608 | if (waitqueue_active(&vcpu->wq)) | 1729 | if (waitqueue_active(&vcpu->wq)) |
1609 | continue; | 1730 | continue; |
1731 | if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) | ||
1732 | continue; | ||
1610 | if (kvm_vcpu_yield_to(vcpu)) { | 1733 | if (kvm_vcpu_yield_to(vcpu)) { |
1611 | kvm->last_boosted_vcpu = i; | 1734 | kvm->last_boosted_vcpu = i; |
1612 | yielded = 1; | 1735 | yielded = 1; |
@@ -1614,6 +1737,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) | |||
1614 | } | 1737 | } |
1615 | } | 1738 | } |
1616 | } | 1739 | } |
1740 | kvm_vcpu_set_in_spin_loop(me, false); | ||
1741 | |||
1742 | /* Ensure vcpu is not eligible during next spinloop */ | ||
1743 | kvm_vcpu_set_dy_eligible(me, false); | ||
1617 | } | 1744 | } |
1618 | EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); | 1745 | EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); |
1619 | 1746 | ||
@@ -1766,7 +1893,9 @@ static long kvm_vcpu_ioctl(struct file *filp, | |||
1766 | #endif | 1893 | #endif |
1767 | 1894 | ||
1768 | 1895 | ||
1769 | vcpu_load(vcpu); | 1896 | r = vcpu_load(vcpu); |
1897 | if (r) | ||
1898 | return r; | ||
1770 | switch (ioctl) { | 1899 | switch (ioctl) { |
1771 | case KVM_RUN: | 1900 | case KVM_RUN: |
1772 | r = -EINVAL; | 1901 | r = -EINVAL; |
@@ -2094,6 +2223,29 @@ static long kvm_vm_ioctl(struct file *filp, | |||
2094 | break; | 2223 | break; |
2095 | } | 2224 | } |
2096 | #endif | 2225 | #endif |
2226 | #ifdef __KVM_HAVE_IRQ_LINE | ||
2227 | case KVM_IRQ_LINE_STATUS: | ||
2228 | case KVM_IRQ_LINE: { | ||
2229 | struct kvm_irq_level irq_event; | ||
2230 | |||
2231 | r = -EFAULT; | ||
2232 | if (copy_from_user(&irq_event, argp, sizeof irq_event)) | ||
2233 | goto out; | ||
2234 | |||
2235 | r = kvm_vm_ioctl_irq_line(kvm, &irq_event); | ||
2236 | if (r) | ||
2237 | goto out; | ||
2238 | |||
2239 | r = -EFAULT; | ||
2240 | if (ioctl == KVM_IRQ_LINE_STATUS) { | ||
2241 | if (copy_to_user(argp, &irq_event, sizeof irq_event)) | ||
2242 | goto out; | ||
2243 | } | ||
2244 | |||
2245 | r = 0; | ||
2246 | break; | ||
2247 | } | ||
2248 | #endif | ||
2097 | default: | 2249 | default: |
2098 | r = kvm_arch_vm_ioctl(filp, ioctl, arg); | 2250 | r = kvm_arch_vm_ioctl(filp, ioctl, arg); |
2099 | if (r == -ENOTTY) | 2251 | if (r == -ENOTTY) |
@@ -2698,9 +2850,6 @@ static struct syscore_ops kvm_syscore_ops = { | |||
2698 | .resume = kvm_resume, | 2850 | .resume = kvm_resume, |
2699 | }; | 2851 | }; |
2700 | 2852 | ||
2701 | struct page *bad_page; | ||
2702 | pfn_t bad_pfn; | ||
2703 | |||
2704 | static inline | 2853 | static inline |
2705 | struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) | 2854 | struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) |
2706 | { | 2855 | { |
@@ -2732,33 +2881,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, | |||
2732 | if (r) | 2881 | if (r) |
2733 | goto out_fail; | 2882 | goto out_fail; |
2734 | 2883 | ||
2735 | bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
2736 | |||
2737 | if (bad_page == NULL) { | ||
2738 | r = -ENOMEM; | ||
2739 | goto out; | ||
2740 | } | ||
2741 | |||
2742 | bad_pfn = page_to_pfn(bad_page); | ||
2743 | |||
2744 | hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
2745 | |||
2746 | if (hwpoison_page == NULL) { | ||
2747 | r = -ENOMEM; | ||
2748 | goto out_free_0; | ||
2749 | } | ||
2750 | |||
2751 | hwpoison_pfn = page_to_pfn(hwpoison_page); | ||
2752 | |||
2753 | fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
2754 | |||
2755 | if (fault_page == NULL) { | ||
2756 | r = -ENOMEM; | ||
2757 | goto out_free_0; | ||
2758 | } | ||
2759 | |||
2760 | fault_pfn = page_to_pfn(fault_page); | ||
2761 | |||
2762 | if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { | 2884 | if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { |
2763 | r = -ENOMEM; | 2885 | r = -ENOMEM; |
2764 | goto out_free_0; | 2886 | goto out_free_0; |
@@ -2833,12 +2955,6 @@ out_free_1: | |||
2833 | out_free_0a: | 2955 | out_free_0a: |
2834 | free_cpumask_var(cpus_hardware_enabled); | 2956 | free_cpumask_var(cpus_hardware_enabled); |
2835 | out_free_0: | 2957 | out_free_0: |
2836 | if (fault_page) | ||
2837 | __free_page(fault_page); | ||
2838 | if (hwpoison_page) | ||
2839 | __free_page(hwpoison_page); | ||
2840 | __free_page(bad_page); | ||
2841 | out: | ||
2842 | kvm_arch_exit(); | 2958 | kvm_arch_exit(); |
2843 | out_fail: | 2959 | out_fail: |
2844 | return r; | 2960 | return r; |
@@ -2858,8 +2974,5 @@ void kvm_exit(void) | |||
2858 | kvm_arch_hardware_unsetup(); | 2974 | kvm_arch_hardware_unsetup(); |
2859 | kvm_arch_exit(); | 2975 | kvm_arch_exit(); |
2860 | free_cpumask_var(cpus_hardware_enabled); | 2976 | free_cpumask_var(cpus_hardware_enabled); |
2861 | __free_page(fault_page); | ||
2862 | __free_page(hwpoison_page); | ||
2863 | __free_page(bad_page); | ||
2864 | } | 2977 | } |
2865 | EXPORT_SYMBOL_GPL(kvm_exit); | 2978 | EXPORT_SYMBOL_GPL(kvm_exit); |