aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-08-04 13:43:01 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-04 13:43:01 -0400
commit5e83f6fbdb020b70c0e413312801424d13c58d68 (patch)
treeca270178fa891813dbc47751c331fed975d3766c
parentfe445c6e2cb62a566e1a89f8798de11459975710 (diff)
parent3444d7da1839b851eefedd372978d8a982316c36 (diff)
Merge branch 'kvm-updates/2.6.36' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/2.6.36' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (198 commits) KVM: VMX: Fix host GDT.LIMIT corruption KVM: MMU: using __xchg_spte more smarter KVM: MMU: cleanup spte set and accssed/dirty tracking KVM: MMU: don't atomicly set spte if it's not present KVM: MMU: fix page dirty tracking lost while sync page KVM: MMU: fix broken page accessed tracking with ept enabled KVM: MMU: add missing reserved bits check in speculative path KVM: MMU: fix mmu notifier invalidate handler for huge spte KVM: x86 emulator: fix xchg instruction emulation KVM: x86: Call mask notifiers from pic KVM: x86: never re-execute instruction with enabled tdp KVM: Document KVM_GET_SUPPORTED_CPUID2 ioctl KVM: x86: emulator: inc/dec can have lock prefix KVM: MMU: Eliminate redundant temporaries in FNAME(fetch) KVM: MMU: Validate all gptes during fetch, not just those used for new pages KVM: MMU: Simplify spte fetch() function KVM: MMU: Add gpte_valid() helper KVM: MMU: Add validate_direct_spte() helper KVM: MMU: Add drop_large_spte() helper KVM: MMU: Use __set_spte to link shadow pages ...
-rw-r--r--Documentation/feature-removal-schedule.txt21
-rw-r--r--Documentation/kvm/api.txt208
-rw-r--r--Documentation/kvm/mmu.txt52
-rw-r--r--Documentation/kvm/msr.txt153
-rw-r--r--Documentation/kvm/review-checklist.txt38
-rw-r--r--arch/ia64/include/asm/kvm_host.h1
-rw-r--r--arch/ia64/kvm/kvm-ia64.c50
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h10
-rw-r--r--arch/powerpc/include/asm/kvm_fpu.h27
-rw-r--r--arch/powerpc/include/asm/kvm_host.h18
-rw-r--r--arch/powerpc/kernel/ppc_ksyms.c4
-rw-r--r--arch/powerpc/kvm/44x_tlb.c3
-rw-r--r--arch/powerpc/kvm/Makefile2
-rw-r--r--arch/powerpc/kvm/book3s.c79
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu.c8
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu_host.c134
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_host.c129
-rw-r--r--arch/powerpc/kvm/book3s_mmu_hpte.c277
-rw-r--r--arch/powerpc/kvm/book3s_paired_singles.c94
-rw-r--r--arch/powerpc/kvm/booke.c12
-rw-r--r--arch/powerpc/kvm/fpu.S18
-rw-r--r--arch/powerpc/kvm/powerpc.c14
-rw-r--r--arch/s390/include/asm/kvm_host.h5
-rw-r--r--arch/s390/kvm/intercept.c2
-rw-r--r--arch/s390/kvm/kvm-s390.c64
-rw-r--r--arch/s390/kvm/kvm-s390.h2
-rw-r--r--arch/x86/include/asm/i387.h2
-rw-r--r--arch/x86/include/asm/kvm.h22
-rw-r--r--arch/x86/include/asm/kvm_emulate.h30
-rw-r--r--arch/x86/include/asm/kvm_host.h70
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/vmx.h5
-rw-r--r--arch/x86/include/asm/xsave.h6
-rw-r--r--arch/x86/kernel/i387.c3
-rw-r--r--arch/x86/kernel/process.c1
-rw-r--r--arch/x86/kvm/emulate.c749
-rw-r--r--arch/x86/kvm/i8254.c146
-rw-r--r--arch/x86/kvm/i8254.h4
-rw-r--r--arch/x86/kvm/i8259.c48
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/irq.h4
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h8
-rw-r--r--arch/x86/kvm/lapic.c17
-rw-r--r--arch/x86/kvm/mmu.c807
-rw-r--r--arch/x86/kvm/mmutrace.h2
-rw-r--r--arch/x86/kvm/paging_tmpl.h252
-rw-r--r--arch/x86/kvm/svm.c138
-rw-r--r--arch/x86/kvm/timer.c16
-rw-r--r--arch/x86/kvm/vmx.c253
-rw-r--r--arch/x86/kvm/x86.c1174
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--include/linux/kvm.h13
-rw-r--r--include/linux/kvm_host.h35
-rw-r--r--include/linux/kvm_types.h4
-rw-r--r--include/linux/mm.h8
-rw-r--r--mm/memory-failure.c33
-rw-r--r--virt/kvm/assigned-dev.c7
-rw-r--r--virt/kvm/coalesced_mmio.c1
-rw-r--r--virt/kvm/eventfd.c1
-rw-r--r--virt/kvm/ioapic.c3
-rw-r--r--virt/kvm/iommu.c12
-rw-r--r--virt/kvm/irq_comm.c15
-rw-r--r--virt/kvm/kvm_main.c106
63 files changed, 3328 insertions, 2103 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 79cb554761af..b273d35039ed 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -487,17 +487,6 @@ Who: Jan Kiszka <jan.kiszka@web.de>
487 487
488---------------------------- 488----------------------------
489 489
490What: KVM memory aliases support
491When: July 2010
492Why: Memory aliasing support is used for speeding up guest vga access
493 through the vga windows.
494
495 Modern userspace no longer uses this feature, so it's just bitrotted
496 code and can be removed with no impact.
497Who: Avi Kivity <avi@redhat.com>
498
499----------------------------
500
501What: xtime, wall_to_monotonic 490What: xtime, wall_to_monotonic
502When: 2.6.36+ 491When: 2.6.36+
503Files: kernel/time/timekeeping.c include/linux/time.h 492Files: kernel/time/timekeeping.c include/linux/time.h
@@ -508,16 +497,6 @@ Who: John Stultz <johnstul@us.ibm.com>
508 497
509---------------------------- 498----------------------------
510 499
511What: KVM kernel-allocated memory slots
512When: July 2010
513Why: Since 2.6.25, kvm supports user-allocated memory slots, which are
514 much more flexible than kernel-allocated slots. All current userspace
515 supports the newer interface and this code can be removed with no
516 impact.
517Who: Avi Kivity <avi@redhat.com>
518
519----------------------------
520
521What: KVM paravirt mmu host support 500What: KVM paravirt mmu host support
522When: January 2011 501When: January 2011
523Why: The paravirt mmu host support is slower than non-paravirt mmu, both 502Why: The paravirt mmu host support is slower than non-paravirt mmu, both
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index a237518e51b9..5f5b64982b1a 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -126,6 +126,10 @@ user fills in the size of the indices array in nmsrs, and in return
126kvm adjusts nmsrs to reflect the actual number of msrs and fills in 126kvm adjusts nmsrs to reflect the actual number of msrs and fills in
127the indices array with their numbers. 127the indices array with their numbers.
128 128
129Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are
130not returned in the MSR list, as different vcpus can have a different number
131of banks, as set via the KVM_X86_SETUP_MCE ioctl.
132
1294.4 KVM_CHECK_EXTENSION 1334.4 KVM_CHECK_EXTENSION
130 134
131Capability: basic 135Capability: basic
@@ -160,29 +164,7 @@ Type: vm ioctl
160Parameters: struct kvm_memory_region (in) 164Parameters: struct kvm_memory_region (in)
161Returns: 0 on success, -1 on error 165Returns: 0 on success, -1 on error
162 166
163struct kvm_memory_region { 167This ioctl is obsolete and has been removed.
164 __u32 slot;
165 __u32 flags;
166 __u64 guest_phys_addr;
167 __u64 memory_size; /* bytes */
168};
169
170/* for kvm_memory_region::flags */
171#define KVM_MEM_LOG_DIRTY_PAGES 1UL
172
173This ioctl allows the user to create or modify a guest physical memory
174slot. When changing an existing slot, it may be moved in the guest
175physical memory space, or its flags may be modified. It may not be
176resized. Slots may not overlap.
177
178The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
179instructs kvm to keep track of writes to memory within the slot. See
180the KVM_GET_DIRTY_LOG ioctl.
181
182It is recommended to use the KVM_SET_USER_MEMORY_REGION ioctl instead
183of this API, if available. This newer API allows placing guest memory
184at specified locations in the host address space, yielding better
185control and easy access.
186 168
1874.6 KVM_CREATE_VCPU 1694.6 KVM_CREATE_VCPU
188 170
@@ -226,17 +208,7 @@ Type: vm ioctl
226Parameters: struct kvm_memory_alias (in) 208Parameters: struct kvm_memory_alias (in)
227Returns: 0 (success), -1 (error) 209Returns: 0 (success), -1 (error)
228 210
229struct kvm_memory_alias { 211This ioctl is obsolete and has been removed.
230 __u32 slot; /* this has a different namespace than memory slots */
231 __u32 flags;
232 __u64 guest_phys_addr;
233 __u64 memory_size;
234 __u64 target_phys_addr;
235};
236
237Defines a guest physical address space region as an alias to another
238region. Useful for aliased address, for example the VGA low memory
239window. Should not be used with userspace memory.
240 212
2414.9 KVM_RUN 2134.9 KVM_RUN
242 214
@@ -892,6 +864,174 @@ arguments.
892This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel 864This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel
893irqchip, the multiprocessing state must be maintained by userspace. 865irqchip, the multiprocessing state must be maintained by userspace.
894 866
8674.39 KVM_SET_IDENTITY_MAP_ADDR
868
869Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR
870Architectures: x86
871Type: vm ioctl
872Parameters: unsigned long identity (in)
873Returns: 0 on success, -1 on error
874
875This ioctl defines the physical address of a one-page region in the guest
876physical address space. The region must be within the first 4GB of the
877guest physical address space and must not conflict with any memory slot
878or any mmio address. The guest may malfunction if it accesses this memory
879region.
880
881This ioctl is required on Intel-based hosts. This is needed on Intel hardware
882because of a quirk in the virtualization implementation (see the internals
883documentation when it pops into existence).
884
8854.40 KVM_SET_BOOT_CPU_ID
886
887Capability: KVM_CAP_SET_BOOT_CPU_ID
888Architectures: x86, ia64
889Type: vm ioctl
890Parameters: unsigned long vcpu_id
891Returns: 0 on success, -1 on error
892
893Define which vcpu is the Bootstrap Processor (BSP). Values are the same
894as the vcpu id in KVM_CREATE_VCPU. If this ioctl is not called, the default
895is vcpu 0.
896
8974.41 KVM_GET_XSAVE
898
899Capability: KVM_CAP_XSAVE
900Architectures: x86
901Type: vcpu ioctl
902Parameters: struct kvm_xsave (out)
903Returns: 0 on success, -1 on error
904
905struct kvm_xsave {
906 __u32 region[1024];
907};
908
909This ioctl would copy current vcpu's xsave struct to the userspace.
910
9114.42 KVM_SET_XSAVE
912
913Capability: KVM_CAP_XSAVE
914Architectures: x86
915Type: vcpu ioctl
916Parameters: struct kvm_xsave (in)
917Returns: 0 on success, -1 on error
918
919struct kvm_xsave {
920 __u32 region[1024];
921};
922
923This ioctl would copy userspace's xsave struct to the kernel.
924
9254.43 KVM_GET_XCRS
926
927Capability: KVM_CAP_XCRS
928Architectures: x86
929Type: vcpu ioctl
930Parameters: struct kvm_xcrs (out)
931Returns: 0 on success, -1 on error
932
933struct kvm_xcr {
934 __u32 xcr;
935 __u32 reserved;
936 __u64 value;
937};
938
939struct kvm_xcrs {
940 __u32 nr_xcrs;
941 __u32 flags;
942 struct kvm_xcr xcrs[KVM_MAX_XCRS];
943 __u64 padding[16];
944};
945
946This ioctl would copy current vcpu's xcrs to the userspace.
947
9484.44 KVM_SET_XCRS
949
950Capability: KVM_CAP_XCRS
951Architectures: x86
952Type: vcpu ioctl
953Parameters: struct kvm_xcrs (in)
954Returns: 0 on success, -1 on error
955
956struct kvm_xcr {
957 __u32 xcr;
958 __u32 reserved;
959 __u64 value;
960};
961
962struct kvm_xcrs {
963 __u32 nr_xcrs;
964 __u32 flags;
965 struct kvm_xcr xcrs[KVM_MAX_XCRS];
966 __u64 padding[16];
967};
968
969This ioctl would set vcpu's xcr to the value userspace specified.
970
9714.45 KVM_GET_SUPPORTED_CPUID
972
973Capability: KVM_CAP_EXT_CPUID
974Architectures: x86
975Type: system ioctl
976Parameters: struct kvm_cpuid2 (in/out)
977Returns: 0 on success, -1 on error
978
979struct kvm_cpuid2 {
980 __u32 nent;
981 __u32 padding;
982 struct kvm_cpuid_entry2 entries[0];
983};
984
985#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
986#define KVM_CPUID_FLAG_STATEFUL_FUNC 2
987#define KVM_CPUID_FLAG_STATE_READ_NEXT 4
988
989struct kvm_cpuid_entry2 {
990 __u32 function;
991 __u32 index;
992 __u32 flags;
993 __u32 eax;
994 __u32 ebx;
995 __u32 ecx;
996 __u32 edx;
997 __u32 padding[3];
998};
999
1000This ioctl returns x86 cpuid features which are supported by both the hardware
1001and kvm. Userspace can use the information returned by this ioctl to
1002construct cpuid information (for KVM_SET_CPUID2) that is consistent with
1003hardware, kernel, and userspace capabilities, and with user requirements (for
1004example, the user may wish to constrain cpuid to emulate older hardware,
1005or for feature consistency across a cluster).
1006
1007Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure
1008with the 'nent' field indicating the number of entries in the variable-size
1009array 'entries'. If the number of entries is too low to describe the cpu
1010capabilities, an error (E2BIG) is returned. If the number is too high,
1011the 'nent' field is adjusted and an error (ENOMEM) is returned. If the
1012number is just right, the 'nent' field is adjusted to the number of valid
1013entries in the 'entries' array, which is then filled.
1014
1015The entries returned are the host cpuid as returned by the cpuid instruction,
1016with unknown or unsupported features masked out. The fields in each entry
1017are defined as follows:
1018
1019 function: the eax value used to obtain the entry
1020 index: the ecx value used to obtain the entry (for entries that are
1021 affected by ecx)
1022 flags: an OR of zero or more of the following:
1023 KVM_CPUID_FLAG_SIGNIFCANT_INDEX:
1024 if the index field is valid
1025 KVM_CPUID_FLAG_STATEFUL_FUNC:
1026 if cpuid for this function returns different values for successive
1027 invocations; there will be several entries with the same function,
1028 all with this flag set
1029 KVM_CPUID_FLAG_STATE_READ_NEXT:
1030 for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is
1031 the first entry to be read by a cpu
1032 eax, ebx, ecx, edx: the values returned by the cpuid instruction for
1033 this function/index combination
1034
8955. The kvm_run structure 10355. The kvm_run structure
896 1036
897Application code obtains a pointer to the kvm_run structure by 1037Application code obtains a pointer to the kvm_run structure by
diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt
index aaed6ab9d7ab..142cc5136650 100644
--- a/Documentation/kvm/mmu.txt
+++ b/Documentation/kvm/mmu.txt
@@ -77,10 +77,10 @@ Memory
77 77
78Guest memory (gpa) is part of the user address space of the process that is 78Guest memory (gpa) is part of the user address space of the process that is
79using kvm. Userspace defines the translation between guest addresses and user 79using kvm. Userspace defines the translation between guest addresses and user
80addresses (gpa->hva); note that two gpas may alias to the same gva, but not 80addresses (gpa->hva); note that two gpas may alias to the same hva, but not
81vice versa. 81vice versa.
82 82
83These gvas may be backed using any method available to the host: anonymous 83These hvas may be backed using any method available to the host: anonymous
84memory, file backed memory, and device memory. Memory might be paged by the 84memory, file backed memory, and device memory. Memory might be paged by the
85host at any time. 85host at any time.
86 86
@@ -161,7 +161,7 @@ Shadow pages contain the following information:
161 role.cr4_pae: 161 role.cr4_pae:
162 Contains the value of cr4.pae for which the page is valid (e.g. whether 162 Contains the value of cr4.pae for which the page is valid (e.g. whether
163 32-bit or 64-bit gptes are in use). 163 32-bit or 64-bit gptes are in use).
164 role.cr4_nxe: 164 role.nxe:
165 Contains the value of efer.nxe for which the page is valid. 165 Contains the value of efer.nxe for which the page is valid.
166 role.cr0_wp: 166 role.cr0_wp:
167 Contains the value of cr0.wp for which the page is valid. 167 Contains the value of cr0.wp for which the page is valid.
@@ -180,7 +180,9 @@ Shadow pages contain the following information:
180 guest pages as leaves. 180 guest pages as leaves.
181 gfns: 181 gfns:
182 An array of 512 guest frame numbers, one for each present pte. Used to 182 An array of 512 guest frame numbers, one for each present pte. Used to
183 perform a reverse map from a pte to a gfn. 183 perform a reverse map from a pte to a gfn. When role.direct is set, any
184 element of this array can be calculated from the gfn field when used, in
185 this case, the array of gfns is not allocated. See role.direct and gfn.
184 slot_bitmap: 186 slot_bitmap:
185 A bitmap containing one bit per memory slot. If the page contains a pte 187 A bitmap containing one bit per memory slot. If the page contains a pte
186 mapping a page from memory slot n, then bit n of slot_bitmap will be set 188 mapping a page from memory slot n, then bit n of slot_bitmap will be set
@@ -296,6 +298,48 @@ Host translation updates:
296 - look up affected sptes through reverse map 298 - look up affected sptes through reverse map
297 - drop (or update) translations 299 - drop (or update) translations
298 300
301Emulating cr0.wp
302================
303
304If tdp is not enabled, the host must keep cr0.wp=1 so page write protection
305works for the guest kernel, not guest guest userspace. When the guest
306cr0.wp=1, this does not present a problem. However when the guest cr0.wp=0,
307we cannot map the permissions for gpte.u=1, gpte.w=0 to any spte (the
308semantics require allowing any guest kernel access plus user read access).
309
310We handle this by mapping the permissions to two possible sptes, depending
311on fault type:
312
313- kernel write fault: spte.u=0, spte.w=1 (allows full kernel access,
314 disallows user access)
315- read fault: spte.u=1, spte.w=0 (allows full read access, disallows kernel
316 write access)
317
318(user write faults generate a #PF)
319
320Large pages
321===========
322
323The mmu supports all combinations of large and small guest and host pages.
324Supported page sizes include 4k, 2M, 4M, and 1G. 4M pages are treated as
325two separate 2M pages, on both guest and host, since the mmu always uses PAE
326paging.
327
328To instantiate a large spte, four constraints must be satisfied:
329
330- the spte must point to a large host page
331- the guest pte must be a large pte of at least equivalent size (if tdp is
332 enabled, there is no guest pte and this condition is satisified)
333- if the spte will be writeable, the large page frame may not overlap any
334 write-protected pages
335- the guest page must be wholly contained by a single memory slot
336
337To check the last two conditions, the mmu maintains a ->write_count set of
338arrays for each memory slot and large page size. Every write protected page
339causes its write_count to be incremented, thus preventing instantiation of
340a large spte. The frames at the end of an unaligned memory slot have
341artificically inflated ->write_counts so they can never be instantiated.
342
299Further reading 343Further reading
300=============== 344===============
301 345
diff --git a/Documentation/kvm/msr.txt b/Documentation/kvm/msr.txt
new file mode 100644
index 000000000000..8ddcfe84c09a
--- /dev/null
+++ b/Documentation/kvm/msr.txt
@@ -0,0 +1,153 @@
1KVM-specific MSRs.
2Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010
3=====================================================
4
5KVM makes use of some custom MSRs to service some requests.
6At present, this facility is only used by kvmclock.
7
8Custom MSRs have a range reserved for them, that goes from
90x4b564d00 to 0x4b564dff. There are MSRs outside this area,
10but they are deprecated and their use is discouraged.
11
12Custom MSR list
13--------
14
15The current supported Custom MSR list is:
16
17MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00
18
19 data: 4-byte alignment physical address of a memory area which must be
20 in guest RAM. This memory is expected to hold a copy of the following
21 structure:
22
23 struct pvclock_wall_clock {
24 u32 version;
25 u32 sec;
26 u32 nsec;
27 } __attribute__((__packed__));
28
29 whose data will be filled in by the hypervisor. The hypervisor is only
30 guaranteed to update this data at the moment of MSR write.
31 Users that want to reliably query this information more than once have
32 to write more than once to this MSR. Fields have the following meanings:
33
34 version: guest has to check version before and after grabbing
35 time information and check that they are both equal and even.
36 An odd version indicates an in-progress update.
37
38 sec: number of seconds for wallclock.
39
40 nsec: number of nanoseconds for wallclock.
41
42 Note that although MSRs are per-CPU entities, the effect of this
43 particular MSR is global.
44
45 Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid
46 leaf prior to usage.
47
48MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01
49
50 data: 4-byte aligned physical address of a memory area which must be in
51 guest RAM, plus an enable bit in bit 0. This memory is expected to hold
52 a copy of the following structure:
53
54 struct pvclock_vcpu_time_info {
55 u32 version;
56 u32 pad0;
57 u64 tsc_timestamp;
58 u64 system_time;
59 u32 tsc_to_system_mul;
60 s8 tsc_shift;
61 u8 flags;
62 u8 pad[2];
63 } __attribute__((__packed__)); /* 32 bytes */
64
65 whose data will be filled in by the hypervisor periodically. Only one
66 write, or registration, is needed for each VCPU. The interval between
67 updates of this structure is arbitrary and implementation-dependent.
68 The hypervisor may update this structure at any time it sees fit until
69 anything with bit0 == 0 is written to it.
70
71 Fields have the following meanings:
72
73 version: guest has to check version before and after grabbing
74 time information and check that they are both equal and even.
75 An odd version indicates an in-progress update.
76
77 tsc_timestamp: the tsc value at the current VCPU at the time
78 of the update of this structure. Guests can subtract this value
79 from current tsc to derive a notion of elapsed time since the
80 structure update.
81
82 system_time: a host notion of monotonic time, including sleep
83 time at the time this structure was last updated. Unit is
84 nanoseconds.
85
86 tsc_to_system_mul: a function of the tsc frequency. One has
87 to multiply any tsc-related quantity by this value to get
88 a value in nanoseconds, besides dividing by 2^tsc_shift
89
90 tsc_shift: cycle to nanosecond divider, as a power of two, to
91 allow for shift rights. One has to shift right any tsc-related
92 quantity by this value to get a value in nanoseconds, besides
93 multiplying by tsc_to_system_mul.
94
95 With this information, guests can derive per-CPU time by
96 doing:
97
98 time = (current_tsc - tsc_timestamp)
99 time = (time * tsc_to_system_mul) >> tsc_shift
100 time = time + system_time
101
102 flags: bits in this field indicate extended capabilities
103 coordinated between the guest and the hypervisor. Availability
104 of specific flags has to be checked in 0x40000001 cpuid leaf.
105 Current flags are:
106
107 flag bit | cpuid bit | meaning
108 -------------------------------------------------------------
109 | | time measures taken across
110 0 | 24 | multiple cpus are guaranteed to
111 | | be monotonic
112 -------------------------------------------------------------
113
114 Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid
115 leaf prior to usage.
116
117
118MSR_KVM_WALL_CLOCK: 0x11
119
120 data and functioning: same as MSR_KVM_WALL_CLOCK_NEW. Use that instead.
121
122 This MSR falls outside the reserved KVM range and may be removed in the
123 future. Its usage is deprecated.
124
125 Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid
126 leaf prior to usage.
127
128MSR_KVM_SYSTEM_TIME: 0x12
129
130 data and functioning: same as MSR_KVM_SYSTEM_TIME_NEW. Use that instead.
131
132 This MSR falls outside the reserved KVM range and may be removed in the
133 future. Its usage is deprecated.
134
135 Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid
136 leaf prior to usage.
137
138 The suggested algorithm for detecting kvmclock presence is then:
139
140 if (!kvm_para_available()) /* refer to cpuid.txt */
141 return NON_PRESENT;
142
143 flags = cpuid_eax(0x40000001);
144 if (flags & 3) {
145 msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
146 msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
147 return PRESENT;
148 } else if (flags & 0) {
149 msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
150 msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
151 return PRESENT;
152 } else
153 return NON_PRESENT;
diff --git a/Documentation/kvm/review-checklist.txt b/Documentation/kvm/review-checklist.txt
new file mode 100644
index 000000000000..730475ae1b8d
--- /dev/null
+++ b/Documentation/kvm/review-checklist.txt
@@ -0,0 +1,38 @@
1Review checklist for kvm patches
2================================
3
41. The patch must follow Documentation/CodingStyle and
5 Documentation/SubmittingPatches.
6
72. Patches should be against kvm.git master branch.
8
93. If the patch introduces or modifies a new userspace API:
10 - the API must be documented in Documentation/kvm/api.txt
11 - the API must be discoverable using KVM_CHECK_EXTENSION
12
134. New state must include support for save/restore.
14
155. New features must default to off (userspace should explicitly request them).
16 Performance improvements can and should default to on.
17
186. New cpu features should be exposed via KVM_GET_SUPPORTED_CPUID2
19
207. Emulator changes should be accompanied by unit tests for qemu-kvm.git
21 kvm/test directory.
22
238. Changes should be vendor neutral when possible. Changes to common code
24 are better than duplicating changes to vendor code.
25
269. Similarly, prefer changes to arch independent code than to arch dependent
27 code.
28
2910. User/kernel interfaces and guest/host interfaces must be 64-bit clean
30 (all variables and sizes naturally aligned on 64-bit; use specific types
31 only - u64 rather than ulong).
32
3311. New guest visible features must either be documented in a hardware manual
34 or be accompanied by documentation.
35
3612. Features must be robust against reset and kexec - for example, shared
37 host/guest memory must be unshared to prevent the host from writing to
38 guest memory that the guest has not reserved for this purpose.
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index a362e67e0ca6..2f229e5de498 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -235,6 +235,7 @@ struct kvm_vm_data {
235#define KVM_REQ_PTC_G 32 235#define KVM_REQ_PTC_G 32
236#define KVM_REQ_RESUME 33 236#define KVM_REQ_RESUME 33
237 237
238#define KVM_HPAGE_GFN_SHIFT(x) 0
238#define KVM_NR_PAGE_SIZES 1 239#define KVM_NR_PAGE_SIZES 1
239#define KVM_PAGES_PER_HPAGE(x) 1 240#define KVM_PAGES_PER_HPAGE(x) 1
240 241
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 21b701374f72..5cb58655cd5f 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -725,8 +725,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
725 int r; 725 int r;
726 sigset_t sigsaved; 726 sigset_t sigsaved;
727 727
728 vcpu_load(vcpu);
729
730 if (vcpu->sigset_active) 728 if (vcpu->sigset_active)
731 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 729 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
732 730
@@ -748,7 +746,6 @@ out:
748 if (vcpu->sigset_active) 746 if (vcpu->sigset_active)
749 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 747 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
750 748
751 vcpu_put(vcpu);
752 return r; 749 return r;
753} 750}
754 751
@@ -883,8 +880,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
883 struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd); 880 struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
884 int i; 881 int i;
885 882
886 vcpu_load(vcpu);
887
888 for (i = 0; i < 16; i++) { 883 for (i = 0; i < 16; i++) {
889 vpd->vgr[i] = regs->vpd.vgr[i]; 884 vpd->vgr[i] = regs->vpd.vgr[i];
890 vpd->vbgr[i] = regs->vpd.vbgr[i]; 885 vpd->vbgr[i] = regs->vpd.vbgr[i];
@@ -931,8 +926,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
931 vcpu->arch.itc_offset = regs->saved_itc - kvm_get_itc(vcpu); 926 vcpu->arch.itc_offset = regs->saved_itc - kvm_get_itc(vcpu);
932 set_bit(KVM_REQ_RESUME, &vcpu->requests); 927 set_bit(KVM_REQ_RESUME, &vcpu->requests);
933 928
934 vcpu_put(vcpu);
935
936 return 0; 929 return 0;
937} 930}
938 931
@@ -1802,35 +1795,24 @@ void kvm_arch_exit(void)
1802 kvm_vmm_info = NULL; 1795 kvm_vmm_info = NULL;
1803} 1796}
1804 1797
1805static int kvm_ia64_sync_dirty_log(struct kvm *kvm, 1798static void kvm_ia64_sync_dirty_log(struct kvm *kvm,
1806 struct kvm_dirty_log *log) 1799 struct kvm_memory_slot *memslot)
1807{ 1800{
1808 struct kvm_memory_slot *memslot; 1801 int i;
1809 int r, i;
1810 long base; 1802 long base;
1811 unsigned long n; 1803 unsigned long n;
1812 unsigned long *dirty_bitmap = (unsigned long *)(kvm->arch.vm_base + 1804 unsigned long *dirty_bitmap = (unsigned long *)(kvm->arch.vm_base +
1813 offsetof(struct kvm_vm_data, kvm_mem_dirty_log)); 1805 offsetof(struct kvm_vm_data, kvm_mem_dirty_log));
1814 1806
1815 r = -EINVAL;
1816 if (log->slot >= KVM_MEMORY_SLOTS)
1817 goto out;
1818
1819 memslot = &kvm->memslots->memslots[log->slot];
1820 r = -ENOENT;
1821 if (!memslot->dirty_bitmap)
1822 goto out;
1823
1824 n = kvm_dirty_bitmap_bytes(memslot); 1807 n = kvm_dirty_bitmap_bytes(memslot);
1825 base = memslot->base_gfn / BITS_PER_LONG; 1808 base = memslot->base_gfn / BITS_PER_LONG;
1826 1809
1810 spin_lock(&kvm->arch.dirty_log_lock);
1827 for (i = 0; i < n/sizeof(long); ++i) { 1811 for (i = 0; i < n/sizeof(long); ++i) {
1828 memslot->dirty_bitmap[i] = dirty_bitmap[base + i]; 1812 memslot->dirty_bitmap[i] = dirty_bitmap[base + i];
1829 dirty_bitmap[base + i] = 0; 1813 dirty_bitmap[base + i] = 0;
1830 } 1814 }
1831 r = 0; 1815 spin_unlock(&kvm->arch.dirty_log_lock);
1832out:
1833 return r;
1834} 1816}
1835 1817
1836int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 1818int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
@@ -1842,12 +1824,17 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1842 int is_dirty = 0; 1824 int is_dirty = 0;
1843 1825
1844 mutex_lock(&kvm->slots_lock); 1826 mutex_lock(&kvm->slots_lock);
1845 spin_lock(&kvm->arch.dirty_log_lock);
1846 1827
1847 r = kvm_ia64_sync_dirty_log(kvm, log); 1828 r = -EINVAL;
1848 if (r) 1829 if (log->slot >= KVM_MEMORY_SLOTS)
1830 goto out;
1831
1832 memslot = &kvm->memslots->memslots[log->slot];
1833 r = -ENOENT;
1834 if (!memslot->dirty_bitmap)
1849 goto out; 1835 goto out;
1850 1836
1837 kvm_ia64_sync_dirty_log(kvm, memslot);
1851 r = kvm_get_dirty_log(kvm, log, &is_dirty); 1838 r = kvm_get_dirty_log(kvm, log, &is_dirty);
1852 if (r) 1839 if (r)
1853 goto out; 1840 goto out;
@@ -1855,14 +1842,12 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1855 /* If nothing is dirty, don't bother messing with page tables. */ 1842 /* If nothing is dirty, don't bother messing with page tables. */
1856 if (is_dirty) { 1843 if (is_dirty) {
1857 kvm_flush_remote_tlbs(kvm); 1844 kvm_flush_remote_tlbs(kvm);
1858 memslot = &kvm->memslots->memslots[log->slot];
1859 n = kvm_dirty_bitmap_bytes(memslot); 1845 n = kvm_dirty_bitmap_bytes(memslot);
1860 memset(memslot->dirty_bitmap, 0, n); 1846 memset(memslot->dirty_bitmap, 0, n);
1861 } 1847 }
1862 r = 0; 1848 r = 0;
1863out: 1849out:
1864 mutex_unlock(&kvm->slots_lock); 1850 mutex_unlock(&kvm->slots_lock);
1865 spin_unlock(&kvm->arch.dirty_log_lock);
1866 return r; 1851 return r;
1867} 1852}
1868 1853
@@ -1953,11 +1938,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
1953 return vcpu->arch.timer_fired; 1938 return vcpu->arch.timer_fired;
1954} 1939}
1955 1940
1956gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1957{
1958 return gfn;
1959}
1960
1961int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 1941int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
1962{ 1942{
1963 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) || 1943 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) ||
@@ -1967,9 +1947,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
1967int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 1947int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
1968 struct kvm_mp_state *mp_state) 1948 struct kvm_mp_state *mp_state)
1969{ 1949{
1970 vcpu_load(vcpu);
1971 mp_state->mp_state = vcpu->arch.mp_state; 1950 mp_state->mp_state = vcpu->arch.mp_state;
1972 vcpu_put(vcpu);
1973 return 0; 1951 return 0;
1974} 1952}
1975 1953
@@ -2000,10 +1978,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
2000{ 1978{
2001 int r = 0; 1979 int r = 0;
2002 1980
2003 vcpu_load(vcpu);
2004 vcpu->arch.mp_state = mp_state->mp_state; 1981 vcpu->arch.mp_state = mp_state->mp_state;
2005 if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) 1982 if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
2006 r = vcpu_reset(vcpu); 1983 r = vcpu_reset(vcpu);
2007 vcpu_put(vcpu);
2008 return r; 1984 return r;
2009} 1985}
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 6f74d93725a0..8274a2d43925 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -115,7 +115,15 @@ extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
115extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); 115extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
116extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr); 116extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
117extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu); 117extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
118extern struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data); 118
119extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
120extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
121extern void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu);
122extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
123extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
124extern int kvmppc_mmu_hpte_sysinit(void);
125extern void kvmppc_mmu_hpte_sysexit(void);
126
119extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); 127extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
120extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); 128extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
121extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec); 129extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
diff --git a/arch/powerpc/include/asm/kvm_fpu.h b/arch/powerpc/include/asm/kvm_fpu.h
index 94f05de9ad04..c3d4f0518a67 100644
--- a/arch/powerpc/include/asm/kvm_fpu.h
+++ b/arch/powerpc/include/asm/kvm_fpu.h
@@ -22,24 +22,24 @@
22 22
23#include <linux/types.h> 23#include <linux/types.h>
24 24
25extern void fps_fres(struct thread_struct *t, u32 *dst, u32 *src1); 25extern void fps_fres(u64 *fpscr, u32 *dst, u32 *src1);
26extern void fps_frsqrte(struct thread_struct *t, u32 *dst, u32 *src1); 26extern void fps_frsqrte(u64 *fpscr, u32 *dst, u32 *src1);
27extern void fps_fsqrts(struct thread_struct *t, u32 *dst, u32 *src1); 27extern void fps_fsqrts(u64 *fpscr, u32 *dst, u32 *src1);
28 28
29extern void fps_fadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2); 29extern void fps_fadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
30extern void fps_fdivs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2); 30extern void fps_fdivs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
31extern void fps_fmuls(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2); 31extern void fps_fmuls(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
32extern void fps_fsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2); 32extern void fps_fsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
33 33
34extern void fps_fmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2, 34extern void fps_fmadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
35 u32 *src3); 35 u32 *src3);
36extern void fps_fmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2, 36extern void fps_fmsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
37 u32 *src3); 37 u32 *src3);
38extern void fps_fnmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2, 38extern void fps_fnmadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
39 u32 *src3); 39 u32 *src3);
40extern void fps_fnmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2, 40extern void fps_fnmsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
41 u32 *src3); 41 u32 *src3);
42extern void fps_fsel(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2, 42extern void fps_fsel(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
43 u32 *src3); 43 u32 *src3);
44 44
45#define FPD_ONE_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \ 45#define FPD_ONE_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \
@@ -82,4 +82,7 @@ FPD_THREE_IN(fmadd)
82FPD_THREE_IN(fnmsub) 82FPD_THREE_IN(fnmsub)
83FPD_THREE_IN(fnmadd) 83FPD_THREE_IN(fnmadd)
84 84
85extern void kvm_cvt_fd(u32 *from, u64 *to, u64 *fpscr);
86extern void kvm_cvt_df(u64 *from, u32 *to, u64 *fpscr);
87
85#endif 88#endif
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 0c9ad869decd..b0b23c007d6e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -35,10 +35,17 @@
35#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 35#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
36 36
37/* We don't currently support large pages. */ 37/* We don't currently support large pages. */
38#define KVM_HPAGE_GFN_SHIFT(x) 0
38#define KVM_NR_PAGE_SIZES 1 39#define KVM_NR_PAGE_SIZES 1
39#define KVM_PAGES_PER_HPAGE(x) (1UL<<31) 40#define KVM_PAGES_PER_HPAGE(x) (1UL<<31)
40 41
41#define HPTEG_CACHE_NUM 1024 42#define HPTEG_CACHE_NUM (1 << 15)
43#define HPTEG_HASH_BITS_PTE 13
44#define HPTEG_HASH_BITS_VPTE 13
45#define HPTEG_HASH_BITS_VPTE_LONG 5
46#define HPTEG_HASH_NUM_PTE (1 << HPTEG_HASH_BITS_PTE)
47#define HPTEG_HASH_NUM_VPTE (1 << HPTEG_HASH_BITS_VPTE)
48#define HPTEG_HASH_NUM_VPTE_LONG (1 << HPTEG_HASH_BITS_VPTE_LONG)
42 49
43struct kvm; 50struct kvm;
44struct kvm_run; 51struct kvm_run;
@@ -151,6 +158,9 @@ struct kvmppc_mmu {
151}; 158};
152 159
153struct hpte_cache { 160struct hpte_cache {
161 struct hlist_node list_pte;
162 struct hlist_node list_vpte;
163 struct hlist_node list_vpte_long;
154 u64 host_va; 164 u64 host_va;
155 u64 pfn; 165 u64 pfn;
156 ulong slot; 166 ulong slot;
@@ -282,8 +292,10 @@ struct kvm_vcpu_arch {
282 unsigned long pending_exceptions; 292 unsigned long pending_exceptions;
283 293
284#ifdef CONFIG_PPC_BOOK3S 294#ifdef CONFIG_PPC_BOOK3S
285 struct hpte_cache hpte_cache[HPTEG_CACHE_NUM]; 295 struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
286 int hpte_cache_offset; 296 struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
297 struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
298 int hpte_cache_count;
287#endif 299#endif
288}; 300};
289 301
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 3b4dcc82a4c1..ab3e392ac63c 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -101,10 +101,6 @@ EXPORT_SYMBOL(pci_dram_offset);
101EXPORT_SYMBOL(start_thread); 101EXPORT_SYMBOL(start_thread);
102EXPORT_SYMBOL(kernel_thread); 102EXPORT_SYMBOL(kernel_thread);
103 103
104#ifdef CONFIG_PPC_FPU
105EXPORT_SYMBOL_GPL(cvt_df);
106EXPORT_SYMBOL_GPL(cvt_fd);
107#endif
108EXPORT_SYMBOL(giveup_fpu); 104EXPORT_SYMBOL(giveup_fpu);
109#ifdef CONFIG_ALTIVEC 105#ifdef CONFIG_ALTIVEC
110EXPORT_SYMBOL(giveup_altivec); 106EXPORT_SYMBOL(giveup_altivec);
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 812312542e50..9b9b5cdea840 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -316,7 +316,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
316 gfn = gpaddr >> PAGE_SHIFT; 316 gfn = gpaddr >> PAGE_SHIFT;
317 new_page = gfn_to_page(vcpu->kvm, gfn); 317 new_page = gfn_to_page(vcpu->kvm, gfn);
318 if (is_error_page(new_page)) { 318 if (is_error_page(new_page)) {
319 printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn); 319 printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n",
320 (unsigned long long)gfn);
320 kvm_release_page_clean(new_page); 321 kvm_release_page_clean(new_page);
321 return; 322 return;
322 } 323 }
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index ff436066bf77..d45c818a384c 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -45,6 +45,7 @@ kvm-book3s_64-objs := \
45 book3s.o \ 45 book3s.o \
46 book3s_emulate.o \ 46 book3s_emulate.o \
47 book3s_interrupts.o \ 47 book3s_interrupts.o \
48 book3s_mmu_hpte.o \
48 book3s_64_mmu_host.o \ 49 book3s_64_mmu_host.o \
49 book3s_64_mmu.o \ 50 book3s_64_mmu.o \
50 book3s_32_mmu.o 51 book3s_32_mmu.o
@@ -57,6 +58,7 @@ kvm-book3s_32-objs := \
57 book3s.o \ 58 book3s.o \
58 book3s_emulate.o \ 59 book3s_emulate.o \
59 book3s_interrupts.o \ 60 book3s_interrupts.o \
61 book3s_mmu_hpte.o \
60 book3s_32_mmu_host.o \ 62 book3s_32_mmu_host.o \
61 book3s_32_mmu.o 63 book3s_32_mmu.o
62kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs) 64kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b998abf1a63d..a3cef30d1d42 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1047,8 +1047,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
1047{ 1047{
1048 int i; 1048 int i;
1049 1049
1050 vcpu_load(vcpu);
1051
1052 regs->pc = kvmppc_get_pc(vcpu); 1050 regs->pc = kvmppc_get_pc(vcpu);
1053 regs->cr = kvmppc_get_cr(vcpu); 1051 regs->cr = kvmppc_get_cr(vcpu);
1054 regs->ctr = kvmppc_get_ctr(vcpu); 1052 regs->ctr = kvmppc_get_ctr(vcpu);
@@ -1069,8 +1067,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
1069 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 1067 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
1070 regs->gpr[i] = kvmppc_get_gpr(vcpu, i); 1068 regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
1071 1069
1072 vcpu_put(vcpu);
1073
1074 return 0; 1070 return 0;
1075} 1071}
1076 1072
@@ -1078,8 +1074,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
1078{ 1074{
1079 int i; 1075 int i;
1080 1076
1081 vcpu_load(vcpu);
1082
1083 kvmppc_set_pc(vcpu, regs->pc); 1077 kvmppc_set_pc(vcpu, regs->pc);
1084 kvmppc_set_cr(vcpu, regs->cr); 1078 kvmppc_set_cr(vcpu, regs->cr);
1085 kvmppc_set_ctr(vcpu, regs->ctr); 1079 kvmppc_set_ctr(vcpu, regs->ctr);
@@ -1099,8 +1093,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
1099 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 1093 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
1100 kvmppc_set_gpr(vcpu, i, regs->gpr[i]); 1094 kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
1101 1095
1102 vcpu_put(vcpu);
1103
1104 return 0; 1096 return 0;
1105} 1097}
1106 1098
@@ -1110,8 +1102,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
1110 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); 1102 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
1111 int i; 1103 int i;
1112 1104
1113 vcpu_load(vcpu);
1114
1115 sregs->pvr = vcpu->arch.pvr; 1105 sregs->pvr = vcpu->arch.pvr;
1116 1106
1117 sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1; 1107 sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
@@ -1131,8 +1121,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
1131 } 1121 }
1132 } 1122 }
1133 1123
1134 vcpu_put(vcpu);
1135
1136 return 0; 1124 return 0;
1137} 1125}
1138 1126
@@ -1142,8 +1130,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
1142 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); 1130 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
1143 int i; 1131 int i;
1144 1132
1145 vcpu_load(vcpu);
1146
1147 kvmppc_set_pvr(vcpu, sregs->pvr); 1133 kvmppc_set_pvr(vcpu, sregs->pvr);
1148 1134
1149 vcpu3s->sdr1 = sregs->u.s.sdr1; 1135 vcpu3s->sdr1 = sregs->u.s.sdr1;
@@ -1171,8 +1157,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
1171 /* Flush the MMU after messing with the segments */ 1157 /* Flush the MMU after messing with the segments */
1172 kvmppc_mmu_pte_flush(vcpu, 0, 0); 1158 kvmppc_mmu_pte_flush(vcpu, 0, 0);
1173 1159
1174 vcpu_put(vcpu);
1175
1176 return 0; 1160 return 0;
1177} 1161}
1178 1162
@@ -1309,12 +1293,17 @@ extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
1309int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 1293int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1310{ 1294{
1311 int ret; 1295 int ret;
1312 struct thread_struct ext_bkp; 1296 double fpr[32][TS_FPRWIDTH];
1297 unsigned int fpscr;
1298 int fpexc_mode;
1313#ifdef CONFIG_ALTIVEC 1299#ifdef CONFIG_ALTIVEC
1314 bool save_vec = current->thread.used_vr; 1300 vector128 vr[32];
1301 vector128 vscr;
1302 unsigned long uninitialized_var(vrsave);
1303 int used_vr;
1315#endif 1304#endif
1316#ifdef CONFIG_VSX 1305#ifdef CONFIG_VSX
1317 bool save_vsx = current->thread.used_vsr; 1306 int used_vsr;
1318#endif 1307#endif
1319 ulong ext_msr; 1308 ulong ext_msr;
1320 1309
@@ -1327,27 +1316,27 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1327 /* Save FPU state in stack */ 1316 /* Save FPU state in stack */
1328 if (current->thread.regs->msr & MSR_FP) 1317 if (current->thread.regs->msr & MSR_FP)
1329 giveup_fpu(current); 1318 giveup_fpu(current);
1330 memcpy(ext_bkp.fpr, current->thread.fpr, sizeof(current->thread.fpr)); 1319 memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr));
1331 ext_bkp.fpscr = current->thread.fpscr; 1320 fpscr = current->thread.fpscr.val;
1332 ext_bkp.fpexc_mode = current->thread.fpexc_mode; 1321 fpexc_mode = current->thread.fpexc_mode;
1333 1322
1334#ifdef CONFIG_ALTIVEC 1323#ifdef CONFIG_ALTIVEC
1335 /* Save Altivec state in stack */ 1324 /* Save Altivec state in stack */
1336 if (save_vec) { 1325 used_vr = current->thread.used_vr;
1326 if (used_vr) {
1337 if (current->thread.regs->msr & MSR_VEC) 1327 if (current->thread.regs->msr & MSR_VEC)
1338 giveup_altivec(current); 1328 giveup_altivec(current);
1339 memcpy(ext_bkp.vr, current->thread.vr, sizeof(ext_bkp.vr)); 1329 memcpy(vr, current->thread.vr, sizeof(current->thread.vr));
1340 ext_bkp.vscr = current->thread.vscr; 1330 vscr = current->thread.vscr;
1341 ext_bkp.vrsave = current->thread.vrsave; 1331 vrsave = current->thread.vrsave;
1342 } 1332 }
1343 ext_bkp.used_vr = current->thread.used_vr;
1344#endif 1333#endif
1345 1334
1346#ifdef CONFIG_VSX 1335#ifdef CONFIG_VSX
1347 /* Save VSX state in stack */ 1336 /* Save VSX state in stack */
1348 if (save_vsx && (current->thread.regs->msr & MSR_VSX)) 1337 used_vsr = current->thread.used_vsr;
1338 if (used_vsr && (current->thread.regs->msr & MSR_VSX))
1349 __giveup_vsx(current); 1339 __giveup_vsx(current);
1350 ext_bkp.used_vsr = current->thread.used_vsr;
1351#endif 1340#endif
1352 1341
1353 /* Remember the MSR with disabled extensions */ 1342 /* Remember the MSR with disabled extensions */
@@ -1372,22 +1361,22 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1372 kvmppc_giveup_ext(vcpu, MSR_VSX); 1361 kvmppc_giveup_ext(vcpu, MSR_VSX);
1373 1362
1374 /* Restore FPU state from stack */ 1363 /* Restore FPU state from stack */
1375 memcpy(current->thread.fpr, ext_bkp.fpr, sizeof(ext_bkp.fpr)); 1364 memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
1376 current->thread.fpscr = ext_bkp.fpscr; 1365 current->thread.fpscr.val = fpscr;
1377 current->thread.fpexc_mode = ext_bkp.fpexc_mode; 1366 current->thread.fpexc_mode = fpexc_mode;
1378 1367
1379#ifdef CONFIG_ALTIVEC 1368#ifdef CONFIG_ALTIVEC
1380 /* Restore Altivec state from stack */ 1369 /* Restore Altivec state from stack */
1381 if (save_vec && current->thread.used_vr) { 1370 if (used_vr && current->thread.used_vr) {
1382 memcpy(current->thread.vr, ext_bkp.vr, sizeof(ext_bkp.vr)); 1371 memcpy(current->thread.vr, vr, sizeof(current->thread.vr));
1383 current->thread.vscr = ext_bkp.vscr; 1372 current->thread.vscr = vscr;
1384 current->thread.vrsave= ext_bkp.vrsave; 1373 current->thread.vrsave = vrsave;
1385 } 1374 }
1386 current->thread.used_vr = ext_bkp.used_vr; 1375 current->thread.used_vr = used_vr;
1387#endif 1376#endif
1388 1377
1389#ifdef CONFIG_VSX 1378#ifdef CONFIG_VSX
1390 current->thread.used_vsr = ext_bkp.used_vsr; 1379 current->thread.used_vsr = used_vsr;
1391#endif 1380#endif
1392 1381
1393 return ret; 1382 return ret;
@@ -1395,12 +1384,22 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1395 1384
1396static int kvmppc_book3s_init(void) 1385static int kvmppc_book3s_init(void)
1397{ 1386{
1398 return kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0, 1387 int r;
1399 THIS_MODULE); 1388
1389 r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
1390 THIS_MODULE);
1391
1392 if (r)
1393 return r;
1394
1395 r = kvmppc_mmu_hpte_sysinit();
1396
1397 return r;
1400} 1398}
1401 1399
1402static void kvmppc_book3s_exit(void) 1400static void kvmppc_book3s_exit(void)
1403{ 1401{
1402 kvmppc_mmu_hpte_sysexit();
1404 kvm_exit(); 1403 kvm_exit();
1405} 1404}
1406 1405
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index 0b10503c8a4a..3292d76101d2 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -354,10 +354,10 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
354 *vsid = VSID_REAL_DR | gvsid; 354 *vsid = VSID_REAL_DR | gvsid;
355 break; 355 break;
356 case MSR_DR|MSR_IR: 356 case MSR_DR|MSR_IR:
357 if (!sr->valid) 357 if (sr->valid)
358 return -1; 358 *vsid = sr->vsid;
359 359 else
360 *vsid = sr->vsid; 360 *vsid = VSID_BAT | gvsid;
361 break; 361 break;
362 default: 362 default:
363 BUG(); 363 BUG();
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index 0bb66005338f..0b51ef872c1e 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -19,6 +19,7 @@
19 */ 19 */
20 20
21#include <linux/kvm_host.h> 21#include <linux/kvm_host.h>
22#include <linux/hash.h>
22 23
23#include <asm/kvm_ppc.h> 24#include <asm/kvm_ppc.h>
24#include <asm/kvm_book3s.h> 25#include <asm/kvm_book3s.h>
@@ -57,139 +58,26 @@
57static ulong htab; 58static ulong htab;
58static u32 htabmask; 59static u32 htabmask;
59 60
60static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) 61void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
61{ 62{
62 volatile u32 *pteg; 63 volatile u32 *pteg;
63 64
64 dprintk_mmu("KVM: Flushing SPTE: 0x%llx (0x%llx) -> 0x%llx\n", 65 /* Remove from host HTAB */
65 pte->pte.eaddr, pte->pte.vpage, pte->host_va);
66
67 pteg = (u32*)pte->slot; 66 pteg = (u32*)pte->slot;
68
69 pteg[0] = 0; 67 pteg[0] = 0;
68
69 /* And make sure it's gone from the TLB too */
70 asm volatile ("sync"); 70 asm volatile ("sync");
71 asm volatile ("tlbie %0" : : "r" (pte->pte.eaddr) : "memory"); 71 asm volatile ("tlbie %0" : : "r" (pte->pte.eaddr) : "memory");
72 asm volatile ("sync"); 72 asm volatile ("sync");
73 asm volatile ("tlbsync"); 73 asm volatile ("tlbsync");
74
75 pte->host_va = 0;
76
77 if (pte->pte.may_write)
78 kvm_release_pfn_dirty(pte->pfn);
79 else
80 kvm_release_pfn_clean(pte->pfn);
81}
82
83void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
84{
85 int i;
86
87 dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%x & 0x%x\n",
88 vcpu->arch.hpte_cache_offset, guest_ea, ea_mask);
89 BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
90
91 guest_ea &= ea_mask;
92 for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
93 struct hpte_cache *pte;
94
95 pte = &vcpu->arch.hpte_cache[i];
96 if (!pte->host_va)
97 continue;
98
99 if ((pte->pte.eaddr & ea_mask) == guest_ea) {
100 invalidate_pte(vcpu, pte);
101 }
102 }
103
104 /* Doing a complete flush -> start from scratch */
105 if (!ea_mask)
106 vcpu->arch.hpte_cache_offset = 0;
107}
108
109void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
110{
111 int i;
112
113 dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
114 vcpu->arch.hpte_cache_offset, guest_vp, vp_mask);
115 BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
116
117 guest_vp &= vp_mask;
118 for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
119 struct hpte_cache *pte;
120
121 pte = &vcpu->arch.hpte_cache[i];
122 if (!pte->host_va)
123 continue;
124
125 if ((pte->pte.vpage & vp_mask) == guest_vp) {
126 invalidate_pte(vcpu, pte);
127 }
128 }
129}
130
131void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
132{
133 int i;
134
135 dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%llx & 0x%llx\n",
136 vcpu->arch.hpte_cache_offset, pa_start, pa_end);
137 BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
138
139 for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
140 struct hpte_cache *pte;
141
142 pte = &vcpu->arch.hpte_cache[i];
143 if (!pte->host_va)
144 continue;
145
146 if ((pte->pte.raddr >= pa_start) &&
147 (pte->pte.raddr < pa_end)) {
148 invalidate_pte(vcpu, pte);
149 }
150 }
151}
152
153struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data)
154{
155 int i;
156 u64 guest_vp;
157
158 guest_vp = vcpu->arch.mmu.ea_to_vp(vcpu, ea, false);
159 for (i=0; i<vcpu->arch.hpte_cache_offset; i++) {
160 struct hpte_cache *pte;
161
162 pte = &vcpu->arch.hpte_cache[i];
163 if (!pte->host_va)
164 continue;
165
166 if (pte->pte.vpage == guest_vp)
167 return &pte->pte;
168 }
169
170 return NULL;
171}
172
173static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
174{
175 if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM)
176 kvmppc_mmu_pte_flush(vcpu, 0, 0);
177
178 return vcpu->arch.hpte_cache_offset++;
179} 74}
180 75
181/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using 76/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
182 * a hash, so we don't waste cycles on looping */ 77 * a hash, so we don't waste cycles on looping */
183static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid) 78static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
184{ 79{
185 return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^ 80 return hash_64(gvsid, SID_MAP_BITS);
186 ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
187 ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
188 ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
189 ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
190 ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
191 ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
192 ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
193} 81}
194 82
195 83
@@ -256,7 +144,6 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
256 register int rr = 0; 144 register int rr = 0;
257 bool primary = false; 145 bool primary = false;
258 bool evict = false; 146 bool evict = false;
259 int hpte_id;
260 struct hpte_cache *pte; 147 struct hpte_cache *pte;
261 148
262 /* Get host physical address for gpa */ 149 /* Get host physical address for gpa */
@@ -341,8 +228,7 @@ next_pteg:
341 228
342 /* Now tell our Shadow PTE code about the new page */ 229 /* Now tell our Shadow PTE code about the new page */
343 230
344 hpte_id = kvmppc_mmu_hpte_cache_next(vcpu); 231 pte = kvmppc_mmu_hpte_cache_next(vcpu);
345 pte = &vcpu->arch.hpte_cache[hpte_id];
346 232
347 dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n", 233 dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n",
348 orig_pte->may_write ? 'w' : '-', 234 orig_pte->may_write ? 'w' : '-',
@@ -355,6 +241,8 @@ next_pteg:
355 pte->pte = *orig_pte; 241 pte->pte = *orig_pte;
356 pte->pfn = hpaddr >> PAGE_SHIFT; 242 pte->pfn = hpaddr >> PAGE_SHIFT;
357 243
244 kvmppc_mmu_hpte_cache_map(vcpu, pte);
245
358 return 0; 246 return 0;
359} 247}
360 248
@@ -439,7 +327,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
439 327
440void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) 328void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
441{ 329{
442 kvmppc_mmu_pte_flush(vcpu, 0, 0); 330 kvmppc_mmu_hpte_destroy(vcpu);
443 preempt_disable(); 331 preempt_disable();
444 __destroy_context(to_book3s(vcpu)->context_id); 332 __destroy_context(to_book3s(vcpu)->context_id);
445 preempt_enable(); 333 preempt_enable();
@@ -479,5 +367,7 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
479 htabmask = ((sdr1 & 0x1FF) << 16) | 0xFFC0; 367 htabmask = ((sdr1 & 0x1FF) << 16) | 0xFFC0;
480 htab = (ulong)__va(sdr1 & 0xffff0000); 368 htab = (ulong)__va(sdr1 & 0xffff0000);
481 369
370 kvmppc_mmu_hpte_init(vcpu);
371
482 return 0; 372 return 0;
483} 373}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index e4b5744977f6..384179a5002b 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/kvm_host.h> 22#include <linux/kvm_host.h>
23#include <linux/hash.h>
23 24
24#include <asm/kvm_ppc.h> 25#include <asm/kvm_ppc.h>
25#include <asm/kvm_book3s.h> 26#include <asm/kvm_book3s.h>
@@ -46,135 +47,20 @@
46#define dprintk_slb(a, ...) do { } while(0) 47#define dprintk_slb(a, ...) do { } while(0)
47#endif 48#endif
48 49
49static void invalidate_pte(struct hpte_cache *pte) 50void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
50{ 51{
51 dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
52 pte->pte.eaddr, pte->pte.vpage, pte->host_va);
53
54 ppc_md.hpte_invalidate(pte->slot, pte->host_va, 52 ppc_md.hpte_invalidate(pte->slot, pte->host_va,
55 MMU_PAGE_4K, MMU_SEGSIZE_256M, 53 MMU_PAGE_4K, MMU_SEGSIZE_256M,
56 false); 54 false);
57 pte->host_va = 0;
58
59 if (pte->pte.may_write)
60 kvm_release_pfn_dirty(pte->pfn);
61 else
62 kvm_release_pfn_clean(pte->pfn);
63}
64
65void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
66{
67 int i;
68
69 dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n",
70 vcpu->arch.hpte_cache_offset, guest_ea, ea_mask);
71 BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
72
73 guest_ea &= ea_mask;
74 for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
75 struct hpte_cache *pte;
76
77 pte = &vcpu->arch.hpte_cache[i];
78 if (!pte->host_va)
79 continue;
80
81 if ((pte->pte.eaddr & ea_mask) == guest_ea) {
82 invalidate_pte(pte);
83 }
84 }
85
86 /* Doing a complete flush -> start from scratch */
87 if (!ea_mask)
88 vcpu->arch.hpte_cache_offset = 0;
89}
90
91void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
92{
93 int i;
94
95 dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
96 vcpu->arch.hpte_cache_offset, guest_vp, vp_mask);
97 BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
98
99 guest_vp &= vp_mask;
100 for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
101 struct hpte_cache *pte;
102
103 pte = &vcpu->arch.hpte_cache[i];
104 if (!pte->host_va)
105 continue;
106
107 if ((pte->pte.vpage & vp_mask) == guest_vp) {
108 invalidate_pte(pte);
109 }
110 }
111}
112
113void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
114{
115 int i;
116
117 dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx & 0x%lx\n",
118 vcpu->arch.hpte_cache_offset, pa_start, pa_end);
119 BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
120
121 for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
122 struct hpte_cache *pte;
123
124 pte = &vcpu->arch.hpte_cache[i];
125 if (!pte->host_va)
126 continue;
127
128 if ((pte->pte.raddr >= pa_start) &&
129 (pte->pte.raddr < pa_end)) {
130 invalidate_pte(pte);
131 }
132 }
133}
134
135struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data)
136{
137 int i;
138 u64 guest_vp;
139
140 guest_vp = vcpu->arch.mmu.ea_to_vp(vcpu, ea, false);
141 for (i=0; i<vcpu->arch.hpte_cache_offset; i++) {
142 struct hpte_cache *pte;
143
144 pte = &vcpu->arch.hpte_cache[i];
145 if (!pte->host_va)
146 continue;
147
148 if (pte->pte.vpage == guest_vp)
149 return &pte->pte;
150 }
151
152 return NULL;
153}
154
155static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
156{
157 if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM)
158 kvmppc_mmu_pte_flush(vcpu, 0, 0);
159
160 return vcpu->arch.hpte_cache_offset++;
161} 55}
162 56
163/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using 57/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
164 * a hash, so we don't waste cycles on looping */ 58 * a hash, so we don't waste cycles on looping */
165static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid) 59static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
166{ 60{
167 return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^ 61 return hash_64(gvsid, SID_MAP_BITS);
168 ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
169 ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
170 ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
171 ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
172 ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
173 ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
174 ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
175} 62}
176 63
177
178static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) 64static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
179{ 65{
180 struct kvmppc_sid_map *map; 66 struct kvmppc_sid_map *map;
@@ -273,8 +159,7 @@ map_again:
273 attempt++; 159 attempt++;
274 goto map_again; 160 goto map_again;
275 } else { 161 } else {
276 int hpte_id = kvmppc_mmu_hpte_cache_next(vcpu); 162 struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu);
277 struct hpte_cache *pte = &vcpu->arch.hpte_cache[hpte_id];
278 163
279 dprintk_mmu("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx\n", 164 dprintk_mmu("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx\n",
280 ((rflags & HPTE_R_PP) == 3) ? '-' : 'w', 165 ((rflags & HPTE_R_PP) == 3) ? '-' : 'w',
@@ -292,6 +177,8 @@ map_again:
292 pte->host_va = va; 177 pte->host_va = va;
293 pte->pte = *orig_pte; 178 pte->pte = *orig_pte;
294 pte->pfn = hpaddr >> PAGE_SHIFT; 179 pte->pfn = hpaddr >> PAGE_SHIFT;
180
181 kvmppc_mmu_hpte_cache_map(vcpu, pte);
295 } 182 }
296 183
297 return 0; 184 return 0;
@@ -418,7 +305,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
418 305
419void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) 306void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
420{ 307{
421 kvmppc_mmu_pte_flush(vcpu, 0, 0); 308 kvmppc_mmu_hpte_destroy(vcpu);
422 __destroy_context(to_book3s(vcpu)->context_id); 309 __destroy_context(to_book3s(vcpu)->context_id);
423} 310}
424 311
@@ -436,5 +323,7 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
436 vcpu3s->vsid_first = vcpu3s->context_id << USER_ESID_BITS; 323 vcpu3s->vsid_first = vcpu3s->context_id << USER_ESID_BITS;
437 vcpu3s->vsid_next = vcpu3s->vsid_first; 324 vcpu3s->vsid_next = vcpu3s->vsid_first;
438 325
326 kvmppc_mmu_hpte_init(vcpu);
327
439 return 0; 328 return 0;
440} 329}
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
new file mode 100644
index 000000000000..4868d4a7ebc5
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -0,0 +1,277 @@
1/*
2 * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
3 *
4 * Authors:
5 * Alexander Graf <agraf@suse.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License, version 2, as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 */
20
21#include <linux/kvm_host.h>
22#include <linux/hash.h>
23#include <linux/slab.h>
24
25#include <asm/kvm_ppc.h>
26#include <asm/kvm_book3s.h>
27#include <asm/machdep.h>
28#include <asm/mmu_context.h>
29#include <asm/hw_irq.h>
30
31#define PTE_SIZE 12
32
33/* #define DEBUG_MMU */
34
35#ifdef DEBUG_MMU
36#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
37#else
38#define dprintk_mmu(a, ...) do { } while(0)
39#endif
40
41static struct kmem_cache *hpte_cache;
42
43static inline u64 kvmppc_mmu_hash_pte(u64 eaddr)
44{
45 return hash_64(eaddr >> PTE_SIZE, HPTEG_HASH_BITS_PTE);
46}
47
48static inline u64 kvmppc_mmu_hash_vpte(u64 vpage)
49{
50 return hash_64(vpage & 0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
51}
52
53static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage)
54{
55 return hash_64((vpage & 0xffffff000ULL) >> 12,
56 HPTEG_HASH_BITS_VPTE_LONG);
57}
58
59void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
60{
61 u64 index;
62
63 /* Add to ePTE list */
64 index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
65 hlist_add_head(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]);
66
67 /* Add to vPTE list */
68 index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
69 hlist_add_head(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]);
70
71 /* Add to vPTE_long list */
72 index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);
73 hlist_add_head(&pte->list_vpte_long,
74 &vcpu->arch.hpte_hash_vpte_long[index]);
75}
76
77static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
78{
79 dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
80 pte->pte.eaddr, pte->pte.vpage, pte->host_va);
81
82 /* Different for 32 and 64 bit */
83 kvmppc_mmu_invalidate_pte(vcpu, pte);
84
85 if (pte->pte.may_write)
86 kvm_release_pfn_dirty(pte->pfn);
87 else
88 kvm_release_pfn_clean(pte->pfn);
89
90 hlist_del(&pte->list_pte);
91 hlist_del(&pte->list_vpte);
92 hlist_del(&pte->list_vpte_long);
93
94 vcpu->arch.hpte_cache_count--;
95 kmem_cache_free(hpte_cache, pte);
96}
97
98static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
99{
100 struct hpte_cache *pte;
101 struct hlist_node *node, *tmp;
102 int i;
103
104 for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
105 struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
106
107 hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
108 invalidate_pte(vcpu, pte);
109 }
110}
111
112static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
113{
114 struct hlist_head *list;
115 struct hlist_node *node, *tmp;
116 struct hpte_cache *pte;
117
118 /* Find the list of entries in the map */
119 list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
120
121 /* Check the list for matching entries and invalidate */
122 hlist_for_each_entry_safe(pte, node, tmp, list, list_pte)
123 if ((pte->pte.eaddr & ~0xfffUL) == guest_ea)
124 invalidate_pte(vcpu, pte);
125}
126
127void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
128{
129 u64 i;
130
131 dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n",
132 vcpu->arch.hpte_cache_count, guest_ea, ea_mask);
133
134 guest_ea &= ea_mask;
135
136 switch (ea_mask) {
137 case ~0xfffUL:
138 kvmppc_mmu_pte_flush_page(vcpu, guest_ea);
139 break;
140 case 0x0ffff000:
141 /* 32-bit flush w/o segment, go through all possible segments */
142 for (i = 0; i < 0x100000000ULL; i += 0x10000000ULL)
143 kvmppc_mmu_pte_flush(vcpu, guest_ea | i, ~0xfffUL);
144 break;
145 case 0:
146 /* Doing a complete flush -> start from scratch */
147 kvmppc_mmu_pte_flush_all(vcpu);
148 break;
149 default:
150 WARN_ON(1);
151 break;
152 }
153}
154
155/* Flush with mask 0xfffffffff */
156static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
157{
158 struct hlist_head *list;
159 struct hlist_node *node, *tmp;
160 struct hpte_cache *pte;
161 u64 vp_mask = 0xfffffffffULL;
162
163 list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
164
165 /* Check the list for matching entries and invalidate */
166 hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte)
167 if ((pte->pte.vpage & vp_mask) == guest_vp)
168 invalidate_pte(vcpu, pte);
169}
170
171/* Flush with mask 0xffffff000 */
172static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
173{
174 struct hlist_head *list;
175 struct hlist_node *node, *tmp;
176 struct hpte_cache *pte;
177 u64 vp_mask = 0xffffff000ULL;
178
179 list = &vcpu->arch.hpte_hash_vpte_long[
180 kvmppc_mmu_hash_vpte_long(guest_vp)];
181
182 /* Check the list for matching entries and invalidate */
183 hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
184 if ((pte->pte.vpage & vp_mask) == guest_vp)
185 invalidate_pte(vcpu, pte);
186}
187
188void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
189{
190 dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
191 vcpu->arch.hpte_cache_count, guest_vp, vp_mask);
192 guest_vp &= vp_mask;
193
194 switch(vp_mask) {
195 case 0xfffffffffULL:
196 kvmppc_mmu_pte_vflush_short(vcpu, guest_vp);
197 break;
198 case 0xffffff000ULL:
199 kvmppc_mmu_pte_vflush_long(vcpu, guest_vp);
200 break;
201 default:
202 WARN_ON(1);
203 return;
204 }
205}
206
207void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
208{
209 struct hlist_node *node, *tmp;
210 struct hpte_cache *pte;
211 int i;
212
213 dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx - 0x%lx\n",
214 vcpu->arch.hpte_cache_count, pa_start, pa_end);
215
216 for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
217 struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
218
219 hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
220 if ((pte->pte.raddr >= pa_start) &&
221 (pte->pte.raddr < pa_end))
222 invalidate_pte(vcpu, pte);
223 }
224}
225
226struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
227{
228 struct hpte_cache *pte;
229
230 pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL);
231 vcpu->arch.hpte_cache_count++;
232
233 if (vcpu->arch.hpte_cache_count == HPTEG_CACHE_NUM)
234 kvmppc_mmu_pte_flush_all(vcpu);
235
236 return pte;
237}
238
239void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu)
240{
241 kvmppc_mmu_pte_flush(vcpu, 0, 0);
242}
243
244static void kvmppc_mmu_hpte_init_hash(struct hlist_head *hash_list, int len)
245{
246 int i;
247
248 for (i = 0; i < len; i++)
249 INIT_HLIST_HEAD(&hash_list[i]);
250}
251
252int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
253{
254 /* init hpte lookup hashes */
255 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte,
256 ARRAY_SIZE(vcpu->arch.hpte_hash_pte));
257 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte,
258 ARRAY_SIZE(vcpu->arch.hpte_hash_vpte));
259 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long,
260 ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long));
261
262 return 0;
263}
264
265int kvmppc_mmu_hpte_sysinit(void)
266{
267 /* init hpte slab cache */
268 hpte_cache = kmem_cache_create("kvm-spt", sizeof(struct hpte_cache),
269 sizeof(struct hpte_cache), 0, NULL);
270
271 return 0;
272}
273
274void kvmppc_mmu_hpte_sysexit(void)
275{
276 kmem_cache_destroy(hpte_cache);
277}
diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c
index a9f66abafcb3..474f2e24050a 100644
--- a/arch/powerpc/kvm/book3s_paired_singles.c
+++ b/arch/powerpc/kvm/book3s_paired_singles.c
@@ -159,10 +159,7 @@
159 159
160static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt) 160static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt)
161{ 161{
162 struct thread_struct t; 162 kvm_cvt_df(&vcpu->arch.fpr[rt], &vcpu->arch.qpr[rt], &vcpu->arch.fpscr);
163
164 t.fpscr.val = vcpu->arch.fpscr;
165 cvt_df((double*)&vcpu->arch.fpr[rt], (float*)&vcpu->arch.qpr[rt], &t);
166} 163}
167 164
168static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store) 165static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)
@@ -183,7 +180,6 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
183 int rs, ulong addr, int ls_type) 180 int rs, ulong addr, int ls_type)
184{ 181{
185 int emulated = EMULATE_FAIL; 182 int emulated = EMULATE_FAIL;
186 struct thread_struct t;
187 int r; 183 int r;
188 char tmp[8]; 184 char tmp[8];
189 int len = sizeof(u32); 185 int len = sizeof(u32);
@@ -191,8 +187,6 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
191 if (ls_type == FPU_LS_DOUBLE) 187 if (ls_type == FPU_LS_DOUBLE)
192 len = sizeof(u64); 188 len = sizeof(u64);
193 189
194 t.fpscr.val = vcpu->arch.fpscr;
195
196 /* read from memory */ 190 /* read from memory */
197 r = kvmppc_ld(vcpu, &addr, len, tmp, true); 191 r = kvmppc_ld(vcpu, &addr, len, tmp, true);
198 vcpu->arch.paddr_accessed = addr; 192 vcpu->arch.paddr_accessed = addr;
@@ -210,7 +204,7 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
210 /* put in registers */ 204 /* put in registers */
211 switch (ls_type) { 205 switch (ls_type) {
212 case FPU_LS_SINGLE: 206 case FPU_LS_SINGLE:
213 cvt_fd((float*)tmp, (double*)&vcpu->arch.fpr[rs], &t); 207 kvm_cvt_fd((u32*)tmp, &vcpu->arch.fpr[rs], &vcpu->arch.fpscr);
214 vcpu->arch.qpr[rs] = *((u32*)tmp); 208 vcpu->arch.qpr[rs] = *((u32*)tmp);
215 break; 209 break;
216 case FPU_LS_DOUBLE: 210 case FPU_LS_DOUBLE:
@@ -229,17 +223,14 @@ static int kvmppc_emulate_fpr_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
229 int rs, ulong addr, int ls_type) 223 int rs, ulong addr, int ls_type)
230{ 224{
231 int emulated = EMULATE_FAIL; 225 int emulated = EMULATE_FAIL;
232 struct thread_struct t;
233 int r; 226 int r;
234 char tmp[8]; 227 char tmp[8];
235 u64 val; 228 u64 val;
236 int len; 229 int len;
237 230
238 t.fpscr.val = vcpu->arch.fpscr;
239
240 switch (ls_type) { 231 switch (ls_type) {
241 case FPU_LS_SINGLE: 232 case FPU_LS_SINGLE:
242 cvt_df((double*)&vcpu->arch.fpr[rs], (float*)tmp, &t); 233 kvm_cvt_df(&vcpu->arch.fpr[rs], (u32*)tmp, &vcpu->arch.fpscr);
243 val = *((u32*)tmp); 234 val = *((u32*)tmp);
244 len = sizeof(u32); 235 len = sizeof(u32);
245 break; 236 break;
@@ -278,13 +269,10 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
278 int rs, ulong addr, bool w, int i) 269 int rs, ulong addr, bool w, int i)
279{ 270{
280 int emulated = EMULATE_FAIL; 271 int emulated = EMULATE_FAIL;
281 struct thread_struct t;
282 int r; 272 int r;
283 float one = 1.0; 273 float one = 1.0;
284 u32 tmp[2]; 274 u32 tmp[2];
285 275
286 t.fpscr.val = vcpu->arch.fpscr;
287
288 /* read from memory */ 276 /* read from memory */
289 if (w) { 277 if (w) {
290 r = kvmppc_ld(vcpu, &addr, sizeof(u32), tmp, true); 278 r = kvmppc_ld(vcpu, &addr, sizeof(u32), tmp, true);
@@ -308,7 +296,7 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
308 emulated = EMULATE_DONE; 296 emulated = EMULATE_DONE;
309 297
310 /* put in registers */ 298 /* put in registers */
311 cvt_fd((float*)&tmp[0], (double*)&vcpu->arch.fpr[rs], &t); 299 kvm_cvt_fd(&tmp[0], &vcpu->arch.fpr[rs], &vcpu->arch.fpscr);
312 vcpu->arch.qpr[rs] = tmp[1]; 300 vcpu->arch.qpr[rs] = tmp[1];
313 301
314 dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0], 302 dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0],
@@ -322,14 +310,11 @@ static int kvmppc_emulate_psq_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
322 int rs, ulong addr, bool w, int i) 310 int rs, ulong addr, bool w, int i)
323{ 311{
324 int emulated = EMULATE_FAIL; 312 int emulated = EMULATE_FAIL;
325 struct thread_struct t;
326 int r; 313 int r;
327 u32 tmp[2]; 314 u32 tmp[2];
328 int len = w ? sizeof(u32) : sizeof(u64); 315 int len = w ? sizeof(u32) : sizeof(u64);
329 316
330 t.fpscr.val = vcpu->arch.fpscr; 317 kvm_cvt_df(&vcpu->arch.fpr[rs], &tmp[0], &vcpu->arch.fpscr);
331
332 cvt_df((double*)&vcpu->arch.fpr[rs], (float*)&tmp[0], &t);
333 tmp[1] = vcpu->arch.qpr[rs]; 318 tmp[1] = vcpu->arch.qpr[rs];
334 319
335 r = kvmppc_st(vcpu, &addr, len, tmp, true); 320 r = kvmppc_st(vcpu, &addr, len, tmp, true);
@@ -517,7 +502,7 @@ static int get_d_signext(u32 inst)
517static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc, 502static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
518 int reg_out, int reg_in1, int reg_in2, 503 int reg_out, int reg_in1, int reg_in2,
519 int reg_in3, int scalar, 504 int reg_in3, int scalar,
520 void (*func)(struct thread_struct *t, 505 void (*func)(u64 *fpscr,
521 u32 *dst, u32 *src1, 506 u32 *dst, u32 *src1,
522 u32 *src2, u32 *src3)) 507 u32 *src2, u32 *src3))
523{ 508{
@@ -526,27 +511,25 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
526 u32 ps0_out; 511 u32 ps0_out;
527 u32 ps0_in1, ps0_in2, ps0_in3; 512 u32 ps0_in1, ps0_in2, ps0_in3;
528 u32 ps1_in1, ps1_in2, ps1_in3; 513 u32 ps1_in1, ps1_in2, ps1_in3;
529 struct thread_struct t;
530 t.fpscr.val = vcpu->arch.fpscr;
531 514
532 /* RC */ 515 /* RC */
533 WARN_ON(rc); 516 WARN_ON(rc);
534 517
535 /* PS0 */ 518 /* PS0 */
536 cvt_df((double*)&fpr[reg_in1], (float*)&ps0_in1, &t); 519 kvm_cvt_df(&fpr[reg_in1], &ps0_in1, &vcpu->arch.fpscr);
537 cvt_df((double*)&fpr[reg_in2], (float*)&ps0_in2, &t); 520 kvm_cvt_df(&fpr[reg_in2], &ps0_in2, &vcpu->arch.fpscr);
538 cvt_df((double*)&fpr[reg_in3], (float*)&ps0_in3, &t); 521 kvm_cvt_df(&fpr[reg_in3], &ps0_in3, &vcpu->arch.fpscr);
539 522
540 if (scalar & SCALAR_LOW) 523 if (scalar & SCALAR_LOW)
541 ps0_in2 = qpr[reg_in2]; 524 ps0_in2 = qpr[reg_in2];
542 525
543 func(&t, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3); 526 func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3);
544 527
545 dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n", 528 dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
546 ps0_in1, ps0_in2, ps0_in3, ps0_out); 529 ps0_in1, ps0_in2, ps0_in3, ps0_out);
547 530
548 if (!(scalar & SCALAR_NO_PS0)) 531 if (!(scalar & SCALAR_NO_PS0))
549 cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t); 532 kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr);
550 533
551 /* PS1 */ 534 /* PS1 */
552 ps1_in1 = qpr[reg_in1]; 535 ps1_in1 = qpr[reg_in1];
@@ -557,7 +540,7 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
557 ps1_in2 = ps0_in2; 540 ps1_in2 = ps0_in2;
558 541
559 if (!(scalar & SCALAR_NO_PS1)) 542 if (!(scalar & SCALAR_NO_PS1))
560 func(&t, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3); 543 func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3);
561 544
562 dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n", 545 dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
563 ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]); 546 ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]);
@@ -568,7 +551,7 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
568static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc, 551static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
569 int reg_out, int reg_in1, int reg_in2, 552 int reg_out, int reg_in1, int reg_in2,
570 int scalar, 553 int scalar,
571 void (*func)(struct thread_struct *t, 554 void (*func)(u64 *fpscr,
572 u32 *dst, u32 *src1, 555 u32 *dst, u32 *src1,
573 u32 *src2)) 556 u32 *src2))
574{ 557{
@@ -578,27 +561,25 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
578 u32 ps0_in1, ps0_in2; 561 u32 ps0_in1, ps0_in2;
579 u32 ps1_out; 562 u32 ps1_out;
580 u32 ps1_in1, ps1_in2; 563 u32 ps1_in1, ps1_in2;
581 struct thread_struct t;
582 t.fpscr.val = vcpu->arch.fpscr;
583 564
584 /* RC */ 565 /* RC */
585 WARN_ON(rc); 566 WARN_ON(rc);
586 567
587 /* PS0 */ 568 /* PS0 */
588 cvt_df((double*)&fpr[reg_in1], (float*)&ps0_in1, &t); 569 kvm_cvt_df(&fpr[reg_in1], &ps0_in1, &vcpu->arch.fpscr);
589 570
590 if (scalar & SCALAR_LOW) 571 if (scalar & SCALAR_LOW)
591 ps0_in2 = qpr[reg_in2]; 572 ps0_in2 = qpr[reg_in2];
592 else 573 else
593 cvt_df((double*)&fpr[reg_in2], (float*)&ps0_in2, &t); 574 kvm_cvt_df(&fpr[reg_in2], &ps0_in2, &vcpu->arch.fpscr);
594 575
595 func(&t, &ps0_out, &ps0_in1, &ps0_in2); 576 func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2);
596 577
597 if (!(scalar & SCALAR_NO_PS0)) { 578 if (!(scalar & SCALAR_NO_PS0)) {
598 dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n", 579 dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n",
599 ps0_in1, ps0_in2, ps0_out); 580 ps0_in1, ps0_in2, ps0_out);
600 581
601 cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t); 582 kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr);
602 } 583 }
603 584
604 /* PS1 */ 585 /* PS1 */
@@ -608,7 +589,7 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
608 if (scalar & SCALAR_HIGH) 589 if (scalar & SCALAR_HIGH)
609 ps1_in2 = ps0_in2; 590 ps1_in2 = ps0_in2;
610 591
611 func(&t, &ps1_out, &ps1_in1, &ps1_in2); 592 func(&vcpu->arch.fpscr, &ps1_out, &ps1_in1, &ps1_in2);
612 593
613 if (!(scalar & SCALAR_NO_PS1)) { 594 if (!(scalar & SCALAR_NO_PS1)) {
614 qpr[reg_out] = ps1_out; 595 qpr[reg_out] = ps1_out;
@@ -622,31 +603,29 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
622 603
623static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc, 604static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc,
624 int reg_out, int reg_in, 605 int reg_out, int reg_in,
625 void (*func)(struct thread_struct *t, 606 void (*func)(u64 *t,
626 u32 *dst, u32 *src1)) 607 u32 *dst, u32 *src1))
627{ 608{
628 u32 *qpr = vcpu->arch.qpr; 609 u32 *qpr = vcpu->arch.qpr;
629 u64 *fpr = vcpu->arch.fpr; 610 u64 *fpr = vcpu->arch.fpr;
630 u32 ps0_out, ps0_in; 611 u32 ps0_out, ps0_in;
631 u32 ps1_in; 612 u32 ps1_in;
632 struct thread_struct t;
633 t.fpscr.val = vcpu->arch.fpscr;
634 613
635 /* RC */ 614 /* RC */
636 WARN_ON(rc); 615 WARN_ON(rc);
637 616
638 /* PS0 */ 617 /* PS0 */
639 cvt_df((double*)&fpr[reg_in], (float*)&ps0_in, &t); 618 kvm_cvt_df(&fpr[reg_in], &ps0_in, &vcpu->arch.fpscr);
640 func(&t, &ps0_out, &ps0_in); 619 func(&vcpu->arch.fpscr, &ps0_out, &ps0_in);
641 620
642 dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n", 621 dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n",
643 ps0_in, ps0_out); 622 ps0_in, ps0_out);
644 623
645 cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t); 624 kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr);
646 625
647 /* PS1 */ 626 /* PS1 */
648 ps1_in = qpr[reg_in]; 627 ps1_in = qpr[reg_in];
649 func(&t, &qpr[reg_out], &ps1_in); 628 func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in);
650 629
651 dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n", 630 dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n",
652 ps1_in, qpr[reg_out]); 631 ps1_in, qpr[reg_out]);
@@ -672,13 +651,10 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
672 651
673 bool rcomp = (inst & 1) ? true : false; 652 bool rcomp = (inst & 1) ? true : false;
674 u32 cr = kvmppc_get_cr(vcpu); 653 u32 cr = kvmppc_get_cr(vcpu);
675 struct thread_struct t;
676#ifdef DEBUG 654#ifdef DEBUG
677 int i; 655 int i;
678#endif 656#endif
679 657
680 t.fpscr.val = vcpu->arch.fpscr;
681
682 if (!kvmppc_inst_is_paired_single(vcpu, inst)) 658 if (!kvmppc_inst_is_paired_single(vcpu, inst))
683 return EMULATE_FAIL; 659 return EMULATE_FAIL;
684 660
@@ -695,7 +671,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
695#ifdef DEBUG 671#ifdef DEBUG
696 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) { 672 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) {
697 u32 f; 673 u32 f;
698 cvt_df((double*)&vcpu->arch.fpr[i], (float*)&f, &t); 674 kvm_cvt_df(&vcpu->arch.fpr[i], &f, &vcpu->arch.fpscr);
699 dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx QPR[%d] = 0x%x\n", 675 dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx QPR[%d] = 0x%x\n",
700 i, f, vcpu->arch.fpr[i], i, vcpu->arch.qpr[i]); 676 i, f, vcpu->arch.fpr[i], i, vcpu->arch.qpr[i]);
701 } 677 }
@@ -819,8 +795,9 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
819 WARN_ON(rcomp); 795 WARN_ON(rcomp);
820 vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra]; 796 vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra];
821 /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */ 797 /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */
822 cvt_df((double*)&vcpu->arch.fpr[ax_rb], 798 kvm_cvt_df(&vcpu->arch.fpr[ax_rb],
823 (float*)&vcpu->arch.qpr[ax_rd], &t); 799 &vcpu->arch.qpr[ax_rd],
800 &vcpu->arch.fpscr);
824 break; 801 break;
825 case OP_4X_PS_MERGE01: 802 case OP_4X_PS_MERGE01:
826 WARN_ON(rcomp); 803 WARN_ON(rcomp);
@@ -830,17 +807,20 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
830 case OP_4X_PS_MERGE10: 807 case OP_4X_PS_MERGE10:
831 WARN_ON(rcomp); 808 WARN_ON(rcomp);
832 /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */ 809 /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */
833 cvt_fd((float*)&vcpu->arch.qpr[ax_ra], 810 kvm_cvt_fd(&vcpu->arch.qpr[ax_ra],
834 (double*)&vcpu->arch.fpr[ax_rd], &t); 811 &vcpu->arch.fpr[ax_rd],
812 &vcpu->arch.fpscr);
835 /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */ 813 /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */
836 cvt_df((double*)&vcpu->arch.fpr[ax_rb], 814 kvm_cvt_df(&vcpu->arch.fpr[ax_rb],
837 (float*)&vcpu->arch.qpr[ax_rd], &t); 815 &vcpu->arch.qpr[ax_rd],
816 &vcpu->arch.fpscr);
838 break; 817 break;
839 case OP_4X_PS_MERGE11: 818 case OP_4X_PS_MERGE11:
840 WARN_ON(rcomp); 819 WARN_ON(rcomp);
841 /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */ 820 /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */
842 cvt_fd((float*)&vcpu->arch.qpr[ax_ra], 821 kvm_cvt_fd(&vcpu->arch.qpr[ax_ra],
843 (double*)&vcpu->arch.fpr[ax_rd], &t); 822 &vcpu->arch.fpr[ax_rd],
823 &vcpu->arch.fpscr);
844 vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; 824 vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
845 break; 825 break;
846 } 826 }
@@ -1275,7 +1255,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
1275#ifdef DEBUG 1255#ifdef DEBUG
1276 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) { 1256 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) {
1277 u32 f; 1257 u32 f;
1278 cvt_df((double*)&vcpu->arch.fpr[i], (float*)&f, &t); 1258 kvm_cvt_df(&vcpu->arch.fpr[i], &f, &vcpu->arch.fpscr);
1279 dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f); 1259 dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f);
1280 } 1260 }
1281#endif 1261#endif
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index a33ab8cc2ccc..8d4e35f5372c 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -144,7 +144,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
144 unsigned int priority) 144 unsigned int priority)
145{ 145{
146 int allowed = 0; 146 int allowed = 0;
147 ulong msr_mask; 147 ulong uninitialized_var(msr_mask);
148 bool update_esr = false, update_dear = false; 148 bool update_esr = false, update_dear = false;
149 149
150 switch (priority) { 150 switch (priority) {
@@ -485,8 +485,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
485{ 485{
486 int i; 486 int i;
487 487
488 vcpu_load(vcpu);
489
490 regs->pc = vcpu->arch.pc; 488 regs->pc = vcpu->arch.pc;
491 regs->cr = kvmppc_get_cr(vcpu); 489 regs->cr = kvmppc_get_cr(vcpu);
492 regs->ctr = vcpu->arch.ctr; 490 regs->ctr = vcpu->arch.ctr;
@@ -507,8 +505,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
507 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 505 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
508 regs->gpr[i] = kvmppc_get_gpr(vcpu, i); 506 regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
509 507
510 vcpu_put(vcpu);
511
512 return 0; 508 return 0;
513} 509}
514 510
@@ -516,8 +512,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
516{ 512{
517 int i; 513 int i;
518 514
519 vcpu_load(vcpu);
520
521 vcpu->arch.pc = regs->pc; 515 vcpu->arch.pc = regs->pc;
522 kvmppc_set_cr(vcpu, regs->cr); 516 kvmppc_set_cr(vcpu, regs->cr);
523 vcpu->arch.ctr = regs->ctr; 517 vcpu->arch.ctr = regs->ctr;
@@ -537,8 +531,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
537 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 531 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
538 kvmppc_set_gpr(vcpu, i, regs->gpr[i]); 532 kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
539 533
540 vcpu_put(vcpu);
541
542 return 0; 534 return 0;
543} 535}
544 536
@@ -569,9 +561,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
569{ 561{
570 int r; 562 int r;
571 563
572 vcpu_load(vcpu);
573 r = kvmppc_core_vcpu_translate(vcpu, tr); 564 r = kvmppc_core_vcpu_translate(vcpu, tr);
574 vcpu_put(vcpu);
575 return r; 565 return r;
576} 566}
577 567
diff --git a/arch/powerpc/kvm/fpu.S b/arch/powerpc/kvm/fpu.S
index 2b340a3eee90..cb34bbe16113 100644
--- a/arch/powerpc/kvm/fpu.S
+++ b/arch/powerpc/kvm/fpu.S
@@ -271,3 +271,21 @@ FPD_THREE_IN(fmsub)
271FPD_THREE_IN(fmadd) 271FPD_THREE_IN(fmadd)
272FPD_THREE_IN(fnmsub) 272FPD_THREE_IN(fnmsub)
273FPD_THREE_IN(fnmadd) 273FPD_THREE_IN(fnmadd)
274
275_GLOBAL(kvm_cvt_fd)
276 lfd 0,0(r5) /* load up fpscr value */
277 MTFSF_L(0)
278 lfs 0,0(r3)
279 stfd 0,0(r4)
280 mffs 0
281 stfd 0,0(r5) /* save new fpscr value */
282 blr
283
284_GLOBAL(kvm_cvt_df)
285 lfd 0,0(r5) /* load up fpscr value */
286 MTFSF_L(0)
287 lfd 0,0(r3)
288 stfs 0,0(r4)
289 mffs 0
290 stfd 0,0(r5) /* save new fpscr value */
291 blr
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 9b8683f39e05..72a4ad86ee91 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -36,11 +36,6 @@
36#define CREATE_TRACE_POINTS 36#define CREATE_TRACE_POINTS
37#include "trace.h" 37#include "trace.h"
38 38
39gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
40{
41 return gfn;
42}
43
44int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 39int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
45{ 40{
46 return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions); 41 return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions);
@@ -287,7 +282,7 @@ static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
287static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, 282static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
288 struct kvm_run *run) 283 struct kvm_run *run)
289{ 284{
290 u64 gpr; 285 u64 uninitialized_var(gpr);
291 286
292 if (run->mmio.len > sizeof(gpr)) { 287 if (run->mmio.len > sizeof(gpr)) {
293 printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len); 288 printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len);
@@ -423,8 +418,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
423 int r; 418 int r;
424 sigset_t sigsaved; 419 sigset_t sigsaved;
425 420
426 vcpu_load(vcpu);
427
428 if (vcpu->sigset_active) 421 if (vcpu->sigset_active)
429 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 422 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
430 423
@@ -456,8 +449,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
456 if (vcpu->sigset_active) 449 if (vcpu->sigset_active)
457 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 450 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
458 451
459 vcpu_put(vcpu);
460
461 return r; 452 return r;
462} 453}
463 454
@@ -523,8 +514,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
523 if (copy_from_user(&irq, argp, sizeof(irq))) 514 if (copy_from_user(&irq, argp, sizeof(irq)))
524 goto out; 515 goto out;
525 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 516 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
526 break; 517 goto out;
527 } 518 }
519
528 case KVM_ENABLE_CAP: 520 case KVM_ENABLE_CAP:
529 { 521 {
530 struct kvm_enable_cap cap; 522 struct kvm_enable_cap cap;
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 27605b62b980..cef7dbf69dfc 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -26,7 +26,7 @@
26 26
27struct sca_entry { 27struct sca_entry {
28 atomic_t scn; 28 atomic_t scn;
29 __u64 reserved; 29 __u32 reserved;
30 __u64 sda; 30 __u64 sda;
31 __u64 reserved2[2]; 31 __u64 reserved2[2];
32} __attribute__((packed)); 32} __attribute__((packed));
@@ -41,7 +41,8 @@ struct sca_block {
41} __attribute__((packed)); 41} __attribute__((packed));
42 42
43#define KVM_NR_PAGE_SIZES 2 43#define KVM_NR_PAGE_SIZES 2
44#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + ((x) - 1) * 8) 44#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 8)
45#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
45#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) 46#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
46#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) 47#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
47#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) 48#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 3ddc30895e31..f7b6df45d8be 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -135,7 +135,7 @@ static int handle_stop(struct kvm_vcpu *vcpu)
135 spin_lock_bh(&vcpu->arch.local_int.lock); 135 spin_lock_bh(&vcpu->arch.local_int.lock);
136 if (vcpu->arch.local_int.action_bits & ACTION_STORE_ON_STOP) { 136 if (vcpu->arch.local_int.action_bits & ACTION_STORE_ON_STOP) {
137 vcpu->arch.local_int.action_bits &= ~ACTION_STORE_ON_STOP; 137 vcpu->arch.local_int.action_bits &= ~ACTION_STORE_ON_STOP;
138 rc = __kvm_s390_vcpu_store_status(vcpu, 138 rc = kvm_s390_vcpu_store_status(vcpu,
139 KVM_S390_STORE_STATUS_NOADDR); 139 KVM_S390_STORE_STATUS_NOADDR);
140 if (rc >= 0) 140 if (rc >= 0)
141 rc = -EOPNOTSUPP; 141 rc = -EOPNOTSUPP;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ae3705816878..4fe68650535c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -207,6 +207,7 @@ out_nokvm:
207void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 207void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
208{ 208{
209 VCPU_EVENT(vcpu, 3, "%s", "free cpu"); 209 VCPU_EVENT(vcpu, 3, "%s", "free cpu");
210 clear_bit(63 - vcpu->vcpu_id, (unsigned long *) &vcpu->kvm->arch.sca->mcn);
210 if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda == 211 if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda ==
211 (__u64) vcpu->arch.sie_block) 212 (__u64) vcpu->arch.sie_block)
212 vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0; 213 vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0;
@@ -296,7 +297,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
296{ 297{
297 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH); 298 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH);
298 set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests); 299 set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests);
299 vcpu->arch.sie_block->ecb = 2; 300 vcpu->arch.sie_block->ecb = 6;
300 vcpu->arch.sie_block->eca = 0xC1002001U; 301 vcpu->arch.sie_block->eca = 0xC1002001U;
301 vcpu->arch.sie_block->fac = (int) (long) facilities; 302 vcpu->arch.sie_block->fac = (int) (long) facilities;
302 hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 303 hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
@@ -329,6 +330,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
329 kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block; 330 kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
330 vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32); 331 vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32);
331 vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca; 332 vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
333 set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn);
332 334
333 spin_lock_init(&vcpu->arch.local_int.lock); 335 spin_lock_init(&vcpu->arch.local_int.lock);
334 INIT_LIST_HEAD(&vcpu->arch.local_int.list); 336 INIT_LIST_HEAD(&vcpu->arch.local_int.list);
@@ -363,63 +365,49 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
363 365
364static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) 366static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
365{ 367{
366 vcpu_load(vcpu);
367 kvm_s390_vcpu_initial_reset(vcpu); 368 kvm_s390_vcpu_initial_reset(vcpu);
368 vcpu_put(vcpu);
369 return 0; 369 return 0;
370} 370}
371 371
372int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 372int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
373{ 373{
374 vcpu_load(vcpu);
375 memcpy(&vcpu->arch.guest_gprs, &regs->gprs, sizeof(regs->gprs)); 374 memcpy(&vcpu->arch.guest_gprs, &regs->gprs, sizeof(regs->gprs));
376 vcpu_put(vcpu);
377 return 0; 375 return 0;
378} 376}
379 377
380int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 378int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
381{ 379{
382 vcpu_load(vcpu);
383 memcpy(&regs->gprs, &vcpu->arch.guest_gprs, sizeof(regs->gprs)); 380 memcpy(&regs->gprs, &vcpu->arch.guest_gprs, sizeof(regs->gprs));
384 vcpu_put(vcpu);
385 return 0; 381 return 0;
386} 382}
387 383
388int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 384int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
389 struct kvm_sregs *sregs) 385 struct kvm_sregs *sregs)
390{ 386{
391 vcpu_load(vcpu);
392 memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs)); 387 memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs));
393 memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs)); 388 memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
394 vcpu_put(vcpu);
395 return 0; 389 return 0;
396} 390}
397 391
398int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 392int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
399 struct kvm_sregs *sregs) 393 struct kvm_sregs *sregs)
400{ 394{
401 vcpu_load(vcpu);
402 memcpy(&sregs->acrs, &vcpu->arch.guest_acrs, sizeof(sregs->acrs)); 395 memcpy(&sregs->acrs, &vcpu->arch.guest_acrs, sizeof(sregs->acrs));
403 memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs)); 396 memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs));
404 vcpu_put(vcpu);
405 return 0; 397 return 0;
406} 398}
407 399
408int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 400int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
409{ 401{
410 vcpu_load(vcpu);
411 memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs)); 402 memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
412 vcpu->arch.guest_fpregs.fpc = fpu->fpc; 403 vcpu->arch.guest_fpregs.fpc = fpu->fpc;
413 vcpu_put(vcpu);
414 return 0; 404 return 0;
415} 405}
416 406
417int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 407int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
418{ 408{
419 vcpu_load(vcpu);
420 memcpy(&fpu->fprs, &vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs)); 409 memcpy(&fpu->fprs, &vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs));
421 fpu->fpc = vcpu->arch.guest_fpregs.fpc; 410 fpu->fpc = vcpu->arch.guest_fpregs.fpc;
422 vcpu_put(vcpu);
423 return 0; 411 return 0;
424} 412}
425 413
@@ -427,14 +415,12 @@ static int kvm_arch_vcpu_ioctl_set_initial_psw(struct kvm_vcpu *vcpu, psw_t psw)
427{ 415{
428 int rc = 0; 416 int rc = 0;
429 417
430 vcpu_load(vcpu);
431 if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING) 418 if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING)
432 rc = -EBUSY; 419 rc = -EBUSY;
433 else { 420 else {
434 vcpu->run->psw_mask = psw.mask; 421 vcpu->run->psw_mask = psw.mask;
435 vcpu->run->psw_addr = psw.addr; 422 vcpu->run->psw_addr = psw.addr;
436 } 423 }
437 vcpu_put(vcpu);
438 return rc; 424 return rc;
439} 425}
440 426
@@ -498,8 +484,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
498 int rc; 484 int rc;
499 sigset_t sigsaved; 485 sigset_t sigsaved;
500 486
501 vcpu_load(vcpu);
502
503rerun_vcpu: 487rerun_vcpu:
504 if (vcpu->requests) 488 if (vcpu->requests)
505 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 489 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -568,8 +552,6 @@ rerun_vcpu:
568 if (vcpu->sigset_active) 552 if (vcpu->sigset_active)
569 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 553 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
570 554
571 vcpu_put(vcpu);
572
573 vcpu->stat.exit_userspace++; 555 vcpu->stat.exit_userspace++;
574 return rc; 556 return rc;
575} 557}
@@ -589,7 +571,7 @@ static int __guestcopy(struct kvm_vcpu *vcpu, u64 guestdest, const void *from,
589 * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit 571 * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit
590 * KVM_S390_STORE_STATUS_PREFIXED: -> prefix 572 * KVM_S390_STORE_STATUS_PREFIXED: -> prefix
591 */ 573 */
592int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) 574int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
593{ 575{
594 const unsigned char archmode = 1; 576 const unsigned char archmode = 1;
595 int prefix; 577 int prefix;
@@ -651,45 +633,42 @@ int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
651 return 0; 633 return 0;
652} 634}
653 635
654static int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
655{
656 int rc;
657
658 vcpu_load(vcpu);
659 rc = __kvm_s390_vcpu_store_status(vcpu, addr);
660 vcpu_put(vcpu);
661 return rc;
662}
663
664long kvm_arch_vcpu_ioctl(struct file *filp, 636long kvm_arch_vcpu_ioctl(struct file *filp,
665 unsigned int ioctl, unsigned long arg) 637 unsigned int ioctl, unsigned long arg)
666{ 638{
667 struct kvm_vcpu *vcpu = filp->private_data; 639 struct kvm_vcpu *vcpu = filp->private_data;
668 void __user *argp = (void __user *)arg; 640 void __user *argp = (void __user *)arg;
641 long r;
669 642
670 switch (ioctl) { 643 switch (ioctl) {
671 case KVM_S390_INTERRUPT: { 644 case KVM_S390_INTERRUPT: {
672 struct kvm_s390_interrupt s390int; 645 struct kvm_s390_interrupt s390int;
673 646
647 r = -EFAULT;
674 if (copy_from_user(&s390int, argp, sizeof(s390int))) 648 if (copy_from_user(&s390int, argp, sizeof(s390int)))
675 return -EFAULT; 649 break;
676 return kvm_s390_inject_vcpu(vcpu, &s390int); 650 r = kvm_s390_inject_vcpu(vcpu, &s390int);
651 break;
677 } 652 }
678 case KVM_S390_STORE_STATUS: 653 case KVM_S390_STORE_STATUS:
679 return kvm_s390_vcpu_store_status(vcpu, arg); 654 r = kvm_s390_vcpu_store_status(vcpu, arg);
655 break;
680 case KVM_S390_SET_INITIAL_PSW: { 656 case KVM_S390_SET_INITIAL_PSW: {
681 psw_t psw; 657 psw_t psw;
682 658
659 r = -EFAULT;
683 if (copy_from_user(&psw, argp, sizeof(psw))) 660 if (copy_from_user(&psw, argp, sizeof(psw)))
684 return -EFAULT; 661 break;
685 return kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw); 662 r = kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw);
663 break;
686 } 664 }
687 case KVM_S390_INITIAL_RESET: 665 case KVM_S390_INITIAL_RESET:
688 return kvm_arch_vcpu_ioctl_initial_reset(vcpu); 666 r = kvm_arch_vcpu_ioctl_initial_reset(vcpu);
667 break;
689 default: 668 default:
690 ; 669 r = -EINVAL;
691 } 670 }
692 return -EINVAL; 671 return r;
693} 672}
694 673
695/* Section: memory related */ 674/* Section: memory related */
@@ -744,11 +723,6 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
744{ 723{
745} 724}
746 725
747gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
748{
749 return gfn;
750}
751
752static int __init kvm_s390_init(void) 726static int __init kvm_s390_init(void)
753{ 727{
754 int ret; 728 int ret;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index cfa9d1777457..a7b7586626db 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -92,7 +92,7 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
92int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); 92int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
93 93
94/* implemented in kvm-s390.c */ 94/* implemented in kvm-s390.c */
95int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, 95int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu,
96 unsigned long addr); 96 unsigned long addr);
97/* implemented in diag.c */ 97/* implemented in diag.c */
98int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); 98int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c991b3a7b904..815c5b2b9f57 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -482,6 +482,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src)
482 memcpy(dst->state, src->state, xstate_size); 482 memcpy(dst->state, src->state, xstate_size);
483} 483}
484 484
485extern void fpu_finit(struct fpu *fpu);
486
485#endif /* __ASSEMBLY__ */ 487#endif /* __ASSEMBLY__ */
486 488
487#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 489#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index ff90055c7f0b..4d8dcbdfc120 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -22,6 +22,8 @@
22#define __KVM_HAVE_XEN_HVM 22#define __KVM_HAVE_XEN_HVM
23#define __KVM_HAVE_VCPU_EVENTS 23#define __KVM_HAVE_VCPU_EVENTS
24#define __KVM_HAVE_DEBUGREGS 24#define __KVM_HAVE_DEBUGREGS
25#define __KVM_HAVE_XSAVE
26#define __KVM_HAVE_XCRS
25 27
26/* Architectural interrupt line count. */ 28/* Architectural interrupt line count. */
27#define KVM_NR_INTERRUPTS 256 29#define KVM_NR_INTERRUPTS 256
@@ -299,4 +301,24 @@ struct kvm_debugregs {
299 __u64 reserved[9]; 301 __u64 reserved[9];
300}; 302};
301 303
304/* for KVM_CAP_XSAVE */
305struct kvm_xsave {
306 __u32 region[1024];
307};
308
309#define KVM_MAX_XCRS 16
310
311struct kvm_xcr {
312 __u32 xcr;
313 __u32 reserved;
314 __u64 value;
315};
316
317struct kvm_xcrs {
318 __u32 nr_xcrs;
319 __u32 flags;
320 struct kvm_xcr xcrs[KVM_MAX_XCRS];
321 __u64 padding[16];
322};
323
302#endif /* _ASM_X86_KVM_H */ 324#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0b2729bf2070..51cfd730ac5d 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -51,8 +51,10 @@ struct x86_emulate_ctxt;
51#define X86EMUL_UNHANDLEABLE 1 51#define X86EMUL_UNHANDLEABLE 1
52/* Terminate emulation but return success to the caller. */ 52/* Terminate emulation but return success to the caller. */
53#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ 53#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
54#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ 54#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
55#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ 55#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
56#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
57
56struct x86_emulate_ops { 58struct x86_emulate_ops {
57 /* 59 /*
58 * read_std: Read bytes of standard (non-emulated/special) memory. 60 * read_std: Read bytes of standard (non-emulated/special) memory.
@@ -92,6 +94,7 @@ struct x86_emulate_ops {
92 int (*read_emulated)(unsigned long addr, 94 int (*read_emulated)(unsigned long addr,
93 void *val, 95 void *val,
94 unsigned int bytes, 96 unsigned int bytes,
97 unsigned int *error,
95 struct kvm_vcpu *vcpu); 98 struct kvm_vcpu *vcpu);
96 99
97 /* 100 /*
@@ -104,6 +107,7 @@ struct x86_emulate_ops {
104 int (*write_emulated)(unsigned long addr, 107 int (*write_emulated)(unsigned long addr,
105 const void *val, 108 const void *val,
106 unsigned int bytes, 109 unsigned int bytes,
110 unsigned int *error,
107 struct kvm_vcpu *vcpu); 111 struct kvm_vcpu *vcpu);
108 112
109 /* 113 /*
@@ -118,6 +122,7 @@ struct x86_emulate_ops {
118 const void *old, 122 const void *old,
119 const void *new, 123 const void *new,
120 unsigned int bytes, 124 unsigned int bytes,
125 unsigned int *error,
121 struct kvm_vcpu *vcpu); 126 struct kvm_vcpu *vcpu);
122 127
123 int (*pio_in_emulated)(int size, unsigned short port, void *val, 128 int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -132,18 +137,26 @@ struct x86_emulate_ops {
132 int seg, struct kvm_vcpu *vcpu); 137 int seg, struct kvm_vcpu *vcpu);
133 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); 138 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
134 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); 139 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
140 unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
135 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); 141 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
136 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); 142 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
137 void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); 143 int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
138 int (*cpl)(struct kvm_vcpu *vcpu); 144 int (*cpl)(struct kvm_vcpu *vcpu);
139 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 145 int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
146 int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
147 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
148 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
140}; 149};
141 150
142/* Type, address-of, and value of an instruction's operand. */ 151/* Type, address-of, and value of an instruction's operand. */
143struct operand { 152struct operand {
144 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; 153 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
145 unsigned int bytes; 154 unsigned int bytes;
146 unsigned long val, orig_val, *ptr; 155 unsigned long orig_val, *ptr;
156 union {
157 unsigned long val;
158 char valptr[sizeof(unsigned long) + 2];
159 };
147}; 160};
148 161
149struct fetch_cache { 162struct fetch_cache {
@@ -186,6 +199,7 @@ struct decode_cache {
186 unsigned long modrm_val; 199 unsigned long modrm_val;
187 struct fetch_cache fetch; 200 struct fetch_cache fetch;
188 struct read_cache io_read; 201 struct read_cache io_read;
202 struct read_cache mem_read;
189}; 203};
190 204
191struct x86_emulate_ctxt { 205struct x86_emulate_ctxt {
@@ -202,6 +216,12 @@ struct x86_emulate_ctxt {
202 int interruptibility; 216 int interruptibility;
203 217
204 bool restart; /* restart string instruction after writeback */ 218 bool restart; /* restart string instruction after writeback */
219
220 int exception; /* exception that happens during emulation or -1 */
221 u32 error_code; /* error code for exception */
222 bool error_code_valid;
223 unsigned long cr2; /* faulted address in case of #PF */
224
205 /* decode cache */ 225 /* decode cache */
206 struct decode_cache decode; 226 struct decode_cache decode;
207}; 227};
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 76f5483cffec..502e53f999cf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -15,6 +15,7 @@
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/mmu_notifier.h> 16#include <linux/mmu_notifier.h>
17#include <linux/tracepoint.h> 17#include <linux/tracepoint.h>
18#include <linux/cpumask.h>
18 19
19#include <linux/kvm.h> 20#include <linux/kvm.h>
20#include <linux/kvm_para.h> 21#include <linux/kvm_para.h>
@@ -39,11 +40,14 @@
39 0xFFFFFF0000000000ULL) 40 0xFFFFFF0000000000ULL)
40 41
41#define INVALID_PAGE (~(hpa_t)0) 42#define INVALID_PAGE (~(hpa_t)0)
43#define VALID_PAGE(x) ((x) != INVALID_PAGE)
44
42#define UNMAPPED_GVA (~(gpa_t)0) 45#define UNMAPPED_GVA (~(gpa_t)0)
43 46
44/* KVM Hugepage definitions for x86 */ 47/* KVM Hugepage definitions for x86 */
45#define KVM_NR_PAGE_SIZES 3 48#define KVM_NR_PAGE_SIZES 3
46#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) 49#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
50#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
47#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) 51#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
48#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) 52#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
49#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) 53#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
@@ -69,8 +73,6 @@
69 73
70#define IOPL_SHIFT 12 74#define IOPL_SHIFT 12
71 75
72#define KVM_ALIAS_SLOTS 4
73
74#define KVM_PERMILLE_MMU_PAGES 20 76#define KVM_PERMILLE_MMU_PAGES 20
75#define KVM_MIN_ALLOC_MMU_PAGES 64 77#define KVM_MIN_ALLOC_MMU_PAGES 64
76#define KVM_MMU_HASH_SHIFT 10 78#define KVM_MMU_HASH_SHIFT 10
@@ -241,7 +243,7 @@ struct kvm_mmu {
241 void (*prefetch_page)(struct kvm_vcpu *vcpu, 243 void (*prefetch_page)(struct kvm_vcpu *vcpu,
242 struct kvm_mmu_page *page); 244 struct kvm_mmu_page *page);
243 int (*sync_page)(struct kvm_vcpu *vcpu, 245 int (*sync_page)(struct kvm_vcpu *vcpu,
244 struct kvm_mmu_page *sp); 246 struct kvm_mmu_page *sp, bool clear_unsync);
245 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 247 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
246 hpa_t root_hpa; 248 hpa_t root_hpa;
247 int root_level; 249 int root_level;
@@ -301,8 +303,8 @@ struct kvm_vcpu_arch {
301 unsigned long mmu_seq; 303 unsigned long mmu_seq;
302 } update_pte; 304 } update_pte;
303 305
304 struct i387_fxsave_struct host_fx_image; 306 struct fpu guest_fpu;
305 struct i387_fxsave_struct guest_fx_image; 307 u64 xcr0;
306 308
307 gva_t mmio_fault_cr2; 309 gva_t mmio_fault_cr2;
308 struct kvm_pio_request pio; 310 struct kvm_pio_request pio;
@@ -360,26 +362,11 @@ struct kvm_vcpu_arch {
360 362
361 /* fields used by HYPER-V emulation */ 363 /* fields used by HYPER-V emulation */
362 u64 hv_vapic; 364 u64 hv_vapic;
363};
364
365struct kvm_mem_alias {
366 gfn_t base_gfn;
367 unsigned long npages;
368 gfn_t target_gfn;
369#define KVM_ALIAS_INVALID 1UL
370 unsigned long flags;
371};
372 365
373#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION 366 cpumask_var_t wbinvd_dirty_mask;
374
375struct kvm_mem_aliases {
376 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
377 int naliases;
378}; 367};
379 368
380struct kvm_arch { 369struct kvm_arch {
381 struct kvm_mem_aliases *aliases;
382
383 unsigned int n_free_mmu_pages; 370 unsigned int n_free_mmu_pages;
384 unsigned int n_requested_mmu_pages; 371 unsigned int n_requested_mmu_pages;
385 unsigned int n_alloc_mmu_pages; 372 unsigned int n_alloc_mmu_pages;
@@ -533,6 +520,8 @@ struct kvm_x86_ops {
533 520
534 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); 521 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
535 522
523 bool (*has_wbinvd_exit)(void);
524
536 const struct trace_print_flags *exit_reasons_str; 525 const struct trace_print_flags *exit_reasons_str;
537}; 526};
538 527
@@ -576,7 +565,6 @@ enum emulation_result {
576#define EMULTYPE_SKIP (1 << 2) 565#define EMULTYPE_SKIP (1 << 2)
577int emulate_instruction(struct kvm_vcpu *vcpu, 566int emulate_instruction(struct kvm_vcpu *vcpu,
578 unsigned long cr2, u16 error_code, int emulation_type); 567 unsigned long cr2, u16 error_code, int emulation_type);
579void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
580void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 568void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
581void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 569void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
582 570
@@ -591,10 +579,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
591int kvm_emulate_halt(struct kvm_vcpu *vcpu); 579int kvm_emulate_halt(struct kvm_vcpu *vcpu);
592int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); 580int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
593int emulate_clts(struct kvm_vcpu *vcpu); 581int emulate_clts(struct kvm_vcpu *vcpu);
594int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, 582int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
595 unsigned long *dest);
596int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
597 unsigned long value);
598 583
599void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 584void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
600int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); 585int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
@@ -602,15 +587,16 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
602int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 587int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
603 bool has_error_code, u32 error_code); 588 bool has_error_code, u32 error_code);
604 589
605void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 590int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
606void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 591int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
607void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 592int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
608void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); 593void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
609int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); 594int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
610int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); 595int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
611unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); 596unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
612void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); 597void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
613void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); 598void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
599int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
614 600
615int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 601int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
616int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 602int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
@@ -630,12 +616,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level);
630 616
631void kvm_inject_nmi(struct kvm_vcpu *vcpu); 617void kvm_inject_nmi(struct kvm_vcpu *vcpu);
632 618
633void fx_init(struct kvm_vcpu *vcpu); 619int fx_init(struct kvm_vcpu *vcpu);
634
635int emulator_write_emulated(unsigned long addr,
636 const void *val,
637 unsigned int bytes,
638 struct kvm_vcpu *vcpu);
639 620
640void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); 621void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
641void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 622void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -664,8 +645,6 @@ void kvm_disable_tdp(void);
664int complete_pio(struct kvm_vcpu *vcpu); 645int complete_pio(struct kvm_vcpu *vcpu);
665bool kvm_check_iopl(struct kvm_vcpu *vcpu); 646bool kvm_check_iopl(struct kvm_vcpu *vcpu);
666 647
667struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
668
669static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) 648static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
670{ 649{
671 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); 650 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -719,21 +698,6 @@ static inline unsigned long read_msr(unsigned long msr)
719} 698}
720#endif 699#endif
721 700
722static inline void kvm_fx_save(struct i387_fxsave_struct *image)
723{
724 asm("fxsave (%0)":: "r" (image));
725}
726
727static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
728{
729 asm("fxrstor (%0)":: "r" (image));
730}
731
732static inline void kvm_fx_finit(void)
733{
734 asm("finit");
735}
736
737static inline u32 get_rdx_init_val(void) 701static inline u32 get_rdx_init_val(void)
738{ 702{
739 return 0x600; /* P6 family */ 703 return 0x600; /* P6 family */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8c7ae4318629..509a42187dc2 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -20,6 +20,7 @@
20#define _EFER_LMA 10 /* Long mode active (read-only) */ 20#define _EFER_LMA 10 /* Long mode active (read-only) */
21#define _EFER_NX 11 /* No execute enable */ 21#define _EFER_NX 11 /* No execute enable */
22#define _EFER_SVME 12 /* Enable virtualization */ 22#define _EFER_SVME 12 /* Enable virtualization */
23#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
23#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ 24#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
24 25
25#define EFER_SCE (1<<_EFER_SCE) 26#define EFER_SCE (1<<_EFER_SCE)
@@ -27,6 +28,7 @@
27#define EFER_LMA (1<<_EFER_LMA) 28#define EFER_LMA (1<<_EFER_LMA)
28#define EFER_NX (1<<_EFER_NX) 29#define EFER_NX (1<<_EFER_NX)
29#define EFER_SVME (1<<_EFER_SVME) 30#define EFER_SVME (1<<_EFER_SVME)
31#define EFER_LMSLE (1<<_EFER_LMSLE)
30#define EFER_FFXSR (1<<_EFER_FFXSR) 32#define EFER_FFXSR (1<<_EFER_FFXSR)
31 33
32/* Intel MSRs. Some also available on other CPUs */ 34/* Intel MSRs. Some also available on other CPUs */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9e6779f7cf2d..9f0cbd987d50 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -257,6 +257,7 @@ enum vmcs_field {
257#define EXIT_REASON_IO_INSTRUCTION 30 257#define EXIT_REASON_IO_INSTRUCTION 30
258#define EXIT_REASON_MSR_READ 31 258#define EXIT_REASON_MSR_READ 31
259#define EXIT_REASON_MSR_WRITE 32 259#define EXIT_REASON_MSR_WRITE 32
260#define EXIT_REASON_INVALID_STATE 33
260#define EXIT_REASON_MWAIT_INSTRUCTION 36 261#define EXIT_REASON_MWAIT_INSTRUCTION 36
261#define EXIT_REASON_MONITOR_INSTRUCTION 39 262#define EXIT_REASON_MONITOR_INSTRUCTION 39
262#define EXIT_REASON_PAUSE_INSTRUCTION 40 263#define EXIT_REASON_PAUSE_INSTRUCTION 40
@@ -266,6 +267,7 @@ enum vmcs_field {
266#define EXIT_REASON_EPT_VIOLATION 48 267#define EXIT_REASON_EPT_VIOLATION 48
267#define EXIT_REASON_EPT_MISCONFIG 49 268#define EXIT_REASON_EPT_MISCONFIG 49
268#define EXIT_REASON_WBINVD 54 269#define EXIT_REASON_WBINVD 54
270#define EXIT_REASON_XSETBV 55
269 271
270/* 272/*
271 * Interruption-information format 273 * Interruption-information format
@@ -375,6 +377,9 @@ enum vmcs_field {
375#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 377#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
376#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 378#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
377 379
380#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
381#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
382
378#define VMX_EPT_DEFAULT_GAW 3 383#define VMX_EPT_DEFAULT_GAW 3
379#define VMX_EPT_MAX_GAW 0x4 384#define VMX_EPT_MAX_GAW 0x4
380#define VMX_EPT_MT_EPTE_SHIFT 3 385#define VMX_EPT_MT_EPTE_SHIFT 3
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 2c4390cae228..32c36668fa7b 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -13,6 +13,12 @@
13 13
14#define FXSAVE_SIZE 512 14#define FXSAVE_SIZE 512
15 15
16#define XSAVE_HDR_SIZE 64
17#define XSAVE_HDR_OFFSET FXSAVE_SIZE
18
19#define XSAVE_YMM_SIZE 256
20#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
21
16/* 22/*
17 * These are the features that the OS can handle currently. 23 * These are the features that the OS can handle currently.
18 */ 24 */
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 86cef6b32253..c4444bce8469 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -107,7 +107,7 @@ void __cpuinit fpu_init(void)
107} 107}
108#endif /* CONFIG_X86_64 */ 108#endif /* CONFIG_X86_64 */
109 109
110static void fpu_finit(struct fpu *fpu) 110void fpu_finit(struct fpu *fpu)
111{ 111{
112#ifdef CONFIG_X86_32 112#ifdef CONFIG_X86_32
113 if (!HAVE_HWFP) { 113 if (!HAVE_HWFP) {
@@ -132,6 +132,7 @@ static void fpu_finit(struct fpu *fpu)
132 fp->fos = 0xffff0000u; 132 fp->fos = 0xffff0000u;
133 } 133 }
134} 134}
135EXPORT_SYMBOL_GPL(fpu_finit);
135 136
136/* 137/*
137 * The _current_ task is using the FPU for the first time 138 * The _current_ task is using the FPU for the first time
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e35219b32f..ebcfcceccc72 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@ unsigned long idle_nomwait;
28EXPORT_SYMBOL(idle_nomwait); 28EXPORT_SYMBOL(idle_nomwait);
29 29
30struct kmem_cache *task_xstate_cachep; 30struct kmem_cache *task_xstate_cachep;
31EXPORT_SYMBOL_GPL(task_xstate_cachep);
31 32
32int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 33int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
33{ 34{
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5ac0bb465ed6..b38bd8b92aa6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,6 +9,7 @@
9 * privileged instructions: 9 * privileged instructions:
10 * 10 *
11 * Copyright (C) 2006 Qumranet 11 * Copyright (C) 2006 Qumranet
12 * Copyright 2010 Red Hat, Inc. and/or its affilates.
12 * 13 *
13 * Avi Kivity <avi@qumranet.com> 14 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com> 15 * Yaniv Kamay <yaniv@qumranet.com>
@@ -67,6 +68,9 @@
67#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ 68#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
68#define SrcImmU (9<<4) /* Immediate operand, unsigned */ 69#define SrcImmU (9<<4) /* Immediate operand, unsigned */
69#define SrcSI (0xa<<4) /* Source is in the DS:RSI */ 70#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
71#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */
72#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */
73#define SrcAcc (0xd<<4) /* Source Accumulator */
70#define SrcMask (0xf<<4) 74#define SrcMask (0xf<<4)
71/* Generic ModRM decode. */ 75/* Generic ModRM decode. */
72#define ModRM (1<<8) 76#define ModRM (1<<8)
@@ -88,10 +92,6 @@
88#define Src2CL (1<<29) 92#define Src2CL (1<<29)
89#define Src2ImmByte (2<<29) 93#define Src2ImmByte (2<<29)
90#define Src2One (3<<29) 94#define Src2One (3<<29)
91#define Src2Imm16 (4<<29)
92#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be
93 in memory and second argument is located
94 immediately after the first one in memory. */
95#define Src2Mask (7<<29) 95#define Src2Mask (7<<29)
96 96
97enum { 97enum {
@@ -124,15 +124,15 @@ static u32 opcode_table[256] = {
124 /* 0x20 - 0x27 */ 124 /* 0x20 - 0x27 */
125 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 125 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
126 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 126 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
127 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, 127 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
128 /* 0x28 - 0x2F */ 128 /* 0x28 - 0x2F */
129 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 129 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
130 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 130 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
131 0, 0, 0, 0, 131 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
132 /* 0x30 - 0x37 */ 132 /* 0x30 - 0x37 */
133 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 133 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
134 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 134 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
135 0, 0, 0, 0, 135 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
136 /* 0x38 - 0x3F */ 136 /* 0x38 - 0x3F */
137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
138 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 138 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -170,20 +170,20 @@ static u32 opcode_table[256] = {
170 /* 0x88 - 0x8F */ 170 /* 0x88 - 0x8F */
171 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 171 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
172 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 172 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
173 DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, 173 DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
174 DstReg | SrcMem | ModRM | Mov, Group | Group1A, 174 ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
175 /* 0x90 - 0x97 */ 175 /* 0x90 - 0x97 */
176 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 176 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
177 /* 0x98 - 0x9F */ 177 /* 0x98 - 0x9F */
178 0, 0, SrcImm | Src2Imm16 | No64, 0, 178 0, 0, SrcImmFAddr | No64, 0,
179 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 179 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
180 /* 0xA0 - 0xA7 */ 180 /* 0xA0 - 0xA7 */
181 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 181 ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
182 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, 182 ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, 183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, 184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
185 /* 0xA8 - 0xAF */ 185 /* 0xA8 - 0xAF */
186 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String, 186 DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, 187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
188 ByteOp | DstDI | String, DstDI | String, 188 ByteOp | DstDI | String, DstDI | String,
189 /* 0xB0 - 0xB7 */ 189 /* 0xB0 - 0xB7 */
@@ -215,7 +215,7 @@ static u32 opcode_table[256] = {
215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, 215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
216 /* 0xE8 - 0xEF */ 216 /* 0xE8 - 0xEF */
217 SrcImm | Stack, SrcImm | ImplicitOps, 217 SrcImm | Stack, SrcImm | ImplicitOps,
218 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, 218 SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, 219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, 220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
221 /* 0xF0 - 0xF7 */ 221 /* 0xF0 - 0xF7 */
@@ -337,20 +337,20 @@ static u32 group_table[] = {
337 [Group1A*8] = 337 [Group1A*8] =
338 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, 338 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
339 [Group3_Byte*8] = 339 [Group3_Byte*8] =
340 ByteOp | SrcImm | DstMem | ModRM, 0, 340 ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
341 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 341 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
342 0, 0, 0, 0, 342 0, 0, 0, 0,
343 [Group3*8] = 343 [Group3*8] =
344 DstMem | SrcImm | ModRM, 0, 344 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
345 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 345 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
346 0, 0, 0, 0, 346 0, 0, 0, 0,
347 [Group4*8] = 347 [Group4*8] =
348 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 348 ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
349 0, 0, 0, 0, 0, 0, 349 0, 0, 0, 0, 0, 0,
350 [Group5*8] = 350 [Group5*8] =
351 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 351 DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
352 SrcMem | ModRM | Stack, 0, 352 SrcMem | ModRM | Stack, 0,
353 SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps, 353 SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
354 SrcMem | ModRM | Stack, 0, 354 SrcMem | ModRM | Stack, 0,
355 [Group7*8] = 355 [Group7*8] =
356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, 356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
@@ -576,6 +576,13 @@ static u32 group2_table[] = {
576 (_type)_x; \ 576 (_type)_x; \
577}) 577})
578 578
579#define insn_fetch_arr(_arr, _size, _eip) \
580({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
581 if (rc != X86EMUL_CONTINUE) \
582 goto done; \
583 (_eip) += (_size); \
584})
585
579static inline unsigned long ad_mask(struct decode_cache *c) 586static inline unsigned long ad_mask(struct decode_cache *c)
580{ 587{
581 return (1UL << (c->ad_bytes << 3)) - 1; 588 return (1UL << (c->ad_bytes << 3)) - 1;
@@ -617,31 +624,66 @@ static void set_seg_override(struct decode_cache *c, int seg)
617 c->seg_override = seg; 624 c->seg_override = seg;
618} 625}
619 626
620static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) 627static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
628 struct x86_emulate_ops *ops, int seg)
621{ 629{
622 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) 630 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
623 return 0; 631 return 0;
624 632
625 return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); 633 return ops->get_cached_segment_base(seg, ctxt->vcpu);
626} 634}
627 635
628static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, 636static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
637 struct x86_emulate_ops *ops,
629 struct decode_cache *c) 638 struct decode_cache *c)
630{ 639{
631 if (!c->has_seg_override) 640 if (!c->has_seg_override)
632 return 0; 641 return 0;
633 642
634 return seg_base(ctxt, c->seg_override); 643 return seg_base(ctxt, ops, c->seg_override);
644}
645
646static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
647 struct x86_emulate_ops *ops)
648{
649 return seg_base(ctxt, ops, VCPU_SREG_ES);
650}
651
652static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
653 struct x86_emulate_ops *ops)
654{
655 return seg_base(ctxt, ops, VCPU_SREG_SS);
656}
657
658static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
659 u32 error, bool valid)
660{
661 ctxt->exception = vec;
662 ctxt->error_code = error;
663 ctxt->error_code_valid = valid;
664 ctxt->restart = false;
665}
666
667static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
668{
669 emulate_exception(ctxt, GP_VECTOR, err, true);
635} 670}
636 671
637static unsigned long es_base(struct x86_emulate_ctxt *ctxt) 672static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr,
673 int err)
638{ 674{
639 return seg_base(ctxt, VCPU_SREG_ES); 675 ctxt->cr2 = addr;
676 emulate_exception(ctxt, PF_VECTOR, err, true);
640} 677}
641 678
642static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) 679static void emulate_ud(struct x86_emulate_ctxt *ctxt)
643{ 680{
644 return seg_base(ctxt, VCPU_SREG_SS); 681 emulate_exception(ctxt, UD_VECTOR, 0, false);
682}
683
684static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
685{
686 emulate_exception(ctxt, TS_VECTOR, err, true);
645} 687}
646 688
647static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 689static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -932,12 +974,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
932 /* we cannot decode insn before we complete previous rep insn */ 974 /* we cannot decode insn before we complete previous rep insn */
933 WARN_ON(ctxt->restart); 975 WARN_ON(ctxt->restart);
934 976
935 /* Shadow copy of register state. Committed on successful emulation. */
936 memset(c, 0, sizeof(struct decode_cache));
937 c->eip = ctxt->eip; 977 c->eip = ctxt->eip;
938 c->fetch.start = c->fetch.end = c->eip; 978 c->fetch.start = c->fetch.end = c->eip;
939 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 979 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
940 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
941 980
942 switch (mode) { 981 switch (mode) {
943 case X86EMUL_MODE_REAL: 982 case X86EMUL_MODE_REAL:
@@ -1060,7 +1099,7 @@ done_prefixes:
1060 set_seg_override(c, VCPU_SREG_DS); 1099 set_seg_override(c, VCPU_SREG_DS);
1061 1100
1062 if (!(!c->twobyte && c->b == 0x8d)) 1101 if (!(!c->twobyte && c->b == 0x8d))
1063 c->modrm_ea += seg_override_base(ctxt, c); 1102 c->modrm_ea += seg_override_base(ctxt, ops, c);
1064 1103
1065 if (c->ad_bytes != 8) 1104 if (c->ad_bytes != 8)
1066 c->modrm_ea = (u32)c->modrm_ea; 1105 c->modrm_ea = (u32)c->modrm_ea;
@@ -1148,6 +1187,25 @@ done_prefixes:
1148 else 1187 else
1149 c->src.val = insn_fetch(u8, 1, c->eip); 1188 c->src.val = insn_fetch(u8, 1, c->eip);
1150 break; 1189 break;
1190 case SrcAcc:
1191 c->src.type = OP_REG;
1192 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1193 c->src.ptr = &c->regs[VCPU_REGS_RAX];
1194 switch (c->src.bytes) {
1195 case 1:
1196 c->src.val = *(u8 *)c->src.ptr;
1197 break;
1198 case 2:
1199 c->src.val = *(u16 *)c->src.ptr;
1200 break;
1201 case 4:
1202 c->src.val = *(u32 *)c->src.ptr;
1203 break;
1204 case 8:
1205 c->src.val = *(u64 *)c->src.ptr;
1206 break;
1207 }
1208 break;
1151 case SrcOne: 1209 case SrcOne:
1152 c->src.bytes = 1; 1210 c->src.bytes = 1;
1153 c->src.val = 1; 1211 c->src.val = 1;
@@ -1156,10 +1214,21 @@ done_prefixes:
1156 c->src.type = OP_MEM; 1214 c->src.type = OP_MEM;
1157 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1215 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1158 c->src.ptr = (unsigned long *) 1216 c->src.ptr = (unsigned long *)
1159 register_address(c, seg_override_base(ctxt, c), 1217 register_address(c, seg_override_base(ctxt, ops, c),
1160 c->regs[VCPU_REGS_RSI]); 1218 c->regs[VCPU_REGS_RSI]);
1161 c->src.val = 0; 1219 c->src.val = 0;
1162 break; 1220 break;
1221 case SrcImmFAddr:
1222 c->src.type = OP_IMM;
1223 c->src.ptr = (unsigned long *)c->eip;
1224 c->src.bytes = c->op_bytes + 2;
1225 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
1226 break;
1227 case SrcMemFAddr:
1228 c->src.type = OP_MEM;
1229 c->src.ptr = (unsigned long *)c->modrm_ea;
1230 c->src.bytes = c->op_bytes + 2;
1231 break;
1163 } 1232 }
1164 1233
1165 /* 1234 /*
@@ -1179,22 +1248,10 @@ done_prefixes:
1179 c->src2.bytes = 1; 1248 c->src2.bytes = 1;
1180 c->src2.val = insn_fetch(u8, 1, c->eip); 1249 c->src2.val = insn_fetch(u8, 1, c->eip);
1181 break; 1250 break;
1182 case Src2Imm16:
1183 c->src2.type = OP_IMM;
1184 c->src2.ptr = (unsigned long *)c->eip;
1185 c->src2.bytes = 2;
1186 c->src2.val = insn_fetch(u16, 2, c->eip);
1187 break;
1188 case Src2One: 1251 case Src2One:
1189 c->src2.bytes = 1; 1252 c->src2.bytes = 1;
1190 c->src2.val = 1; 1253 c->src2.val = 1;
1191 break; 1254 break;
1192 case Src2Mem16:
1193 c->src2.type = OP_MEM;
1194 c->src2.bytes = 2;
1195 c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
1196 c->src2.val = 0;
1197 break;
1198 } 1255 }
1199 1256
1200 /* Decode and fetch the destination operand: register or memory. */ 1257 /* Decode and fetch the destination operand: register or memory. */
@@ -1253,7 +1310,7 @@ done_prefixes:
1253 c->dst.type = OP_MEM; 1310 c->dst.type = OP_MEM;
1254 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1311 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1255 c->dst.ptr = (unsigned long *) 1312 c->dst.ptr = (unsigned long *)
1256 register_address(c, es_base(ctxt), 1313 register_address(c, es_base(ctxt, ops),
1257 c->regs[VCPU_REGS_RDI]); 1314 c->regs[VCPU_REGS_RDI]);
1258 c->dst.val = 0; 1315 c->dst.val = 0;
1259 break; 1316 break;
@@ -1263,6 +1320,37 @@ done:
1263 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1320 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1264} 1321}
1265 1322
1323static int read_emulated(struct x86_emulate_ctxt *ctxt,
1324 struct x86_emulate_ops *ops,
1325 unsigned long addr, void *dest, unsigned size)
1326{
1327 int rc;
1328 struct read_cache *mc = &ctxt->decode.mem_read;
1329 u32 err;
1330
1331 while (size) {
1332 int n = min(size, 8u);
1333 size -= n;
1334 if (mc->pos < mc->end)
1335 goto read_cached;
1336
1337 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
1338 ctxt->vcpu);
1339 if (rc == X86EMUL_PROPAGATE_FAULT)
1340 emulate_pf(ctxt, addr, err);
1341 if (rc != X86EMUL_CONTINUE)
1342 return rc;
1343 mc->end += n;
1344
1345 read_cached:
1346 memcpy(dest, mc->data + mc->pos, n);
1347 mc->pos += n;
1348 dest += n;
1349 addr += n;
1350 }
1351 return X86EMUL_CONTINUE;
1352}
1353
1266static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, 1354static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1267 struct x86_emulate_ops *ops, 1355 struct x86_emulate_ops *ops,
1268 unsigned int size, unsigned short port, 1356 unsigned int size, unsigned short port,
@@ -1330,13 +1418,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1330 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1418 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1331 1419
1332 if (dt.size < index * 8 + 7) { 1420 if (dt.size < index * 8 + 7) {
1333 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); 1421 emulate_gp(ctxt, selector & 0xfffc);
1334 return X86EMUL_PROPAGATE_FAULT; 1422 return X86EMUL_PROPAGATE_FAULT;
1335 } 1423 }
1336 addr = dt.address + index * 8; 1424 addr = dt.address + index * 8;
1337 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 1425 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1338 if (ret == X86EMUL_PROPAGATE_FAULT) 1426 if (ret == X86EMUL_PROPAGATE_FAULT)
1339 kvm_inject_page_fault(ctxt->vcpu, addr, err); 1427 emulate_pf(ctxt, addr, err);
1340 1428
1341 return ret; 1429 return ret;
1342} 1430}
@@ -1355,14 +1443,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1355 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1443 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1356 1444
1357 if (dt.size < index * 8 + 7) { 1445 if (dt.size < index * 8 + 7) {
1358 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); 1446 emulate_gp(ctxt, selector & 0xfffc);
1359 return X86EMUL_PROPAGATE_FAULT; 1447 return X86EMUL_PROPAGATE_FAULT;
1360 } 1448 }
1361 1449
1362 addr = dt.address + index * 8; 1450 addr = dt.address + index * 8;
1363 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 1451 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1364 if (ret == X86EMUL_PROPAGATE_FAULT) 1452 if (ret == X86EMUL_PROPAGATE_FAULT)
1365 kvm_inject_page_fault(ctxt->vcpu, addr, err); 1453 emulate_pf(ctxt, addr, err);
1366 1454
1367 return ret; 1455 return ret;
1368} 1456}
@@ -1481,11 +1569,70 @@ load:
1481 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); 1569 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
1482 return X86EMUL_CONTINUE; 1570 return X86EMUL_CONTINUE;
1483exception: 1571exception:
1484 kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code); 1572 emulate_exception(ctxt, err_vec, err_code, true);
1485 return X86EMUL_PROPAGATE_FAULT; 1573 return X86EMUL_PROPAGATE_FAULT;
1486} 1574}
1487 1575
1488static inline void emulate_push(struct x86_emulate_ctxt *ctxt) 1576static inline int writeback(struct x86_emulate_ctxt *ctxt,
1577 struct x86_emulate_ops *ops)
1578{
1579 int rc;
1580 struct decode_cache *c = &ctxt->decode;
1581 u32 err;
1582
1583 switch (c->dst.type) {
1584 case OP_REG:
1585 /* The 4-byte case *is* correct:
1586 * in 64-bit mode we zero-extend.
1587 */
1588 switch (c->dst.bytes) {
1589 case 1:
1590 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1591 break;
1592 case 2:
1593 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1594 break;
1595 case 4:
1596 *c->dst.ptr = (u32)c->dst.val;
1597 break; /* 64b: zero-ext */
1598 case 8:
1599 *c->dst.ptr = c->dst.val;
1600 break;
1601 }
1602 break;
1603 case OP_MEM:
1604 if (c->lock_prefix)
1605 rc = ops->cmpxchg_emulated(
1606 (unsigned long)c->dst.ptr,
1607 &c->dst.orig_val,
1608 &c->dst.val,
1609 c->dst.bytes,
1610 &err,
1611 ctxt->vcpu);
1612 else
1613 rc = ops->write_emulated(
1614 (unsigned long)c->dst.ptr,
1615 &c->dst.val,
1616 c->dst.bytes,
1617 &err,
1618 ctxt->vcpu);
1619 if (rc == X86EMUL_PROPAGATE_FAULT)
1620 emulate_pf(ctxt,
1621 (unsigned long)c->dst.ptr, err);
1622 if (rc != X86EMUL_CONTINUE)
1623 return rc;
1624 break;
1625 case OP_NONE:
1626 /* no writeback */
1627 break;
1628 default:
1629 break;
1630 }
1631 return X86EMUL_CONTINUE;
1632}
1633
1634static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
1635 struct x86_emulate_ops *ops)
1489{ 1636{
1490 struct decode_cache *c = &ctxt->decode; 1637 struct decode_cache *c = &ctxt->decode;
1491 1638
@@ -1493,7 +1640,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1493 c->dst.bytes = c->op_bytes; 1640 c->dst.bytes = c->op_bytes;
1494 c->dst.val = c->src.val; 1641 c->dst.val = c->src.val;
1495 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1642 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1496 c->dst.ptr = (void *) register_address(c, ss_base(ctxt), 1643 c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
1497 c->regs[VCPU_REGS_RSP]); 1644 c->regs[VCPU_REGS_RSP]);
1498} 1645}
1499 1646
@@ -1504,9 +1651,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1504 struct decode_cache *c = &ctxt->decode; 1651 struct decode_cache *c = &ctxt->decode;
1505 int rc; 1652 int rc;
1506 1653
1507 rc = ops->read_emulated(register_address(c, ss_base(ctxt), 1654 rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
1508 c->regs[VCPU_REGS_RSP]), 1655 c->regs[VCPU_REGS_RSP]),
1509 dest, len, ctxt->vcpu); 1656 dest, len);
1510 if (rc != X86EMUL_CONTINUE) 1657 if (rc != X86EMUL_CONTINUE)
1511 return rc; 1658 return rc;
1512 1659
@@ -1541,7 +1688,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1541 break; 1688 break;
1542 case X86EMUL_MODE_VM86: 1689 case X86EMUL_MODE_VM86:
1543 if (iopl < 3) { 1690 if (iopl < 3) {
1544 kvm_inject_gp(ctxt->vcpu, 0); 1691 emulate_gp(ctxt, 0);
1545 return X86EMUL_PROPAGATE_FAULT; 1692 return X86EMUL_PROPAGATE_FAULT;
1546 } 1693 }
1547 change_mask |= EFLG_IF; 1694 change_mask |= EFLG_IF;
@@ -1557,15 +1704,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1557 return rc; 1704 return rc;
1558} 1705}
1559 1706
1560static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) 1707static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
1708 struct x86_emulate_ops *ops, int seg)
1561{ 1709{
1562 struct decode_cache *c = &ctxt->decode; 1710 struct decode_cache *c = &ctxt->decode;
1563 struct kvm_segment segment;
1564 1711
1565 kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg); 1712 c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
1566 1713
1567 c->src.val = segment.selector; 1714 emulate_push(ctxt, ops);
1568 emulate_push(ctxt);
1569} 1715}
1570 1716
1571static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, 1717static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1583,19 +1729,31 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1583 return rc; 1729 return rc;
1584} 1730}
1585 1731
1586static void emulate_pusha(struct x86_emulate_ctxt *ctxt) 1732static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
1733 struct x86_emulate_ops *ops)
1587{ 1734{
1588 struct decode_cache *c = &ctxt->decode; 1735 struct decode_cache *c = &ctxt->decode;
1589 unsigned long old_esp = c->regs[VCPU_REGS_RSP]; 1736 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
1737 int rc = X86EMUL_CONTINUE;
1590 int reg = VCPU_REGS_RAX; 1738 int reg = VCPU_REGS_RAX;
1591 1739
1592 while (reg <= VCPU_REGS_RDI) { 1740 while (reg <= VCPU_REGS_RDI) {
1593 (reg == VCPU_REGS_RSP) ? 1741 (reg == VCPU_REGS_RSP) ?
1594 (c->src.val = old_esp) : (c->src.val = c->regs[reg]); 1742 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1595 1743
1596 emulate_push(ctxt); 1744 emulate_push(ctxt, ops);
1745
1746 rc = writeback(ctxt, ops);
1747 if (rc != X86EMUL_CONTINUE)
1748 return rc;
1749
1597 ++reg; 1750 ++reg;
1598 } 1751 }
1752
1753 /* Disable writeback. */
1754 c->dst.type = OP_NONE;
1755
1756 return rc;
1599} 1757}
1600 1758
1601static int emulate_popa(struct x86_emulate_ctxt *ctxt, 1759static int emulate_popa(struct x86_emulate_ctxt *ctxt,
@@ -1695,14 +1853,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1695 old_eip = c->eip; 1853 old_eip = c->eip;
1696 c->eip = c->src.val; 1854 c->eip = c->src.val;
1697 c->src.val = old_eip; 1855 c->src.val = old_eip;
1698 emulate_push(ctxt); 1856 emulate_push(ctxt, ops);
1699 break; 1857 break;
1700 } 1858 }
1701 case 4: /* jmp abs */ 1859 case 4: /* jmp abs */
1702 c->eip = c->src.val; 1860 c->eip = c->src.val;
1703 break; 1861 break;
1704 case 6: /* push */ 1862 case 6: /* push */
1705 emulate_push(ctxt); 1863 emulate_push(ctxt, ops);
1706 break; 1864 break;
1707 } 1865 }
1708 return X86EMUL_CONTINUE; 1866 return X86EMUL_CONTINUE;
@@ -1748,145 +1906,82 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1748 return rc; 1906 return rc;
1749} 1907}
1750 1908
1751static inline int writeback(struct x86_emulate_ctxt *ctxt,
1752 struct x86_emulate_ops *ops)
1753{
1754 int rc;
1755 struct decode_cache *c = &ctxt->decode;
1756
1757 switch (c->dst.type) {
1758 case OP_REG:
1759 /* The 4-byte case *is* correct:
1760 * in 64-bit mode we zero-extend.
1761 */
1762 switch (c->dst.bytes) {
1763 case 1:
1764 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1765 break;
1766 case 2:
1767 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1768 break;
1769 case 4:
1770 *c->dst.ptr = (u32)c->dst.val;
1771 break; /* 64b: zero-ext */
1772 case 8:
1773 *c->dst.ptr = c->dst.val;
1774 break;
1775 }
1776 break;
1777 case OP_MEM:
1778 if (c->lock_prefix)
1779 rc = ops->cmpxchg_emulated(
1780 (unsigned long)c->dst.ptr,
1781 &c->dst.orig_val,
1782 &c->dst.val,
1783 c->dst.bytes,
1784 ctxt->vcpu);
1785 else
1786 rc = ops->write_emulated(
1787 (unsigned long)c->dst.ptr,
1788 &c->dst.val,
1789 c->dst.bytes,
1790 ctxt->vcpu);
1791 if (rc != X86EMUL_CONTINUE)
1792 return rc;
1793 break;
1794 case OP_NONE:
1795 /* no writeback */
1796 break;
1797 default:
1798 break;
1799 }
1800 return X86EMUL_CONTINUE;
1801}
1802
1803static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
1804{
1805 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
1806 /*
1807 * an sti; sti; sequence only disable interrupts for the first
1808 * instruction. So, if the last instruction, be it emulated or
1809 * not, left the system with the INT_STI flag enabled, it
1810 * means that the last instruction is an sti. We should not
1811 * leave the flag on in this case. The same goes for mov ss
1812 */
1813 if (!(int_shadow & mask))
1814 ctxt->interruptibility = mask;
1815}
1816
1817static inline void 1909static inline void
1818setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 1910setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1819 struct kvm_segment *cs, struct kvm_segment *ss) 1911 struct x86_emulate_ops *ops, struct desc_struct *cs,
1912 struct desc_struct *ss)
1820{ 1913{
1821 memset(cs, 0, sizeof(struct kvm_segment)); 1914 memset(cs, 0, sizeof(struct desc_struct));
1822 kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); 1915 ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu);
1823 memset(ss, 0, sizeof(struct kvm_segment)); 1916 memset(ss, 0, sizeof(struct desc_struct));
1824 1917
1825 cs->l = 0; /* will be adjusted later */ 1918 cs->l = 0; /* will be adjusted later */
1826 cs->base = 0; /* flat segment */ 1919 set_desc_base(cs, 0); /* flat segment */
1827 cs->g = 1; /* 4kb granularity */ 1920 cs->g = 1; /* 4kb granularity */
1828 cs->limit = 0xffffffff; /* 4GB limit */ 1921 set_desc_limit(cs, 0xfffff); /* 4GB limit */
1829 cs->type = 0x0b; /* Read, Execute, Accessed */ 1922 cs->type = 0x0b; /* Read, Execute, Accessed */
1830 cs->s = 1; 1923 cs->s = 1;
1831 cs->dpl = 0; /* will be adjusted later */ 1924 cs->dpl = 0; /* will be adjusted later */
1832 cs->present = 1; 1925 cs->p = 1;
1833 cs->db = 1; 1926 cs->d = 1;
1834 1927
1835 ss->unusable = 0; 1928 set_desc_base(ss, 0); /* flat segment */
1836 ss->base = 0; /* flat segment */ 1929 set_desc_limit(ss, 0xfffff); /* 4GB limit */
1837 ss->limit = 0xffffffff; /* 4GB limit */
1838 ss->g = 1; /* 4kb granularity */ 1930 ss->g = 1; /* 4kb granularity */
1839 ss->s = 1; 1931 ss->s = 1;
1840 ss->type = 0x03; /* Read/Write, Accessed */ 1932 ss->type = 0x03; /* Read/Write, Accessed */
1841 ss->db = 1; /* 32bit stack segment */ 1933 ss->d = 1; /* 32bit stack segment */
1842 ss->dpl = 0; 1934 ss->dpl = 0;
1843 ss->present = 1; 1935 ss->p = 1;
1844} 1936}
1845 1937
1846static int 1938static int
1847emulate_syscall(struct x86_emulate_ctxt *ctxt) 1939emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1848{ 1940{
1849 struct decode_cache *c = &ctxt->decode; 1941 struct decode_cache *c = &ctxt->decode;
1850 struct kvm_segment cs, ss; 1942 struct desc_struct cs, ss;
1851 u64 msr_data; 1943 u64 msr_data;
1944 u16 cs_sel, ss_sel;
1852 1945
1853 /* syscall is not available in real mode */ 1946 /* syscall is not available in real mode */
1854 if (ctxt->mode == X86EMUL_MODE_REAL || 1947 if (ctxt->mode == X86EMUL_MODE_REAL ||
1855 ctxt->mode == X86EMUL_MODE_VM86) { 1948 ctxt->mode == X86EMUL_MODE_VM86) {
1856 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 1949 emulate_ud(ctxt);
1857 return X86EMUL_PROPAGATE_FAULT; 1950 return X86EMUL_PROPAGATE_FAULT;
1858 } 1951 }
1859 1952
1860 setup_syscalls_segments(ctxt, &cs, &ss); 1953 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1861 1954
1862 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1955 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
1863 msr_data >>= 32; 1956 msr_data >>= 32;
1864 cs.selector = (u16)(msr_data & 0xfffc); 1957 cs_sel = (u16)(msr_data & 0xfffc);
1865 ss.selector = (u16)(msr_data + 8); 1958 ss_sel = (u16)(msr_data + 8);
1866 1959
1867 if (is_long_mode(ctxt->vcpu)) { 1960 if (is_long_mode(ctxt->vcpu)) {
1868 cs.db = 0; 1961 cs.d = 0;
1869 cs.l = 1; 1962 cs.l = 1;
1870 } 1963 }
1871 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 1964 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
1872 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 1965 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1966 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
1967 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1873 1968
1874 c->regs[VCPU_REGS_RCX] = c->eip; 1969 c->regs[VCPU_REGS_RCX] = c->eip;
1875 if (is_long_mode(ctxt->vcpu)) { 1970 if (is_long_mode(ctxt->vcpu)) {
1876#ifdef CONFIG_X86_64 1971#ifdef CONFIG_X86_64
1877 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; 1972 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
1878 1973
1879 kvm_x86_ops->get_msr(ctxt->vcpu, 1974 ops->get_msr(ctxt->vcpu,
1880 ctxt->mode == X86EMUL_MODE_PROT64 ? 1975 ctxt->mode == X86EMUL_MODE_PROT64 ?
1881 MSR_LSTAR : MSR_CSTAR, &msr_data); 1976 MSR_LSTAR : MSR_CSTAR, &msr_data);
1882 c->eip = msr_data; 1977 c->eip = msr_data;
1883 1978
1884 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); 1979 ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
1885 ctxt->eflags &= ~(msr_data | EFLG_RF); 1980 ctxt->eflags &= ~(msr_data | EFLG_RF);
1886#endif 1981#endif
1887 } else { 1982 } else {
1888 /* legacy mode */ 1983 /* legacy mode */
1889 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1984 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
1890 c->eip = (u32)msr_data; 1985 c->eip = (u32)msr_data;
1891 1986
1892 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1987 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1896,15 +1991,16 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1896} 1991}
1897 1992
1898static int 1993static int
1899emulate_sysenter(struct x86_emulate_ctxt *ctxt) 1994emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1900{ 1995{
1901 struct decode_cache *c = &ctxt->decode; 1996 struct decode_cache *c = &ctxt->decode;
1902 struct kvm_segment cs, ss; 1997 struct desc_struct cs, ss;
1903 u64 msr_data; 1998 u64 msr_data;
1999 u16 cs_sel, ss_sel;
1904 2000
1905 /* inject #GP if in real mode */ 2001 /* inject #GP if in real mode */
1906 if (ctxt->mode == X86EMUL_MODE_REAL) { 2002 if (ctxt->mode == X86EMUL_MODE_REAL) {
1907 kvm_inject_gp(ctxt->vcpu, 0); 2003 emulate_gp(ctxt, 0);
1908 return X86EMUL_PROPAGATE_FAULT; 2004 return X86EMUL_PROPAGATE_FAULT;
1909 } 2005 }
1910 2006
@@ -1912,67 +2008,70 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1912 * Therefore, we inject an #UD. 2008 * Therefore, we inject an #UD.
1913 */ 2009 */
1914 if (ctxt->mode == X86EMUL_MODE_PROT64) { 2010 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1915 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2011 emulate_ud(ctxt);
1916 return X86EMUL_PROPAGATE_FAULT; 2012 return X86EMUL_PROPAGATE_FAULT;
1917 } 2013 }
1918 2014
1919 setup_syscalls_segments(ctxt, &cs, &ss); 2015 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1920 2016
1921 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 2017 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1922 switch (ctxt->mode) { 2018 switch (ctxt->mode) {
1923 case X86EMUL_MODE_PROT32: 2019 case X86EMUL_MODE_PROT32:
1924 if ((msr_data & 0xfffc) == 0x0) { 2020 if ((msr_data & 0xfffc) == 0x0) {
1925 kvm_inject_gp(ctxt->vcpu, 0); 2021 emulate_gp(ctxt, 0);
1926 return X86EMUL_PROPAGATE_FAULT; 2022 return X86EMUL_PROPAGATE_FAULT;
1927 } 2023 }
1928 break; 2024 break;
1929 case X86EMUL_MODE_PROT64: 2025 case X86EMUL_MODE_PROT64:
1930 if (msr_data == 0x0) { 2026 if (msr_data == 0x0) {
1931 kvm_inject_gp(ctxt->vcpu, 0); 2027 emulate_gp(ctxt, 0);
1932 return X86EMUL_PROPAGATE_FAULT; 2028 return X86EMUL_PROPAGATE_FAULT;
1933 } 2029 }
1934 break; 2030 break;
1935 } 2031 }
1936 2032
1937 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 2033 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1938 cs.selector = (u16)msr_data; 2034 cs_sel = (u16)msr_data;
1939 cs.selector &= ~SELECTOR_RPL_MASK; 2035 cs_sel &= ~SELECTOR_RPL_MASK;
1940 ss.selector = cs.selector + 8; 2036 ss_sel = cs_sel + 8;
1941 ss.selector &= ~SELECTOR_RPL_MASK; 2037 ss_sel &= ~SELECTOR_RPL_MASK;
1942 if (ctxt->mode == X86EMUL_MODE_PROT64 2038 if (ctxt->mode == X86EMUL_MODE_PROT64
1943 || is_long_mode(ctxt->vcpu)) { 2039 || is_long_mode(ctxt->vcpu)) {
1944 cs.db = 0; 2040 cs.d = 0;
1945 cs.l = 1; 2041 cs.l = 1;
1946 } 2042 }
1947 2043
1948 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 2044 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
1949 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 2045 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
2046 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
2047 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1950 2048
1951 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); 2049 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
1952 c->eip = msr_data; 2050 c->eip = msr_data;
1953 2051
1954 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); 2052 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
1955 c->regs[VCPU_REGS_RSP] = msr_data; 2053 c->regs[VCPU_REGS_RSP] = msr_data;
1956 2054
1957 return X86EMUL_CONTINUE; 2055 return X86EMUL_CONTINUE;
1958} 2056}
1959 2057
1960static int 2058static int
1961emulate_sysexit(struct x86_emulate_ctxt *ctxt) 2059emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1962{ 2060{
1963 struct decode_cache *c = &ctxt->decode; 2061 struct decode_cache *c = &ctxt->decode;
1964 struct kvm_segment cs, ss; 2062 struct desc_struct cs, ss;
1965 u64 msr_data; 2063 u64 msr_data;
1966 int usermode; 2064 int usermode;
2065 u16 cs_sel, ss_sel;
1967 2066
1968 /* inject #GP if in real mode or Virtual 8086 mode */ 2067 /* inject #GP if in real mode or Virtual 8086 mode */
1969 if (ctxt->mode == X86EMUL_MODE_REAL || 2068 if (ctxt->mode == X86EMUL_MODE_REAL ||
1970 ctxt->mode == X86EMUL_MODE_VM86) { 2069 ctxt->mode == X86EMUL_MODE_VM86) {
1971 kvm_inject_gp(ctxt->vcpu, 0); 2070 emulate_gp(ctxt, 0);
1972 return X86EMUL_PROPAGATE_FAULT; 2071 return X86EMUL_PROPAGATE_FAULT;
1973 } 2072 }
1974 2073
1975 setup_syscalls_segments(ctxt, &cs, &ss); 2074 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1976 2075
1977 if ((c->rex_prefix & 0x8) != 0x0) 2076 if ((c->rex_prefix & 0x8) != 0x0)
1978 usermode = X86EMUL_MODE_PROT64; 2077 usermode = X86EMUL_MODE_PROT64;
@@ -1981,35 +2080,37 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1981 2080
1982 cs.dpl = 3; 2081 cs.dpl = 3;
1983 ss.dpl = 3; 2082 ss.dpl = 3;
1984 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 2083 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1985 switch (usermode) { 2084 switch (usermode) {
1986 case X86EMUL_MODE_PROT32: 2085 case X86EMUL_MODE_PROT32:
1987 cs.selector = (u16)(msr_data + 16); 2086 cs_sel = (u16)(msr_data + 16);
1988 if ((msr_data & 0xfffc) == 0x0) { 2087 if ((msr_data & 0xfffc) == 0x0) {
1989 kvm_inject_gp(ctxt->vcpu, 0); 2088 emulate_gp(ctxt, 0);
1990 return X86EMUL_PROPAGATE_FAULT; 2089 return X86EMUL_PROPAGATE_FAULT;
1991 } 2090 }
1992 ss.selector = (u16)(msr_data + 24); 2091 ss_sel = (u16)(msr_data + 24);
1993 break; 2092 break;
1994 case X86EMUL_MODE_PROT64: 2093 case X86EMUL_MODE_PROT64:
1995 cs.selector = (u16)(msr_data + 32); 2094 cs_sel = (u16)(msr_data + 32);
1996 if (msr_data == 0x0) { 2095 if (msr_data == 0x0) {
1997 kvm_inject_gp(ctxt->vcpu, 0); 2096 emulate_gp(ctxt, 0);
1998 return X86EMUL_PROPAGATE_FAULT; 2097 return X86EMUL_PROPAGATE_FAULT;
1999 } 2098 }
2000 ss.selector = cs.selector + 8; 2099 ss_sel = cs_sel + 8;
2001 cs.db = 0; 2100 cs.d = 0;
2002 cs.l = 1; 2101 cs.l = 1;
2003 break; 2102 break;
2004 } 2103 }
2005 cs.selector |= SELECTOR_RPL_MASK; 2104 cs_sel |= SELECTOR_RPL_MASK;
2006 ss.selector |= SELECTOR_RPL_MASK; 2105 ss_sel |= SELECTOR_RPL_MASK;
2007 2106
2008 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 2107 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
2009 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 2108 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
2109 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
2110 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
2010 2111
2011 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; 2112 c->eip = c->regs[VCPU_REGS_RDX];
2012 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; 2113 c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
2013 2114
2014 return X86EMUL_CONTINUE; 2115 return X86EMUL_CONTINUE;
2015} 2116}
@@ -2030,25 +2131,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2030 struct x86_emulate_ops *ops, 2131 struct x86_emulate_ops *ops,
2031 u16 port, u16 len) 2132 u16 port, u16 len)
2032{ 2133{
2033 struct kvm_segment tr_seg; 2134 struct desc_struct tr_seg;
2034 int r; 2135 int r;
2035 u16 io_bitmap_ptr; 2136 u16 io_bitmap_ptr;
2036 u8 perm, bit_idx = port & 0x7; 2137 u8 perm, bit_idx = port & 0x7;
2037 unsigned mask = (1 << len) - 1; 2138 unsigned mask = (1 << len) - 1;
2038 2139
2039 kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); 2140 ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu);
2040 if (tr_seg.unusable) 2141 if (!tr_seg.p)
2041 return false; 2142 return false;
2042 if (tr_seg.limit < 103) 2143 if (desc_limit_scaled(&tr_seg) < 103)
2043 return false; 2144 return false;
2044 r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, 2145 r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2,
2045 NULL); 2146 ctxt->vcpu, NULL);
2046 if (r != X86EMUL_CONTINUE) 2147 if (r != X86EMUL_CONTINUE)
2047 return false; 2148 return false;
2048 if (io_bitmap_ptr + port/8 > tr_seg.limit) 2149 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
2049 return false; 2150 return false;
2050 r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, 2151 r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8,
2051 ctxt->vcpu, NULL); 2152 &perm, 1, ctxt->vcpu, NULL);
2052 if (r != X86EMUL_CONTINUE) 2153 if (r != X86EMUL_CONTINUE)
2053 return false; 2154 return false;
2054 if ((perm >> bit_idx) & mask) 2155 if ((perm >> bit_idx) & mask)
@@ -2066,17 +2167,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2066 return true; 2167 return true;
2067} 2168}
2068 2169
2069static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
2070 struct x86_emulate_ops *ops,
2071 int seg)
2072{
2073 struct desc_struct desc;
2074 if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
2075 return get_desc_base(&desc);
2076 else
2077 return ~0;
2078}
2079
2080static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, 2170static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2081 struct x86_emulate_ops *ops, 2171 struct x86_emulate_ops *ops,
2082 struct tss_segment_16 *tss) 2172 struct tss_segment_16 *tss)
@@ -2165,7 +2255,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2165 &err); 2255 &err);
2166 if (ret == X86EMUL_PROPAGATE_FAULT) { 2256 if (ret == X86EMUL_PROPAGATE_FAULT) {
2167 /* FIXME: need to provide precise fault address */ 2257 /* FIXME: need to provide precise fault address */
2168 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2258 emulate_pf(ctxt, old_tss_base, err);
2169 return ret; 2259 return ret;
2170 } 2260 }
2171 2261
@@ -2175,7 +2265,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2175 &err); 2265 &err);
2176 if (ret == X86EMUL_PROPAGATE_FAULT) { 2266 if (ret == X86EMUL_PROPAGATE_FAULT) {
2177 /* FIXME: need to provide precise fault address */ 2267 /* FIXME: need to provide precise fault address */
2178 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2268 emulate_pf(ctxt, old_tss_base, err);
2179 return ret; 2269 return ret;
2180 } 2270 }
2181 2271
@@ -2183,7 +2273,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2183 &err); 2273 &err);
2184 if (ret == X86EMUL_PROPAGATE_FAULT) { 2274 if (ret == X86EMUL_PROPAGATE_FAULT) {
2185 /* FIXME: need to provide precise fault address */ 2275 /* FIXME: need to provide precise fault address */
2186 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2276 emulate_pf(ctxt, new_tss_base, err);
2187 return ret; 2277 return ret;
2188 } 2278 }
2189 2279
@@ -2196,7 +2286,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2196 ctxt->vcpu, &err); 2286 ctxt->vcpu, &err);
2197 if (ret == X86EMUL_PROPAGATE_FAULT) { 2287 if (ret == X86EMUL_PROPAGATE_FAULT) {
2198 /* FIXME: need to provide precise fault address */ 2288 /* FIXME: need to provide precise fault address */
2199 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2289 emulate_pf(ctxt, new_tss_base, err);
2200 return ret; 2290 return ret;
2201 } 2291 }
2202 } 2292 }
@@ -2238,7 +2328,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2238 struct decode_cache *c = &ctxt->decode; 2328 struct decode_cache *c = &ctxt->decode;
2239 int ret; 2329 int ret;
2240 2330
2241 ops->set_cr(3, tss->cr3, ctxt->vcpu); 2331 if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
2332 emulate_gp(ctxt, 0);
2333 return X86EMUL_PROPAGATE_FAULT;
2334 }
2242 c->eip = tss->eip; 2335 c->eip = tss->eip;
2243 ctxt->eflags = tss->eflags | 2; 2336 ctxt->eflags = tss->eflags | 2;
2244 c->regs[VCPU_REGS_RAX] = tss->eax; 2337 c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2304,7 +2397,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2304 &err); 2397 &err);
2305 if (ret == X86EMUL_PROPAGATE_FAULT) { 2398 if (ret == X86EMUL_PROPAGATE_FAULT) {
2306 /* FIXME: need to provide precise fault address */ 2399 /* FIXME: need to provide precise fault address */
2307 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2400 emulate_pf(ctxt, old_tss_base, err);
2308 return ret; 2401 return ret;
2309 } 2402 }
2310 2403
@@ -2314,7 +2407,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2314 &err); 2407 &err);
2315 if (ret == X86EMUL_PROPAGATE_FAULT) { 2408 if (ret == X86EMUL_PROPAGATE_FAULT) {
2316 /* FIXME: need to provide precise fault address */ 2409 /* FIXME: need to provide precise fault address */
2317 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2410 emulate_pf(ctxt, old_tss_base, err);
2318 return ret; 2411 return ret;
2319 } 2412 }
2320 2413
@@ -2322,7 +2415,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2322 &err); 2415 &err);
2323 if (ret == X86EMUL_PROPAGATE_FAULT) { 2416 if (ret == X86EMUL_PROPAGATE_FAULT) {
2324 /* FIXME: need to provide precise fault address */ 2417 /* FIXME: need to provide precise fault address */
2325 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2418 emulate_pf(ctxt, new_tss_base, err);
2326 return ret; 2419 return ret;
2327 } 2420 }
2328 2421
@@ -2335,7 +2428,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2335 ctxt->vcpu, &err); 2428 ctxt->vcpu, &err);
2336 if (ret == X86EMUL_PROPAGATE_FAULT) { 2429 if (ret == X86EMUL_PROPAGATE_FAULT) {
2337 /* FIXME: need to provide precise fault address */ 2430 /* FIXME: need to provide precise fault address */
2338 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2431 emulate_pf(ctxt, new_tss_base, err);
2339 return ret; 2432 return ret;
2340 } 2433 }
2341 } 2434 }
@@ -2352,7 +2445,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2352 int ret; 2445 int ret;
2353 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); 2446 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
2354 ulong old_tss_base = 2447 ulong old_tss_base =
2355 get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR); 2448 ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);
2356 u32 desc_limit; 2449 u32 desc_limit;
2357 2450
2358 /* FIXME: old_tss_base == ~0 ? */ 2451 /* FIXME: old_tss_base == ~0 ? */
@@ -2369,7 +2462,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2369 if (reason != TASK_SWITCH_IRET) { 2462 if (reason != TASK_SWITCH_IRET) {
2370 if ((tss_selector & 3) > next_tss_desc.dpl || 2463 if ((tss_selector & 3) > next_tss_desc.dpl ||
2371 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { 2464 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
2372 kvm_inject_gp(ctxt->vcpu, 0); 2465 emulate_gp(ctxt, 0);
2373 return X86EMUL_PROPAGATE_FAULT; 2466 return X86EMUL_PROPAGATE_FAULT;
2374 } 2467 }
2375 } 2468 }
@@ -2378,8 +2471,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2378 if (!next_tss_desc.p || 2471 if (!next_tss_desc.p ||
2379 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || 2472 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
2380 desc_limit < 0x2b)) { 2473 desc_limit < 0x2b)) {
2381 kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR, 2474 emulate_ts(ctxt, tss_selector & 0xfffc);
2382 tss_selector & 0xfffc);
2383 return X86EMUL_PROPAGATE_FAULT; 2475 return X86EMUL_PROPAGATE_FAULT;
2384 } 2476 }
2385 2477
@@ -2425,7 +2517,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2425 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; 2517 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2426 c->lock_prefix = 0; 2518 c->lock_prefix = 0;
2427 c->src.val = (unsigned long) error_code; 2519 c->src.val = (unsigned long) error_code;
2428 emulate_push(ctxt); 2520 emulate_push(ctxt, ops);
2429 } 2521 }
2430 2522
2431 return ret; 2523 return ret;
@@ -2439,18 +2531,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2439 struct decode_cache *c = &ctxt->decode; 2531 struct decode_cache *c = &ctxt->decode;
2440 int rc; 2532 int rc;
2441 2533
2442 memset(c, 0, sizeof(struct decode_cache));
2443 c->eip = ctxt->eip; 2534 c->eip = ctxt->eip;
2444 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
2445 c->dst.type = OP_NONE; 2535 c->dst.type = OP_NONE;
2446 2536
2447 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, 2537 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
2448 has_error_code, error_code); 2538 has_error_code, error_code);
2449 2539
2450 if (rc == X86EMUL_CONTINUE) { 2540 if (rc == X86EMUL_CONTINUE) {
2451 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
2452 kvm_rip_write(ctxt->vcpu, c->eip);
2453 rc = writeback(ctxt, ops); 2541 rc = writeback(ctxt, ops);
2542 if (rc == X86EMUL_CONTINUE)
2543 ctxt->eip = c->eip;
2454 } 2544 }
2455 2545
2456 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 2546 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -2474,29 +2564,22 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2474 int rc = X86EMUL_CONTINUE; 2564 int rc = X86EMUL_CONTINUE;
2475 int saved_dst_type = c->dst.type; 2565 int saved_dst_type = c->dst.type;
2476 2566
2477 ctxt->interruptibility = 0; 2567 ctxt->decode.mem_read.pos = 0;
2478
2479 /* Shadow copy of register state. Committed on successful emulation.
2480 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
2481 * modify them.
2482 */
2483
2484 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
2485 2568
2486 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 2569 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
2487 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2570 emulate_ud(ctxt);
2488 goto done; 2571 goto done;
2489 } 2572 }
2490 2573
2491 /* LOCK prefix is allowed only with some instructions */ 2574 /* LOCK prefix is allowed only with some instructions */
2492 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { 2575 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
2493 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2576 emulate_ud(ctxt);
2494 goto done; 2577 goto done;
2495 } 2578 }
2496 2579
2497 /* Privileged instruction can be executed only in CPL=0 */ 2580 /* Privileged instruction can be executed only in CPL=0 */
2498 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 2581 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
2499 kvm_inject_gp(ctxt->vcpu, 0); 2582 emulate_gp(ctxt, 0);
2500 goto done; 2583 goto done;
2501 } 2584 }
2502 2585
@@ -2506,7 +2589,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2506 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { 2589 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
2507 string_done: 2590 string_done:
2508 ctxt->restart = false; 2591 ctxt->restart = false;
2509 kvm_rip_write(ctxt->vcpu, c->eip); 2592 ctxt->eip = c->eip;
2510 goto done; 2593 goto done;
2511 } 2594 }
2512 /* The second termination condition only applies for REPE 2595 /* The second termination condition only applies for REPE
@@ -2529,20 +2612,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2529 } 2612 }
2530 2613
2531 if (c->src.type == OP_MEM) { 2614 if (c->src.type == OP_MEM) {
2532 rc = ops->read_emulated((unsigned long)c->src.ptr, 2615 rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr,
2533 &c->src.val, 2616 c->src.valptr, c->src.bytes);
2534 c->src.bytes,
2535 ctxt->vcpu);
2536 if (rc != X86EMUL_CONTINUE) 2617 if (rc != X86EMUL_CONTINUE)
2537 goto done; 2618 goto done;
2538 c->src.orig_val = c->src.val; 2619 c->src.orig_val = c->src.val;
2539 } 2620 }
2540 2621
2541 if (c->src2.type == OP_MEM) { 2622 if (c->src2.type == OP_MEM) {
2542 rc = ops->read_emulated((unsigned long)c->src2.ptr, 2623 rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr,
2543 &c->src2.val, 2624 &c->src2.val, c->src2.bytes);
2544 c->src2.bytes,
2545 ctxt->vcpu);
2546 if (rc != X86EMUL_CONTINUE) 2625 if (rc != X86EMUL_CONTINUE)
2547 goto done; 2626 goto done;
2548 } 2627 }
@@ -2553,8 +2632,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2553 2632
2554 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 2633 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
2555 /* optimisation - avoid slow emulated read if Mov */ 2634 /* optimisation - avoid slow emulated read if Mov */
2556 rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val, 2635 rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr,
2557 c->dst.bytes, ctxt->vcpu); 2636 &c->dst.val, c->dst.bytes);
2558 if (rc != X86EMUL_CONTINUE) 2637 if (rc != X86EMUL_CONTINUE)
2559 goto done; 2638 goto done;
2560 } 2639 }
@@ -2571,7 +2650,7 @@ special_insn:
2571 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 2650 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
2572 break; 2651 break;
2573 case 0x06: /* push es */ 2652 case 0x06: /* push es */
2574 emulate_push_sreg(ctxt, VCPU_SREG_ES); 2653 emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
2575 break; 2654 break;
2576 case 0x07: /* pop es */ 2655 case 0x07: /* pop es */
2577 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 2656 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
@@ -2583,14 +2662,14 @@ special_insn:
2583 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 2662 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2584 break; 2663 break;
2585 case 0x0e: /* push cs */ 2664 case 0x0e: /* push cs */
2586 emulate_push_sreg(ctxt, VCPU_SREG_CS); 2665 emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
2587 break; 2666 break;
2588 case 0x10 ... 0x15: 2667 case 0x10 ... 0x15:
2589 adc: /* adc */ 2668 adc: /* adc */
2590 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); 2669 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
2591 break; 2670 break;
2592 case 0x16: /* push ss */ 2671 case 0x16: /* push ss */
2593 emulate_push_sreg(ctxt, VCPU_SREG_SS); 2672 emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
2594 break; 2673 break;
2595 case 0x17: /* pop ss */ 2674 case 0x17: /* pop ss */
2596 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 2675 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
@@ -2602,7 +2681,7 @@ special_insn:
2602 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 2681 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
2603 break; 2682 break;
2604 case 0x1e: /* push ds */ 2683 case 0x1e: /* push ds */
2605 emulate_push_sreg(ctxt, VCPU_SREG_DS); 2684 emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
2606 break; 2685 break;
2607 case 0x1f: /* pop ds */ 2686 case 0x1f: /* pop ds */
2608 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 2687 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
@@ -2632,7 +2711,7 @@ special_insn:
2632 emulate_1op("dec", c->dst, ctxt->eflags); 2711 emulate_1op("dec", c->dst, ctxt->eflags);
2633 break; 2712 break;
2634 case 0x50 ... 0x57: /* push reg */ 2713 case 0x50 ... 0x57: /* push reg */
2635 emulate_push(ctxt); 2714 emulate_push(ctxt, ops);
2636 break; 2715 break;
2637 case 0x58 ... 0x5f: /* pop reg */ 2716 case 0x58 ... 0x5f: /* pop reg */
2638 pop_instruction: 2717 pop_instruction:
@@ -2641,7 +2720,9 @@ special_insn:
2641 goto done; 2720 goto done;
2642 break; 2721 break;
2643 case 0x60: /* pusha */ 2722 case 0x60: /* pusha */
2644 emulate_pusha(ctxt); 2723 rc = emulate_pusha(ctxt, ops);
2724 if (rc != X86EMUL_CONTINUE)
2725 goto done;
2645 break; 2726 break;
2646 case 0x61: /* popa */ 2727 case 0x61: /* popa */
2647 rc = emulate_popa(ctxt, ops); 2728 rc = emulate_popa(ctxt, ops);
@@ -2655,14 +2736,14 @@ special_insn:
2655 break; 2736 break;
2656 case 0x68: /* push imm */ 2737 case 0x68: /* push imm */
2657 case 0x6a: /* push imm8 */ 2738 case 0x6a: /* push imm8 */
2658 emulate_push(ctxt); 2739 emulate_push(ctxt, ops);
2659 break; 2740 break;
2660 case 0x6c: /* insb */ 2741 case 0x6c: /* insb */
2661 case 0x6d: /* insw/insd */ 2742 case 0x6d: /* insw/insd */
2662 c->dst.bytes = min(c->dst.bytes, 4u); 2743 c->dst.bytes = min(c->dst.bytes, 4u);
2663 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2744 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2664 c->dst.bytes)) { 2745 c->dst.bytes)) {
2665 kvm_inject_gp(ctxt->vcpu, 0); 2746 emulate_gp(ctxt, 0);
2666 goto done; 2747 goto done;
2667 } 2748 }
2668 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, 2749 if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
@@ -2674,7 +2755,7 @@ special_insn:
2674 c->src.bytes = min(c->src.bytes, 4u); 2755 c->src.bytes = min(c->src.bytes, 4u);
2675 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2756 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2676 c->src.bytes)) { 2757 c->src.bytes)) {
2677 kvm_inject_gp(ctxt->vcpu, 0); 2758 emulate_gp(ctxt, 0);
2678 goto done; 2759 goto done;
2679 } 2760 }
2680 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], 2761 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
@@ -2707,6 +2788,7 @@ special_insn:
2707 } 2788 }
2708 break; 2789 break;
2709 case 0x84 ... 0x85: 2790 case 0x84 ... 0x85:
2791 test:
2710 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 2792 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
2711 break; 2793 break;
2712 case 0x86 ... 0x87: /* xchg */ 2794 case 0x86 ... 0x87: /* xchg */
@@ -2735,18 +2817,13 @@ special_insn:
2735 break; 2817 break;
2736 case 0x88 ... 0x8b: /* mov */ 2818 case 0x88 ... 0x8b: /* mov */
2737 goto mov; 2819 goto mov;
2738 case 0x8c: { /* mov r/m, sreg */ 2820 case 0x8c: /* mov r/m, sreg */
2739 struct kvm_segment segreg; 2821 if (c->modrm_reg > VCPU_SREG_GS) {
2740 2822 emulate_ud(ctxt);
2741 if (c->modrm_reg <= VCPU_SREG_GS)
2742 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
2743 else {
2744 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2745 goto done; 2823 goto done;
2746 } 2824 }
2747 c->dst.val = segreg.selector; 2825 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
2748 break; 2826 break;
2749 }
2750 case 0x8d: /* lea r16/r32, m */ 2827 case 0x8d: /* lea r16/r32, m */
2751 c->dst.val = c->modrm_ea; 2828 c->dst.val = c->modrm_ea;
2752 break; 2829 break;
@@ -2757,12 +2834,12 @@ special_insn:
2757 2834
2758 if (c->modrm_reg == VCPU_SREG_CS || 2835 if (c->modrm_reg == VCPU_SREG_CS ||
2759 c->modrm_reg > VCPU_SREG_GS) { 2836 c->modrm_reg > VCPU_SREG_GS) {
2760 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2837 emulate_ud(ctxt);
2761 goto done; 2838 goto done;
2762 } 2839 }
2763 2840
2764 if (c->modrm_reg == VCPU_SREG_SS) 2841 if (c->modrm_reg == VCPU_SREG_SS)
2765 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS); 2842 ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
2766 2843
2767 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); 2844 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
2768 2845
@@ -2775,19 +2852,19 @@ special_insn:
2775 goto done; 2852 goto done;
2776 break; 2853 break;
2777 case 0x90: /* nop / xchg r8,rax */ 2854 case 0x90: /* nop / xchg r8,rax */
2778 if (!(c->rex_prefix & 1)) { /* nop */ 2855 if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) {
2779 c->dst.type = OP_NONE; 2856 c->dst.type = OP_NONE; /* nop */
2780 break; 2857 break;
2781 } 2858 }
2782 case 0x91 ... 0x97: /* xchg reg,rax */ 2859 case 0x91 ... 0x97: /* xchg reg,rax */
2783 c->src.type = c->dst.type = OP_REG; 2860 c->src.type = OP_REG;
2784 c->src.bytes = c->dst.bytes = c->op_bytes; 2861 c->src.bytes = c->op_bytes;
2785 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; 2862 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
2786 c->src.val = *(c->src.ptr); 2863 c->src.val = *(c->src.ptr);
2787 goto xchg; 2864 goto xchg;
2788 case 0x9c: /* pushf */ 2865 case 0x9c: /* pushf */
2789 c->src.val = (unsigned long) ctxt->eflags; 2866 c->src.val = (unsigned long) ctxt->eflags;
2790 emulate_push(ctxt); 2867 emulate_push(ctxt, ops);
2791 break; 2868 break;
2792 case 0x9d: /* popf */ 2869 case 0x9d: /* popf */
2793 c->dst.type = OP_REG; 2870 c->dst.type = OP_REG;
@@ -2797,19 +2874,15 @@ special_insn:
2797 if (rc != X86EMUL_CONTINUE) 2874 if (rc != X86EMUL_CONTINUE)
2798 goto done; 2875 goto done;
2799 break; 2876 break;
2800 case 0xa0 ... 0xa1: /* mov */ 2877 case 0xa0 ... 0xa3: /* mov */
2801 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
2802 c->dst.val = c->src.val;
2803 break;
2804 case 0xa2 ... 0xa3: /* mov */
2805 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
2806 break;
2807 case 0xa4 ... 0xa5: /* movs */ 2878 case 0xa4 ... 0xa5: /* movs */
2808 goto mov; 2879 goto mov;
2809 case 0xa6 ... 0xa7: /* cmps */ 2880 case 0xa6 ... 0xa7: /* cmps */
2810 c->dst.type = OP_NONE; /* Disable writeback. */ 2881 c->dst.type = OP_NONE; /* Disable writeback. */
2811 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 2882 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
2812 goto cmp; 2883 goto cmp;
2884 case 0xa8 ... 0xa9: /* test ax, imm */
2885 goto test;
2813 case 0xaa ... 0xab: /* stos */ 2886 case 0xaa ... 0xab: /* stos */
2814 c->dst.val = c->regs[VCPU_REGS_RAX]; 2887 c->dst.val = c->regs[VCPU_REGS_RAX];
2815 break; 2888 break;
@@ -2855,19 +2928,23 @@ special_insn:
2855 long int rel = c->src.val; 2928 long int rel = c->src.val;
2856 c->src.val = (unsigned long) c->eip; 2929 c->src.val = (unsigned long) c->eip;
2857 jmp_rel(c, rel); 2930 jmp_rel(c, rel);
2858 emulate_push(ctxt); 2931 emulate_push(ctxt, ops);
2859 break; 2932 break;
2860 } 2933 }
2861 case 0xe9: /* jmp rel */ 2934 case 0xe9: /* jmp rel */
2862 goto jmp; 2935 goto jmp;
2863 case 0xea: /* jmp far */ 2936 case 0xea: { /* jmp far */
2937 unsigned short sel;
2864 jump_far: 2938 jump_far:
2865 if (load_segment_descriptor(ctxt, ops, c->src2.val, 2939 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
2866 VCPU_SREG_CS)) 2940
2941 if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
2867 goto done; 2942 goto done;
2868 2943
2869 c->eip = c->src.val; 2944 c->eip = 0;
2945 memcpy(&c->eip, c->src.valptr, c->op_bytes);
2870 break; 2946 break;
2947 }
2871 case 0xeb: 2948 case 0xeb:
2872 jmp: /* jmp rel short */ 2949 jmp: /* jmp rel short */
2873 jmp_rel(c, c->src.val); 2950 jmp_rel(c, c->src.val);
@@ -2879,20 +2956,20 @@ special_insn:
2879 do_io_in: 2956 do_io_in:
2880 c->dst.bytes = min(c->dst.bytes, 4u); 2957 c->dst.bytes = min(c->dst.bytes, 4u);
2881 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 2958 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2882 kvm_inject_gp(ctxt->vcpu, 0); 2959 emulate_gp(ctxt, 0);
2883 goto done; 2960 goto done;
2884 } 2961 }
2885 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 2962 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
2886 &c->dst.val)) 2963 &c->dst.val))
2887 goto done; /* IO is needed */ 2964 goto done; /* IO is needed */
2888 break; 2965 break;
2889 case 0xee: /* out al,dx */ 2966 case 0xee: /* out dx,al */
2890 case 0xef: /* out (e/r)ax,dx */ 2967 case 0xef: /* out dx,(e/r)ax */
2891 c->src.val = c->regs[VCPU_REGS_RDX]; 2968 c->src.val = c->regs[VCPU_REGS_RDX];
2892 do_io_out: 2969 do_io_out:
2893 c->dst.bytes = min(c->dst.bytes, 4u); 2970 c->dst.bytes = min(c->dst.bytes, 4u);
2894 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 2971 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2895 kvm_inject_gp(ctxt->vcpu, 0); 2972 emulate_gp(ctxt, 0);
2896 goto done; 2973 goto done;
2897 } 2974 }
2898 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, 2975 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
@@ -2916,18 +2993,20 @@ special_insn:
2916 c->dst.type = OP_NONE; /* Disable writeback. */ 2993 c->dst.type = OP_NONE; /* Disable writeback. */
2917 break; 2994 break;
2918 case 0xfa: /* cli */ 2995 case 0xfa: /* cli */
2919 if (emulator_bad_iopl(ctxt, ops)) 2996 if (emulator_bad_iopl(ctxt, ops)) {
2920 kvm_inject_gp(ctxt->vcpu, 0); 2997 emulate_gp(ctxt, 0);
2921 else { 2998 goto done;
2999 } else {
2922 ctxt->eflags &= ~X86_EFLAGS_IF; 3000 ctxt->eflags &= ~X86_EFLAGS_IF;
2923 c->dst.type = OP_NONE; /* Disable writeback. */ 3001 c->dst.type = OP_NONE; /* Disable writeback. */
2924 } 3002 }
2925 break; 3003 break;
2926 case 0xfb: /* sti */ 3004 case 0xfb: /* sti */
2927 if (emulator_bad_iopl(ctxt, ops)) 3005 if (emulator_bad_iopl(ctxt, ops)) {
2928 kvm_inject_gp(ctxt->vcpu, 0); 3006 emulate_gp(ctxt, 0);
2929 else { 3007 goto done;
2930 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI); 3008 } else {
3009 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
2931 ctxt->eflags |= X86_EFLAGS_IF; 3010 ctxt->eflags |= X86_EFLAGS_IF;
2932 c->dst.type = OP_NONE; /* Disable writeback. */ 3011 c->dst.type = OP_NONE; /* Disable writeback. */
2933 } 3012 }
@@ -2964,11 +3043,12 @@ writeback:
2964 c->dst.type = saved_dst_type; 3043 c->dst.type = saved_dst_type;
2965 3044
2966 if ((c->d & SrcMask) == SrcSI) 3045 if ((c->d & SrcMask) == SrcSI)
2967 string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI, 3046 string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
2968 &c->src); 3047 VCPU_REGS_RSI, &c->src);
2969 3048
2970 if ((c->d & DstMask) == DstDI) 3049 if ((c->d & DstMask) == DstDI)
2971 string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst); 3050 string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
3051 &c->dst);
2972 3052
2973 if (c->rep_prefix && (c->d & String)) { 3053 if (c->rep_prefix && (c->d & String)) {
2974 struct read_cache *rc = &ctxt->decode.io_read; 3054 struct read_cache *rc = &ctxt->decode.io_read;
@@ -2981,11 +3061,12 @@ writeback:
2981 (rc->end != 0 && rc->end == rc->pos)) 3061 (rc->end != 0 && rc->end == rc->pos))
2982 ctxt->restart = false; 3062 ctxt->restart = false;
2983 } 3063 }
2984 3064 /*
2985 /* Commit shadow register state. */ 3065 * reset read cache here in case string instruction is restared
2986 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 3066 * without decoding
2987 kvm_rip_write(ctxt->vcpu, c->eip); 3067 */
2988 ops->set_rflags(ctxt->vcpu, ctxt->eflags); 3068 ctxt->decode.mem_read.end = 0;
3069 ctxt->eip = c->eip;
2989 3070
2990done: 3071done:
2991 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 3072 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -3051,7 +3132,7 @@ twobyte_insn:
3051 c->dst.type = OP_NONE; 3132 c->dst.type = OP_NONE;
3052 break; 3133 break;
3053 case 5: /* not defined */ 3134 case 5: /* not defined */
3054 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3135 emulate_ud(ctxt);
3055 goto done; 3136 goto done;
3056 case 7: /* invlpg*/ 3137 case 7: /* invlpg*/
3057 emulate_invlpg(ctxt->vcpu, c->modrm_ea); 3138 emulate_invlpg(ctxt->vcpu, c->modrm_ea);
@@ -3063,7 +3144,7 @@ twobyte_insn:
3063 } 3144 }
3064 break; 3145 break;
3065 case 0x05: /* syscall */ 3146 case 0x05: /* syscall */
3066 rc = emulate_syscall(ctxt); 3147 rc = emulate_syscall(ctxt, ops);
3067 if (rc != X86EMUL_CONTINUE) 3148 if (rc != X86EMUL_CONTINUE)
3068 goto done; 3149 goto done;
3069 else 3150 else
@@ -3073,8 +3154,11 @@ twobyte_insn:
3073 emulate_clts(ctxt->vcpu); 3154 emulate_clts(ctxt->vcpu);
3074 c->dst.type = OP_NONE; 3155 c->dst.type = OP_NONE;
3075 break; 3156 break;
3076 case 0x08: /* invd */
3077 case 0x09: /* wbinvd */ 3157 case 0x09: /* wbinvd */
3158 kvm_emulate_wbinvd(ctxt->vcpu);
3159 c->dst.type = OP_NONE;
3160 break;
3161 case 0x08: /* invd */
3078 case 0x0d: /* GrpP (prefetch) */ 3162 case 0x0d: /* GrpP (prefetch) */
3079 case 0x18: /* Grp16 (prefetch/nop) */ 3163 case 0x18: /* Grp16 (prefetch/nop) */
3080 c->dst.type = OP_NONE; 3164 c->dst.type = OP_NONE;
@@ -3084,7 +3168,7 @@ twobyte_insn:
3084 case 1: 3168 case 1:
3085 case 5 ... 7: 3169 case 5 ... 7:
3086 case 9 ... 15: 3170 case 9 ... 15:
3087 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3171 emulate_ud(ctxt);
3088 goto done; 3172 goto done;
3089 } 3173 }
3090 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); 3174 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3093,31 +3177,42 @@ twobyte_insn:
3093 case 0x21: /* mov from dr to reg */ 3177 case 0x21: /* mov from dr to reg */
3094 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3178 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3095 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3179 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3096 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3180 emulate_ud(ctxt);
3097 goto done; 3181 goto done;
3098 } 3182 }
3099 emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); 3183 ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
3100 c->dst.type = OP_NONE; /* no writeback */ 3184 c->dst.type = OP_NONE; /* no writeback */
3101 break; 3185 break;
3102 case 0x22: /* mov reg, cr */ 3186 case 0x22: /* mov reg, cr */
3103 ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu); 3187 if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) {
3188 emulate_gp(ctxt, 0);
3189 goto done;
3190 }
3104 c->dst.type = OP_NONE; 3191 c->dst.type = OP_NONE;
3105 break; 3192 break;
3106 case 0x23: /* mov from reg to dr */ 3193 case 0x23: /* mov from reg to dr */
3107 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3194 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3108 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3195 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3109 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3196 emulate_ud(ctxt);
3197 goto done;
3198 }
3199
3200 if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] &
3201 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
3202 ~0ULL : ~0U), ctxt->vcpu) < 0) {
3203 /* #UD condition is already handled by the code above */
3204 emulate_gp(ctxt, 0);
3110 goto done; 3205 goto done;
3111 } 3206 }
3112 emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]); 3207
3113 c->dst.type = OP_NONE; /* no writeback */ 3208 c->dst.type = OP_NONE; /* no writeback */
3114 break; 3209 break;
3115 case 0x30: 3210 case 0x30:
3116 /* wrmsr */ 3211 /* wrmsr */
3117 msr_data = (u32)c->regs[VCPU_REGS_RAX] 3212 msr_data = (u32)c->regs[VCPU_REGS_RAX]
3118 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 3213 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
3119 if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { 3214 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
3120 kvm_inject_gp(ctxt->vcpu, 0); 3215 emulate_gp(ctxt, 0);
3121 goto done; 3216 goto done;
3122 } 3217 }
3123 rc = X86EMUL_CONTINUE; 3218 rc = X86EMUL_CONTINUE;
@@ -3125,8 +3220,8 @@ twobyte_insn:
3125 break; 3220 break;
3126 case 0x32: 3221 case 0x32:
3127 /* rdmsr */ 3222 /* rdmsr */
3128 if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { 3223 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
3129 kvm_inject_gp(ctxt->vcpu, 0); 3224 emulate_gp(ctxt, 0);
3130 goto done; 3225 goto done;
3131 } else { 3226 } else {
3132 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 3227 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3136,14 +3231,14 @@ twobyte_insn:
3136 c->dst.type = OP_NONE; 3231 c->dst.type = OP_NONE;
3137 break; 3232 break;
3138 case 0x34: /* sysenter */ 3233 case 0x34: /* sysenter */
3139 rc = emulate_sysenter(ctxt); 3234 rc = emulate_sysenter(ctxt, ops);
3140 if (rc != X86EMUL_CONTINUE) 3235 if (rc != X86EMUL_CONTINUE)
3141 goto done; 3236 goto done;
3142 else 3237 else
3143 goto writeback; 3238 goto writeback;
3144 break; 3239 break;
3145 case 0x35: /* sysexit */ 3240 case 0x35: /* sysexit */
3146 rc = emulate_sysexit(ctxt); 3241 rc = emulate_sysexit(ctxt, ops);
3147 if (rc != X86EMUL_CONTINUE) 3242 if (rc != X86EMUL_CONTINUE)
3148 goto done; 3243 goto done;
3149 else 3244 else
@@ -3160,7 +3255,7 @@ twobyte_insn:
3160 c->dst.type = OP_NONE; 3255 c->dst.type = OP_NONE;
3161 break; 3256 break;
3162 case 0xa0: /* push fs */ 3257 case 0xa0: /* push fs */
3163 emulate_push_sreg(ctxt, VCPU_SREG_FS); 3258 emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
3164 break; 3259 break;
3165 case 0xa1: /* pop fs */ 3260 case 0xa1: /* pop fs */
3166 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 3261 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
@@ -3179,7 +3274,7 @@ twobyte_insn:
3179 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 3274 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
3180 break; 3275 break;
3181 case 0xa8: /* push gs */ 3276 case 0xa8: /* push gs */
3182 emulate_push_sreg(ctxt, VCPU_SREG_GS); 3277 emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
3183 break; 3278 break;
3184 case 0xa9: /* pop gs */ 3279 case 0xa9: /* pop gs */
3185 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 3280 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 0150affad25d..0fd6378981f4 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,6 +5,7 @@
5 * Copyright (c) 2006 Intel Corporation 5 * Copyright (c) 2006 Intel Corporation
6 * Copyright (c) 2007 Keir Fraser, XenSource Inc 6 * Copyright (c) 2007 Keir Fraser, XenSource Inc
7 * Copyright (c) 2008 Intel Corporation 7 * Copyright (c) 2008 Intel Corporation
8 * Copyright 2009 Red Hat, Inc. and/or its affilates.
8 * 9 *
9 * Permission is hereby granted, free of charge, to any person obtaining a copy 10 * Permission is hereby granted, free of charge, to any person obtaining a copy
10 * of this software and associated documentation files (the "Software"), to deal 11 * of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,7 @@
33 34
34#include <linux/kvm_host.h> 35#include <linux/kvm_host.h>
35#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/workqueue.h>
36 38
37#include "irq.h" 39#include "irq.h"
38#include "i8254.h" 40#include "i8254.h"
@@ -243,11 +245,22 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
243{ 245{
244 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, 246 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
245 irq_ack_notifier); 247 irq_ack_notifier);
246 raw_spin_lock(&ps->inject_lock); 248 int value;
247 if (atomic_dec_return(&ps->pit_timer.pending) < 0) 249
250 spin_lock(&ps->inject_lock);
251 value = atomic_dec_return(&ps->pit_timer.pending);
252 if (value < 0)
253 /* spurious acks can be generated if, for example, the
254 * PIC is being reset. Handle it gracefully here
255 */
248 atomic_inc(&ps->pit_timer.pending); 256 atomic_inc(&ps->pit_timer.pending);
257 else if (value > 0)
258 /* in this case, we had multiple outstanding pit interrupts
259 * that we needed to inject. Reinject
260 */
261 queue_work(ps->pit->wq, &ps->pit->expired);
249 ps->irq_ack = 1; 262 ps->irq_ack = 1;
250 raw_spin_unlock(&ps->inject_lock); 263 spin_unlock(&ps->inject_lock);
251} 264}
252 265
253void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) 266void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -263,10 +276,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
263 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 276 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
264} 277}
265 278
266static void destroy_pit_timer(struct kvm_timer *pt) 279static void destroy_pit_timer(struct kvm_pit *pit)
267{ 280{
268 pr_debug("execute del timer!\n"); 281 hrtimer_cancel(&pit->pit_state.pit_timer.timer);
269 hrtimer_cancel(&pt->timer); 282 cancel_work_sync(&pit->expired);
270} 283}
271 284
272static bool kpit_is_periodic(struct kvm_timer *ktimer) 285static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -280,6 +293,60 @@ static struct kvm_timer_ops kpit_ops = {
280 .is_periodic = kpit_is_periodic, 293 .is_periodic = kpit_is_periodic,
281}; 294};
282 295
296static void pit_do_work(struct work_struct *work)
297{
298 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
299 struct kvm *kvm = pit->kvm;
300 struct kvm_vcpu *vcpu;
301 int i;
302 struct kvm_kpit_state *ps = &pit->pit_state;
303 int inject = 0;
304
305 /* Try to inject pending interrupts when
306 * last one has been acked.
307 */
308 spin_lock(&ps->inject_lock);
309 if (ps->irq_ack) {
310 ps->irq_ack = 0;
311 inject = 1;
312 }
313 spin_unlock(&ps->inject_lock);
314 if (inject) {
315 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
316 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
317
318 /*
319 * Provides NMI watchdog support via Virtual Wire mode.
320 * The route is: PIT -> PIC -> LVT0 in NMI mode.
321 *
322 * Note: Our Virtual Wire implementation is simplified, only
323 * propagating PIT interrupts to all VCPUs when they have set
324 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
325 * VCPU0, and only if its LVT0 is in EXTINT mode.
326 */
327 if (kvm->arch.vapics_in_nmi_mode > 0)
328 kvm_for_each_vcpu(i, vcpu, kvm)
329 kvm_apic_nmi_wd_deliver(vcpu);
330 }
331}
332
333static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
334{
335 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
336 struct kvm_pit *pt = ktimer->kvm->arch.vpit;
337
338 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
339 atomic_inc(&ktimer->pending);
340 queue_work(pt->wq, &pt->expired);
341 }
342
343 if (ktimer->t_ops->is_periodic(ktimer)) {
344 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
345 return HRTIMER_RESTART;
346 } else
347 return HRTIMER_NORESTART;
348}
349
283static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) 350static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
284{ 351{
285 struct kvm_timer *pt = &ps->pit_timer; 352 struct kvm_timer *pt = &ps->pit_timer;
@@ -291,13 +358,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
291 358
292 /* TODO The new value only affected after the retriggered */ 359 /* TODO The new value only affected after the retriggered */
293 hrtimer_cancel(&pt->timer); 360 hrtimer_cancel(&pt->timer);
361 cancel_work_sync(&ps->pit->expired);
294 pt->period = interval; 362 pt->period = interval;
295 ps->is_periodic = is_period; 363 ps->is_periodic = is_period;
296 364
297 pt->timer.function = kvm_timer_fn; 365 pt->timer.function = pit_timer_fn;
298 pt->t_ops = &kpit_ops; 366 pt->t_ops = &kpit_ops;
299 pt->kvm = ps->pit->kvm; 367 pt->kvm = ps->pit->kvm;
300 pt->vcpu = pt->kvm->bsp_vcpu;
301 368
302 atomic_set(&pt->pending, 0); 369 atomic_set(&pt->pending, 0);
303 ps->irq_ack = 1; 370 ps->irq_ack = 1;
@@ -346,7 +413,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
346 } 413 }
347 break; 414 break;
348 default: 415 default:
349 destroy_pit_timer(&ps->pit_timer); 416 destroy_pit_timer(kvm->arch.vpit);
350 } 417 }
351} 418}
352 419
@@ -625,7 +692,15 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
625 692
626 mutex_init(&pit->pit_state.lock); 693 mutex_init(&pit->pit_state.lock);
627 mutex_lock(&pit->pit_state.lock); 694 mutex_lock(&pit->pit_state.lock);
628 raw_spin_lock_init(&pit->pit_state.inject_lock); 695 spin_lock_init(&pit->pit_state.inject_lock);
696
697 pit->wq = create_singlethread_workqueue("kvm-pit-wq");
698 if (!pit->wq) {
699 mutex_unlock(&pit->pit_state.lock);
700 kfree(pit);
701 return NULL;
702 }
703 INIT_WORK(&pit->expired, pit_do_work);
629 704
630 kvm->arch.vpit = pit; 705 kvm->arch.vpit = pit;
631 pit->kvm = kvm; 706 pit->kvm = kvm;
@@ -677,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm)
677 struct hrtimer *timer; 752 struct hrtimer *timer;
678 753
679 if (kvm->arch.vpit) { 754 if (kvm->arch.vpit) {
755 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
756 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
757 &kvm->arch.vpit->speaker_dev);
680 kvm_unregister_irq_mask_notifier(kvm, 0, 758 kvm_unregister_irq_mask_notifier(kvm, 0,
681 &kvm->arch.vpit->mask_notifier); 759 &kvm->arch.vpit->mask_notifier);
682 kvm_unregister_irq_ack_notifier(kvm, 760 kvm_unregister_irq_ack_notifier(kvm,
@@ -684,54 +762,10 @@ void kvm_free_pit(struct kvm *kvm)
684 mutex_lock(&kvm->arch.vpit->pit_state.lock); 762 mutex_lock(&kvm->arch.vpit->pit_state.lock);
685 timer = &kvm->arch.vpit->pit_state.pit_timer.timer; 763 timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
686 hrtimer_cancel(timer); 764 hrtimer_cancel(timer);
765 cancel_work_sync(&kvm->arch.vpit->expired);
687 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); 766 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
688 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 767 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
768 destroy_workqueue(kvm->arch.vpit->wq);
689 kfree(kvm->arch.vpit); 769 kfree(kvm->arch.vpit);
690 } 770 }
691} 771}
692
693static void __inject_pit_timer_intr(struct kvm *kvm)
694{
695 struct kvm_vcpu *vcpu;
696 int i;
697
698 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
699 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
700
701 /*
702 * Provides NMI watchdog support via Virtual Wire mode.
703 * The route is: PIT -> PIC -> LVT0 in NMI mode.
704 *
705 * Note: Our Virtual Wire implementation is simplified, only
706 * propagating PIT interrupts to all VCPUs when they have set
707 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
708 * VCPU0, and only if its LVT0 is in EXTINT mode.
709 */
710 if (kvm->arch.vapics_in_nmi_mode > 0)
711 kvm_for_each_vcpu(i, vcpu, kvm)
712 kvm_apic_nmi_wd_deliver(vcpu);
713}
714
715void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
716{
717 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
718 struct kvm *kvm = vcpu->kvm;
719 struct kvm_kpit_state *ps;
720
721 if (pit) {
722 int inject = 0;
723 ps = &pit->pit_state;
724
725 /* Try to inject pending interrupts when
726 * last one has been acked.
727 */
728 raw_spin_lock(&ps->inject_lock);
729 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
730 ps->irq_ack = 0;
731 inject = 1;
732 }
733 raw_spin_unlock(&ps->inject_lock);
734 if (inject)
735 __inject_pit_timer_intr(kvm);
736 }
737}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 900d6b0ba7c2..46d08ca0b48f 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
27 u32 speaker_data_on; 27 u32 speaker_data_on;
28 struct mutex lock; 28 struct mutex lock;
29 struct kvm_pit *pit; 29 struct kvm_pit *pit;
30 raw_spinlock_t inject_lock; 30 spinlock_t inject_lock;
31 unsigned long irq_ack; 31 unsigned long irq_ack;
32 struct kvm_irq_ack_notifier irq_ack_notifier; 32 struct kvm_irq_ack_notifier irq_ack_notifier;
33}; 33};
@@ -40,6 +40,8 @@ struct kvm_pit {
40 struct kvm_kpit_state pit_state; 40 struct kvm_kpit_state pit_state;
41 int irq_source_id; 41 int irq_source_id;
42 struct kvm_irq_mask_notifier mask_notifier; 42 struct kvm_irq_mask_notifier mask_notifier;
43 struct workqueue_struct *wq;
44 struct work_struct expired;
43}; 45};
44 46
45#define KVM_PIT_BASE_ADDRESS 0x40 47#define KVM_PIT_BASE_ADDRESS 0x40
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 93825ff3338f..8d10c063d7f2 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard 4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation 5 * Copyright (c) 2007 Intel Corporation
6 * Copyright 2009 Red Hat, Inc. and/or its affilates.
6 * 7 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal 9 * of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,8 @@
33#include <linux/kvm_host.h> 34#include <linux/kvm_host.h>
34#include "trace.h" 35#include "trace.h"
35 36
37static void pic_irq_request(struct kvm *kvm, int level);
38
36static void pic_lock(struct kvm_pic *s) 39static void pic_lock(struct kvm_pic *s)
37 __acquires(&s->lock) 40 __acquires(&s->lock)
38{ 41{
@@ -43,16 +46,25 @@ static void pic_unlock(struct kvm_pic *s)
43 __releases(&s->lock) 46 __releases(&s->lock)
44{ 47{
45 bool wakeup = s->wakeup_needed; 48 bool wakeup = s->wakeup_needed;
46 struct kvm_vcpu *vcpu; 49 struct kvm_vcpu *vcpu, *found = NULL;
50 int i;
47 51
48 s->wakeup_needed = false; 52 s->wakeup_needed = false;
49 53
50 raw_spin_unlock(&s->lock); 54 raw_spin_unlock(&s->lock);
51 55
52 if (wakeup) { 56 if (wakeup) {
53 vcpu = s->kvm->bsp_vcpu; 57 kvm_for_each_vcpu(i, vcpu, s->kvm) {
54 if (vcpu) 58 if (kvm_apic_accept_pic_intr(vcpu)) {
55 kvm_vcpu_kick(vcpu); 59 found = vcpu;
60 break;
61 }
62 }
63
64 if (!found)
65 found = s->kvm->bsp_vcpu;
66
67 kvm_vcpu_kick(found);
56 } 68 }
57} 69}
58 70
@@ -173,10 +185,7 @@ static void pic_update_irq(struct kvm_pic *s)
173 pic_set_irq1(&s->pics[0], 2, 0); 185 pic_set_irq1(&s->pics[0], 2, 0);
174 } 186 }
175 irq = pic_get_irq(&s->pics[0]); 187 irq = pic_get_irq(&s->pics[0]);
176 if (irq >= 0) 188 pic_irq_request(s->kvm, irq >= 0);
177 s->irq_request(s->irq_request_opaque, 1);
178 else
179 s->irq_request(s->irq_request_opaque, 0);
180} 189}
181 190
182void kvm_pic_update_irq(struct kvm_pic *s) 191void kvm_pic_update_irq(struct kvm_pic *s)
@@ -261,8 +270,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
261void kvm_pic_reset(struct kvm_kpic_state *s) 270void kvm_pic_reset(struct kvm_kpic_state *s)
262{ 271{
263 int irq; 272 int irq;
264 struct kvm *kvm = s->pics_state->irq_request_opaque; 273 struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
265 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
266 u8 irr = s->irr, isr = s->imr; 274 u8 irr = s->irr, isr = s->imr;
267 275
268 s->last_irr = 0; 276 s->last_irr = 0;
@@ -301,8 +309,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
301 /* 309 /*
302 * deassert a pending interrupt 310 * deassert a pending interrupt
303 */ 311 */
304 s->pics_state->irq_request(s->pics_state-> 312 pic_irq_request(s->pics_state->kvm, 0);
305 irq_request_opaque, 0);
306 s->init_state = 1; 313 s->init_state = 1;
307 s->init4 = val & 1; 314 s->init4 = val & 1;
308 if (val & 0x02) 315 if (val & 0x02)
@@ -356,10 +363,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
356 } 363 }
357 } else 364 } else
358 switch (s->init_state) { 365 switch (s->init_state) {
359 case 0: /* normal mode */ 366 case 0: { /* normal mode */
367 u8 imr_diff = s->imr ^ val,
368 off = (s == &s->pics_state->pics[0]) ? 0 : 8;
360 s->imr = val; 369 s->imr = val;
370 for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
371 if (imr_diff & (1 << irq))
372 kvm_fire_mask_notifiers(
373 s->pics_state->kvm,
374 SELECT_PIC(irq + off),
375 irq + off,
376 !!(s->imr & (1 << irq)));
361 pic_update_irq(s->pics_state); 377 pic_update_irq(s->pics_state);
362 break; 378 break;
379 }
363 case 1: 380 case 1:
364 s->irq_base = val & 0xf8; 381 s->irq_base = val & 0xf8;
365 s->init_state = 2; 382 s->init_state = 2;
@@ -518,9 +535,8 @@ static int picdev_read(struct kvm_io_device *this,
518/* 535/*
519 * callback when PIC0 irq status changed 536 * callback when PIC0 irq status changed
520 */ 537 */
521static void pic_irq_request(void *opaque, int level) 538static void pic_irq_request(struct kvm *kvm, int level)
522{ 539{
523 struct kvm *kvm = opaque;
524 struct kvm_vcpu *vcpu = kvm->bsp_vcpu; 540 struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
525 struct kvm_pic *s = pic_irqchip(kvm); 541 struct kvm_pic *s = pic_irqchip(kvm);
526 int irq = pic_get_irq(&s->pics[0]); 542 int irq = pic_get_irq(&s->pics[0]);
@@ -549,8 +565,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
549 s->kvm = kvm; 565 s->kvm = kvm;
550 s->pics[0].elcr_mask = 0xf8; 566 s->pics[0].elcr_mask = 0xf8;
551 s->pics[1].elcr_mask = 0xde; 567 s->pics[1].elcr_mask = 0xde;
552 s->irq_request = pic_irq_request;
553 s->irq_request_opaque = kvm;
554 s->pics[0].pics_state = s; 568 s->pics[0].pics_state = s;
555 s->pics[1].pics_state = s; 569 s->pics[1].pics_state = s;
556 570
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 96dfbb6ad2a9..2095a049835e 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * irq.c: API for in kernel interrupt controller 2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation. 3 * Copyright (c) 2007, Intel Corporation.
4 * Copyright 2009 Red Hat, Inc. and/or its affilates.
4 * 5 *
5 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -89,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
89void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 90void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
90{ 91{
91 kvm_inject_apic_timer_irqs(vcpu); 92 kvm_inject_apic_timer_irqs(vcpu);
92 kvm_inject_pit_timer_irqs(vcpu);
93 /* TODO: PIT, RTC etc. */ 93 /* TODO: PIT, RTC etc. */
94} 94}
95EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); 95EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index cd1f362f413d..ffed06871c5c 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -38,8 +38,6 @@
38struct kvm; 38struct kvm;
39struct kvm_vcpu; 39struct kvm_vcpu;
40 40
41typedef void irq_request_func(void *opaque, int level);
42
43struct kvm_kpic_state { 41struct kvm_kpic_state {
44 u8 last_irr; /* edge detection */ 42 u8 last_irr; /* edge detection */
45 u8 irr; /* interrupt request register */ 43 u8 irr; /* interrupt request register */
@@ -67,8 +65,6 @@ struct kvm_pic {
67 unsigned pending_acks; 65 unsigned pending_acks;
68 struct kvm *kvm; 66 struct kvm *kvm;
69 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 67 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
70 irq_request_func *irq_request;
71 void *irq_request_opaque;
72 int output; /* intr from master PIC */ 68 int output; /* intr from master PIC */
73 struct kvm_io_device dev; 69 struct kvm_io_device dev;
74 void (*ack_notifier)(void *opaque, int irq); 70 void (*ack_notifier)(void *opaque, int irq);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index cff851cf5322..6491ac8e755b 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -36,6 +36,8 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
36 36
37static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) 37static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
38{ 38{
39 might_sleep(); /* on svm */
40
39 if (!test_bit(VCPU_EXREG_PDPTR, 41 if (!test_bit(VCPU_EXREG_PDPTR,
40 (unsigned long *)&vcpu->arch.regs_avail)) 42 (unsigned long *)&vcpu->arch.regs_avail))
41 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); 43 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
@@ -69,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
69 return kvm_read_cr4_bits(vcpu, ~0UL); 71 return kvm_read_cr4_bits(vcpu, ~0UL);
70} 72}
71 73
74static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
75{
76 return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
77 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
78}
79
72#endif 80#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1eb7a4ae0c9c..77d8c0f4817d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,6 +5,7 @@
5 * Copyright (C) 2006 Qumranet, Inc. 5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell 6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel 7 * Copyright (C) 2007 Intel
8 * Copyright 2009 Red Hat, Inc. and/or its affilates.
8 * 9 *
9 * Authors: 10 * Authors:
10 * Dor Laor <dor.laor@qumranet.com> 11 * Dor Laor <dor.laor@qumranet.com>
@@ -328,7 +329,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
328 "dest_mode 0x%x, short_hand 0x%x\n", 329 "dest_mode 0x%x, short_hand 0x%x\n",
329 target, source, dest, dest_mode, short_hand); 330 target, source, dest, dest_mode, short_hand);
330 331
331 ASSERT(!target); 332 ASSERT(target);
332 switch (short_hand) { 333 switch (short_hand) {
333 case APIC_DEST_NOSHORT: 334 case APIC_DEST_NOSHORT:
334 if (dest_mode == 0) 335 if (dest_mode == 0)
@@ -533,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
533 struct kvm_vcpu *vcpu = apic->vcpu; 534 struct kvm_vcpu *vcpu = apic->vcpu;
534 struct kvm_run *run = vcpu->run; 535 struct kvm_run *run = vcpu->run;
535 536
536 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); 537 kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
537 run->tpr_access.rip = kvm_rip_read(vcpu); 538 run->tpr_access.rip = kvm_rip_read(vcpu);
538 run->tpr_access.is_write = write; 539 run->tpr_access.is_write = write;
539} 540}
@@ -1106,13 +1107,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1106 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); 1107 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
1107 int r = 0; 1108 int r = 0;
1108 1109
1109 if (kvm_vcpu_is_bsp(vcpu)) { 1110 if (!apic_hw_enabled(vcpu->arch.apic))
1110 if (!apic_hw_enabled(vcpu->arch.apic)) 1111 r = 1;
1111 r = 1; 1112 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1112 if ((lvt0 & APIC_LVT_MASKED) == 0 && 1113 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
1113 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) 1114 r = 1;
1114 r = 1;
1115 }
1116 return r; 1115 return r;
1117} 1116}
1118 1117
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b1ed0a1a5913..0dcc95e09876 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,6 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates.
10 * 11 *
11 * Authors: 12 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -32,6 +33,7 @@
32#include <linux/compiler.h> 33#include <linux/compiler.h>
33#include <linux/srcu.h> 34#include <linux/srcu.h>
34#include <linux/slab.h> 35#include <linux/slab.h>
36#include <linux/uaccess.h>
35 37
36#include <asm/page.h> 38#include <asm/page.h>
37#include <asm/cmpxchg.h> 39#include <asm/cmpxchg.h>
@@ -90,8 +92,6 @@ module_param(oos_shadow, bool, 0644);
90#define PT_FIRST_AVAIL_BITS_SHIFT 9 92#define PT_FIRST_AVAIL_BITS_SHIFT 9
91#define PT64_SECOND_AVAIL_BITS_SHIFT 52 93#define PT64_SECOND_AVAIL_BITS_SHIFT 52
92 94
93#define VALID_PAGE(x) ((x) != INVALID_PAGE)
94
95#define PT64_LEVEL_BITS 9 95#define PT64_LEVEL_BITS 9
96 96
97#define PT64_LEVEL_SHIFT(level) \ 97#define PT64_LEVEL_SHIFT(level) \
@@ -173,7 +173,7 @@ struct kvm_shadow_walk_iterator {
173 shadow_walk_okay(&(_walker)); \ 173 shadow_walk_okay(&(_walker)); \
174 shadow_walk_next(&(_walker))) 174 shadow_walk_next(&(_walker)))
175 175
176typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp); 176typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
177 177
178static struct kmem_cache *pte_chain_cache; 178static struct kmem_cache *pte_chain_cache;
179static struct kmem_cache *rmap_desc_cache; 179static struct kmem_cache *rmap_desc_cache;
@@ -288,6 +288,35 @@ static void __set_spte(u64 *sptep, u64 spte)
288#endif 288#endif
289} 289}
290 290
291static u64 __xchg_spte(u64 *sptep, u64 new_spte)
292{
293#ifdef CONFIG_X86_64
294 return xchg(sptep, new_spte);
295#else
296 u64 old_spte;
297
298 do {
299 old_spte = *sptep;
300 } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
301
302 return old_spte;
303#endif
304}
305
306static void update_spte(u64 *sptep, u64 new_spte)
307{
308 u64 old_spte;
309
310 if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
311 !is_rmap_spte(*sptep))
312 __set_spte(sptep, new_spte);
313 else {
314 old_spte = __xchg_spte(sptep, new_spte);
315 if (old_spte & shadow_accessed_mask)
316 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
317 }
318}
319
291static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 320static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
292 struct kmem_cache *base_cache, int min) 321 struct kmem_cache *base_cache, int min)
293{ 322{
@@ -304,10 +333,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
304 return 0; 333 return 0;
305} 334}
306 335
307static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 336static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
337 struct kmem_cache *cache)
308{ 338{
309 while (mc->nobjs) 339 while (mc->nobjs)
310 kfree(mc->objects[--mc->nobjs]); 340 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
311} 341}
312 342
313static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 343static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
@@ -355,10 +385,11 @@ out:
355 385
356static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 386static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
357{ 387{
358 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); 388 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
359 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); 389 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
360 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); 390 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
361 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); 391 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
392 mmu_page_header_cache);
362} 393}
363 394
364static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, 395static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -379,7 +410,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
379 410
380static void mmu_free_pte_chain(struct kvm_pte_chain *pc) 411static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
381{ 412{
382 kfree(pc); 413 kmem_cache_free(pte_chain_cache, pc);
383} 414}
384 415
385static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) 416static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
@@ -390,7 +421,23 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
390 421
391static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) 422static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
392{ 423{
393 kfree(rd); 424 kmem_cache_free(rmap_desc_cache, rd);
425}
426
427static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
428{
429 if (!sp->role.direct)
430 return sp->gfns[index];
431
432 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
433}
434
435static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
436{
437 if (sp->role.direct)
438 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
439 else
440 sp->gfns[index] = gfn;
394} 441}
395 442
396/* 443/*
@@ -403,8 +450,8 @@ static int *slot_largepage_idx(gfn_t gfn,
403{ 450{
404 unsigned long idx; 451 unsigned long idx;
405 452
406 idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - 453 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
407 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); 454 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
408 return &slot->lpage_info[level - 2][idx].write_count; 455 return &slot->lpage_info[level - 2][idx].write_count;
409} 456}
410 457
@@ -414,9 +461,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
414 int *write_count; 461 int *write_count;
415 int i; 462 int i;
416 463
417 gfn = unalias_gfn(kvm, gfn); 464 slot = gfn_to_memslot(kvm, gfn);
418
419 slot = gfn_to_memslot_unaliased(kvm, gfn);
420 for (i = PT_DIRECTORY_LEVEL; 465 for (i = PT_DIRECTORY_LEVEL;
421 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 466 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
422 write_count = slot_largepage_idx(gfn, slot, i); 467 write_count = slot_largepage_idx(gfn, slot, i);
@@ -430,8 +475,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
430 int *write_count; 475 int *write_count;
431 int i; 476 int i;
432 477
433 gfn = unalias_gfn(kvm, gfn); 478 slot = gfn_to_memslot(kvm, gfn);
434 slot = gfn_to_memslot_unaliased(kvm, gfn);
435 for (i = PT_DIRECTORY_LEVEL; 479 for (i = PT_DIRECTORY_LEVEL;
436 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 480 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
437 write_count = slot_largepage_idx(gfn, slot, i); 481 write_count = slot_largepage_idx(gfn, slot, i);
@@ -447,8 +491,7 @@ static int has_wrprotected_page(struct kvm *kvm,
447 struct kvm_memory_slot *slot; 491 struct kvm_memory_slot *slot;
448 int *largepage_idx; 492 int *largepage_idx;
449 493
450 gfn = unalias_gfn(kvm, gfn); 494 slot = gfn_to_memslot(kvm, gfn);
451 slot = gfn_to_memslot_unaliased(kvm, gfn);
452 if (slot) { 495 if (slot) {
453 largepage_idx = slot_largepage_idx(gfn, slot, level); 496 largepage_idx = slot_largepage_idx(gfn, slot, level);
454 return *largepage_idx; 497 return *largepage_idx;
@@ -501,7 +544,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
501 544
502/* 545/*
503 * Take gfn and return the reverse mapping to it. 546 * Take gfn and return the reverse mapping to it.
504 * Note: gfn must be unaliased before this function get called
505 */ 547 */
506 548
507static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 549static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
@@ -513,8 +555,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
513 if (likely(level == PT_PAGE_TABLE_LEVEL)) 555 if (likely(level == PT_PAGE_TABLE_LEVEL))
514 return &slot->rmap[gfn - slot->base_gfn]; 556 return &slot->rmap[gfn - slot->base_gfn];
515 557
516 idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - 558 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
517 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); 559 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
518 560
519 return &slot->lpage_info[level - 2][idx].rmap_pde; 561 return &slot->lpage_info[level - 2][idx].rmap_pde;
520} 562}
@@ -541,9 +583,8 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
541 583
542 if (!is_rmap_spte(*spte)) 584 if (!is_rmap_spte(*spte))
543 return count; 585 return count;
544 gfn = unalias_gfn(vcpu->kvm, gfn);
545 sp = page_header(__pa(spte)); 586 sp = page_header(__pa(spte));
546 sp->gfns[spte - sp->spt] = gfn; 587 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
547 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 588 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
548 if (!*rmapp) { 589 if (!*rmapp) {
549 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); 590 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
@@ -600,19 +641,13 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
600 struct kvm_rmap_desc *desc; 641 struct kvm_rmap_desc *desc;
601 struct kvm_rmap_desc *prev_desc; 642 struct kvm_rmap_desc *prev_desc;
602 struct kvm_mmu_page *sp; 643 struct kvm_mmu_page *sp;
603 pfn_t pfn; 644 gfn_t gfn;
604 unsigned long *rmapp; 645 unsigned long *rmapp;
605 int i; 646 int i;
606 647
607 if (!is_rmap_spte(*spte))
608 return;
609 sp = page_header(__pa(spte)); 648 sp = page_header(__pa(spte));
610 pfn = spte_to_pfn(*spte); 649 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
611 if (*spte & shadow_accessed_mask) 650 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
612 kvm_set_pfn_accessed(pfn);
613 if (is_writable_pte(*spte))
614 kvm_set_pfn_dirty(pfn);
615 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
616 if (!*rmapp) { 651 if (!*rmapp) {
617 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 652 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
618 BUG(); 653 BUG();
@@ -644,6 +679,32 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
644 } 679 }
645} 680}
646 681
682static void set_spte_track_bits(u64 *sptep, u64 new_spte)
683{
684 pfn_t pfn;
685 u64 old_spte = *sptep;
686
687 if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
688 old_spte & shadow_accessed_mask) {
689 __set_spte(sptep, new_spte);
690 } else
691 old_spte = __xchg_spte(sptep, new_spte);
692
693 if (!is_rmap_spte(old_spte))
694 return;
695 pfn = spte_to_pfn(old_spte);
696 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
697 kvm_set_pfn_accessed(pfn);
698 if (is_writable_pte(old_spte))
699 kvm_set_pfn_dirty(pfn);
700}
701
702static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
703{
704 set_spte_track_bits(sptep, new_spte);
705 rmap_remove(kvm, sptep);
706}
707
647static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 708static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
648{ 709{
649 struct kvm_rmap_desc *desc; 710 struct kvm_rmap_desc *desc;
@@ -676,7 +737,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
676 u64 *spte; 737 u64 *spte;
677 int i, write_protected = 0; 738 int i, write_protected = 0;
678 739
679 gfn = unalias_gfn(kvm, gfn);
680 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); 740 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
681 741
682 spte = rmap_next(kvm, rmapp, NULL); 742 spte = rmap_next(kvm, rmapp, NULL);
@@ -685,7 +745,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
685 BUG_ON(!(*spte & PT_PRESENT_MASK)); 745 BUG_ON(!(*spte & PT_PRESENT_MASK));
686 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 746 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
687 if (is_writable_pte(*spte)) { 747 if (is_writable_pte(*spte)) {
688 __set_spte(spte, *spte & ~PT_WRITABLE_MASK); 748 update_spte(spte, *spte & ~PT_WRITABLE_MASK);
689 write_protected = 1; 749 write_protected = 1;
690 } 750 }
691 spte = rmap_next(kvm, rmapp, spte); 751 spte = rmap_next(kvm, rmapp, spte);
@@ -709,9 +769,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
709 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 769 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
710 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 770 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
711 if (is_writable_pte(*spte)) { 771 if (is_writable_pte(*spte)) {
712 rmap_remove(kvm, spte); 772 drop_spte(kvm, spte,
773 shadow_trap_nonpresent_pte);
713 --kvm->stat.lpages; 774 --kvm->stat.lpages;
714 __set_spte(spte, shadow_trap_nonpresent_pte);
715 spte = NULL; 775 spte = NULL;
716 write_protected = 1; 776 write_protected = 1;
717 } 777 }
@@ -731,8 +791,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
731 while ((spte = rmap_next(kvm, rmapp, NULL))) { 791 while ((spte = rmap_next(kvm, rmapp, NULL))) {
732 BUG_ON(!(*spte & PT_PRESENT_MASK)); 792 BUG_ON(!(*spte & PT_PRESENT_MASK));
733 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 793 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
734 rmap_remove(kvm, spte); 794 drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
735 __set_spte(spte, shadow_trap_nonpresent_pte);
736 need_tlb_flush = 1; 795 need_tlb_flush = 1;
737 } 796 }
738 return need_tlb_flush; 797 return need_tlb_flush;
@@ -754,8 +813,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
754 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 813 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
755 need_flush = 1; 814 need_flush = 1;
756 if (pte_write(*ptep)) { 815 if (pte_write(*ptep)) {
757 rmap_remove(kvm, spte); 816 drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
758 __set_spte(spte, shadow_trap_nonpresent_pte);
759 spte = rmap_next(kvm, rmapp, NULL); 817 spte = rmap_next(kvm, rmapp, NULL);
760 } else { 818 } else {
761 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 819 new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -763,9 +821,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
763 821
764 new_spte &= ~PT_WRITABLE_MASK; 822 new_spte &= ~PT_WRITABLE_MASK;
765 new_spte &= ~SPTE_HOST_WRITEABLE; 823 new_spte &= ~SPTE_HOST_WRITEABLE;
766 if (is_writable_pte(*spte)) 824 new_spte &= ~shadow_accessed_mask;
767 kvm_set_pfn_dirty(spte_to_pfn(*spte)); 825 set_spte_track_bits(spte, new_spte);
768 __set_spte(spte, new_spte);
769 spte = rmap_next(kvm, rmapp, spte); 826 spte = rmap_next(kvm, rmapp, spte);
770 } 827 }
771 } 828 }
@@ -799,8 +856,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
799 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 856 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
800 857
801 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 858 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
802 int idx = gfn_offset; 859 unsigned long idx;
803 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); 860 int sh;
861
862 sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
863 idx = ((memslot->base_gfn+gfn_offset) >> sh) -
864 (memslot->base_gfn >> sh);
804 ret |= handler(kvm, 865 ret |= handler(kvm,
805 &memslot->lpage_info[j][idx].rmap_pde, 866 &memslot->lpage_info[j][idx].rmap_pde,
806 data); 867 data);
@@ -863,7 +924,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
863 924
864 sp = page_header(__pa(spte)); 925 sp = page_header(__pa(spte));
865 926
866 gfn = unalias_gfn(vcpu->kvm, gfn);
867 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 927 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
868 928
869 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); 929 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
@@ -894,10 +954,12 @@ static int is_empty_shadow_page(u64 *spt)
894static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 954static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
895{ 955{
896 ASSERT(is_empty_shadow_page(sp->spt)); 956 ASSERT(is_empty_shadow_page(sp->spt));
957 hlist_del(&sp->hash_link);
897 list_del(&sp->link); 958 list_del(&sp->link);
898 __free_page(virt_to_page(sp->spt)); 959 __free_page(virt_to_page(sp->spt));
899 __free_page(virt_to_page(sp->gfns)); 960 if (!sp->role.direct)
900 kfree(sp); 961 __free_page(virt_to_page(sp->gfns));
962 kmem_cache_free(mmu_page_header_cache, sp);
901 ++kvm->arch.n_free_mmu_pages; 963 ++kvm->arch.n_free_mmu_pages;
902} 964}
903 965
@@ -907,13 +969,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
907} 969}
908 970
909static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, 971static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
910 u64 *parent_pte) 972 u64 *parent_pte, int direct)
911{ 973{
912 struct kvm_mmu_page *sp; 974 struct kvm_mmu_page *sp;
913 975
914 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); 976 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
915 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 977 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
916 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 978 if (!direct)
979 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
980 PAGE_SIZE);
917 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 981 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
918 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 982 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
919 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 983 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
@@ -998,7 +1062,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
998 BUG(); 1062 BUG();
999} 1063}
1000 1064
1001
1002static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) 1065static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1003{ 1066{
1004 struct kvm_pte_chain *pte_chain; 1067 struct kvm_pte_chain *pte_chain;
@@ -1008,63 +1071,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1008 1071
1009 if (!sp->multimapped && sp->parent_pte) { 1072 if (!sp->multimapped && sp->parent_pte) {
1010 parent_sp = page_header(__pa(sp->parent_pte)); 1073 parent_sp = page_header(__pa(sp->parent_pte));
1011 fn(parent_sp); 1074 fn(parent_sp, sp->parent_pte);
1012 mmu_parent_walk(parent_sp, fn);
1013 return; 1075 return;
1014 } 1076 }
1077
1015 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1078 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1016 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { 1079 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1017 if (!pte_chain->parent_ptes[i]) 1080 u64 *spte = pte_chain->parent_ptes[i];
1081
1082 if (!spte)
1018 break; 1083 break;
1019 parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); 1084 parent_sp = page_header(__pa(spte));
1020 fn(parent_sp); 1085 fn(parent_sp, spte);
1021 mmu_parent_walk(parent_sp, fn);
1022 } 1086 }
1023} 1087}
1024 1088
1025static void kvm_mmu_update_unsync_bitmap(u64 *spte) 1089static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
1090static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1026{ 1091{
1027 unsigned int index; 1092 mmu_parent_walk(sp, mark_unsync);
1028 struct kvm_mmu_page *sp = page_header(__pa(spte));
1029
1030 index = spte - sp->spt;
1031 if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
1032 sp->unsync_children++;
1033 WARN_ON(!sp->unsync_children);
1034} 1093}
1035 1094
1036static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) 1095static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1037{ 1096{
1038 struct kvm_pte_chain *pte_chain; 1097 unsigned int index;
1039 struct hlist_node *node;
1040 int i;
1041 1098
1042 if (!sp->parent_pte) 1099 index = spte - sp->spt;
1100 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1043 return; 1101 return;
1044 1102 if (sp->unsync_children++)
1045 if (!sp->multimapped) {
1046 kvm_mmu_update_unsync_bitmap(sp->parent_pte);
1047 return; 1103 return;
1048 } 1104 kvm_mmu_mark_parents_unsync(sp);
1049
1050 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1051 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1052 if (!pte_chain->parent_ptes[i])
1053 break;
1054 kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
1055 }
1056}
1057
1058static int unsync_walk_fn(struct kvm_mmu_page *sp)
1059{
1060 kvm_mmu_update_parents_unsync(sp);
1061 return 1;
1062}
1063
1064static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1065{
1066 mmu_parent_walk(sp, unsync_walk_fn);
1067 kvm_mmu_update_parents_unsync(sp);
1068} 1105}
1069 1106
1070static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, 1107static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
@@ -1077,7 +1114,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1077} 1114}
1078 1115
1079static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1116static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1080 struct kvm_mmu_page *sp) 1117 struct kvm_mmu_page *sp, bool clear_unsync)
1081{ 1118{
1082 return 1; 1119 return 1;
1083} 1120}
@@ -1123,35 +1160,40 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1123 int i, ret, nr_unsync_leaf = 0; 1160 int i, ret, nr_unsync_leaf = 0;
1124 1161
1125 for_each_unsync_children(sp->unsync_child_bitmap, i) { 1162 for_each_unsync_children(sp->unsync_child_bitmap, i) {
1163 struct kvm_mmu_page *child;
1126 u64 ent = sp->spt[i]; 1164 u64 ent = sp->spt[i];
1127 1165
1128 if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { 1166 if (!is_shadow_present_pte(ent) || is_large_pte(ent))
1129 struct kvm_mmu_page *child; 1167 goto clear_child_bitmap;
1130 child = page_header(ent & PT64_BASE_ADDR_MASK); 1168
1131 1169 child = page_header(ent & PT64_BASE_ADDR_MASK);
1132 if (child->unsync_children) { 1170
1133 if (mmu_pages_add(pvec, child, i)) 1171 if (child->unsync_children) {
1134 return -ENOSPC; 1172 if (mmu_pages_add(pvec, child, i))
1135 1173 return -ENOSPC;
1136 ret = __mmu_unsync_walk(child, pvec); 1174
1137 if (!ret) 1175 ret = __mmu_unsync_walk(child, pvec);
1138 __clear_bit(i, sp->unsync_child_bitmap); 1176 if (!ret)
1139 else if (ret > 0) 1177 goto clear_child_bitmap;
1140 nr_unsync_leaf += ret; 1178 else if (ret > 0)
1141 else 1179 nr_unsync_leaf += ret;
1142 return ret; 1180 else
1143 } 1181 return ret;
1182 } else if (child->unsync) {
1183 nr_unsync_leaf++;
1184 if (mmu_pages_add(pvec, child, i))
1185 return -ENOSPC;
1186 } else
1187 goto clear_child_bitmap;
1144 1188
1145 if (child->unsync) { 1189 continue;
1146 nr_unsync_leaf++; 1190
1147 if (mmu_pages_add(pvec, child, i)) 1191clear_child_bitmap:
1148 return -ENOSPC; 1192 __clear_bit(i, sp->unsync_child_bitmap);
1149 } 1193 sp->unsync_children--;
1150 } 1194 WARN_ON((int)sp->unsync_children < 0);
1151 } 1195 }
1152 1196
1153 if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
1154 sp->unsync_children = 0;
1155 1197
1156 return nr_unsync_leaf; 1198 return nr_unsync_leaf;
1157} 1199}
@@ -1166,26 +1208,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1166 return __mmu_unsync_walk(sp, pvec); 1208 return __mmu_unsync_walk(sp, pvec);
1167} 1209}
1168 1210
1169static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1170{
1171 unsigned index;
1172 struct hlist_head *bucket;
1173 struct kvm_mmu_page *sp;
1174 struct hlist_node *node;
1175
1176 pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1177 index = kvm_page_table_hashfn(gfn);
1178 bucket = &kvm->arch.mmu_page_hash[index];
1179 hlist_for_each_entry(sp, node, bucket, hash_link)
1180 if (sp->gfn == gfn && !sp->role.direct
1181 && !sp->role.invalid) {
1182 pgprintk("%s: found role %x\n",
1183 __func__, sp->role.word);
1184 return sp;
1185 }
1186 return NULL;
1187}
1188
1189static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1211static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1190{ 1212{
1191 WARN_ON(!sp->unsync); 1213 WARN_ON(!sp->unsync);
@@ -1194,20 +1216,36 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1194 --kvm->stat.mmu_unsync; 1216 --kvm->stat.mmu_unsync;
1195} 1217}
1196 1218
1197static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); 1219static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1220 struct list_head *invalid_list);
1221static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1222 struct list_head *invalid_list);
1223
1224#define for_each_gfn_sp(kvm, sp, gfn, pos) \
1225 hlist_for_each_entry(sp, pos, \
1226 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
1227 if ((sp)->gfn != (gfn)) {} else
1228
1229#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \
1230 hlist_for_each_entry(sp, pos, \
1231 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
1232 if ((sp)->gfn != (gfn) || (sp)->role.direct || \
1233 (sp)->role.invalid) {} else
1198 1234
1199static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1235/* @sp->gfn should be write-protected at the call site */
1236static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1237 struct list_head *invalid_list, bool clear_unsync)
1200{ 1238{
1201 if (sp->role.cr4_pae != !!is_pae(vcpu)) { 1239 if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1202 kvm_mmu_zap_page(vcpu->kvm, sp); 1240 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1203 return 1; 1241 return 1;
1204 } 1242 }
1205 1243
1206 if (rmap_write_protect(vcpu->kvm, sp->gfn)) 1244 if (clear_unsync)
1207 kvm_flush_remote_tlbs(vcpu->kvm); 1245 kvm_unlink_unsync_page(vcpu->kvm, sp);
1208 kvm_unlink_unsync_page(vcpu->kvm, sp); 1246
1209 if (vcpu->arch.mmu.sync_page(vcpu, sp)) { 1247 if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
1210 kvm_mmu_zap_page(vcpu->kvm, sp); 1248 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1211 return 1; 1249 return 1;
1212 } 1250 }
1213 1251
@@ -1215,6 +1253,52 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1215 return 0; 1253 return 0;
1216} 1254}
1217 1255
1256static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1257 struct kvm_mmu_page *sp)
1258{
1259 LIST_HEAD(invalid_list);
1260 int ret;
1261
1262 ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1263 if (ret)
1264 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1265
1266 return ret;
1267}
1268
1269static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1270 struct list_head *invalid_list)
1271{
1272 return __kvm_sync_page(vcpu, sp, invalid_list, true);
1273}
1274
1275/* @gfn should be write-protected at the call site */
1276static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1277{
1278 struct kvm_mmu_page *s;
1279 struct hlist_node *node;
1280 LIST_HEAD(invalid_list);
1281 bool flush = false;
1282
1283 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1284 if (!s->unsync)
1285 continue;
1286
1287 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1288 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1289 (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
1290 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1291 continue;
1292 }
1293 kvm_unlink_unsync_page(vcpu->kvm, s);
1294 flush = true;
1295 }
1296
1297 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1298 if (flush)
1299 kvm_mmu_flush_tlb(vcpu);
1300}
1301
1218struct mmu_page_path { 1302struct mmu_page_path {
1219 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; 1303 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1220 unsigned int idx[PT64_ROOT_LEVEL-1]; 1304 unsigned int idx[PT64_ROOT_LEVEL-1];
@@ -1281,6 +1365,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1281 struct kvm_mmu_page *sp; 1365 struct kvm_mmu_page *sp;
1282 struct mmu_page_path parents; 1366 struct mmu_page_path parents;
1283 struct kvm_mmu_pages pages; 1367 struct kvm_mmu_pages pages;
1368 LIST_HEAD(invalid_list);
1284 1369
1285 kvm_mmu_pages_init(parent, &parents, &pages); 1370 kvm_mmu_pages_init(parent, &parents, &pages);
1286 while (mmu_unsync_walk(parent, &pages)) { 1371 while (mmu_unsync_walk(parent, &pages)) {
@@ -1293,9 +1378,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1293 kvm_flush_remote_tlbs(vcpu->kvm); 1378 kvm_flush_remote_tlbs(vcpu->kvm);
1294 1379
1295 for_each_sp(pages, sp, parents, i) { 1380 for_each_sp(pages, sp, parents, i) {
1296 kvm_sync_page(vcpu, sp); 1381 kvm_sync_page(vcpu, sp, &invalid_list);
1297 mmu_pages_clear_parents(&parents); 1382 mmu_pages_clear_parents(&parents);
1298 } 1383 }
1384 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1299 cond_resched_lock(&vcpu->kvm->mmu_lock); 1385 cond_resched_lock(&vcpu->kvm->mmu_lock);
1300 kvm_mmu_pages_init(parent, &parents, &pages); 1386 kvm_mmu_pages_init(parent, &parents, &pages);
1301 } 1387 }
@@ -1310,11 +1396,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1310 u64 *parent_pte) 1396 u64 *parent_pte)
1311{ 1397{
1312 union kvm_mmu_page_role role; 1398 union kvm_mmu_page_role role;
1313 unsigned index;
1314 unsigned quadrant; 1399 unsigned quadrant;
1315 struct hlist_head *bucket;
1316 struct kvm_mmu_page *sp; 1400 struct kvm_mmu_page *sp;
1317 struct hlist_node *node, *tmp; 1401 struct hlist_node *node;
1402 bool need_sync = false;
1318 1403
1319 role = vcpu->arch.mmu.base_role; 1404 role = vcpu->arch.mmu.base_role;
1320 role.level = level; 1405 role.level = level;
@@ -1322,40 +1407,45 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1322 if (role.direct) 1407 if (role.direct)
1323 role.cr4_pae = 0; 1408 role.cr4_pae = 0;
1324 role.access = access; 1409 role.access = access;
1325 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1410 if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1326 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1411 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1327 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1412 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1328 role.quadrant = quadrant; 1413 role.quadrant = quadrant;
1329 } 1414 }
1330 index = kvm_page_table_hashfn(gfn); 1415 for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1331 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1416 if (!need_sync && sp->unsync)
1332 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) 1417 need_sync = true;
1333 if (sp->gfn == gfn) {
1334 if (sp->unsync)
1335 if (kvm_sync_page(vcpu, sp))
1336 continue;
1337 1418
1338 if (sp->role.word != role.word) 1419 if (sp->role.word != role.word)
1339 continue; 1420 continue;
1340 1421
1341 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1422 if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
1342 if (sp->unsync_children) { 1423 break;
1343 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); 1424
1344 kvm_mmu_mark_parents_unsync(sp); 1425 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1345 } 1426 if (sp->unsync_children) {
1346 trace_kvm_mmu_get_page(sp, false); 1427 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1347 return sp; 1428 kvm_mmu_mark_parents_unsync(sp);
1348 } 1429 } else if (sp->unsync)
1430 kvm_mmu_mark_parents_unsync(sp);
1431
1432 trace_kvm_mmu_get_page(sp, false);
1433 return sp;
1434 }
1349 ++vcpu->kvm->stat.mmu_cache_miss; 1435 ++vcpu->kvm->stat.mmu_cache_miss;
1350 sp = kvm_mmu_alloc_page(vcpu, parent_pte); 1436 sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1351 if (!sp) 1437 if (!sp)
1352 return sp; 1438 return sp;
1353 sp->gfn = gfn; 1439 sp->gfn = gfn;
1354 sp->role = role; 1440 sp->role = role;
1355 hlist_add_head(&sp->hash_link, bucket); 1441 hlist_add_head(&sp->hash_link,
1442 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1356 if (!direct) { 1443 if (!direct) {
1357 if (rmap_write_protect(vcpu->kvm, gfn)) 1444 if (rmap_write_protect(vcpu->kvm, gfn))
1358 kvm_flush_remote_tlbs(vcpu->kvm); 1445 kvm_flush_remote_tlbs(vcpu->kvm);
1446 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
1447 kvm_sync_pages(vcpu, gfn);
1448
1359 account_shadowed(vcpu->kvm, gfn); 1449 account_shadowed(vcpu->kvm, gfn);
1360 } 1450 }
1361 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1451 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1402,6 +1492,47 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1402 --iterator->level; 1492 --iterator->level;
1403} 1493}
1404 1494
1495static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1496{
1497 u64 spte;
1498
1499 spte = __pa(sp->spt)
1500 | PT_PRESENT_MASK | PT_ACCESSED_MASK
1501 | PT_WRITABLE_MASK | PT_USER_MASK;
1502 __set_spte(sptep, spte);
1503}
1504
1505static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1506{
1507 if (is_large_pte(*sptep)) {
1508 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1509 kvm_flush_remote_tlbs(vcpu->kvm);
1510 }
1511}
1512
1513static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1514 unsigned direct_access)
1515{
1516 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
1517 struct kvm_mmu_page *child;
1518
1519 /*
1520 * For the direct sp, if the guest pte's dirty bit
1521 * changed form clean to dirty, it will corrupt the
1522 * sp's access: allow writable in the read-only sp,
1523 * so we should update the spte at this point to get
1524 * a new sp with the correct access.
1525 */
1526 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
1527 if (child->role.access == direct_access)
1528 return;
1529
1530 mmu_page_remove_parent_pte(child, sptep);
1531 __set_spte(sptep, shadow_trap_nonpresent_pte);
1532 kvm_flush_remote_tlbs(vcpu->kvm);
1533 }
1534}
1535
1405static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1536static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1406 struct kvm_mmu_page *sp) 1537 struct kvm_mmu_page *sp)
1407{ 1538{
@@ -1422,7 +1553,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1422 } else { 1553 } else {
1423 if (is_large_pte(ent)) 1554 if (is_large_pte(ent))
1424 --kvm->stat.lpages; 1555 --kvm->stat.lpages;
1425 rmap_remove(kvm, &pt[i]); 1556 drop_spte(kvm, &pt[i],
1557 shadow_trap_nonpresent_pte);
1426 } 1558 }
1427 } 1559 }
1428 pt[i] = shadow_trap_nonpresent_pte; 1560 pt[i] = shadow_trap_nonpresent_pte;
@@ -1464,7 +1596,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1464} 1596}
1465 1597
1466static int mmu_zap_unsync_children(struct kvm *kvm, 1598static int mmu_zap_unsync_children(struct kvm *kvm,
1467 struct kvm_mmu_page *parent) 1599 struct kvm_mmu_page *parent,
1600 struct list_head *invalid_list)
1468{ 1601{
1469 int i, zapped = 0; 1602 int i, zapped = 0;
1470 struct mmu_page_path parents; 1603 struct mmu_page_path parents;
@@ -1478,7 +1611,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1478 struct kvm_mmu_page *sp; 1611 struct kvm_mmu_page *sp;
1479 1612
1480 for_each_sp(pages, sp, parents, i) { 1613 for_each_sp(pages, sp, parents, i) {
1481 kvm_mmu_zap_page(kvm, sp); 1614 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1482 mmu_pages_clear_parents(&parents); 1615 mmu_pages_clear_parents(&parents);
1483 zapped++; 1616 zapped++;
1484 } 1617 }
@@ -1488,32 +1621,52 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1488 return zapped; 1621 return zapped;
1489} 1622}
1490 1623
1491static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1624static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1625 struct list_head *invalid_list)
1492{ 1626{
1493 int ret; 1627 int ret;
1494 1628
1495 trace_kvm_mmu_zap_page(sp); 1629 trace_kvm_mmu_prepare_zap_page(sp);
1496 ++kvm->stat.mmu_shadow_zapped; 1630 ++kvm->stat.mmu_shadow_zapped;
1497 ret = mmu_zap_unsync_children(kvm, sp); 1631 ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1498 kvm_mmu_page_unlink_children(kvm, sp); 1632 kvm_mmu_page_unlink_children(kvm, sp);
1499 kvm_mmu_unlink_parents(kvm, sp); 1633 kvm_mmu_unlink_parents(kvm, sp);
1500 kvm_flush_remote_tlbs(kvm);
1501 if (!sp->role.invalid && !sp->role.direct) 1634 if (!sp->role.invalid && !sp->role.direct)
1502 unaccount_shadowed(kvm, sp->gfn); 1635 unaccount_shadowed(kvm, sp->gfn);
1503 if (sp->unsync) 1636 if (sp->unsync)
1504 kvm_unlink_unsync_page(kvm, sp); 1637 kvm_unlink_unsync_page(kvm, sp);
1505 if (!sp->root_count) { 1638 if (!sp->root_count) {
1506 hlist_del(&sp->hash_link); 1639 /* Count self */
1507 kvm_mmu_free_page(kvm, sp); 1640 ret++;
1641 list_move(&sp->link, invalid_list);
1508 } else { 1642 } else {
1509 sp->role.invalid = 1;
1510 list_move(&sp->link, &kvm->arch.active_mmu_pages); 1643 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1511 kvm_reload_remote_mmus(kvm); 1644 kvm_reload_remote_mmus(kvm);
1512 } 1645 }
1646
1647 sp->role.invalid = 1;
1513 kvm_mmu_reset_last_pte_updated(kvm); 1648 kvm_mmu_reset_last_pte_updated(kvm);
1514 return ret; 1649 return ret;
1515} 1650}
1516 1651
1652static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1653 struct list_head *invalid_list)
1654{
1655 struct kvm_mmu_page *sp;
1656
1657 if (list_empty(invalid_list))
1658 return;
1659
1660 kvm_flush_remote_tlbs(kvm);
1661
1662 do {
1663 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1664 WARN_ON(!sp->role.invalid || sp->root_count);
1665 kvm_mmu_free_page(kvm, sp);
1666 } while (!list_empty(invalid_list));
1667
1668}
1669
1517/* 1670/*
1518 * Changing the number of mmu pages allocated to the vm 1671 * Changing the number of mmu pages allocated to the vm
1519 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock 1672 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
@@ -1521,6 +1674,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1521void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1674void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1522{ 1675{
1523 int used_pages; 1676 int used_pages;
1677 LIST_HEAD(invalid_list);
1524 1678
1525 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; 1679 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1526 used_pages = max(0, used_pages); 1680 used_pages = max(0, used_pages);
@@ -1538,9 +1692,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1538 1692
1539 page = container_of(kvm->arch.active_mmu_pages.prev, 1693 page = container_of(kvm->arch.active_mmu_pages.prev,
1540 struct kvm_mmu_page, link); 1694 struct kvm_mmu_page, link);
1541 used_pages -= kvm_mmu_zap_page(kvm, page); 1695 used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
1542 used_pages--; 1696 &invalid_list);
1543 } 1697 }
1698 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1544 kvm_nr_mmu_pages = used_pages; 1699 kvm_nr_mmu_pages = used_pages;
1545 kvm->arch.n_free_mmu_pages = 0; 1700 kvm->arch.n_free_mmu_pages = 0;
1546 } 1701 }
@@ -1553,47 +1708,36 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1553 1708
1554static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 1709static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1555{ 1710{
1556 unsigned index;
1557 struct hlist_head *bucket;
1558 struct kvm_mmu_page *sp; 1711 struct kvm_mmu_page *sp;
1559 struct hlist_node *node, *n; 1712 struct hlist_node *node;
1713 LIST_HEAD(invalid_list);
1560 int r; 1714 int r;
1561 1715
1562 pgprintk("%s: looking for gfn %lx\n", __func__, gfn); 1716 pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1563 r = 0; 1717 r = 0;
1564 index = kvm_page_table_hashfn(gfn); 1718
1565 bucket = &kvm->arch.mmu_page_hash[index]; 1719 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1566restart: 1720 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1567 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) 1721 sp->role.word);
1568 if (sp->gfn == gfn && !sp->role.direct) { 1722 r = 1;
1569 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1723 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1570 sp->role.word); 1724 }
1571 r = 1; 1725 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1572 if (kvm_mmu_zap_page(kvm, sp))
1573 goto restart;
1574 }
1575 return r; 1726 return r;
1576} 1727}
1577 1728
1578static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) 1729static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1579{ 1730{
1580 unsigned index;
1581 struct hlist_head *bucket;
1582 struct kvm_mmu_page *sp; 1731 struct kvm_mmu_page *sp;
1583 struct hlist_node *node, *nn; 1732 struct hlist_node *node;
1733 LIST_HEAD(invalid_list);
1584 1734
1585 index = kvm_page_table_hashfn(gfn); 1735 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1586 bucket = &kvm->arch.mmu_page_hash[index]; 1736 pgprintk("%s: zap %lx %x\n",
1587restart: 1737 __func__, gfn, sp->role.word);
1588 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { 1738 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1589 if (sp->gfn == gfn && !sp->role.direct
1590 && !sp->role.invalid) {
1591 pgprintk("%s: zap %lx %x\n",
1592 __func__, gfn, sp->role.word);
1593 if (kvm_mmu_zap_page(kvm, sp))
1594 goto restart;
1595 }
1596 } 1739 }
1740 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1597} 1741}
1598 1742
1599static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) 1743static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
@@ -1723,47 +1867,51 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1723} 1867}
1724EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); 1868EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1725 1869
1726static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1870static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1727{ 1871{
1728 unsigned index;
1729 struct hlist_head *bucket;
1730 struct kvm_mmu_page *s;
1731 struct hlist_node *node, *n;
1732
1733 index = kvm_page_table_hashfn(sp->gfn);
1734 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1735 /* don't unsync if pagetable is shadowed with multiple roles */
1736 hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
1737 if (s->gfn != sp->gfn || s->role.direct)
1738 continue;
1739 if (s->role.word != sp->role.word)
1740 return 1;
1741 }
1742 trace_kvm_mmu_unsync_page(sp); 1872 trace_kvm_mmu_unsync_page(sp);
1743 ++vcpu->kvm->stat.mmu_unsync; 1873 ++vcpu->kvm->stat.mmu_unsync;
1744 sp->unsync = 1; 1874 sp->unsync = 1;
1745 1875
1746 kvm_mmu_mark_parents_unsync(sp); 1876 kvm_mmu_mark_parents_unsync(sp);
1747
1748 mmu_convert_notrap(sp); 1877 mmu_convert_notrap(sp);
1749 return 0; 1878}
1879
1880static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1881{
1882 struct kvm_mmu_page *s;
1883 struct hlist_node *node;
1884
1885 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1886 if (s->unsync)
1887 continue;
1888 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1889 __kvm_unsync_page(vcpu, s);
1890 }
1750} 1891}
1751 1892
1752static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, 1893static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1753 bool can_unsync) 1894 bool can_unsync)
1754{ 1895{
1755 struct kvm_mmu_page *shadow; 1896 struct kvm_mmu_page *s;
1897 struct hlist_node *node;
1898 bool need_unsync = false;
1756 1899
1757 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1900 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1758 if (shadow) { 1901 if (!can_unsync)
1759 if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
1760 return 1; 1902 return 1;
1761 if (shadow->unsync) 1903
1762 return 0; 1904 if (s->role.level != PT_PAGE_TABLE_LEVEL)
1763 if (can_unsync && oos_shadow) 1905 return 1;
1764 return kvm_unsync_page(vcpu, shadow); 1906
1765 return 1; 1907 if (!need_unsync && !s->unsync) {
1908 if (!oos_shadow)
1909 return 1;
1910 need_unsync = true;
1911 }
1766 } 1912 }
1913 if (need_unsync)
1914 kvm_unsync_pages(vcpu, gfn);
1767 return 0; 1915 return 0;
1768} 1916}
1769 1917
@@ -1804,13 +1952,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1804 spte |= (u64)pfn << PAGE_SHIFT; 1952 spte |= (u64)pfn << PAGE_SHIFT;
1805 1953
1806 if ((pte_access & ACC_WRITE_MASK) 1954 if ((pte_access & ACC_WRITE_MASK)
1807 || (write_fault && !is_write_protection(vcpu) && !user_fault)) { 1955 || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
1956 && !user_fault)) {
1808 1957
1809 if (level > PT_PAGE_TABLE_LEVEL && 1958 if (level > PT_PAGE_TABLE_LEVEL &&
1810 has_wrprotected_page(vcpu->kvm, gfn, level)) { 1959 has_wrprotected_page(vcpu->kvm, gfn, level)) {
1811 ret = 1; 1960 ret = 1;
1812 spte = shadow_trap_nonpresent_pte; 1961 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1813 goto set_pte; 1962 goto done;
1814 } 1963 }
1815 1964
1816 spte |= PT_WRITABLE_MASK; 1965 spte |= PT_WRITABLE_MASK;
@@ -1841,7 +1990,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1841 mark_page_dirty(vcpu->kvm, gfn); 1990 mark_page_dirty(vcpu->kvm, gfn);
1842 1991
1843set_pte: 1992set_pte:
1844 __set_spte(sptep, spte); 1993 if (is_writable_pte(*sptep) && !is_writable_pte(spte))
1994 kvm_set_pfn_dirty(pfn);
1995 update_spte(sptep, spte);
1996done:
1845 return ret; 1997 return ret;
1846} 1998}
1847 1999
@@ -1853,7 +2005,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1853 bool reset_host_protection) 2005 bool reset_host_protection)
1854{ 2006{
1855 int was_rmapped = 0; 2007 int was_rmapped = 0;
1856 int was_writable = is_writable_pte(*sptep);
1857 int rmap_count; 2008 int rmap_count;
1858 2009
1859 pgprintk("%s: spte %llx access %x write_fault %d" 2010 pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1878,8 +2029,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1878 } else if (pfn != spte_to_pfn(*sptep)) { 2029 } else if (pfn != spte_to_pfn(*sptep)) {
1879 pgprintk("hfn old %lx new %lx\n", 2030 pgprintk("hfn old %lx new %lx\n",
1880 spte_to_pfn(*sptep), pfn); 2031 spte_to_pfn(*sptep), pfn);
1881 rmap_remove(vcpu->kvm, sptep); 2032 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1882 __set_spte(sptep, shadow_trap_nonpresent_pte);
1883 kvm_flush_remote_tlbs(vcpu->kvm); 2033 kvm_flush_remote_tlbs(vcpu->kvm);
1884 } else 2034 } else
1885 was_rmapped = 1; 2035 was_rmapped = 1;
@@ -1890,7 +2040,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1890 reset_host_protection)) { 2040 reset_host_protection)) {
1891 if (write_fault) 2041 if (write_fault)
1892 *ptwrite = 1; 2042 *ptwrite = 1;
1893 kvm_x86_ops->tlb_flush(vcpu); 2043 kvm_mmu_flush_tlb(vcpu);
1894 } 2044 }
1895 2045
1896 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2046 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
@@ -1904,15 +2054,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1904 page_header_update_slot(vcpu->kvm, sptep, gfn); 2054 page_header_update_slot(vcpu->kvm, sptep, gfn);
1905 if (!was_rmapped) { 2055 if (!was_rmapped) {
1906 rmap_count = rmap_add(vcpu, sptep, gfn); 2056 rmap_count = rmap_add(vcpu, sptep, gfn);
1907 kvm_release_pfn_clean(pfn);
1908 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2057 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1909 rmap_recycle(vcpu, sptep, gfn); 2058 rmap_recycle(vcpu, sptep, gfn);
1910 } else {
1911 if (was_writable)
1912 kvm_release_pfn_dirty(pfn);
1913 else
1914 kvm_release_pfn_clean(pfn);
1915 } 2059 }
2060 kvm_release_pfn_clean(pfn);
1916 if (speculative) { 2061 if (speculative) {
1917 vcpu->arch.last_pte_updated = sptep; 2062 vcpu->arch.last_pte_updated = sptep;
1918 vcpu->arch.last_pte_gfn = gfn; 2063 vcpu->arch.last_pte_gfn = gfn;
@@ -1941,7 +2086,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1941 } 2086 }
1942 2087
1943 if (*iterator.sptep == shadow_trap_nonpresent_pte) { 2088 if (*iterator.sptep == shadow_trap_nonpresent_pte) {
1944 pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; 2089 u64 base_addr = iterator.addr;
2090
2091 base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
2092 pseudo_gfn = base_addr >> PAGE_SHIFT;
1945 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, 2093 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
1946 iterator.level - 1, 2094 iterator.level - 1,
1947 1, ACC_ALL, iterator.sptep); 2095 1, ACC_ALL, iterator.sptep);
@@ -1960,6 +2108,29 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1960 return pt_write; 2108 return pt_write;
1961} 2109}
1962 2110
2111static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
2112{
2113 char buf[1];
2114 void __user *hva;
2115 int r;
2116
2117 /* Touch the page, so send SIGBUS */
2118 hva = (void __user *)gfn_to_hva(kvm, gfn);
2119 r = copy_from_user(buf, hva, 1);
2120}
2121
2122static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2123{
2124 kvm_release_pfn_clean(pfn);
2125 if (is_hwpoison_pfn(pfn)) {
2126 kvm_send_hwpoison_signal(kvm, gfn);
2127 return 0;
2128 } else if (is_fault_pfn(pfn))
2129 return -EFAULT;
2130
2131 return 1;
2132}
2133
1963static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 2134static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1964{ 2135{
1965 int r; 2136 int r;
@@ -1983,10 +2154,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1983 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2154 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1984 2155
1985 /* mmio */ 2156 /* mmio */
1986 if (is_error_pfn(pfn)) { 2157 if (is_error_pfn(pfn))
1987 kvm_release_pfn_clean(pfn); 2158 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
1988 return 1;
1989 }
1990 2159
1991 spin_lock(&vcpu->kvm->mmu_lock); 2160 spin_lock(&vcpu->kvm->mmu_lock);
1992 if (mmu_notifier_retry(vcpu, mmu_seq)) 2161 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2009,6 +2178,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2009{ 2178{
2010 int i; 2179 int i;
2011 struct kvm_mmu_page *sp; 2180 struct kvm_mmu_page *sp;
2181 LIST_HEAD(invalid_list);
2012 2182
2013 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2183 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2014 return; 2184 return;
@@ -2018,8 +2188,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2018 2188
2019 sp = page_header(root); 2189 sp = page_header(root);
2020 --sp->root_count; 2190 --sp->root_count;
2021 if (!sp->root_count && sp->role.invalid) 2191 if (!sp->root_count && sp->role.invalid) {
2022 kvm_mmu_zap_page(vcpu->kvm, sp); 2192 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2193 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2194 }
2023 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2195 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2024 spin_unlock(&vcpu->kvm->mmu_lock); 2196 spin_unlock(&vcpu->kvm->mmu_lock);
2025 return; 2197 return;
@@ -2032,10 +2204,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2032 sp = page_header(root); 2204 sp = page_header(root);
2033 --sp->root_count; 2205 --sp->root_count;
2034 if (!sp->root_count && sp->role.invalid) 2206 if (!sp->root_count && sp->role.invalid)
2035 kvm_mmu_zap_page(vcpu->kvm, sp); 2207 kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2208 &invalid_list);
2036 } 2209 }
2037 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 2210 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2038 } 2211 }
2212 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2039 spin_unlock(&vcpu->kvm->mmu_lock); 2213 spin_unlock(&vcpu->kvm->mmu_lock);
2040 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2214 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2041} 2215}
@@ -2045,7 +2219,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2045 int ret = 0; 2219 int ret = 0;
2046 2220
2047 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { 2221 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2048 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 2222 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2049 ret = 1; 2223 ret = 1;
2050 } 2224 }
2051 2225
@@ -2073,6 +2247,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2073 root_gfn = 0; 2247 root_gfn = 0;
2074 } 2248 }
2075 spin_lock(&vcpu->kvm->mmu_lock); 2249 spin_lock(&vcpu->kvm->mmu_lock);
2250 kvm_mmu_free_some_pages(vcpu);
2076 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2251 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
2077 PT64_ROOT_LEVEL, direct, 2252 PT64_ROOT_LEVEL, direct,
2078 ACC_ALL, NULL); 2253 ACC_ALL, NULL);
@@ -2103,6 +2278,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2103 root_gfn = i << 30; 2278 root_gfn = i << 30;
2104 } 2279 }
2105 spin_lock(&vcpu->kvm->mmu_lock); 2280 spin_lock(&vcpu->kvm->mmu_lock);
2281 kvm_mmu_free_some_pages(vcpu);
2106 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2282 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2107 PT32_ROOT_LEVEL, direct, 2283 PT32_ROOT_LEVEL, direct,
2108 ACC_ALL, NULL); 2284 ACC_ALL, NULL);
@@ -2198,10 +2374,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2198 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2374 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2199 smp_rmb(); 2375 smp_rmb();
2200 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2376 pfn = gfn_to_pfn(vcpu->kvm, gfn);
2201 if (is_error_pfn(pfn)) { 2377 if (is_error_pfn(pfn))
2202 kvm_release_pfn_clean(pfn); 2378 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2203 return 1;
2204 }
2205 spin_lock(&vcpu->kvm->mmu_lock); 2379 spin_lock(&vcpu->kvm->mmu_lock);
2206 if (mmu_notifier_retry(vcpu, mmu_seq)) 2380 if (mmu_notifier_retry(vcpu, mmu_seq))
2207 goto out_unlock; 2381 goto out_unlock;
@@ -2243,7 +2417,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
2243void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) 2417void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2244{ 2418{
2245 ++vcpu->stat.tlb_flush; 2419 ++vcpu->stat.tlb_flush;
2246 kvm_x86_ops->tlb_flush(vcpu); 2420 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2247} 2421}
2248 2422
2249static void paging_new_cr3(struct kvm_vcpu *vcpu) 2423static void paging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2457,10 +2631,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
2457static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) 2631static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
2458{ 2632{
2459 ASSERT(vcpu); 2633 ASSERT(vcpu);
2460 if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { 2634 if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
2635 /* mmu.free() should set root_hpa = INVALID_PAGE */
2461 vcpu->arch.mmu.free(vcpu); 2636 vcpu->arch.mmu.free(vcpu);
2462 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2463 }
2464} 2637}
2465 2638
2466int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 2639int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -2477,9 +2650,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2477 r = mmu_topup_memory_caches(vcpu); 2650 r = mmu_topup_memory_caches(vcpu);
2478 if (r) 2651 if (r)
2479 goto out; 2652 goto out;
2480 spin_lock(&vcpu->kvm->mmu_lock);
2481 kvm_mmu_free_some_pages(vcpu);
2482 spin_unlock(&vcpu->kvm->mmu_lock);
2483 r = mmu_alloc_roots(vcpu); 2653 r = mmu_alloc_roots(vcpu);
2484 spin_lock(&vcpu->kvm->mmu_lock); 2654 spin_lock(&vcpu->kvm->mmu_lock);
2485 mmu_sync_roots(vcpu); 2655 mmu_sync_roots(vcpu);
@@ -2508,7 +2678,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2508 pte = *spte; 2678 pte = *spte;
2509 if (is_shadow_present_pte(pte)) { 2679 if (is_shadow_present_pte(pte)) {
2510 if (is_last_spte(pte, sp->role.level)) 2680 if (is_last_spte(pte, sp->role.level))
2511 rmap_remove(vcpu->kvm, spte); 2681 drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
2512 else { 2682 else {
2513 child = page_header(pte & PT64_BASE_ADDR_MASK); 2683 child = page_header(pte & PT64_BASE_ADDR_MASK);
2514 mmu_page_remove_parent_pte(child, spte); 2684 mmu_page_remove_parent_pte(child, spte);
@@ -2529,6 +2699,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2529 return; 2699 return;
2530 } 2700 }
2531 2701
2702 if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
2703 return;
2704
2532 ++vcpu->kvm->stat.mmu_pte_updated; 2705 ++vcpu->kvm->stat.mmu_pte_updated;
2533 if (!sp->role.cr4_pae) 2706 if (!sp->role.cr4_pae)
2534 paging32_update_pte(vcpu, sp, spte, new); 2707 paging32_update_pte(vcpu, sp, spte, new);
@@ -2549,11 +2722,15 @@ static bool need_remote_flush(u64 old, u64 new)
2549 return (old & ~new & PT64_PERM_MASK) != 0; 2722 return (old & ~new & PT64_PERM_MASK) != 0;
2550} 2723}
2551 2724
2552static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) 2725static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
2726 bool remote_flush, bool local_flush)
2553{ 2727{
2554 if (need_remote_flush(old, new)) 2728 if (zap_page)
2729 return;
2730
2731 if (remote_flush)
2555 kvm_flush_remote_tlbs(vcpu->kvm); 2732 kvm_flush_remote_tlbs(vcpu->kvm);
2556 else 2733 else if (local_flush)
2557 kvm_mmu_flush_tlb(vcpu); 2734 kvm_mmu_flush_tlb(vcpu);
2558} 2735}
2559 2736
@@ -2603,10 +2780,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2603 bool guest_initiated) 2780 bool guest_initiated)
2604{ 2781{
2605 gfn_t gfn = gpa >> PAGE_SHIFT; 2782 gfn_t gfn = gpa >> PAGE_SHIFT;
2783 union kvm_mmu_page_role mask = { .word = 0 };
2606 struct kvm_mmu_page *sp; 2784 struct kvm_mmu_page *sp;
2607 struct hlist_node *node, *n; 2785 struct hlist_node *node;
2608 struct hlist_head *bucket; 2786 LIST_HEAD(invalid_list);
2609 unsigned index;
2610 u64 entry, gentry; 2787 u64 entry, gentry;
2611 u64 *spte; 2788 u64 *spte;
2612 unsigned offset = offset_in_page(gpa); 2789 unsigned offset = offset_in_page(gpa);
@@ -2619,6 +2796,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2619 int npte; 2796 int npte;
2620 int r; 2797 int r;
2621 int invlpg_counter; 2798 int invlpg_counter;
2799 bool remote_flush, local_flush, zap_page;
2800
2801 zap_page = remote_flush = local_flush = false;
2622 2802
2623 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 2803 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2624 2804
@@ -2674,13 +2854,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2674 vcpu->arch.last_pte_updated = NULL; 2854 vcpu->arch.last_pte_updated = NULL;
2675 } 2855 }
2676 } 2856 }
2677 index = kvm_page_table_hashfn(gfn);
2678 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
2679 2857
2680restart: 2858 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
2681 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2859 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
2682 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
2683 continue;
2684 pte_size = sp->role.cr4_pae ? 8 : 4; 2860 pte_size = sp->role.cr4_pae ? 8 : 4;
2685 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2861 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2686 misaligned |= bytes < 4; 2862 misaligned |= bytes < 4;
@@ -2697,8 +2873,8 @@ restart:
2697 */ 2873 */
2698 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2874 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2699 gpa, bytes, sp->role.word); 2875 gpa, bytes, sp->role.word);
2700 if (kvm_mmu_zap_page(vcpu->kvm, sp)) 2876 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2701 goto restart; 2877 &invalid_list);
2702 ++vcpu->kvm->stat.mmu_flooded; 2878 ++vcpu->kvm->stat.mmu_flooded;
2703 continue; 2879 continue;
2704 } 2880 }
@@ -2722,16 +2898,22 @@ restart:
2722 if (quadrant != sp->role.quadrant) 2898 if (quadrant != sp->role.quadrant)
2723 continue; 2899 continue;
2724 } 2900 }
2901 local_flush = true;
2725 spte = &sp->spt[page_offset / sizeof(*spte)]; 2902 spte = &sp->spt[page_offset / sizeof(*spte)];
2726 while (npte--) { 2903 while (npte--) {
2727 entry = *spte; 2904 entry = *spte;
2728 mmu_pte_write_zap_pte(vcpu, sp, spte); 2905 mmu_pte_write_zap_pte(vcpu, sp, spte);
2729 if (gentry) 2906 if (gentry &&
2907 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
2908 & mask.word))
2730 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 2909 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2731 mmu_pte_write_flush_tlb(vcpu, entry, *spte); 2910 if (!remote_flush && need_remote_flush(entry, *spte))
2911 remote_flush = true;
2732 ++spte; 2912 ++spte;
2733 } 2913 }
2734 } 2914 }
2915 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2916 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2735 kvm_mmu_audit(vcpu, "post pte write"); 2917 kvm_mmu_audit(vcpu, "post pte write");
2736 spin_unlock(&vcpu->kvm->mmu_lock); 2918 spin_unlock(&vcpu->kvm->mmu_lock);
2737 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { 2919 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
@@ -2759,15 +2941,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2759 2941
2760void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 2942void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2761{ 2943{
2762 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && 2944 int free_pages;
2945 LIST_HEAD(invalid_list);
2946
2947 free_pages = vcpu->kvm->arch.n_free_mmu_pages;
2948 while (free_pages < KVM_REFILL_PAGES &&
2763 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 2949 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2764 struct kvm_mmu_page *sp; 2950 struct kvm_mmu_page *sp;
2765 2951
2766 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 2952 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2767 struct kvm_mmu_page, link); 2953 struct kvm_mmu_page, link);
2768 kvm_mmu_zap_page(vcpu->kvm, sp); 2954 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2955 &invalid_list);
2769 ++vcpu->kvm->stat.mmu_recycled; 2956 ++vcpu->kvm->stat.mmu_recycled;
2770 } 2957 }
2958 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2771} 2959}
2772 2960
2773int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 2961int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -2795,11 +2983,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2795 return 1; 2983 return 1;
2796 case EMULATE_DO_MMIO: 2984 case EMULATE_DO_MMIO:
2797 ++vcpu->stat.mmio_exits; 2985 ++vcpu->stat.mmio_exits;
2798 return 0; 2986 /* fall through */
2799 case EMULATE_FAIL: 2987 case EMULATE_FAIL:
2800 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2801 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2802 vcpu->run->internal.ndata = 0;
2803 return 0; 2988 return 0;
2804 default: 2989 default:
2805 BUG(); 2990 BUG();
@@ -2896,7 +3081,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2896 pt = sp->spt; 3081 pt = sp->spt;
2897 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3082 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2898 /* avoid RMW */ 3083 /* avoid RMW */
2899 if (pt[i] & PT_WRITABLE_MASK) 3084 if (is_writable_pte(pt[i]))
2900 pt[i] &= ~PT_WRITABLE_MASK; 3085 pt[i] &= ~PT_WRITABLE_MASK;
2901 } 3086 }
2902 kvm_flush_remote_tlbs(kvm); 3087 kvm_flush_remote_tlbs(kvm);
@@ -2905,25 +3090,26 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2905void kvm_mmu_zap_all(struct kvm *kvm) 3090void kvm_mmu_zap_all(struct kvm *kvm)
2906{ 3091{
2907 struct kvm_mmu_page *sp, *node; 3092 struct kvm_mmu_page *sp, *node;
3093 LIST_HEAD(invalid_list);
2908 3094
2909 spin_lock(&kvm->mmu_lock); 3095 spin_lock(&kvm->mmu_lock);
2910restart: 3096restart:
2911 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 3097 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2912 if (kvm_mmu_zap_page(kvm, sp)) 3098 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
2913 goto restart; 3099 goto restart;
2914 3100
3101 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2915 spin_unlock(&kvm->mmu_lock); 3102 spin_unlock(&kvm->mmu_lock);
2916
2917 kvm_flush_remote_tlbs(kvm);
2918} 3103}
2919 3104
2920static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) 3105static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3106 struct list_head *invalid_list)
2921{ 3107{
2922 struct kvm_mmu_page *page; 3108 struct kvm_mmu_page *page;
2923 3109
2924 page = container_of(kvm->arch.active_mmu_pages.prev, 3110 page = container_of(kvm->arch.active_mmu_pages.prev,
2925 struct kvm_mmu_page, link); 3111 struct kvm_mmu_page, link);
2926 return kvm_mmu_zap_page(kvm, page) + 1; 3112 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
2927} 3113}
2928 3114
2929static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 3115static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
@@ -2936,6 +3122,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
2936 3122
2937 list_for_each_entry(kvm, &vm_list, vm_list) { 3123 list_for_each_entry(kvm, &vm_list, vm_list) {
2938 int npages, idx, freed_pages; 3124 int npages, idx, freed_pages;
3125 LIST_HEAD(invalid_list);
2939 3126
2940 idx = srcu_read_lock(&kvm->srcu); 3127 idx = srcu_read_lock(&kvm->srcu);
2941 spin_lock(&kvm->mmu_lock); 3128 spin_lock(&kvm->mmu_lock);
@@ -2943,12 +3130,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
2943 kvm->arch.n_free_mmu_pages; 3130 kvm->arch.n_free_mmu_pages;
2944 cache_count += npages; 3131 cache_count += npages;
2945 if (!kvm_freed && nr_to_scan > 0 && npages > 0) { 3132 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
2946 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm); 3133 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3134 &invalid_list);
2947 cache_count -= freed_pages; 3135 cache_count -= freed_pages;
2948 kvm_freed = kvm; 3136 kvm_freed = kvm;
2949 } 3137 }
2950 nr_to_scan--; 3138 nr_to_scan--;
2951 3139
3140 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2952 spin_unlock(&kvm->mmu_lock); 3141 spin_unlock(&kvm->mmu_lock);
2953 srcu_read_unlock(&kvm->srcu, idx); 3142 srcu_read_unlock(&kvm->srcu, idx);
2954 } 3143 }
@@ -3074,7 +3263,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3074 3263
3075static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 3264static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3076{ 3265{
3077 kvm_set_cr3(vcpu, vcpu->arch.cr3); 3266 (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
3078 return 1; 3267 return 1;
3079} 3268}
3080 3269
@@ -3331,9 +3520,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3331 struct kvm_mmu_page *rev_sp; 3520 struct kvm_mmu_page *rev_sp;
3332 gfn_t gfn; 3521 gfn_t gfn;
3333 3522
3334 if (*sptep & PT_WRITABLE_MASK) { 3523 if (is_writable_pte(*sptep)) {
3335 rev_sp = page_header(__pa(sptep)); 3524 rev_sp = page_header(__pa(sptep));
3336 gfn = rev_sp->gfns[sptep - rev_sp->spt]; 3525 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3337 3526
3338 if (!gfn_to_memslot(kvm, gfn)) { 3527 if (!gfn_to_memslot(kvm, gfn)) {
3339 if (!printk_ratelimit()) 3528 if (!printk_ratelimit())
@@ -3347,8 +3536,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3347 return; 3536 return;
3348 } 3537 }
3349 3538
3350 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 3539 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3351 rev_sp->role.level);
3352 if (!*rmapp) { 3540 if (!*rmapp) {
3353 if (!printk_ratelimit()) 3541 if (!printk_ratelimit())
3354 return; 3542 return;
@@ -3381,7 +3569,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3381 3569
3382 if (!(ent & PT_PRESENT_MASK)) 3570 if (!(ent & PT_PRESENT_MASK))
3383 continue; 3571 continue;
3384 if (!(ent & PT_WRITABLE_MASK)) 3572 if (!is_writable_pte(ent))
3385 continue; 3573 continue;
3386 inspect_spte_has_rmap(vcpu->kvm, &pt[i]); 3574 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3387 } 3575 }
@@ -3409,13 +3597,12 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
3409 if (sp->unsync) 3597 if (sp->unsync)
3410 continue; 3598 continue;
3411 3599
3412 gfn = unalias_gfn(vcpu->kvm, sp->gfn); 3600 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3413 slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
3414 rmapp = &slot->rmap[gfn - slot->base_gfn]; 3601 rmapp = &slot->rmap[gfn - slot->base_gfn];
3415 3602
3416 spte = rmap_next(vcpu->kvm, rmapp, NULL); 3603 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3417 while (spte) { 3604 while (spte) {
3418 if (*spte & PT_WRITABLE_MASK) 3605 if (is_writable_pte(*spte))
3419 printk(KERN_ERR "%s: (%s) shadow page has " 3606 printk(KERN_ERR "%s: (%s) shadow page has "
3420 "writable mappings: gfn %lx role %x\n", 3607 "writable mappings: gfn %lx role %x\n",
3421 __func__, audit_msg, sp->gfn, 3608 __func__, audit_msg, sp->gfn,
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 42f07b1bfbc9..3aab0f0930ef 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -190,7 +190,7 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
190 TP_ARGS(sp) 190 TP_ARGS(sp)
191); 191);
192 192
193DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page, 193DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
194 TP_PROTO(struct kvm_mmu_page *sp), 194 TP_PROTO(struct kvm_mmu_page *sp),
195 195
196 TP_ARGS(sp) 196 TP_ARGS(sp)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2331bdc2b549..51ef9097960d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,6 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates.
10 * 11 *
11 * Authors: 12 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -118,21 +119,25 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
118{ 119{
119 pt_element_t pte; 120 pt_element_t pte;
120 gfn_t table_gfn; 121 gfn_t table_gfn;
121 unsigned index, pt_access, pte_access; 122 unsigned index, pt_access, uninitialized_var(pte_access);
122 gpa_t pte_gpa; 123 gpa_t pte_gpa;
123 int rsvd_fault = 0; 124 bool eperm, present, rsvd_fault;
124 125
125 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 126 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
126 fetch_fault); 127 fetch_fault);
127walk: 128walk:
129 present = true;
130 eperm = rsvd_fault = false;
128 walker->level = vcpu->arch.mmu.root_level; 131 walker->level = vcpu->arch.mmu.root_level;
129 pte = vcpu->arch.cr3; 132 pte = vcpu->arch.cr3;
130#if PTTYPE == 64 133#if PTTYPE == 64
131 if (!is_long_mode(vcpu)) { 134 if (!is_long_mode(vcpu)) {
132 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); 135 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
133 trace_kvm_mmu_paging_element(pte, walker->level); 136 trace_kvm_mmu_paging_element(pte, walker->level);
134 if (!is_present_gpte(pte)) 137 if (!is_present_gpte(pte)) {
135 goto not_present; 138 present = false;
139 goto error;
140 }
136 --walker->level; 141 --walker->level;
137 } 142 }
138#endif 143#endif
@@ -150,37 +155,42 @@ walk:
150 walker->table_gfn[walker->level - 1] = table_gfn; 155 walker->table_gfn[walker->level - 1] = table_gfn;
151 walker->pte_gpa[walker->level - 1] = pte_gpa; 156 walker->pte_gpa[walker->level - 1] = pte_gpa;
152 157
153 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) 158 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) {
154 goto not_present; 159 present = false;
160 break;
161 }
155 162
156 trace_kvm_mmu_paging_element(pte, walker->level); 163 trace_kvm_mmu_paging_element(pte, walker->level);
157 164
158 if (!is_present_gpte(pte)) 165 if (!is_present_gpte(pte)) {
159 goto not_present; 166 present = false;
167 break;
168 }
160 169
161 rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); 170 if (is_rsvd_bits_set(vcpu, pte, walker->level)) {
162 if (rsvd_fault) 171 rsvd_fault = true;
163 goto access_error; 172 break;
173 }
164 174
165 if (write_fault && !is_writable_pte(pte)) 175 if (write_fault && !is_writable_pte(pte))
166 if (user_fault || is_write_protection(vcpu)) 176 if (user_fault || is_write_protection(vcpu))
167 goto access_error; 177 eperm = true;
168 178
169 if (user_fault && !(pte & PT_USER_MASK)) 179 if (user_fault && !(pte & PT_USER_MASK))
170 goto access_error; 180 eperm = true;
171 181
172#if PTTYPE == 64 182#if PTTYPE == 64
173 if (fetch_fault && (pte & PT64_NX_MASK)) 183 if (fetch_fault && (pte & PT64_NX_MASK))
174 goto access_error; 184 eperm = true;
175#endif 185#endif
176 186
177 if (!(pte & PT_ACCESSED_MASK)) { 187 if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
178 trace_kvm_mmu_set_accessed_bit(table_gfn, index, 188 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
179 sizeof(pte)); 189 sizeof(pte));
180 mark_page_dirty(vcpu->kvm, table_gfn);
181 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, 190 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
182 index, pte, pte|PT_ACCESSED_MASK)) 191 index, pte, pte|PT_ACCESSED_MASK))
183 goto walk; 192 goto walk;
193 mark_page_dirty(vcpu->kvm, table_gfn);
184 pte |= PT_ACCESSED_MASK; 194 pte |= PT_ACCESSED_MASK;
185 } 195 }
186 196
@@ -213,15 +223,18 @@ walk:
213 --walker->level; 223 --walker->level;
214 } 224 }
215 225
226 if (!present || eperm || rsvd_fault)
227 goto error;
228
216 if (write_fault && !is_dirty_gpte(pte)) { 229 if (write_fault && !is_dirty_gpte(pte)) {
217 bool ret; 230 bool ret;
218 231
219 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 232 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
220 mark_page_dirty(vcpu->kvm, table_gfn);
221 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, 233 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
222 pte|PT_DIRTY_MASK); 234 pte|PT_DIRTY_MASK);
223 if (ret) 235 if (ret)
224 goto walk; 236 goto walk;
237 mark_page_dirty(vcpu->kvm, table_gfn);
225 pte |= PT_DIRTY_MASK; 238 pte |= PT_DIRTY_MASK;
226 walker->ptes[walker->level - 1] = pte; 239 walker->ptes[walker->level - 1] = pte;
227 } 240 }
@@ -229,22 +242,18 @@ walk:
229 walker->pt_access = pt_access; 242 walker->pt_access = pt_access;
230 walker->pte_access = pte_access; 243 walker->pte_access = pte_access;
231 pgprintk("%s: pte %llx pte_access %x pt_access %x\n", 244 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
232 __func__, (u64)pte, pt_access, pte_access); 245 __func__, (u64)pte, pte_access, pt_access);
233 return 1; 246 return 1;
234 247
235not_present: 248error:
236 walker->error_code = 0; 249 walker->error_code = 0;
237 goto err; 250 if (present)
238 251 walker->error_code |= PFERR_PRESENT_MASK;
239access_error:
240 walker->error_code = PFERR_PRESENT_MASK;
241
242err:
243 if (write_fault) 252 if (write_fault)
244 walker->error_code |= PFERR_WRITE_MASK; 253 walker->error_code |= PFERR_WRITE_MASK;
245 if (user_fault) 254 if (user_fault)
246 walker->error_code |= PFERR_USER_MASK; 255 walker->error_code |= PFERR_USER_MASK;
247 if (fetch_fault) 256 if (fetch_fault && is_nx(vcpu))
248 walker->error_code |= PFERR_FETCH_MASK; 257 walker->error_code |= PFERR_FETCH_MASK;
249 if (rsvd_fault) 258 if (rsvd_fault)
250 walker->error_code |= PFERR_RSVD_MASK; 259 walker->error_code |= PFERR_RSVD_MASK;
@@ -252,7 +261,7 @@ err:
252 return 0; 261 return 0;
253} 262}
254 263
255static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, 264static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
256 u64 *spte, const void *pte) 265 u64 *spte, const void *pte)
257{ 266{
258 pt_element_t gpte; 267 pt_element_t gpte;
@@ -263,7 +272,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
263 gpte = *(const pt_element_t *)pte; 272 gpte = *(const pt_element_t *)pte;
264 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 273 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
265 if (!is_present_gpte(gpte)) { 274 if (!is_present_gpte(gpte)) {
266 if (page->unsync) 275 if (sp->unsync)
267 new_spte = shadow_trap_nonpresent_pte; 276 new_spte = shadow_trap_nonpresent_pte;
268 else 277 else
269 new_spte = shadow_notrap_nonpresent_pte; 278 new_spte = shadow_notrap_nonpresent_pte;
@@ -272,7 +281,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
272 return; 281 return;
273 } 282 }
274 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 283 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
275 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); 284 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
276 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 285 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
277 return; 286 return;
278 pfn = vcpu->arch.update_pte.pfn; 287 pfn = vcpu->arch.update_pte.pfn;
@@ -285,11 +294,22 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
285 * we call mmu_set_spte() with reset_host_protection = true beacuse that 294 * we call mmu_set_spte() with reset_host_protection = true beacuse that
286 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 295 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
287 */ 296 */
288 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 297 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
289 gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, 298 is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
290 gpte_to_gfn(gpte), pfn, true, true); 299 gpte_to_gfn(gpte), pfn, true, true);
291} 300}
292 301
302static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
303 struct guest_walker *gw, int level)
304{
305 int r;
306 pt_element_t curr_pte;
307
308 r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
309 &curr_pte, sizeof(curr_pte));
310 return r || curr_pte != gw->ptes[level - 1];
311}
312
293/* 313/*
294 * Fetch a shadow pte for a specific level in the paging hierarchy. 314 * Fetch a shadow pte for a specific level in the paging hierarchy.
295 */ 315 */
@@ -299,75 +319,86 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
299 int *ptwrite, pfn_t pfn) 319 int *ptwrite, pfn_t pfn)
300{ 320{
301 unsigned access = gw->pt_access; 321 unsigned access = gw->pt_access;
302 struct kvm_mmu_page *shadow_page; 322 struct kvm_mmu_page *sp = NULL;
303 u64 spte, *sptep = NULL; 323 bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
304 int direct; 324 int top_level;
305 gfn_t table_gfn; 325 unsigned direct_access;
306 int r; 326 struct kvm_shadow_walk_iterator it;
307 int level;
308 pt_element_t curr_pte;
309 struct kvm_shadow_walk_iterator iterator;
310 327
311 if (!is_present_gpte(gw->ptes[gw->level - 1])) 328 if (!is_present_gpte(gw->ptes[gw->level - 1]))
312 return NULL; 329 return NULL;
313 330
314 for_each_shadow_entry(vcpu, addr, iterator) { 331 direct_access = gw->pt_access & gw->pte_access;
315 level = iterator.level; 332 if (!dirty)
316 sptep = iterator.sptep; 333 direct_access &= ~ACC_WRITE_MASK;
317 if (iterator.level == hlevel) {
318 mmu_set_spte(vcpu, sptep, access,
319 gw->pte_access & access,
320 user_fault, write_fault,
321 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
322 ptwrite, level,
323 gw->gfn, pfn, false, true);
324 break;
325 }
326 334
327 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) 335 top_level = vcpu->arch.mmu.root_level;
328 continue; 336 if (top_level == PT32E_ROOT_LEVEL)
337 top_level = PT32_ROOT_LEVEL;
338 /*
339 * Verify that the top-level gpte is still there. Since the page
340 * is a root page, it is either write protected (and cannot be
341 * changed from now on) or it is invalid (in which case, we don't
342 * really care if it changes underneath us after this point).
343 */
344 if (FNAME(gpte_changed)(vcpu, gw, top_level))
345 goto out_gpte_changed;
329 346
330 if (is_large_pte(*sptep)) { 347 for (shadow_walk_init(&it, vcpu, addr);
331 rmap_remove(vcpu->kvm, sptep); 348 shadow_walk_okay(&it) && it.level > gw->level;
332 __set_spte(sptep, shadow_trap_nonpresent_pte); 349 shadow_walk_next(&it)) {
333 kvm_flush_remote_tlbs(vcpu->kvm); 350 gfn_t table_gfn;
334 }
335 351
336 if (level <= gw->level) { 352 drop_large_spte(vcpu, it.sptep);
337 int delta = level - gw->level + 1; 353
338 direct = 1; 354 sp = NULL;
339 if (!is_dirty_gpte(gw->ptes[level - delta])) 355 if (!is_shadow_present_pte(*it.sptep)) {
340 access &= ~ACC_WRITE_MASK; 356 table_gfn = gw->table_gfn[it.level - 2];
341 table_gfn = gpte_to_gfn(gw->ptes[level - delta]); 357 sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
342 /* advance table_gfn when emulating 1gb pages with 4k */ 358 false, access, it.sptep);
343 if (delta == 0)
344 table_gfn += PT_INDEX(addr, level);
345 access &= gw->pte_access;
346 } else {
347 direct = 0;
348 table_gfn = gw->table_gfn[level - 2];
349 }
350 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
351 direct, access, sptep);
352 if (!direct) {
353 r = kvm_read_guest_atomic(vcpu->kvm,
354 gw->pte_gpa[level - 2],
355 &curr_pte, sizeof(curr_pte));
356 if (r || curr_pte != gw->ptes[level - 2]) {
357 kvm_mmu_put_page(shadow_page, sptep);
358 kvm_release_pfn_clean(pfn);
359 sptep = NULL;
360 break;
361 }
362 } 359 }
363 360
364 spte = __pa(shadow_page->spt) 361 /*
365 | PT_PRESENT_MASK | PT_ACCESSED_MASK 362 * Verify that the gpte in the page we've just write
366 | PT_WRITABLE_MASK | PT_USER_MASK; 363 * protected is still there.
367 *sptep = spte; 364 */
365 if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
366 goto out_gpte_changed;
367
368 if (sp)
369 link_shadow_page(it.sptep, sp);
368 } 370 }
369 371
370 return sptep; 372 for (;
373 shadow_walk_okay(&it) && it.level > hlevel;
374 shadow_walk_next(&it)) {
375 gfn_t direct_gfn;
376
377 validate_direct_spte(vcpu, it.sptep, direct_access);
378
379 drop_large_spte(vcpu, it.sptep);
380
381 if (is_shadow_present_pte(*it.sptep))
382 continue;
383
384 direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
385
386 sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
387 true, direct_access, it.sptep);
388 link_shadow_page(it.sptep, sp);
389 }
390
391 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
392 user_fault, write_fault, dirty, ptwrite, it.level,
393 gw->gfn, pfn, false, true);
394
395 return it.sptep;
396
397out_gpte_changed:
398 if (sp)
399 kvm_mmu_put_page(sp, it.sptep);
400 kvm_release_pfn_clean(pfn);
401 return NULL;
371} 402}
372 403
373/* 404/*
@@ -431,11 +462,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
431 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 462 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
432 463
433 /* mmio */ 464 /* mmio */
434 if (is_error_pfn(pfn)) { 465 if (is_error_pfn(pfn))
435 pgprintk("gfn %lx is mmio\n", walker.gfn); 466 return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
436 kvm_release_pfn_clean(pfn);
437 return 1;
438 }
439 467
440 spin_lock(&vcpu->kvm->mmu_lock); 468 spin_lock(&vcpu->kvm->mmu_lock);
441 if (mmu_notifier_retry(vcpu, mmu_seq)) 469 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -443,6 +471,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
443 kvm_mmu_free_some_pages(vcpu); 471 kvm_mmu_free_some_pages(vcpu);
444 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 472 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
445 level, &write_pt, pfn); 473 level, &write_pt, pfn);
474 (void)sptep;
446 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 475 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
447 sptep, *sptep, write_pt); 476 sptep, *sptep, write_pt);
448 477
@@ -464,6 +493,7 @@ out_unlock:
464static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 493static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
465{ 494{
466 struct kvm_shadow_walk_iterator iterator; 495 struct kvm_shadow_walk_iterator iterator;
496 struct kvm_mmu_page *sp;
467 gpa_t pte_gpa = -1; 497 gpa_t pte_gpa = -1;
468 int level; 498 int level;
469 u64 *sptep; 499 u64 *sptep;
@@ -475,10 +505,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
475 level = iterator.level; 505 level = iterator.level;
476 sptep = iterator.sptep; 506 sptep = iterator.sptep;
477 507
508 sp = page_header(__pa(sptep));
478 if (is_last_spte(*sptep, level)) { 509 if (is_last_spte(*sptep, level)) {
479 struct kvm_mmu_page *sp = page_header(__pa(sptep));
480 int offset, shift; 510 int offset, shift;
481 511
512 if (!sp->unsync)
513 break;
514
482 shift = PAGE_SHIFT - 515 shift = PAGE_SHIFT -
483 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; 516 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
484 offset = sp->role.quadrant << shift; 517 offset = sp->role.quadrant << shift;
@@ -487,16 +520,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
487 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); 520 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
488 521
489 if (is_shadow_present_pte(*sptep)) { 522 if (is_shadow_present_pte(*sptep)) {
490 rmap_remove(vcpu->kvm, sptep);
491 if (is_large_pte(*sptep)) 523 if (is_large_pte(*sptep))
492 --vcpu->kvm->stat.lpages; 524 --vcpu->kvm->stat.lpages;
525 drop_spte(vcpu->kvm, sptep,
526 shadow_trap_nonpresent_pte);
493 need_flush = 1; 527 need_flush = 1;
494 } 528 } else
495 __set_spte(sptep, shadow_trap_nonpresent_pte); 529 __set_spte(sptep, shadow_trap_nonpresent_pte);
496 break; 530 break;
497 } 531 }
498 532
499 if (!is_shadow_present_pte(*sptep)) 533 if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
500 break; 534 break;
501 } 535 }
502 536
@@ -570,9 +604,9 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
570 * Using the cached information from sp->gfns is safe because: 604 * Using the cached information from sp->gfns is safe because:
571 * - The spte has a reference to the struct page, so the pfn for a given gfn 605 * - The spte has a reference to the struct page, so the pfn for a given gfn
572 * can't change unless all sptes pointing to it are nuked first. 606 * can't change unless all sptes pointing to it are nuked first.
573 * - Alias changes zap the entire shadow cache.
574 */ 607 */
575static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 608static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
609 bool clear_unsync)
576{ 610{
577 int i, offset, nr_present; 611 int i, offset, nr_present;
578 bool reset_host_protection; 612 bool reset_host_protection;
@@ -580,6 +614,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
580 614
581 offset = nr_present = 0; 615 offset = nr_present = 0;
582 616
617 /* direct kvm_mmu_page can not be unsync. */
618 BUG_ON(sp->role.direct);
619
583 if (PTTYPE == 32) 620 if (PTTYPE == 32)
584 offset = sp->role.quadrant << PT64_LEVEL_BITS; 621 offset = sp->role.quadrant << PT64_LEVEL_BITS;
585 622
@@ -589,7 +626,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
589 unsigned pte_access; 626 unsigned pte_access;
590 pt_element_t gpte; 627 pt_element_t gpte;
591 gpa_t pte_gpa; 628 gpa_t pte_gpa;
592 gfn_t gfn = sp->gfns[i]; 629 gfn_t gfn;
593 630
594 if (!is_shadow_present_pte(sp->spt[i])) 631 if (!is_shadow_present_pte(sp->spt[i]))
595 continue; 632 continue;
@@ -600,16 +637,17 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
600 sizeof(pt_element_t))) 637 sizeof(pt_element_t)))
601 return -EINVAL; 638 return -EINVAL;
602 639
603 if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || 640 gfn = gpte_to_gfn(gpte);
604 !(gpte & PT_ACCESSED_MASK)) { 641 if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
642 || gfn != sp->gfns[i] || !is_present_gpte(gpte)
643 || !(gpte & PT_ACCESSED_MASK)) {
605 u64 nonpresent; 644 u64 nonpresent;
606 645
607 rmap_remove(vcpu->kvm, &sp->spt[i]); 646 if (is_present_gpte(gpte) || !clear_unsync)
608 if (is_present_gpte(gpte))
609 nonpresent = shadow_trap_nonpresent_pte; 647 nonpresent = shadow_trap_nonpresent_pte;
610 else 648 else
611 nonpresent = shadow_notrap_nonpresent_pte; 649 nonpresent = shadow_notrap_nonpresent_pte;
612 __set_spte(&sp->spt[i], nonpresent); 650 drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
613 continue; 651 continue;
614 } 652 }
615 653
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce438e0fdd26..56c9b6bd7655 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,6 +4,7 @@
4 * AMD SVM support 4 * AMD SVM support
5 * 5 *
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright 2010 Red Hat, Inc. and/or its affilates.
7 * 8 *
8 * Authors: 9 * Authors:
9 * Yaniv Kamay <yaniv@qumranet.com> 10 * Yaniv Kamay <yaniv@qumranet.com>
@@ -285,11 +286,11 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
285 286
286static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 287static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
287{ 288{
289 vcpu->arch.efer = efer;
288 if (!npt_enabled && !(efer & EFER_LMA)) 290 if (!npt_enabled && !(efer & EFER_LMA))
289 efer &= ~EFER_LME; 291 efer &= ~EFER_LME;
290 292
291 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 293 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
292 vcpu->arch.efer = efer;
293} 294}
294 295
295static int is_external_interrupt(u32 info) 296static int is_external_interrupt(u32 info)
@@ -640,7 +641,7 @@ static __init int svm_hardware_setup(void)
640 641
641 if (nested) { 642 if (nested) {
642 printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 643 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
643 kvm_enable_efer_bits(EFER_SVME); 644 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
644 } 645 }
645 646
646 for_each_possible_cpu(cpu) { 647 for_each_possible_cpu(cpu) {
@@ -806,7 +807,7 @@ static void init_vmcb(struct vcpu_svm *svm)
806 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 807 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
807 */ 808 */
808 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 809 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
809 kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); 810 (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
810 811
811 save->cr4 = X86_CR4_PAE; 812 save->cr4 = X86_CR4_PAE;
812 /* rdx = ?? */ 813 /* rdx = ?? */
@@ -903,13 +904,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
903 svm->asid_generation = 0; 904 svm->asid_generation = 0;
904 init_vmcb(svm); 905 init_vmcb(svm);
905 906
906 fx_init(&svm->vcpu); 907 err = fx_init(&svm->vcpu);
908 if (err)
909 goto free_page4;
910
907 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 911 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
908 if (kvm_vcpu_is_bsp(&svm->vcpu)) 912 if (kvm_vcpu_is_bsp(&svm->vcpu))
909 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 913 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
910 914
911 return &svm->vcpu; 915 return &svm->vcpu;
912 916
917free_page4:
918 __free_page(hsave_page);
913free_page3: 919free_page3:
914 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); 920 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
915free_page2: 921free_page2:
@@ -1488,7 +1494,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
1488 */ 1494 */
1489 pr_err("KVM: Guest triggered AMD Erratum 383\n"); 1495 pr_err("KVM: Guest triggered AMD Erratum 383\n");
1490 1496
1491 set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests); 1497 kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
1492 1498
1493 return; 1499 return;
1494 } 1500 }
@@ -1535,7 +1541,7 @@ static int io_interception(struct vcpu_svm *svm)
1535 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1541 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1536 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1542 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1537 if (string || in) 1543 if (string || in)
1538 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); 1544 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
1539 1545
1540 port = io_info >> 16; 1546 port = io_info >> 16;
1541 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1547 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1957,7 +1963,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1957 svm->vmcb->save.cr3 = hsave->save.cr3; 1963 svm->vmcb->save.cr3 = hsave->save.cr3;
1958 svm->vcpu.arch.cr3 = hsave->save.cr3; 1964 svm->vcpu.arch.cr3 = hsave->save.cr3;
1959 } else { 1965 } else {
1960 kvm_set_cr3(&svm->vcpu, hsave->save.cr3); 1966 (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
1961 } 1967 }
1962 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); 1968 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
1963 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); 1969 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
@@ -2080,7 +2086,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2080 svm->vmcb->save.cr3 = nested_vmcb->save.cr3; 2086 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
2081 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; 2087 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
2082 } else 2088 } else
2083 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); 2089 (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
2084 2090
2085 /* Guest paging mode is active - reset mmu */ 2091 /* Guest paging mode is active - reset mmu */
2086 kvm_mmu_reset_context(&svm->vcpu); 2092 kvm_mmu_reset_context(&svm->vcpu);
@@ -2386,16 +2392,12 @@ static int iret_interception(struct vcpu_svm *svm)
2386 2392
2387static int invlpg_interception(struct vcpu_svm *svm) 2393static int invlpg_interception(struct vcpu_svm *svm)
2388{ 2394{
2389 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) 2395 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
2390 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2391 return 1;
2392} 2396}
2393 2397
2394static int emulate_on_interception(struct vcpu_svm *svm) 2398static int emulate_on_interception(struct vcpu_svm *svm)
2395{ 2399{
2396 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) 2400 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
2397 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2398 return 1;
2399} 2401}
2400 2402
2401static int cr8_write_interception(struct vcpu_svm *svm) 2403static int cr8_write_interception(struct vcpu_svm *svm)
@@ -2726,6 +2728,99 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2726 [SVM_EXIT_NPF] = pf_interception, 2728 [SVM_EXIT_NPF] = pf_interception,
2727}; 2729};
2728 2730
2731void dump_vmcb(struct kvm_vcpu *vcpu)
2732{
2733 struct vcpu_svm *svm = to_svm(vcpu);
2734 struct vmcb_control_area *control = &svm->vmcb->control;
2735 struct vmcb_save_area *save = &svm->vmcb->save;
2736
2737 pr_err("VMCB Control Area:\n");
2738 pr_err("cr_read: %04x\n", control->intercept_cr_read);
2739 pr_err("cr_write: %04x\n", control->intercept_cr_write);
2740 pr_err("dr_read: %04x\n", control->intercept_dr_read);
2741 pr_err("dr_write: %04x\n", control->intercept_dr_write);
2742 pr_err("exceptions: %08x\n", control->intercept_exceptions);
2743 pr_err("intercepts: %016llx\n", control->intercept);
2744 pr_err("pause filter count: %d\n", control->pause_filter_count);
2745 pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa);
2746 pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa);
2747 pr_err("tsc_offset: %016llx\n", control->tsc_offset);
2748 pr_err("asid: %d\n", control->asid);
2749 pr_err("tlb_ctl: %d\n", control->tlb_ctl);
2750 pr_err("int_ctl: %08x\n", control->int_ctl);
2751 pr_err("int_vector: %08x\n", control->int_vector);
2752 pr_err("int_state: %08x\n", control->int_state);
2753 pr_err("exit_code: %08x\n", control->exit_code);
2754 pr_err("exit_info1: %016llx\n", control->exit_info_1);
2755 pr_err("exit_info2: %016llx\n", control->exit_info_2);
2756 pr_err("exit_int_info: %08x\n", control->exit_int_info);
2757 pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err);
2758 pr_err("nested_ctl: %lld\n", control->nested_ctl);
2759 pr_err("nested_cr3: %016llx\n", control->nested_cr3);
2760 pr_err("event_inj: %08x\n", control->event_inj);
2761 pr_err("event_inj_err: %08x\n", control->event_inj_err);
2762 pr_err("lbr_ctl: %lld\n", control->lbr_ctl);
2763 pr_err("next_rip: %016llx\n", control->next_rip);
2764 pr_err("VMCB State Save Area:\n");
2765 pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n",
2766 save->es.selector, save->es.attrib,
2767 save->es.limit, save->es.base);
2768 pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n",
2769 save->cs.selector, save->cs.attrib,
2770 save->cs.limit, save->cs.base);
2771 pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n",
2772 save->ss.selector, save->ss.attrib,
2773 save->ss.limit, save->ss.base);
2774 pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n",
2775 save->ds.selector, save->ds.attrib,
2776 save->ds.limit, save->ds.base);
2777 pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n",
2778 save->fs.selector, save->fs.attrib,
2779 save->fs.limit, save->fs.base);
2780 pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n",
2781 save->gs.selector, save->gs.attrib,
2782 save->gs.limit, save->gs.base);
2783 pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n",
2784 save->gdtr.selector, save->gdtr.attrib,
2785 save->gdtr.limit, save->gdtr.base);
2786 pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n",
2787 save->ldtr.selector, save->ldtr.attrib,
2788 save->ldtr.limit, save->ldtr.base);
2789 pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n",
2790 save->idtr.selector, save->idtr.attrib,
2791 save->idtr.limit, save->idtr.base);
2792 pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n",
2793 save->tr.selector, save->tr.attrib,
2794 save->tr.limit, save->tr.base);
2795 pr_err("cpl: %d efer: %016llx\n",
2796 save->cpl, save->efer);
2797 pr_err("cr0: %016llx cr2: %016llx\n",
2798 save->cr0, save->cr2);
2799 pr_err("cr3: %016llx cr4: %016llx\n",
2800 save->cr3, save->cr4);
2801 pr_err("dr6: %016llx dr7: %016llx\n",
2802 save->dr6, save->dr7);
2803 pr_err("rip: %016llx rflags: %016llx\n",
2804 save->rip, save->rflags);
2805 pr_err("rsp: %016llx rax: %016llx\n",
2806 save->rsp, save->rax);
2807 pr_err("star: %016llx lstar: %016llx\n",
2808 save->star, save->lstar);
2809 pr_err("cstar: %016llx sfmask: %016llx\n",
2810 save->cstar, save->sfmask);
2811 pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n",
2812 save->kernel_gs_base, save->sysenter_cs);
2813 pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n",
2814 save->sysenter_esp, save->sysenter_eip);
2815 pr_err("gpat: %016llx dbgctl: %016llx\n",
2816 save->g_pat, save->dbgctl);
2817 pr_err("br_from: %016llx br_to: %016llx\n",
2818 save->br_from, save->br_to);
2819 pr_err("excp_from: %016llx excp_to: %016llx\n",
2820 save->last_excp_from, save->last_excp_to);
2821
2822}
2823
2729static int handle_exit(struct kvm_vcpu *vcpu) 2824static int handle_exit(struct kvm_vcpu *vcpu)
2730{ 2825{
2731 struct vcpu_svm *svm = to_svm(vcpu); 2826 struct vcpu_svm *svm = to_svm(vcpu);
@@ -2770,6 +2865,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2770 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2865 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2771 kvm_run->fail_entry.hardware_entry_failure_reason 2866 kvm_run->fail_entry.hardware_entry_failure_reason
2772 = svm->vmcb->control.exit_code; 2867 = svm->vmcb->control.exit_code;
2868 pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
2869 dump_vmcb(vcpu);
2773 return 0; 2870 return 0;
2774 } 2871 }
2775 2872
@@ -2826,9 +2923,6 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2826{ 2923{
2827 struct vmcb_control_area *control; 2924 struct vmcb_control_area *control;
2828 2925
2829 trace_kvm_inj_virq(irq);
2830
2831 ++svm->vcpu.stat.irq_injections;
2832 control = &svm->vmcb->control; 2926 control = &svm->vmcb->control;
2833 control->int_vector = irq; 2927 control->int_vector = irq;
2834 control->int_ctl &= ~V_INTR_PRIO_MASK; 2928 control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -2842,6 +2936,9 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
2842 2936
2843 BUG_ON(!(gif_set(svm))); 2937 BUG_ON(!(gif_set(svm)));
2844 2938
2939 trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
2940 ++vcpu->stat.irq_injections;
2941
2845 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | 2942 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
2846 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; 2943 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
2847} 2944}
@@ -3327,6 +3424,11 @@ static bool svm_rdtscp_supported(void)
3327 return false; 3424 return false;
3328} 3425}
3329 3426
3427static bool svm_has_wbinvd_exit(void)
3428{
3429 return true;
3430}
3431
3330static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) 3432static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
3331{ 3433{
3332 struct vcpu_svm *svm = to_svm(vcpu); 3434 struct vcpu_svm *svm = to_svm(vcpu);
@@ -3411,6 +3513,8 @@ static struct kvm_x86_ops svm_x86_ops = {
3411 .rdtscp_supported = svm_rdtscp_supported, 3513 .rdtscp_supported = svm_rdtscp_supported,
3412 3514
3413 .set_supported_cpuid = svm_set_supported_cpuid, 3515 .set_supported_cpuid = svm_set_supported_cpuid,
3516
3517 .has_wbinvd_exit = svm_has_wbinvd_exit,
3414}; 3518};
3415 3519
3416static int __init svm_init(void) 3520static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 4ddadb1a5ffe..e16a0dbe74d8 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -1,3 +1,17 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * timer support
8 *
9 * Copyright 2010 Red Hat, Inc. and/or its affilates.
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2. See
12 * the COPYING file in the top-level directory.
13 */
14
1#include <linux/kvm_host.h> 15#include <linux/kvm_host.h>
2#include <linux/kvm.h> 16#include <linux/kvm.h>
3#include <linux/hrtimer.h> 17#include <linux/hrtimer.h>
@@ -18,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
18 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 32 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
19 atomic_inc(&ktimer->pending); 33 atomic_inc(&ktimer->pending);
20 /* FIXME: this code should not know anything about vcpus */ 34 /* FIXME: this code should not know anything about vcpus */
21 set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 35 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
22 } 36 }
23 37
24 if (waitqueue_active(q)) 38 if (waitqueue_active(q))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ee03679efe78..27a0222c2946 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,6 +5,7 @@
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affilates.
8 * 9 *
9 * Authors: 10 * Authors:
10 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
@@ -36,6 +37,8 @@
36#include <asm/vmx.h> 37#include <asm/vmx.h>
37#include <asm/virtext.h> 38#include <asm/virtext.h>
38#include <asm/mce.h> 39#include <asm/mce.h>
40#include <asm/i387.h>
41#include <asm/xcr.h>
39 42
40#include "trace.h" 43#include "trace.h"
41 44
@@ -63,6 +66,9 @@ module_param_named(unrestricted_guest,
63static int __read_mostly emulate_invalid_guest_state = 0; 66static int __read_mostly emulate_invalid_guest_state = 0;
64module_param(emulate_invalid_guest_state, bool, S_IRUGO); 67module_param(emulate_invalid_guest_state, bool, S_IRUGO);
65 68
69static int __read_mostly vmm_exclusive = 1;
70module_param(vmm_exclusive, bool, S_IRUGO);
71
66#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 72#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
67 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 73 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
68#define KVM_GUEST_CR0_MASK \ 74#define KVM_GUEST_CR0_MASK \
@@ -173,10 +179,13 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
173 179
174static int init_rmode(struct kvm *kvm); 180static int init_rmode(struct kvm *kvm);
175static u64 construct_eptp(unsigned long root_hpa); 181static u64 construct_eptp(unsigned long root_hpa);
182static void kvm_cpu_vmxon(u64 addr);
183static void kvm_cpu_vmxoff(void);
176 184
177static DEFINE_PER_CPU(struct vmcs *, vmxarea); 185static DEFINE_PER_CPU(struct vmcs *, vmxarea);
178static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 186static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
179static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); 187static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
188static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
180 189
181static unsigned long *vmx_io_bitmap_a; 190static unsigned long *vmx_io_bitmap_a;
182static unsigned long *vmx_io_bitmap_b; 191static unsigned long *vmx_io_bitmap_b;
@@ -334,6 +343,11 @@ static inline bool cpu_has_vmx_ept_1g_page(void)
334 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; 343 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
335} 344}
336 345
346static inline bool cpu_has_vmx_ept_4levels(void)
347{
348 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
349}
350
337static inline bool cpu_has_vmx_invept_individual_addr(void) 351static inline bool cpu_has_vmx_invept_individual_addr(void)
338{ 352{
339 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; 353 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -349,6 +363,16 @@ static inline bool cpu_has_vmx_invept_global(void)
349 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; 363 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
350} 364}
351 365
366static inline bool cpu_has_vmx_invvpid_single(void)
367{
368 return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
369}
370
371static inline bool cpu_has_vmx_invvpid_global(void)
372{
373 return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
374}
375
352static inline bool cpu_has_vmx_ept(void) 376static inline bool cpu_has_vmx_ept(void)
353{ 377{
354 return vmcs_config.cpu_based_2nd_exec_ctrl & 378 return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -389,6 +413,12 @@ static inline bool cpu_has_virtual_nmis(void)
389 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 413 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
390} 414}
391 415
416static inline bool cpu_has_vmx_wbinvd_exit(void)
417{
418 return vmcs_config.cpu_based_2nd_exec_ctrl &
419 SECONDARY_EXEC_WBINVD_EXITING;
420}
421
392static inline bool report_flexpriority(void) 422static inline bool report_flexpriority(void)
393{ 423{
394 return flexpriority_enabled; 424 return flexpriority_enabled;
@@ -453,6 +483,19 @@ static void vmcs_clear(struct vmcs *vmcs)
453 vmcs, phys_addr); 483 vmcs, phys_addr);
454} 484}
455 485
486static void vmcs_load(struct vmcs *vmcs)
487{
488 u64 phys_addr = __pa(vmcs);
489 u8 error;
490
491 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
492 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
493 : "cc", "memory");
494 if (error)
495 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
496 vmcs, phys_addr);
497}
498
456static void __vcpu_clear(void *arg) 499static void __vcpu_clear(void *arg)
457{ 500{
458 struct vcpu_vmx *vmx = arg; 501 struct vcpu_vmx *vmx = arg;
@@ -475,12 +518,27 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
475 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); 518 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
476} 519}
477 520
478static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) 521static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
479{ 522{
480 if (vmx->vpid == 0) 523 if (vmx->vpid == 0)
481 return; 524 return;
482 525
483 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); 526 if (cpu_has_vmx_invvpid_single())
527 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
528}
529
530static inline void vpid_sync_vcpu_global(void)
531{
532 if (cpu_has_vmx_invvpid_global())
533 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
534}
535
536static inline void vpid_sync_context(struct vcpu_vmx *vmx)
537{
538 if (cpu_has_vmx_invvpid_single())
539 vpid_sync_vcpu_single(vmx);
540 else
541 vpid_sync_vcpu_global();
484} 542}
485 543
486static inline void ept_sync_global(void) 544static inline void ept_sync_global(void)
@@ -812,6 +870,9 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
812 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 870 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
813 } 871 }
814#endif 872#endif
873 if (current_thread_info()->status & TS_USEDFPU)
874 clts();
875 load_gdt(&__get_cpu_var(host_gdt));
815} 876}
816 877
817static void vmx_load_host_state(struct vcpu_vmx *vmx) 878static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -828,35 +889,30 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
828static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 889static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
829{ 890{
830 struct vcpu_vmx *vmx = to_vmx(vcpu); 891 struct vcpu_vmx *vmx = to_vmx(vcpu);
831 u64 phys_addr = __pa(vmx->vmcs);
832 u64 tsc_this, delta, new_offset; 892 u64 tsc_this, delta, new_offset;
893 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
833 894
834 if (vcpu->cpu != cpu) { 895 if (!vmm_exclusive)
896 kvm_cpu_vmxon(phys_addr);
897 else if (vcpu->cpu != cpu)
835 vcpu_clear(vmx); 898 vcpu_clear(vmx);
836 kvm_migrate_timers(vcpu);
837 set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
838 local_irq_disable();
839 list_add(&vmx->local_vcpus_link,
840 &per_cpu(vcpus_on_cpu, cpu));
841 local_irq_enable();
842 }
843 899
844 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 900 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
845 u8 error;
846
847 per_cpu(current_vmcs, cpu) = vmx->vmcs; 901 per_cpu(current_vmcs, cpu) = vmx->vmcs;
848 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 902 vmcs_load(vmx->vmcs);
849 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
850 : "cc");
851 if (error)
852 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
853 vmx->vmcs, phys_addr);
854 } 903 }
855 904
856 if (vcpu->cpu != cpu) { 905 if (vcpu->cpu != cpu) {
857 struct desc_ptr dt; 906 struct desc_ptr dt;
858 unsigned long sysenter_esp; 907 unsigned long sysenter_esp;
859 908
909 kvm_migrate_timers(vcpu);
910 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
911 local_irq_disable();
912 list_add(&vmx->local_vcpus_link,
913 &per_cpu(vcpus_on_cpu, cpu));
914 local_irq_enable();
915
860 vcpu->cpu = cpu; 916 vcpu->cpu = cpu;
861 /* 917 /*
862 * Linux uses per-cpu TSS and GDT, so set these when switching 918 * Linux uses per-cpu TSS and GDT, so set these when switching
@@ -884,6 +940,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
884static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 940static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
885{ 941{
886 __vmx_load_host_state(to_vmx(vcpu)); 942 __vmx_load_host_state(to_vmx(vcpu));
943 if (!vmm_exclusive) {
944 __vcpu_clear(to_vmx(vcpu));
945 kvm_cpu_vmxoff();
946 }
887} 947}
888 948
889static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 949static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -1286,6 +1346,13 @@ static __init int vmx_disabled_by_bios(void)
1286 /* locked but not enabled */ 1346 /* locked but not enabled */
1287} 1347}
1288 1348
1349static void kvm_cpu_vmxon(u64 addr)
1350{
1351 asm volatile (ASM_VMX_VMXON_RAX
1352 : : "a"(&addr), "m"(addr)
1353 : "memory", "cc");
1354}
1355
1289static int hardware_enable(void *garbage) 1356static int hardware_enable(void *garbage)
1290{ 1357{
1291 int cpu = raw_smp_processor_id(); 1358 int cpu = raw_smp_processor_id();
@@ -1308,11 +1375,13 @@ static int hardware_enable(void *garbage)
1308 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); 1375 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1309 } 1376 }
1310 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1377 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1311 asm volatile (ASM_VMX_VMXON_RAX
1312 : : "a"(&phys_addr), "m"(phys_addr)
1313 : "memory", "cc");
1314 1378
1315 ept_sync_global(); 1379 if (vmm_exclusive) {
1380 kvm_cpu_vmxon(phys_addr);
1381 ept_sync_global();
1382 }
1383
1384 store_gdt(&__get_cpu_var(host_gdt));
1316 1385
1317 return 0; 1386 return 0;
1318} 1387}
@@ -1334,13 +1403,15 @@ static void vmclear_local_vcpus(void)
1334static void kvm_cpu_vmxoff(void) 1403static void kvm_cpu_vmxoff(void)
1335{ 1404{
1336 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); 1405 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1337 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1338} 1406}
1339 1407
1340static void hardware_disable(void *garbage) 1408static void hardware_disable(void *garbage)
1341{ 1409{
1342 vmclear_local_vcpus(); 1410 if (vmm_exclusive) {
1343 kvm_cpu_vmxoff(); 1411 vmclear_local_vcpus();
1412 kvm_cpu_vmxoff();
1413 }
1414 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1344} 1415}
1345 1416
1346static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 1417static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -1539,7 +1610,8 @@ static __init int hardware_setup(void)
1539 if (!cpu_has_vmx_vpid()) 1610 if (!cpu_has_vmx_vpid())
1540 enable_vpid = 0; 1611 enable_vpid = 0;
1541 1612
1542 if (!cpu_has_vmx_ept()) { 1613 if (!cpu_has_vmx_ept() ||
1614 !cpu_has_vmx_ept_4levels()) {
1543 enable_ept = 0; 1615 enable_ept = 0;
1544 enable_unrestricted_guest = 0; 1616 enable_unrestricted_guest = 0;
1545 } 1617 }
@@ -1628,7 +1700,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
1628 gfn_t base_gfn; 1700 gfn_t base_gfn;
1629 1701
1630 slots = kvm_memslots(kvm); 1702 slots = kvm_memslots(kvm);
1631 base_gfn = kvm->memslots->memslots[0].base_gfn + 1703 base_gfn = slots->memslots[0].base_gfn +
1632 kvm->memslots->memslots[0].npages - 3; 1704 kvm->memslots->memslots[0].npages - 3;
1633 return base_gfn << PAGE_SHIFT; 1705 return base_gfn << PAGE_SHIFT;
1634 } 1706 }
@@ -1759,9 +1831,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1759 1831
1760static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1832static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1761{ 1833{
1762 vpid_sync_vcpu_all(to_vmx(vcpu)); 1834 vpid_sync_context(to_vmx(vcpu));
1763 if (enable_ept) 1835 if (enable_ept) {
1836 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1837 return;
1764 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 1838 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1839 }
1765} 1840}
1766 1841
1767static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1842static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -2507,7 +2582,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2507 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 2582 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2508 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 2583 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
2509 2584
2510 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ 2585 vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
2511 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ 2586 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
2512 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ 2587 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
2513 2588
@@ -2599,21 +2674,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2599 2674
2600static int init_rmode(struct kvm *kvm) 2675static int init_rmode(struct kvm *kvm)
2601{ 2676{
2677 int idx, ret = 0;
2678
2679 idx = srcu_read_lock(&kvm->srcu);
2602 if (!init_rmode_tss(kvm)) 2680 if (!init_rmode_tss(kvm))
2603 return 0; 2681 goto exit;
2604 if (!init_rmode_identity_map(kvm)) 2682 if (!init_rmode_identity_map(kvm))
2605 return 0; 2683 goto exit;
2606 return 1; 2684
2685 ret = 1;
2686exit:
2687 srcu_read_unlock(&kvm->srcu, idx);
2688 return ret;
2607} 2689}
2608 2690
2609static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) 2691static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2610{ 2692{
2611 struct vcpu_vmx *vmx = to_vmx(vcpu); 2693 struct vcpu_vmx *vmx = to_vmx(vcpu);
2612 u64 msr; 2694 u64 msr;
2613 int ret, idx; 2695 int ret;
2614 2696
2615 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2697 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2616 idx = srcu_read_lock(&vcpu->kvm->srcu);
2617 if (!init_rmode(vmx->vcpu.kvm)) { 2698 if (!init_rmode(vmx->vcpu.kvm)) {
2618 ret = -ENOMEM; 2699 ret = -ENOMEM;
2619 goto out; 2700 goto out;
@@ -2630,7 +2711,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2630 msr |= MSR_IA32_APICBASE_BSP; 2711 msr |= MSR_IA32_APICBASE_BSP;
2631 kvm_set_apic_base(&vmx->vcpu, msr); 2712 kvm_set_apic_base(&vmx->vcpu, msr);
2632 2713
2633 fx_init(&vmx->vcpu); 2714 ret = fx_init(&vmx->vcpu);
2715 if (ret != 0)
2716 goto out;
2634 2717
2635 seg_setup(VCPU_SREG_CS); 2718 seg_setup(VCPU_SREG_CS);
2636 /* 2719 /*
@@ -2713,7 +2796,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2713 vmx_fpu_activate(&vmx->vcpu); 2796 vmx_fpu_activate(&vmx->vcpu);
2714 update_exception_bitmap(&vmx->vcpu); 2797 update_exception_bitmap(&vmx->vcpu);
2715 2798
2716 vpid_sync_vcpu_all(vmx); 2799 vpid_sync_context(vmx);
2717 2800
2718 ret = 0; 2801 ret = 0;
2719 2802
@@ -2721,7 +2804,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2721 vmx->emulation_required = 0; 2804 vmx->emulation_required = 0;
2722 2805
2723out: 2806out:
2724 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2725 return ret; 2807 return ret;
2726} 2808}
2727 2809
@@ -2826,9 +2908,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2826{ 2908{
2827 if (!cpu_has_virtual_nmis()) 2909 if (!cpu_has_virtual_nmis())
2828 return to_vmx(vcpu)->soft_vnmi_blocked; 2910 return to_vmx(vcpu)->soft_vnmi_blocked;
2829 else 2911 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
2830 return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2831 GUEST_INTR_STATE_NMI);
2832} 2912}
2833 2913
2834static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 2914static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3070,7 +3150,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
3070 ++vcpu->stat.io_exits; 3150 ++vcpu->stat.io_exits;
3071 3151
3072 if (string || in) 3152 if (string || in)
3073 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); 3153 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
3074 3154
3075 port = exit_qualification >> 16; 3155 port = exit_qualification >> 16;
3076 size = (exit_qualification & 7) + 1; 3156 size = (exit_qualification & 7) + 1;
@@ -3090,11 +3170,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3090 hypercall[2] = 0xc1; 3170 hypercall[2] = 0xc1;
3091} 3171}
3092 3172
3173static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
3174{
3175 if (err)
3176 kvm_inject_gp(vcpu, 0);
3177 else
3178 skip_emulated_instruction(vcpu);
3179}
3180
3093static int handle_cr(struct kvm_vcpu *vcpu) 3181static int handle_cr(struct kvm_vcpu *vcpu)
3094{ 3182{
3095 unsigned long exit_qualification, val; 3183 unsigned long exit_qualification, val;
3096 int cr; 3184 int cr;
3097 int reg; 3185 int reg;
3186 int err;
3098 3187
3099 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3188 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3100 cr = exit_qualification & 15; 3189 cr = exit_qualification & 15;
@@ -3105,16 +3194,16 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3105 trace_kvm_cr_write(cr, val); 3194 trace_kvm_cr_write(cr, val);
3106 switch (cr) { 3195 switch (cr) {
3107 case 0: 3196 case 0:
3108 kvm_set_cr0(vcpu, val); 3197 err = kvm_set_cr0(vcpu, val);
3109 skip_emulated_instruction(vcpu); 3198 complete_insn_gp(vcpu, err);
3110 return 1; 3199 return 1;
3111 case 3: 3200 case 3:
3112 kvm_set_cr3(vcpu, val); 3201 err = kvm_set_cr3(vcpu, val);
3113 skip_emulated_instruction(vcpu); 3202 complete_insn_gp(vcpu, err);
3114 return 1; 3203 return 1;
3115 case 4: 3204 case 4:
3116 kvm_set_cr4(vcpu, val); 3205 err = kvm_set_cr4(vcpu, val);
3117 skip_emulated_instruction(vcpu); 3206 complete_insn_gp(vcpu, err);
3118 return 1; 3207 return 1;
3119 case 8: { 3208 case 8: {
3120 u8 cr8_prev = kvm_get_cr8(vcpu); 3209 u8 cr8_prev = kvm_get_cr8(vcpu);
@@ -3321,30 +3410,25 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
3321static int handle_wbinvd(struct kvm_vcpu *vcpu) 3410static int handle_wbinvd(struct kvm_vcpu *vcpu)
3322{ 3411{
3323 skip_emulated_instruction(vcpu); 3412 skip_emulated_instruction(vcpu);
3324 /* TODO: Add support for VT-d/pass-through device */ 3413 kvm_emulate_wbinvd(vcpu);
3325 return 1; 3414 return 1;
3326} 3415}
3327 3416
3328static int handle_apic_access(struct kvm_vcpu *vcpu) 3417static int handle_xsetbv(struct kvm_vcpu *vcpu)
3329{ 3418{
3330 unsigned long exit_qualification; 3419 u64 new_bv = kvm_read_edx_eax(vcpu);
3331 enum emulation_result er; 3420 u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3332 unsigned long offset;
3333 3421
3334 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3422 if (kvm_set_xcr(vcpu, index, new_bv) == 0)
3335 offset = exit_qualification & 0xffful; 3423 skip_emulated_instruction(vcpu);
3336
3337 er = emulate_instruction(vcpu, 0, 0, 0);
3338
3339 if (er != EMULATE_DONE) {
3340 printk(KERN_ERR
3341 "Fail to handle apic access vmexit! Offset is 0x%lx\n",
3342 offset);
3343 return -ENOEXEC;
3344 }
3345 return 1; 3424 return 1;
3346} 3425}
3347 3426
3427static int handle_apic_access(struct kvm_vcpu *vcpu)
3428{
3429 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
3430}
3431
3348static int handle_task_switch(struct kvm_vcpu *vcpu) 3432static int handle_task_switch(struct kvm_vcpu *vcpu)
3349{ 3433{
3350 struct vcpu_vmx *vmx = to_vmx(vcpu); 3434 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3554,13 +3638,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3554 goto out; 3638 goto out;
3555 } 3639 }
3556 3640
3557 if (err != EMULATE_DONE) { 3641 if (err != EMULATE_DONE)
3558 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3642 return 0;
3559 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3560 vcpu->run->internal.ndata = 0;
3561 ret = 0;
3562 goto out;
3563 }
3564 3643
3565 if (signal_pending(current)) 3644 if (signal_pending(current))
3566 goto out; 3645 goto out;
@@ -3623,6 +3702,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3623 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 3702 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
3624 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 3703 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
3625 [EXIT_REASON_WBINVD] = handle_wbinvd, 3704 [EXIT_REASON_WBINVD] = handle_wbinvd,
3705 [EXIT_REASON_XSETBV] = handle_xsetbv,
3626 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 3706 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
3627 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 3707 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3628 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3708 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
@@ -3656,6 +3736,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3656 if (enable_ept && is_paging(vcpu)) 3736 if (enable_ept && is_paging(vcpu))
3657 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3737 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3658 3738
3739 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3740 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3741 vcpu->run->fail_entry.hardware_entry_failure_reason
3742 = exit_reason;
3743 return 0;
3744 }
3745
3659 if (unlikely(vmx->fail)) { 3746 if (unlikely(vmx->fail)) {
3660 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3747 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3661 vcpu->run->fail_entry.hardware_entry_failure_reason 3748 vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3861,11 +3948,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3861 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 3948 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3862 vmx_set_interrupt_shadow(vcpu, 0); 3949 vmx_set_interrupt_shadow(vcpu, 0);
3863 3950
3864 /*
3865 * Loading guest fpu may have cleared host cr0.ts
3866 */
3867 vmcs_writel(HOST_CR0, read_cr0());
3868
3869 asm( 3951 asm(
3870 /* Store host registers */ 3952 /* Store host registers */
3871 "push %%"R"dx; push %%"R"bp;" 3953 "push %%"R"dx; push %%"R"bp;"
@@ -4001,6 +4083,19 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
4001 kmem_cache_free(kvm_vcpu_cache, vmx); 4083 kmem_cache_free(kvm_vcpu_cache, vmx);
4002} 4084}
4003 4085
4086static inline void vmcs_init(struct vmcs *vmcs)
4087{
4088 u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
4089
4090 if (!vmm_exclusive)
4091 kvm_cpu_vmxon(phys_addr);
4092
4093 vmcs_clear(vmcs);
4094
4095 if (!vmm_exclusive)
4096 kvm_cpu_vmxoff();
4097}
4098
4004static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 4099static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4005{ 4100{
4006 int err; 4101 int err;
@@ -4026,7 +4121,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4026 if (!vmx->vmcs) 4121 if (!vmx->vmcs)
4027 goto free_msrs; 4122 goto free_msrs;
4028 4123
4029 vmcs_clear(vmx->vmcs); 4124 vmcs_init(vmx->vmcs);
4030 4125
4031 cpu = get_cpu(); 4126 cpu = get_cpu();
4032 vmx_vcpu_load(&vmx->vcpu, cpu); 4127 vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4265,6 +4360,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
4265 .rdtscp_supported = vmx_rdtscp_supported, 4360 .rdtscp_supported = vmx_rdtscp_supported,
4266 4361
4267 .set_supported_cpuid = vmx_set_supported_cpuid, 4362 .set_supported_cpuid = vmx_set_supported_cpuid,
4363
4364 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
4268}; 4365};
4269 4366
4270static int __init vmx_init(void) 4367static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7fa89c39c64f..97aab036dabf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,6 +6,7 @@
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008 8 * Copyright IBM Corporation, 2008
9 * Copyright 2010 Red Hat, Inc. and/or its affilates.
9 * 10 *
10 * Authors: 11 * Authors:
11 * Avi Kivity <avi@qumranet.com> 12 * Avi Kivity <avi@qumranet.com>
@@ -41,17 +42,19 @@
41#include <linux/srcu.h> 42#include <linux/srcu.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
43#include <linux/perf_event.h> 44#include <linux/perf_event.h>
45#include <linux/uaccess.h>
44#include <trace/events/kvm.h> 46#include <trace/events/kvm.h>
45 47
46#define CREATE_TRACE_POINTS 48#define CREATE_TRACE_POINTS
47#include "trace.h" 49#include "trace.h"
48 50
49#include <asm/debugreg.h> 51#include <asm/debugreg.h>
50#include <asm/uaccess.h>
51#include <asm/msr.h> 52#include <asm/msr.h>
52#include <asm/desc.h> 53#include <asm/desc.h>
53#include <asm/mtrr.h> 54#include <asm/mtrr.h>
54#include <asm/mce.h> 55#include <asm/mce.h>
56#include <asm/i387.h>
57#include <asm/xcr.h>
55 58
56#define MAX_IO_MSRS 256 59#define MAX_IO_MSRS 256
57#define CR0_RESERVED_BITS \ 60#define CR0_RESERVED_BITS \
@@ -62,6 +65,7 @@
62 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 65 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
63 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 66 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
64 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 67 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
68 | X86_CR4_OSXSAVE \
65 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 69 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
66 70
67#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 71#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -147,6 +151,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
147 { NULL } 151 { NULL }
148}; 152};
149 153
154u64 __read_mostly host_xcr0;
155
156static inline u32 bit(int bitno)
157{
158 return 1 << (bitno & 31);
159}
160
150static void kvm_on_user_return(struct user_return_notifier *urn) 161static void kvm_on_user_return(struct user_return_notifier *urn)
151{ 162{
152 unsigned slot; 163 unsigned slot;
@@ -285,7 +296,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
285 prev_nr = vcpu->arch.exception.nr; 296 prev_nr = vcpu->arch.exception.nr;
286 if (prev_nr == DF_VECTOR) { 297 if (prev_nr == DF_VECTOR) {
287 /* triple fault -> shutdown */ 298 /* triple fault -> shutdown */
288 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 299 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
289 return; 300 return;
290 } 301 }
291 class1 = exception_class(prev_nr); 302 class1 = exception_class(prev_nr);
@@ -414,121 +425,163 @@ out:
414 return changed; 425 return changed;
415} 426}
416 427
417void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 428int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
418{ 429{
430 unsigned long old_cr0 = kvm_read_cr0(vcpu);
431 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
432 X86_CR0_CD | X86_CR0_NW;
433
419 cr0 |= X86_CR0_ET; 434 cr0 |= X86_CR0_ET;
420 435
421#ifdef CONFIG_X86_64 436#ifdef CONFIG_X86_64
422 if (cr0 & 0xffffffff00000000UL) { 437 if (cr0 & 0xffffffff00000000UL)
423 kvm_inject_gp(vcpu, 0); 438 return 1;
424 return;
425 }
426#endif 439#endif
427 440
428 cr0 &= ~CR0_RESERVED_BITS; 441 cr0 &= ~CR0_RESERVED_BITS;
429 442
430 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 443 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
431 kvm_inject_gp(vcpu, 0); 444 return 1;
432 return;
433 }
434 445
435 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 446 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
436 kvm_inject_gp(vcpu, 0); 447 return 1;
437 return;
438 }
439 448
440 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 449 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
441#ifdef CONFIG_X86_64 450#ifdef CONFIG_X86_64
442 if ((vcpu->arch.efer & EFER_LME)) { 451 if ((vcpu->arch.efer & EFER_LME)) {
443 int cs_db, cs_l; 452 int cs_db, cs_l;
444 453
445 if (!is_pae(vcpu)) { 454 if (!is_pae(vcpu))
446 kvm_inject_gp(vcpu, 0); 455 return 1;
447 return;
448 }
449 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 456 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
450 if (cs_l) { 457 if (cs_l)
451 kvm_inject_gp(vcpu, 0); 458 return 1;
452 return;
453
454 }
455 } else 459 } else
456#endif 460#endif
457 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 461 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3))
458 kvm_inject_gp(vcpu, 0); 462 return 1;
459 return;
460 }
461
462 } 463 }
463 464
464 kvm_x86_ops->set_cr0(vcpu, cr0); 465 kvm_x86_ops->set_cr0(vcpu, cr0);
465 466
466 kvm_mmu_reset_context(vcpu); 467 if ((cr0 ^ old_cr0) & update_bits)
467 return; 468 kvm_mmu_reset_context(vcpu);
469 return 0;
468} 470}
469EXPORT_SYMBOL_GPL(kvm_set_cr0); 471EXPORT_SYMBOL_GPL(kvm_set_cr0);
470 472
471void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 473void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
472{ 474{
473 kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); 475 (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
474} 476}
475EXPORT_SYMBOL_GPL(kvm_lmsw); 477EXPORT_SYMBOL_GPL(kvm_lmsw);
476 478
477void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 479int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
478{ 480{
479 unsigned long old_cr4 = kvm_read_cr4(vcpu); 481 u64 xcr0;
480 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
481 482
482 if (cr4 & CR4_RESERVED_BITS) { 483 /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
484 if (index != XCR_XFEATURE_ENABLED_MASK)
485 return 1;
486 xcr0 = xcr;
487 if (kvm_x86_ops->get_cpl(vcpu) != 0)
488 return 1;
489 if (!(xcr0 & XSTATE_FP))
490 return 1;
491 if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
492 return 1;
493 if (xcr0 & ~host_xcr0)
494 return 1;
495 vcpu->arch.xcr0 = xcr0;
496 vcpu->guest_xcr0_loaded = 0;
497 return 0;
498}
499
500int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
501{
502 if (__kvm_set_xcr(vcpu, index, xcr)) {
483 kvm_inject_gp(vcpu, 0); 503 kvm_inject_gp(vcpu, 0);
504 return 1;
505 }
506 return 0;
507}
508EXPORT_SYMBOL_GPL(kvm_set_xcr);
509
510static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
511{
512 struct kvm_cpuid_entry2 *best;
513
514 best = kvm_find_cpuid_entry(vcpu, 1, 0);
515 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
516}
517
518static void update_cpuid(struct kvm_vcpu *vcpu)
519{
520 struct kvm_cpuid_entry2 *best;
521
522 best = kvm_find_cpuid_entry(vcpu, 1, 0);
523 if (!best)
484 return; 524 return;
525
526 /* Update OSXSAVE bit */
527 if (cpu_has_xsave && best->function == 0x1) {
528 best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
529 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
530 best->ecx |= bit(X86_FEATURE_OSXSAVE);
485 } 531 }
532}
533
534int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
535{
536 unsigned long old_cr4 = kvm_read_cr4(vcpu);
537 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
538
539 if (cr4 & CR4_RESERVED_BITS)
540 return 1;
541
542 if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
543 return 1;
486 544
487 if (is_long_mode(vcpu)) { 545 if (is_long_mode(vcpu)) {
488 if (!(cr4 & X86_CR4_PAE)) { 546 if (!(cr4 & X86_CR4_PAE))
489 kvm_inject_gp(vcpu, 0); 547 return 1;
490 return;
491 }
492 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 548 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
493 && ((cr4 ^ old_cr4) & pdptr_bits) 549 && ((cr4 ^ old_cr4) & pdptr_bits)
494 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 550 && !load_pdptrs(vcpu, vcpu->arch.cr3))
495 kvm_inject_gp(vcpu, 0); 551 return 1;
496 return; 552
497 } 553 if (cr4 & X86_CR4_VMXE)
554 return 1;
498 555
499 if (cr4 & X86_CR4_VMXE) {
500 kvm_inject_gp(vcpu, 0);
501 return;
502 }
503 kvm_x86_ops->set_cr4(vcpu, cr4); 556 kvm_x86_ops->set_cr4(vcpu, cr4);
504 vcpu->arch.cr4 = cr4; 557
505 kvm_mmu_reset_context(vcpu); 558 if ((cr4 ^ old_cr4) & pdptr_bits)
559 kvm_mmu_reset_context(vcpu);
560
561 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
562 update_cpuid(vcpu);
563
564 return 0;
506} 565}
507EXPORT_SYMBOL_GPL(kvm_set_cr4); 566EXPORT_SYMBOL_GPL(kvm_set_cr4);
508 567
509void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 568int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
510{ 569{
511 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 570 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
512 kvm_mmu_sync_roots(vcpu); 571 kvm_mmu_sync_roots(vcpu);
513 kvm_mmu_flush_tlb(vcpu); 572 kvm_mmu_flush_tlb(vcpu);
514 return; 573 return 0;
515 } 574 }
516 575
517 if (is_long_mode(vcpu)) { 576 if (is_long_mode(vcpu)) {
518 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 577 if (cr3 & CR3_L_MODE_RESERVED_BITS)
519 kvm_inject_gp(vcpu, 0); 578 return 1;
520 return;
521 }
522 } else { 579 } else {
523 if (is_pae(vcpu)) { 580 if (is_pae(vcpu)) {
524 if (cr3 & CR3_PAE_RESERVED_BITS) { 581 if (cr3 & CR3_PAE_RESERVED_BITS)
525 kvm_inject_gp(vcpu, 0); 582 return 1;
526 return; 583 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3))
527 } 584 return 1;
528 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
529 kvm_inject_gp(vcpu, 0);
530 return;
531 }
532 } 585 }
533 /* 586 /*
534 * We don't check reserved bits in nonpae mode, because 587 * We don't check reserved bits in nonpae mode, because
@@ -546,24 +599,28 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
546 * to debug) behavior on the guest side. 599 * to debug) behavior on the guest side.
547 */ 600 */
548 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 601 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
549 kvm_inject_gp(vcpu, 0); 602 return 1;
550 else { 603 vcpu->arch.cr3 = cr3;
551 vcpu->arch.cr3 = cr3; 604 vcpu->arch.mmu.new_cr3(vcpu);
552 vcpu->arch.mmu.new_cr3(vcpu); 605 return 0;
553 }
554} 606}
555EXPORT_SYMBOL_GPL(kvm_set_cr3); 607EXPORT_SYMBOL_GPL(kvm_set_cr3);
556 608
557void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 609int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
558{ 610{
559 if (cr8 & CR8_RESERVED_BITS) { 611 if (cr8 & CR8_RESERVED_BITS)
560 kvm_inject_gp(vcpu, 0); 612 return 1;
561 return;
562 }
563 if (irqchip_in_kernel(vcpu->kvm)) 613 if (irqchip_in_kernel(vcpu->kvm))
564 kvm_lapic_set_tpr(vcpu, cr8); 614 kvm_lapic_set_tpr(vcpu, cr8);
565 else 615 else
566 vcpu->arch.cr8 = cr8; 616 vcpu->arch.cr8 = cr8;
617 return 0;
618}
619
620void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
621{
622 if (__kvm_set_cr8(vcpu, cr8))
623 kvm_inject_gp(vcpu, 0);
567} 624}
568EXPORT_SYMBOL_GPL(kvm_set_cr8); 625EXPORT_SYMBOL_GPL(kvm_set_cr8);
569 626
@@ -576,7 +633,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
576} 633}
577EXPORT_SYMBOL_GPL(kvm_get_cr8); 634EXPORT_SYMBOL_GPL(kvm_get_cr8);
578 635
579int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 636static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
580{ 637{
581 switch (dr) { 638 switch (dr) {
582 case 0 ... 3: 639 case 0 ... 3:
@@ -585,29 +642,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
585 vcpu->arch.eff_db[dr] = val; 642 vcpu->arch.eff_db[dr] = val;
586 break; 643 break;
587 case 4: 644 case 4:
588 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 645 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
589 kvm_queue_exception(vcpu, UD_VECTOR); 646 return 1; /* #UD */
590 return 1;
591 }
592 /* fall through */ 647 /* fall through */
593 case 6: 648 case 6:
594 if (val & 0xffffffff00000000ULL) { 649 if (val & 0xffffffff00000000ULL)
595 kvm_inject_gp(vcpu, 0); 650 return -1; /* #GP */
596 return 1;
597 }
598 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; 651 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
599 break; 652 break;
600 case 5: 653 case 5:
601 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 654 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
602 kvm_queue_exception(vcpu, UD_VECTOR); 655 return 1; /* #UD */
603 return 1;
604 }
605 /* fall through */ 656 /* fall through */
606 default: /* 7 */ 657 default: /* 7 */
607 if (val & 0xffffffff00000000ULL) { 658 if (val & 0xffffffff00000000ULL)
608 kvm_inject_gp(vcpu, 0); 659 return -1; /* #GP */
609 return 1;
610 }
611 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 660 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
612 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 661 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
613 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); 662 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
@@ -618,28 +667,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
618 667
619 return 0; 668 return 0;
620} 669}
670
671int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
672{
673 int res;
674
675 res = __kvm_set_dr(vcpu, dr, val);
676 if (res > 0)
677 kvm_queue_exception(vcpu, UD_VECTOR);
678 else if (res < 0)
679 kvm_inject_gp(vcpu, 0);
680
681 return res;
682}
621EXPORT_SYMBOL_GPL(kvm_set_dr); 683EXPORT_SYMBOL_GPL(kvm_set_dr);
622 684
623int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 685static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
624{ 686{
625 switch (dr) { 687 switch (dr) {
626 case 0 ... 3: 688 case 0 ... 3:
627 *val = vcpu->arch.db[dr]; 689 *val = vcpu->arch.db[dr];
628 break; 690 break;
629 case 4: 691 case 4:
630 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 692 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
631 kvm_queue_exception(vcpu, UD_VECTOR);
632 return 1; 693 return 1;
633 }
634 /* fall through */ 694 /* fall through */
635 case 6: 695 case 6:
636 *val = vcpu->arch.dr6; 696 *val = vcpu->arch.dr6;
637 break; 697 break;
638 case 5: 698 case 5:
639 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 699 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
640 kvm_queue_exception(vcpu, UD_VECTOR);
641 return 1; 700 return 1;
642 }
643 /* fall through */ 701 /* fall through */
644 default: /* 7 */ 702 default: /* 7 */
645 *val = vcpu->arch.dr7; 703 *val = vcpu->arch.dr7;
@@ -648,12 +706,16 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
648 706
649 return 0; 707 return 0;
650} 708}
651EXPORT_SYMBOL_GPL(kvm_get_dr);
652 709
653static inline u32 bit(int bitno) 710int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
654{ 711{
655 return 1 << (bitno & 31); 712 if (_kvm_get_dr(vcpu, dr, val)) {
713 kvm_queue_exception(vcpu, UD_VECTOR);
714 return 1;
715 }
716 return 0;
656} 717}
718EXPORT_SYMBOL_GPL(kvm_get_dr);
657 719
658/* 720/*
659 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 721 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -682,10 +744,14 @@ static unsigned num_msrs_to_save;
682 744
683static u32 emulated_msrs[] = { 745static u32 emulated_msrs[] = {
684 MSR_IA32_MISC_ENABLE, 746 MSR_IA32_MISC_ENABLE,
747 MSR_IA32_MCG_STATUS,
748 MSR_IA32_MCG_CTL,
685}; 749};
686 750
687static int set_efer(struct kvm_vcpu *vcpu, u64 efer) 751static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
688{ 752{
753 u64 old_efer = vcpu->arch.efer;
754
689 if (efer & efer_reserved_bits) 755 if (efer & efer_reserved_bits)
690 return 1; 756 return 1;
691 757
@@ -714,11 +780,13 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
714 780
715 kvm_x86_ops->set_efer(vcpu, efer); 781 kvm_x86_ops->set_efer(vcpu, efer);
716 782
717 vcpu->arch.efer = efer;
718
719 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 783 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
720 kvm_mmu_reset_context(vcpu); 784 kvm_mmu_reset_context(vcpu);
721 785
786 /* Update reserved bits */
787 if ((efer ^ old_efer) & EFER_NX)
788 kvm_mmu_reset_context(vcpu);
789
722 return 0; 790 return 0;
723} 791}
724 792
@@ -882,7 +950,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v)
882 950
883 if (!vcpu->time_page) 951 if (!vcpu->time_page)
884 return 0; 952 return 0;
885 set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); 953 kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
886 return 1; 954 return 1;
887} 955}
888 956
@@ -1524,16 +1592,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1524{ 1592{
1525 int i, idx; 1593 int i, idx;
1526 1594
1527 vcpu_load(vcpu);
1528
1529 idx = srcu_read_lock(&vcpu->kvm->srcu); 1595 idx = srcu_read_lock(&vcpu->kvm->srcu);
1530 for (i = 0; i < msrs->nmsrs; ++i) 1596 for (i = 0; i < msrs->nmsrs; ++i)
1531 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1597 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1532 break; 1598 break;
1533 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1599 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1534 1600
1535 vcpu_put(vcpu);
1536
1537 return i; 1601 return i;
1538} 1602}
1539 1603
@@ -1618,6 +1682,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1618 case KVM_CAP_PCI_SEGMENT: 1682 case KVM_CAP_PCI_SEGMENT:
1619 case KVM_CAP_DEBUGREGS: 1683 case KVM_CAP_DEBUGREGS:
1620 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1684 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1685 case KVM_CAP_XSAVE:
1621 r = 1; 1686 r = 1;
1622 break; 1687 break;
1623 case KVM_CAP_COALESCED_MMIO: 1688 case KVM_CAP_COALESCED_MMIO:
@@ -1641,6 +1706,9 @@ int kvm_dev_ioctl_check_extension(long ext)
1641 case KVM_CAP_MCE: 1706 case KVM_CAP_MCE:
1642 r = KVM_MAX_MCE_BANKS; 1707 r = KVM_MAX_MCE_BANKS;
1643 break; 1708 break;
1709 case KVM_CAP_XCRS:
1710 r = cpu_has_xsave;
1711 break;
1644 default: 1712 default:
1645 r = 0; 1713 r = 0;
1646 break; 1714 break;
@@ -1717,8 +1785,28 @@ out:
1717 return r; 1785 return r;
1718} 1786}
1719 1787
1788static void wbinvd_ipi(void *garbage)
1789{
1790 wbinvd();
1791}
1792
1793static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
1794{
1795 return vcpu->kvm->arch.iommu_domain &&
1796 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
1797}
1798
1720void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1799void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1721{ 1800{
1801 /* Address WBINVD may be executed by guest */
1802 if (need_emulate_wbinvd(vcpu)) {
1803 if (kvm_x86_ops->has_wbinvd_exit())
1804 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
1805 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
1806 smp_call_function_single(vcpu->cpu,
1807 wbinvd_ipi, NULL, 1);
1808 }
1809
1722 kvm_x86_ops->vcpu_load(vcpu, cpu); 1810 kvm_x86_ops->vcpu_load(vcpu, cpu);
1723 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { 1811 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1724 unsigned long khz = cpufreq_quick_get(cpu); 1812 unsigned long khz = cpufreq_quick_get(cpu);
@@ -1731,8 +1819,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1731 1819
1732void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1820void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1733{ 1821{
1734 kvm_put_guest_fpu(vcpu);
1735 kvm_x86_ops->vcpu_put(vcpu); 1822 kvm_x86_ops->vcpu_put(vcpu);
1823 kvm_put_guest_fpu(vcpu);
1736} 1824}
1737 1825
1738static int is_efer_nx(void) 1826static int is_efer_nx(void)
@@ -1781,7 +1869,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1781 if (copy_from_user(cpuid_entries, entries, 1869 if (copy_from_user(cpuid_entries, entries,
1782 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1870 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1783 goto out_free; 1871 goto out_free;
1784 vcpu_load(vcpu);
1785 for (i = 0; i < cpuid->nent; i++) { 1872 for (i = 0; i < cpuid->nent; i++) {
1786 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1873 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1787 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1874 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1799,7 +1886,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1799 r = 0; 1886 r = 0;
1800 kvm_apic_set_version(vcpu); 1887 kvm_apic_set_version(vcpu);
1801 kvm_x86_ops->cpuid_update(vcpu); 1888 kvm_x86_ops->cpuid_update(vcpu);
1802 vcpu_put(vcpu); 1889 update_cpuid(vcpu);
1803 1890
1804out_free: 1891out_free:
1805 vfree(cpuid_entries); 1892 vfree(cpuid_entries);
@@ -1820,11 +1907,10 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1820 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1907 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1821 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1908 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1822 goto out; 1909 goto out;
1823 vcpu_load(vcpu);
1824 vcpu->arch.cpuid_nent = cpuid->nent; 1910 vcpu->arch.cpuid_nent = cpuid->nent;
1825 kvm_apic_set_version(vcpu); 1911 kvm_apic_set_version(vcpu);
1826 kvm_x86_ops->cpuid_update(vcpu); 1912 kvm_x86_ops->cpuid_update(vcpu);
1827 vcpu_put(vcpu); 1913 update_cpuid(vcpu);
1828 return 0; 1914 return 0;
1829 1915
1830out: 1916out:
@@ -1837,7 +1923,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1837{ 1923{
1838 int r; 1924 int r;
1839 1925
1840 vcpu_load(vcpu);
1841 r = -E2BIG; 1926 r = -E2BIG;
1842 if (cpuid->nent < vcpu->arch.cpuid_nent) 1927 if (cpuid->nent < vcpu->arch.cpuid_nent)
1843 goto out; 1928 goto out;
@@ -1849,7 +1934,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1849 1934
1850out: 1935out:
1851 cpuid->nent = vcpu->arch.cpuid_nent; 1936 cpuid->nent = vcpu->arch.cpuid_nent;
1852 vcpu_put(vcpu);
1853 return r; 1937 return r;
1854} 1938}
1855 1939
@@ -1901,13 +1985,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1901 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1985 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1902 /* cpuid 1.ecx */ 1986 /* cpuid 1.ecx */
1903 const u32 kvm_supported_word4_x86_features = 1987 const u32 kvm_supported_word4_x86_features =
1904 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | 1988 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
1905 0 /* DS-CPL, VMX, SMX, EST */ | 1989 0 /* DS-CPL, VMX, SMX, EST */ |
1906 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 1990 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1907 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1991 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1908 0 /* Reserved, DCA */ | F(XMM4_1) | 1992 0 /* Reserved, DCA */ | F(XMM4_1) |
1909 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 1993 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1910 0 /* Reserved, XSAVE, OSXSAVE */; 1994 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
1911 /* cpuid 0x80000001.ecx */ 1995 /* cpuid 0x80000001.ecx */
1912 const u32 kvm_supported_word6_x86_features = 1996 const u32 kvm_supported_word6_x86_features =
1913 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 1997 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
@@ -1922,7 +2006,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1922 2006
1923 switch (function) { 2007 switch (function) {
1924 case 0: 2008 case 0:
1925 entry->eax = min(entry->eax, (u32)0xb); 2009 entry->eax = min(entry->eax, (u32)0xd);
1926 break; 2010 break;
1927 case 1: 2011 case 1:
1928 entry->edx &= kvm_supported_word0_x86_features; 2012 entry->edx &= kvm_supported_word0_x86_features;
@@ -1980,6 +2064,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1980 } 2064 }
1981 break; 2065 break;
1982 } 2066 }
2067 case 0xd: {
2068 int i;
2069
2070 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2071 for (i = 1; *nent < maxnent; ++i) {
2072 if (entry[i - 1].eax == 0 && i != 2)
2073 break;
2074 do_cpuid_1_ent(&entry[i], function, i);
2075 entry[i].flags |=
2076 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2077 ++*nent;
2078 }
2079 break;
2080 }
1983 case KVM_CPUID_SIGNATURE: { 2081 case KVM_CPUID_SIGNATURE: {
1984 char signature[12] = "KVMKVMKVM\0\0"; 2082 char signature[12] = "KVMKVMKVM\0\0";
1985 u32 *sigptr = (u32 *)signature; 2083 u32 *sigptr = (u32 *)signature;
@@ -2081,9 +2179,7 @@ out:
2081static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2179static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2082 struct kvm_lapic_state *s) 2180 struct kvm_lapic_state *s)
2083{ 2181{
2084 vcpu_load(vcpu);
2085 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 2182 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
2086 vcpu_put(vcpu);
2087 2183
2088 return 0; 2184 return 0;
2089} 2185}
@@ -2091,11 +2187,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2091static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 2187static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2092 struct kvm_lapic_state *s) 2188 struct kvm_lapic_state *s)
2093{ 2189{
2094 vcpu_load(vcpu);
2095 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 2190 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
2096 kvm_apic_post_state_restore(vcpu); 2191 kvm_apic_post_state_restore(vcpu);
2097 update_cr8_intercept(vcpu); 2192 update_cr8_intercept(vcpu);
2098 vcpu_put(vcpu);
2099 2193
2100 return 0; 2194 return 0;
2101} 2195}
@@ -2107,20 +2201,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2107 return -EINVAL; 2201 return -EINVAL;
2108 if (irqchip_in_kernel(vcpu->kvm)) 2202 if (irqchip_in_kernel(vcpu->kvm))
2109 return -ENXIO; 2203 return -ENXIO;
2110 vcpu_load(vcpu);
2111 2204
2112 kvm_queue_interrupt(vcpu, irq->irq, false); 2205 kvm_queue_interrupt(vcpu, irq->irq, false);
2113 2206
2114 vcpu_put(vcpu);
2115
2116 return 0; 2207 return 0;
2117} 2208}
2118 2209
2119static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 2210static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
2120{ 2211{
2121 vcpu_load(vcpu);
2122 kvm_inject_nmi(vcpu); 2212 kvm_inject_nmi(vcpu);
2123 vcpu_put(vcpu);
2124 2213
2125 return 0; 2214 return 0;
2126} 2215}
@@ -2140,7 +2229,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2140 int r; 2229 int r;
2141 unsigned bank_num = mcg_cap & 0xff, bank; 2230 unsigned bank_num = mcg_cap & 0xff, bank;
2142 2231
2143 vcpu_load(vcpu);
2144 r = -EINVAL; 2232 r = -EINVAL;
2145 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) 2233 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
2146 goto out; 2234 goto out;
@@ -2155,7 +2243,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2155 for (bank = 0; bank < bank_num; bank++) 2243 for (bank = 0; bank < bank_num; bank++)
2156 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 2244 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
2157out: 2245out:
2158 vcpu_put(vcpu);
2159 return r; 2246 return r;
2160} 2247}
2161 2248
@@ -2188,7 +2275,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2188 printk(KERN_DEBUG "kvm: set_mce: " 2275 printk(KERN_DEBUG "kvm: set_mce: "
2189 "injects mce exception while " 2276 "injects mce exception while "
2190 "previous one is in progress!\n"); 2277 "previous one is in progress!\n");
2191 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 2278 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2192 return 0; 2279 return 0;
2193 } 2280 }
2194 if (banks[1] & MCI_STATUS_VAL) 2281 if (banks[1] & MCI_STATUS_VAL)
@@ -2213,8 +2300,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2213static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 2300static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2214 struct kvm_vcpu_events *events) 2301 struct kvm_vcpu_events *events)
2215{ 2302{
2216 vcpu_load(vcpu);
2217
2218 events->exception.injected = 2303 events->exception.injected =
2219 vcpu->arch.exception.pending && 2304 vcpu->arch.exception.pending &&
2220 !kvm_exception_is_soft(vcpu->arch.exception.nr); 2305 !kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2239,8 +2324,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2239 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2324 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2240 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 2325 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2241 | KVM_VCPUEVENT_VALID_SHADOW); 2326 | KVM_VCPUEVENT_VALID_SHADOW);
2242
2243 vcpu_put(vcpu);
2244} 2327}
2245 2328
2246static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, 2329static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
@@ -2251,8 +2334,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2251 | KVM_VCPUEVENT_VALID_SHADOW)) 2334 | KVM_VCPUEVENT_VALID_SHADOW))
2252 return -EINVAL; 2335 return -EINVAL;
2253 2336
2254 vcpu_load(vcpu);
2255
2256 vcpu->arch.exception.pending = events->exception.injected; 2337 vcpu->arch.exception.pending = events->exception.injected;
2257 vcpu->arch.exception.nr = events->exception.nr; 2338 vcpu->arch.exception.nr = events->exception.nr;
2258 vcpu->arch.exception.has_error_code = events->exception.has_error_code; 2339 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -2275,22 +2356,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2275 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2356 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2276 vcpu->arch.sipi_vector = events->sipi_vector; 2357 vcpu->arch.sipi_vector = events->sipi_vector;
2277 2358
2278 vcpu_put(vcpu);
2279
2280 return 0; 2359 return 0;
2281} 2360}
2282 2361
2283static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, 2362static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2284 struct kvm_debugregs *dbgregs) 2363 struct kvm_debugregs *dbgregs)
2285{ 2364{
2286 vcpu_load(vcpu);
2287
2288 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); 2365 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2289 dbgregs->dr6 = vcpu->arch.dr6; 2366 dbgregs->dr6 = vcpu->arch.dr6;
2290 dbgregs->dr7 = vcpu->arch.dr7; 2367 dbgregs->dr7 = vcpu->arch.dr7;
2291 dbgregs->flags = 0; 2368 dbgregs->flags = 0;
2292
2293 vcpu_put(vcpu);
2294} 2369}
2295 2370
2296static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, 2371static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -2299,40 +2374,113 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
2299 if (dbgregs->flags) 2374 if (dbgregs->flags)
2300 return -EINVAL; 2375 return -EINVAL;
2301 2376
2302 vcpu_load(vcpu);
2303
2304 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); 2377 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
2305 vcpu->arch.dr6 = dbgregs->dr6; 2378 vcpu->arch.dr6 = dbgregs->dr6;
2306 vcpu->arch.dr7 = dbgregs->dr7; 2379 vcpu->arch.dr7 = dbgregs->dr7;
2307 2380
2308 vcpu_put(vcpu); 2381 return 0;
2382}
2383
2384static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
2385 struct kvm_xsave *guest_xsave)
2386{
2387 if (cpu_has_xsave)
2388 memcpy(guest_xsave->region,
2389 &vcpu->arch.guest_fpu.state->xsave,
2390 sizeof(struct xsave_struct));
2391 else {
2392 memcpy(guest_xsave->region,
2393 &vcpu->arch.guest_fpu.state->fxsave,
2394 sizeof(struct i387_fxsave_struct));
2395 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
2396 XSTATE_FPSSE;
2397 }
2398}
2399
2400static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
2401 struct kvm_xsave *guest_xsave)
2402{
2403 u64 xstate_bv =
2404 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
2309 2405
2406 if (cpu_has_xsave)
2407 memcpy(&vcpu->arch.guest_fpu.state->xsave,
2408 guest_xsave->region, sizeof(struct xsave_struct));
2409 else {
2410 if (xstate_bv & ~XSTATE_FPSSE)
2411 return -EINVAL;
2412 memcpy(&vcpu->arch.guest_fpu.state->fxsave,
2413 guest_xsave->region, sizeof(struct i387_fxsave_struct));
2414 }
2310 return 0; 2415 return 0;
2311} 2416}
2312 2417
2418static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
2419 struct kvm_xcrs *guest_xcrs)
2420{
2421 if (!cpu_has_xsave) {
2422 guest_xcrs->nr_xcrs = 0;
2423 return;
2424 }
2425
2426 guest_xcrs->nr_xcrs = 1;
2427 guest_xcrs->flags = 0;
2428 guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
2429 guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
2430}
2431
2432static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
2433 struct kvm_xcrs *guest_xcrs)
2434{
2435 int i, r = 0;
2436
2437 if (!cpu_has_xsave)
2438 return -EINVAL;
2439
2440 if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
2441 return -EINVAL;
2442
2443 for (i = 0; i < guest_xcrs->nr_xcrs; i++)
2444 /* Only support XCR0 currently */
2445 if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
2446 r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
2447 guest_xcrs->xcrs[0].value);
2448 break;
2449 }
2450 if (r)
2451 r = -EINVAL;
2452 return r;
2453}
2454
2313long kvm_arch_vcpu_ioctl(struct file *filp, 2455long kvm_arch_vcpu_ioctl(struct file *filp,
2314 unsigned int ioctl, unsigned long arg) 2456 unsigned int ioctl, unsigned long arg)
2315{ 2457{
2316 struct kvm_vcpu *vcpu = filp->private_data; 2458 struct kvm_vcpu *vcpu = filp->private_data;
2317 void __user *argp = (void __user *)arg; 2459 void __user *argp = (void __user *)arg;
2318 int r; 2460 int r;
2319 struct kvm_lapic_state *lapic = NULL; 2461 union {
2462 struct kvm_lapic_state *lapic;
2463 struct kvm_xsave *xsave;
2464 struct kvm_xcrs *xcrs;
2465 void *buffer;
2466 } u;
2320 2467
2468 u.buffer = NULL;
2321 switch (ioctl) { 2469 switch (ioctl) {
2322 case KVM_GET_LAPIC: { 2470 case KVM_GET_LAPIC: {
2323 r = -EINVAL; 2471 r = -EINVAL;
2324 if (!vcpu->arch.apic) 2472 if (!vcpu->arch.apic)
2325 goto out; 2473 goto out;
2326 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2474 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
2327 2475
2328 r = -ENOMEM; 2476 r = -ENOMEM;
2329 if (!lapic) 2477 if (!u.lapic)
2330 goto out; 2478 goto out;
2331 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 2479 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
2332 if (r) 2480 if (r)
2333 goto out; 2481 goto out;
2334 r = -EFAULT; 2482 r = -EFAULT;
2335 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 2483 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
2336 goto out; 2484 goto out;
2337 r = 0; 2485 r = 0;
2338 break; 2486 break;
@@ -2341,14 +2489,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2341 r = -EINVAL; 2489 r = -EINVAL;
2342 if (!vcpu->arch.apic) 2490 if (!vcpu->arch.apic)
2343 goto out; 2491 goto out;
2344 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2492 u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
2345 r = -ENOMEM; 2493 r = -ENOMEM;
2346 if (!lapic) 2494 if (!u.lapic)
2347 goto out; 2495 goto out;
2348 r = -EFAULT; 2496 r = -EFAULT;
2349 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 2497 if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
2350 goto out; 2498 goto out;
2351 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 2499 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
2352 if (r) 2500 if (r)
2353 goto out; 2501 goto out;
2354 r = 0; 2502 r = 0;
@@ -2464,9 +2612,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2464 r = -EFAULT; 2612 r = -EFAULT;
2465 if (copy_from_user(&mce, argp, sizeof mce)) 2613 if (copy_from_user(&mce, argp, sizeof mce))
2466 goto out; 2614 goto out;
2467 vcpu_load(vcpu);
2468 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2615 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
2469 vcpu_put(vcpu);
2470 break; 2616 break;
2471 } 2617 }
2472 case KVM_GET_VCPU_EVENTS: { 2618 case KVM_GET_VCPU_EVENTS: {
@@ -2513,11 +2659,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2513 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); 2659 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
2514 break; 2660 break;
2515 } 2661 }
2662 case KVM_GET_XSAVE: {
2663 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
2664 r = -ENOMEM;
2665 if (!u.xsave)
2666 break;
2667
2668 kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
2669
2670 r = -EFAULT;
2671 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
2672 break;
2673 r = 0;
2674 break;
2675 }
2676 case KVM_SET_XSAVE: {
2677 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
2678 r = -ENOMEM;
2679 if (!u.xsave)
2680 break;
2681
2682 r = -EFAULT;
2683 if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
2684 break;
2685
2686 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
2687 break;
2688 }
2689 case KVM_GET_XCRS: {
2690 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
2691 r = -ENOMEM;
2692 if (!u.xcrs)
2693 break;
2694
2695 kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
2696
2697 r = -EFAULT;
2698 if (copy_to_user(argp, u.xcrs,
2699 sizeof(struct kvm_xcrs)))
2700 break;
2701 r = 0;
2702 break;
2703 }
2704 case KVM_SET_XCRS: {
2705 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
2706 r = -ENOMEM;
2707 if (!u.xcrs)
2708 break;
2709
2710 r = -EFAULT;
2711 if (copy_from_user(u.xcrs, argp,
2712 sizeof(struct kvm_xcrs)))
2713 break;
2714
2715 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
2716 break;
2717 }
2516 default: 2718 default:
2517 r = -EINVAL; 2719 r = -EINVAL;
2518 } 2720 }
2519out: 2721out:
2520 kfree(lapic); 2722 kfree(u.buffer);
2521 return r; 2723 return r;
2522} 2724}
2523 2725
@@ -2560,115 +2762,6 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2560 return kvm->arch.n_alloc_mmu_pages; 2762 return kvm->arch.n_alloc_mmu_pages;
2561} 2763}
2562 2764
2563gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2564{
2565 int i;
2566 struct kvm_mem_alias *alias;
2567 struct kvm_mem_aliases *aliases;
2568
2569 aliases = kvm_aliases(kvm);
2570
2571 for (i = 0; i < aliases->naliases; ++i) {
2572 alias = &aliases->aliases[i];
2573 if (alias->flags & KVM_ALIAS_INVALID)
2574 continue;
2575 if (gfn >= alias->base_gfn
2576 && gfn < alias->base_gfn + alias->npages)
2577 return alias->target_gfn + gfn - alias->base_gfn;
2578 }
2579 return gfn;
2580}
2581
2582gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2583{
2584 int i;
2585 struct kvm_mem_alias *alias;
2586 struct kvm_mem_aliases *aliases;
2587
2588 aliases = kvm_aliases(kvm);
2589
2590 for (i = 0; i < aliases->naliases; ++i) {
2591 alias = &aliases->aliases[i];
2592 if (gfn >= alias->base_gfn
2593 && gfn < alias->base_gfn + alias->npages)
2594 return alias->target_gfn + gfn - alias->base_gfn;
2595 }
2596 return gfn;
2597}
2598
2599/*
2600 * Set a new alias region. Aliases map a portion of physical memory into
2601 * another portion. This is useful for memory windows, for example the PC
2602 * VGA region.
2603 */
2604static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
2605 struct kvm_memory_alias *alias)
2606{
2607 int r, n;
2608 struct kvm_mem_alias *p;
2609 struct kvm_mem_aliases *aliases, *old_aliases;
2610
2611 r = -EINVAL;
2612 /* General sanity checks */
2613 if (alias->memory_size & (PAGE_SIZE - 1))
2614 goto out;
2615 if (alias->guest_phys_addr & (PAGE_SIZE - 1))
2616 goto out;
2617 if (alias->slot >= KVM_ALIAS_SLOTS)
2618 goto out;
2619 if (alias->guest_phys_addr + alias->memory_size
2620 < alias->guest_phys_addr)
2621 goto out;
2622 if (alias->target_phys_addr + alias->memory_size
2623 < alias->target_phys_addr)
2624 goto out;
2625
2626 r = -ENOMEM;
2627 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2628 if (!aliases)
2629 goto out;
2630
2631 mutex_lock(&kvm->slots_lock);
2632
2633 /* invalidate any gfn reference in case of deletion/shrinking */
2634 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2635 aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
2636 old_aliases = kvm->arch.aliases;
2637 rcu_assign_pointer(kvm->arch.aliases, aliases);
2638 synchronize_srcu_expedited(&kvm->srcu);
2639 kvm_mmu_zap_all(kvm);
2640 kfree(old_aliases);
2641
2642 r = -ENOMEM;
2643 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2644 if (!aliases)
2645 goto out_unlock;
2646
2647 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2648
2649 p = &aliases->aliases[alias->slot];
2650 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
2651 p->npages = alias->memory_size >> PAGE_SHIFT;
2652 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
2653 p->flags &= ~(KVM_ALIAS_INVALID);
2654
2655 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
2656 if (aliases->aliases[n - 1].npages)
2657 break;
2658 aliases->naliases = n;
2659
2660 old_aliases = kvm->arch.aliases;
2661 rcu_assign_pointer(kvm->arch.aliases, aliases);
2662 synchronize_srcu_expedited(&kvm->srcu);
2663 kfree(old_aliases);
2664 r = 0;
2665
2666out_unlock:
2667 mutex_unlock(&kvm->slots_lock);
2668out:
2669 return r;
2670}
2671
2672static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2765static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2673{ 2766{
2674 int r; 2767 int r;
@@ -2797,7 +2890,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2797 struct kvm_memory_slot *memslot; 2890 struct kvm_memory_slot *memslot;
2798 unsigned long n; 2891 unsigned long n;
2799 unsigned long is_dirty = 0; 2892 unsigned long is_dirty = 0;
2800 unsigned long *dirty_bitmap = NULL;
2801 2893
2802 mutex_lock(&kvm->slots_lock); 2894 mutex_lock(&kvm->slots_lock);
2803 2895
@@ -2812,27 +2904,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2812 2904
2813 n = kvm_dirty_bitmap_bytes(memslot); 2905 n = kvm_dirty_bitmap_bytes(memslot);
2814 2906
2815 r = -ENOMEM;
2816 dirty_bitmap = vmalloc(n);
2817 if (!dirty_bitmap)
2818 goto out;
2819 memset(dirty_bitmap, 0, n);
2820
2821 for (i = 0; !is_dirty && i < n/sizeof(long); i++) 2907 for (i = 0; !is_dirty && i < n/sizeof(long); i++)
2822 is_dirty = memslot->dirty_bitmap[i]; 2908 is_dirty = memslot->dirty_bitmap[i];
2823 2909
2824 /* If nothing is dirty, don't bother messing with page tables. */ 2910 /* If nothing is dirty, don't bother messing with page tables. */
2825 if (is_dirty) { 2911 if (is_dirty) {
2826 struct kvm_memslots *slots, *old_slots; 2912 struct kvm_memslots *slots, *old_slots;
2913 unsigned long *dirty_bitmap;
2827 2914
2828 spin_lock(&kvm->mmu_lock); 2915 spin_lock(&kvm->mmu_lock);
2829 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2916 kvm_mmu_slot_remove_write_access(kvm, log->slot);
2830 spin_unlock(&kvm->mmu_lock); 2917 spin_unlock(&kvm->mmu_lock);
2831 2918
2832 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 2919 r = -ENOMEM;
2833 if (!slots) 2920 dirty_bitmap = vmalloc(n);
2834 goto out_free; 2921 if (!dirty_bitmap)
2922 goto out;
2923 memset(dirty_bitmap, 0, n);
2835 2924
2925 r = -ENOMEM;
2926 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
2927 if (!slots) {
2928 vfree(dirty_bitmap);
2929 goto out;
2930 }
2836 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 2931 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
2837 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 2932 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
2838 2933
@@ -2841,13 +2936,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2841 synchronize_srcu_expedited(&kvm->srcu); 2936 synchronize_srcu_expedited(&kvm->srcu);
2842 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; 2937 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
2843 kfree(old_slots); 2938 kfree(old_slots);
2939
2940 r = -EFAULT;
2941 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
2942 vfree(dirty_bitmap);
2943 goto out;
2944 }
2945 vfree(dirty_bitmap);
2946 } else {
2947 r = -EFAULT;
2948 if (clear_user(log->dirty_bitmap, n))
2949 goto out;
2844 } 2950 }
2845 2951
2846 r = 0; 2952 r = 0;
2847 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
2848 r = -EFAULT;
2849out_free:
2850 vfree(dirty_bitmap);
2851out: 2953out:
2852 mutex_unlock(&kvm->slots_lock); 2954 mutex_unlock(&kvm->slots_lock);
2853 return r; 2955 return r;
@@ -2867,7 +2969,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
2867 union { 2969 union {
2868 struct kvm_pit_state ps; 2970 struct kvm_pit_state ps;
2869 struct kvm_pit_state2 ps2; 2971 struct kvm_pit_state2 ps2;
2870 struct kvm_memory_alias alias;
2871 struct kvm_pit_config pit_config; 2972 struct kvm_pit_config pit_config;
2872 } u; 2973 } u;
2873 2974
@@ -2888,22 +2989,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
2888 goto out; 2989 goto out;
2889 break; 2990 break;
2890 } 2991 }
2891 case KVM_SET_MEMORY_REGION: {
2892 struct kvm_memory_region kvm_mem;
2893 struct kvm_userspace_memory_region kvm_userspace_mem;
2894
2895 r = -EFAULT;
2896 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2897 goto out;
2898 kvm_userspace_mem.slot = kvm_mem.slot;
2899 kvm_userspace_mem.flags = kvm_mem.flags;
2900 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
2901 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
2902 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
2903 if (r)
2904 goto out;
2905 break;
2906 }
2907 case KVM_SET_NR_MMU_PAGES: 2992 case KVM_SET_NR_MMU_PAGES:
2908 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 2993 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
2909 if (r) 2994 if (r)
@@ -2912,14 +2997,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
2912 case KVM_GET_NR_MMU_PAGES: 2997 case KVM_GET_NR_MMU_PAGES:
2913 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 2998 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
2914 break; 2999 break;
2915 case KVM_SET_MEMORY_ALIAS:
2916 r = -EFAULT;
2917 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
2918 goto out;
2919 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
2920 if (r)
2921 goto out;
2922 break;
2923 case KVM_CREATE_IRQCHIP: { 3000 case KVM_CREATE_IRQCHIP: {
2924 struct kvm_pic *vpic; 3001 struct kvm_pic *vpic;
2925 3002
@@ -3259,7 +3336,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3259 } 3336 }
3260 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 3337 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
3261 if (ret < 0) { 3338 if (ret < 0) {
3262 r = X86EMUL_UNHANDLEABLE; 3339 r = X86EMUL_IO_NEEDED;
3263 goto out; 3340 goto out;
3264 } 3341 }
3265 3342
@@ -3315,7 +3392,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
3315 } 3392 }
3316 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 3393 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
3317 if (ret < 0) { 3394 if (ret < 0) {
3318 r = X86EMUL_UNHANDLEABLE; 3395 r = X86EMUL_IO_NEEDED;
3319 goto out; 3396 goto out;
3320 } 3397 }
3321 3398
@@ -3330,10 +3407,10 @@ out:
3330static int emulator_read_emulated(unsigned long addr, 3407static int emulator_read_emulated(unsigned long addr,
3331 void *val, 3408 void *val,
3332 unsigned int bytes, 3409 unsigned int bytes,
3410 unsigned int *error_code,
3333 struct kvm_vcpu *vcpu) 3411 struct kvm_vcpu *vcpu)
3334{ 3412{
3335 gpa_t gpa; 3413 gpa_t gpa;
3336 u32 error_code;
3337 3414
3338 if (vcpu->mmio_read_completed) { 3415 if (vcpu->mmio_read_completed) {
3339 memcpy(val, vcpu->mmio_data, bytes); 3416 memcpy(val, vcpu->mmio_data, bytes);
@@ -3343,12 +3420,10 @@ static int emulator_read_emulated(unsigned long addr,
3343 return X86EMUL_CONTINUE; 3420 return X86EMUL_CONTINUE;
3344 } 3421 }
3345 3422
3346 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); 3423 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
3347 3424
3348 if (gpa == UNMAPPED_GVA) { 3425 if (gpa == UNMAPPED_GVA)
3349 kvm_inject_page_fault(vcpu, addr, error_code);
3350 return X86EMUL_PROPAGATE_FAULT; 3426 return X86EMUL_PROPAGATE_FAULT;
3351 }
3352 3427
3353 /* For APIC access vmexit */ 3428 /* For APIC access vmexit */
3354 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3429 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3370,11 +3445,12 @@ mmio:
3370 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 3445 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
3371 3446
3372 vcpu->mmio_needed = 1; 3447 vcpu->mmio_needed = 1;
3373 vcpu->mmio_phys_addr = gpa; 3448 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3374 vcpu->mmio_size = bytes; 3449 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3375 vcpu->mmio_is_write = 0; 3450 vcpu->run->mmio.len = vcpu->mmio_size = bytes;
3451 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
3376 3452
3377 return X86EMUL_UNHANDLEABLE; 3453 return X86EMUL_IO_NEEDED;
3378} 3454}
3379 3455
3380int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 3456int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -3392,17 +3468,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3392static int emulator_write_emulated_onepage(unsigned long addr, 3468static int emulator_write_emulated_onepage(unsigned long addr,
3393 const void *val, 3469 const void *val,
3394 unsigned int bytes, 3470 unsigned int bytes,
3471 unsigned int *error_code,
3395 struct kvm_vcpu *vcpu) 3472 struct kvm_vcpu *vcpu)
3396{ 3473{
3397 gpa_t gpa; 3474 gpa_t gpa;
3398 u32 error_code;
3399 3475
3400 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); 3476 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
3401 3477
3402 if (gpa == UNMAPPED_GVA) { 3478 if (gpa == UNMAPPED_GVA)
3403 kvm_inject_page_fault(vcpu, addr, error_code);
3404 return X86EMUL_PROPAGATE_FAULT; 3479 return X86EMUL_PROPAGATE_FAULT;
3405 }
3406 3480
3407 /* For APIC access vmexit */ 3481 /* For APIC access vmexit */
3408 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3482 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3420,10 +3494,11 @@ mmio:
3420 return X86EMUL_CONTINUE; 3494 return X86EMUL_CONTINUE;
3421 3495
3422 vcpu->mmio_needed = 1; 3496 vcpu->mmio_needed = 1;
3423 vcpu->mmio_phys_addr = gpa; 3497 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3424 vcpu->mmio_size = bytes; 3498 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3425 vcpu->mmio_is_write = 1; 3499 vcpu->run->mmio.len = vcpu->mmio_size = bytes;
3426 memcpy(vcpu->mmio_data, val, bytes); 3500 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
3501 memcpy(vcpu->run->mmio.data, val, bytes);
3427 3502
3428 return X86EMUL_CONTINUE; 3503 return X86EMUL_CONTINUE;
3429} 3504}
@@ -3431,6 +3506,7 @@ mmio:
3431int emulator_write_emulated(unsigned long addr, 3506int emulator_write_emulated(unsigned long addr,
3432 const void *val, 3507 const void *val,
3433 unsigned int bytes, 3508 unsigned int bytes,
3509 unsigned int *error_code,
3434 struct kvm_vcpu *vcpu) 3510 struct kvm_vcpu *vcpu)
3435{ 3511{
3436 /* Crossing a page boundary? */ 3512 /* Crossing a page boundary? */
@@ -3438,16 +3514,17 @@ int emulator_write_emulated(unsigned long addr,
3438 int rc, now; 3514 int rc, now;
3439 3515
3440 now = -addr & ~PAGE_MASK; 3516 now = -addr & ~PAGE_MASK;
3441 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 3517 rc = emulator_write_emulated_onepage(addr, val, now, error_code,
3518 vcpu);
3442 if (rc != X86EMUL_CONTINUE) 3519 if (rc != X86EMUL_CONTINUE)
3443 return rc; 3520 return rc;
3444 addr += now; 3521 addr += now;
3445 val += now; 3522 val += now;
3446 bytes -= now; 3523 bytes -= now;
3447 } 3524 }
3448 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 3525 return emulator_write_emulated_onepage(addr, val, bytes, error_code,
3526 vcpu);
3449} 3527}
3450EXPORT_SYMBOL_GPL(emulator_write_emulated);
3451 3528
3452#define CMPXCHG_TYPE(t, ptr, old, new) \ 3529#define CMPXCHG_TYPE(t, ptr, old, new) \
3453 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) 3530 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
@@ -3463,6 +3540,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3463 const void *old, 3540 const void *old,
3464 const void *new, 3541 const void *new,
3465 unsigned int bytes, 3542 unsigned int bytes,
3543 unsigned int *error_code,
3466 struct kvm_vcpu *vcpu) 3544 struct kvm_vcpu *vcpu)
3467{ 3545{
3468 gpa_t gpa; 3546 gpa_t gpa;
@@ -3484,6 +3562,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3484 goto emul_write; 3562 goto emul_write;
3485 3563
3486 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3564 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3565 if (is_error_page(page)) {
3566 kvm_release_page_clean(page);
3567 goto emul_write;
3568 }
3487 3569
3488 kaddr = kmap_atomic(page, KM_USER0); 3570 kaddr = kmap_atomic(page, KM_USER0);
3489 kaddr += offset_in_page(gpa); 3571 kaddr += offset_in_page(gpa);
@@ -3516,7 +3598,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3516emul_write: 3598emul_write:
3517 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3599 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3518 3600
3519 return emulator_write_emulated(addr, new, bytes, vcpu); 3601 return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
3520} 3602}
3521 3603
3522static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3604static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3604,42 +3686,38 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
3604 return X86EMUL_CONTINUE; 3686 return X86EMUL_CONTINUE;
3605} 3687}
3606 3688
3607int emulate_clts(struct kvm_vcpu *vcpu) 3689int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
3608{ 3690{
3609 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 3691 if (!need_emulate_wbinvd(vcpu))
3610 kvm_x86_ops->fpu_activate(vcpu); 3692 return X86EMUL_CONTINUE;
3693
3694 if (kvm_x86_ops->has_wbinvd_exit()) {
3695 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
3696 wbinvd_ipi, NULL, 1);
3697 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
3698 }
3699 wbinvd();
3611 return X86EMUL_CONTINUE; 3700 return X86EMUL_CONTINUE;
3612} 3701}
3702EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
3613 3703
3614int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3704int emulate_clts(struct kvm_vcpu *vcpu)
3615{ 3705{
3616 return kvm_get_dr(ctxt->vcpu, dr, dest); 3706 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
3707 kvm_x86_ops->fpu_activate(vcpu);
3708 return X86EMUL_CONTINUE;
3617} 3709}
3618 3710
3619int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3711int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
3620{ 3712{
3621 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3713 return _kvm_get_dr(vcpu, dr, dest);
3622
3623 return kvm_set_dr(ctxt->vcpu, dr, value & mask);
3624} 3714}
3625 3715
3626void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3716int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
3627{ 3717{
3628 u8 opcodes[4];
3629 unsigned long rip = kvm_rip_read(vcpu);
3630 unsigned long rip_linear;
3631
3632 if (!printk_ratelimit())
3633 return;
3634 3718
3635 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 3719 return __kvm_set_dr(vcpu, dr, value);
3636
3637 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
3638
3639 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
3640 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
3641} 3720}
3642EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
3643 3721
3644static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3722static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3645{ 3723{
@@ -3674,27 +3752,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
3674 return value; 3752 return value;
3675} 3753}
3676 3754
3677static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) 3755static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
3678{ 3756{
3757 int res = 0;
3758
3679 switch (cr) { 3759 switch (cr) {
3680 case 0: 3760 case 0:
3681 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); 3761 res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
3682 break; 3762 break;
3683 case 2: 3763 case 2:
3684 vcpu->arch.cr2 = val; 3764 vcpu->arch.cr2 = val;
3685 break; 3765 break;
3686 case 3: 3766 case 3:
3687 kvm_set_cr3(vcpu, val); 3767 res = kvm_set_cr3(vcpu, val);
3688 break; 3768 break;
3689 case 4: 3769 case 4:
3690 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 3770 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3691 break; 3771 break;
3692 case 8: 3772 case 8:
3693 kvm_set_cr8(vcpu, val & 0xfUL); 3773 res = __kvm_set_cr8(vcpu, val & 0xfUL);
3694 break; 3774 break;
3695 default: 3775 default:
3696 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3776 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3777 res = -1;
3697 } 3778 }
3779
3780 return res;
3698} 3781}
3699 3782
3700static int emulator_get_cpl(struct kvm_vcpu *vcpu) 3783static int emulator_get_cpl(struct kvm_vcpu *vcpu)
@@ -3707,6 +3790,12 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
3707 kvm_x86_ops->get_gdt(vcpu, dt); 3790 kvm_x86_ops->get_gdt(vcpu, dt);
3708} 3791}
3709 3792
3793static unsigned long emulator_get_cached_segment_base(int seg,
3794 struct kvm_vcpu *vcpu)
3795{
3796 return get_segment_base(vcpu, seg);
3797}
3798
3710static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, 3799static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
3711 struct kvm_vcpu *vcpu) 3800 struct kvm_vcpu *vcpu)
3712{ 3801{
@@ -3779,11 +3868,6 @@ static void emulator_set_segment_selector(u16 sel, int seg,
3779 kvm_set_segment(vcpu, &kvm_seg, seg); 3868 kvm_set_segment(vcpu, &kvm_seg, seg);
3780} 3869}
3781 3870
3782static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3783{
3784 kvm_x86_ops->set_rflags(vcpu, rflags);
3785}
3786
3787static struct x86_emulate_ops emulate_ops = { 3871static struct x86_emulate_ops emulate_ops = {
3788 .read_std = kvm_read_guest_virt_system, 3872 .read_std = kvm_read_guest_virt_system,
3789 .write_std = kvm_write_guest_virt_system, 3873 .write_std = kvm_write_guest_virt_system,
@@ -3797,11 +3881,15 @@ static struct x86_emulate_ops emulate_ops = {
3797 .set_cached_descriptor = emulator_set_cached_descriptor, 3881 .set_cached_descriptor = emulator_set_cached_descriptor,
3798 .get_segment_selector = emulator_get_segment_selector, 3882 .get_segment_selector = emulator_get_segment_selector,
3799 .set_segment_selector = emulator_set_segment_selector, 3883 .set_segment_selector = emulator_set_segment_selector,
3884 .get_cached_segment_base = emulator_get_cached_segment_base,
3800 .get_gdt = emulator_get_gdt, 3885 .get_gdt = emulator_get_gdt,
3801 .get_cr = emulator_get_cr, 3886 .get_cr = emulator_get_cr,
3802 .set_cr = emulator_set_cr, 3887 .set_cr = emulator_set_cr,
3803 .cpl = emulator_get_cpl, 3888 .cpl = emulator_get_cpl,
3804 .set_rflags = emulator_set_rflags, 3889 .get_dr = emulator_get_dr,
3890 .set_dr = emulator_set_dr,
3891 .set_msr = kvm_set_msr,
3892 .get_msr = kvm_get_msr,
3805}; 3893};
3806 3894
3807static void cache_all_regs(struct kvm_vcpu *vcpu) 3895static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3812,14 +3900,75 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
3812 vcpu->arch.regs_dirty = ~0; 3900 vcpu->arch.regs_dirty = ~0;
3813} 3901}
3814 3902
3903static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
3904{
3905 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
3906 /*
3907 * an sti; sti; sequence only disable interrupts for the first
3908 * instruction. So, if the last instruction, be it emulated or
3909 * not, left the system with the INT_STI flag enabled, it
3910 * means that the last instruction is an sti. We should not
3911 * leave the flag on in this case. The same goes for mov ss
3912 */
3913 if (!(int_shadow & mask))
3914 kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
3915}
3916
3917static void inject_emulated_exception(struct kvm_vcpu *vcpu)
3918{
3919 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
3920 if (ctxt->exception == PF_VECTOR)
3921 kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code);
3922 else if (ctxt->error_code_valid)
3923 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
3924 else
3925 kvm_queue_exception(vcpu, ctxt->exception);
3926}
3927
3928static int handle_emulation_failure(struct kvm_vcpu *vcpu)
3929{
3930 ++vcpu->stat.insn_emulation_fail;
3931 trace_kvm_emulate_insn_failed(vcpu);
3932 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3933 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3934 vcpu->run->internal.ndata = 0;
3935 kvm_queue_exception(vcpu, UD_VECTOR);
3936 return EMULATE_FAIL;
3937}
3938
3939static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
3940{
3941 gpa_t gpa;
3942
3943 if (tdp_enabled)
3944 return false;
3945
3946 /*
3947 * if emulation was due to access to shadowed page table
3948 * and it failed try to unshadow page and re-entetr the
3949 * guest to let CPU execute the instruction.
3950 */
3951 if (kvm_mmu_unprotect_page_virt(vcpu, gva))
3952 return true;
3953
3954 gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
3955
3956 if (gpa == UNMAPPED_GVA)
3957 return true; /* let cpu generate fault */
3958
3959 if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
3960 return true;
3961
3962 return false;
3963}
3964
3815int emulate_instruction(struct kvm_vcpu *vcpu, 3965int emulate_instruction(struct kvm_vcpu *vcpu,
3816 unsigned long cr2, 3966 unsigned long cr2,
3817 u16 error_code, 3967 u16 error_code,
3818 int emulation_type) 3968 int emulation_type)
3819{ 3969{
3820 int r, shadow_mask; 3970 int r;
3821 struct decode_cache *c; 3971 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
3822 struct kvm_run *run = vcpu->run;
3823 3972
3824 kvm_clear_exception_queue(vcpu); 3973 kvm_clear_exception_queue(vcpu);
3825 vcpu->arch.mmio_fault_cr2 = cr2; 3974 vcpu->arch.mmio_fault_cr2 = cr2;
@@ -3831,8 +3980,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3831 */ 3980 */
3832 cache_all_regs(vcpu); 3981 cache_all_regs(vcpu);
3833 3982
3834 vcpu->mmio_is_write = 0;
3835
3836 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 3983 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3837 int cs_db, cs_l; 3984 int cs_db, cs_l;
3838 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3985 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
@@ -3846,13 +3993,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3846 ? X86EMUL_MODE_VM86 : cs_l 3993 ? X86EMUL_MODE_VM86 : cs_l
3847 ? X86EMUL_MODE_PROT64 : cs_db 3994 ? X86EMUL_MODE_PROT64 : cs_db
3848 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3995 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3996 memset(c, 0, sizeof(struct decode_cache));
3997 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
3998 vcpu->arch.emulate_ctxt.interruptibility = 0;
3999 vcpu->arch.emulate_ctxt.exception = -1;
3849 4000
3850 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4001 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3851 trace_kvm_emulate_insn_start(vcpu); 4002 trace_kvm_emulate_insn_start(vcpu);
3852 4003
3853 /* Only allow emulation of specific instructions on #UD 4004 /* Only allow emulation of specific instructions on #UD
3854 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 4005 * (namely VMMCALL, sysenter, sysexit, syscall)*/
3855 c = &vcpu->arch.emulate_ctxt.decode;
3856 if (emulation_type & EMULTYPE_TRAP_UD) { 4006 if (emulation_type & EMULTYPE_TRAP_UD) {
3857 if (!c->twobyte) 4007 if (!c->twobyte)
3858 return EMULATE_FAIL; 4008 return EMULATE_FAIL;
@@ -3880,11 +4030,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3880 4030
3881 ++vcpu->stat.insn_emulation; 4031 ++vcpu->stat.insn_emulation;
3882 if (r) { 4032 if (r) {
3883 ++vcpu->stat.insn_emulation_fail; 4033 if (reexecute_instruction(vcpu, cr2))
3884 trace_kvm_emulate_insn_failed(vcpu);
3885 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3886 return EMULATE_DONE; 4034 return EMULATE_DONE;
3887 return EMULATE_FAIL; 4035 if (emulation_type & EMULTYPE_SKIP)
4036 return EMULATE_FAIL;
4037 return handle_emulation_failure(vcpu);
3888 } 4038 }
3889 } 4039 }
3890 4040
@@ -3893,48 +4043,42 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3893 return EMULATE_DONE; 4043 return EMULATE_DONE;
3894 } 4044 }
3895 4045
4046 /* this is needed for vmware backdor interface to work since it
4047 changes registers values during IO operation */
4048 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4049
3896restart: 4050restart:
3897 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4051 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3898 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
3899 4052
3900 if (r == 0) 4053 if (r) { /* emulation failed */
3901 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 4054 if (reexecute_instruction(vcpu, cr2))
4055 return EMULATE_DONE;
3902 4056
3903 if (vcpu->arch.pio.count) { 4057 return handle_emulation_failure(vcpu);
3904 if (!vcpu->arch.pio.in)
3905 vcpu->arch.pio.count = 0;
3906 return EMULATE_DO_MMIO;
3907 } 4058 }
3908 4059
3909 if (r || vcpu->mmio_is_write) { 4060 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
3910 run->exit_reason = KVM_EXIT_MMIO; 4061 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
3911 run->mmio.phys_addr = vcpu->mmio_phys_addr; 4062 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
3912 memcpy(run->mmio.data, vcpu->mmio_data, 8); 4063 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
3913 run->mmio.len = vcpu->mmio_size; 4064
3914 run->mmio.is_write = vcpu->mmio_is_write; 4065 if (vcpu->arch.emulate_ctxt.exception >= 0) {
4066 inject_emulated_exception(vcpu);
4067 return EMULATE_DONE;
3915 } 4068 }
3916 4069
3917 if (r) { 4070 if (vcpu->arch.pio.count) {
3918 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 4071 if (!vcpu->arch.pio.in)
3919 goto done; 4072 vcpu->arch.pio.count = 0;
3920 if (!vcpu->mmio_needed) {
3921 ++vcpu->stat.insn_emulation_fail;
3922 trace_kvm_emulate_insn_failed(vcpu);
3923 kvm_report_emulation_failure(vcpu, "mmio");
3924 return EMULATE_FAIL;
3925 }
3926 return EMULATE_DO_MMIO; 4073 return EMULATE_DO_MMIO;
3927 } 4074 }
3928 4075
3929 if (vcpu->mmio_is_write) { 4076 if (vcpu->mmio_needed) {
3930 vcpu->mmio_needed = 0; 4077 if (vcpu->mmio_is_write)
4078 vcpu->mmio_needed = 0;
3931 return EMULATE_DO_MMIO; 4079 return EMULATE_DO_MMIO;
3932 } 4080 }
3933 4081
3934done:
3935 if (vcpu->arch.exception.pending)
3936 vcpu->arch.emulate_ctxt.restart = false;
3937
3938 if (vcpu->arch.emulate_ctxt.restart) 4082 if (vcpu->arch.emulate_ctxt.restart)
3939 goto restart; 4083 goto restart;
3940 4084
@@ -4108,6 +4252,9 @@ int kvm_arch_init(void *opaque)
4108 4252
4109 perf_register_guest_info_callbacks(&kvm_guest_cbs); 4253 perf_register_guest_info_callbacks(&kvm_guest_cbs);
4110 4254
4255 if (cpu_has_xsave)
4256 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
4257
4111 return 0; 4258 return 0;
4112 4259
4113out: 4260out:
@@ -4270,7 +4417,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
4270 4417
4271 kvm_x86_ops->patch_hypercall(vcpu, instruction); 4418 kvm_x86_ops->patch_hypercall(vcpu, instruction);
4272 4419
4273 return emulator_write_emulated(rip, instruction, 3, vcpu); 4420 return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
4274} 4421}
4275 4422
4276void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4423void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
@@ -4506,59 +4653,78 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
4506 } 4653 }
4507} 4654}
4508 4655
4656static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
4657{
4658 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
4659 !vcpu->guest_xcr0_loaded) {
4660 /* kvm_set_xcr() also depends on this */
4661 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
4662 vcpu->guest_xcr0_loaded = 1;
4663 }
4664}
4665
4666static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
4667{
4668 if (vcpu->guest_xcr0_loaded) {
4669 if (vcpu->arch.xcr0 != host_xcr0)
4670 xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
4671 vcpu->guest_xcr0_loaded = 0;
4672 }
4673}
4674
4509static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 4675static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4510{ 4676{
4511 int r; 4677 int r;
4512 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 4678 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
4513 vcpu->run->request_interrupt_window; 4679 vcpu->run->request_interrupt_window;
4514 4680
4515 if (vcpu->requests)
4516 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
4517 kvm_mmu_unload(vcpu);
4518
4519 r = kvm_mmu_reload(vcpu);
4520 if (unlikely(r))
4521 goto out;
4522
4523 if (vcpu->requests) { 4681 if (vcpu->requests) {
4524 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 4682 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
4683 kvm_mmu_unload(vcpu);
4684 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
4525 __kvm_migrate_timers(vcpu); 4685 __kvm_migrate_timers(vcpu);
4526 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) 4686 if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
4527 kvm_write_guest_time(vcpu); 4687 kvm_write_guest_time(vcpu);
4528 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 4688 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
4529 kvm_mmu_sync_roots(vcpu); 4689 kvm_mmu_sync_roots(vcpu);
4530 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 4690 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
4531 kvm_x86_ops->tlb_flush(vcpu); 4691 kvm_x86_ops->tlb_flush(vcpu);
4532 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 4692 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
4533 &vcpu->requests)) {
4534 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; 4693 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
4535 r = 0; 4694 r = 0;
4536 goto out; 4695 goto out;
4537 } 4696 }
4538 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 4697 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
4539 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 4698 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
4540 r = 0; 4699 r = 0;
4541 goto out; 4700 goto out;
4542 } 4701 }
4543 if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { 4702 if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
4544 vcpu->fpu_active = 0; 4703 vcpu->fpu_active = 0;
4545 kvm_x86_ops->fpu_deactivate(vcpu); 4704 kvm_x86_ops->fpu_deactivate(vcpu);
4546 } 4705 }
4547 } 4706 }
4548 4707
4708 r = kvm_mmu_reload(vcpu);
4709 if (unlikely(r))
4710 goto out;
4711
4549 preempt_disable(); 4712 preempt_disable();
4550 4713
4551 kvm_x86_ops->prepare_guest_switch(vcpu); 4714 kvm_x86_ops->prepare_guest_switch(vcpu);
4552 if (vcpu->fpu_active) 4715 if (vcpu->fpu_active)
4553 kvm_load_guest_fpu(vcpu); 4716 kvm_load_guest_fpu(vcpu);
4717 kvm_load_guest_xcr0(vcpu);
4554 4718
4555 local_irq_disable(); 4719 atomic_set(&vcpu->guest_mode, 1);
4720 smp_wmb();
4556 4721
4557 clear_bit(KVM_REQ_KICK, &vcpu->requests); 4722 local_irq_disable();
4558 smp_mb__after_clear_bit();
4559 4723
4560 if (vcpu->requests || need_resched() || signal_pending(current)) { 4724 if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
4561 set_bit(KVM_REQ_KICK, &vcpu->requests); 4725 || need_resched() || signal_pending(current)) {
4726 atomic_set(&vcpu->guest_mode, 0);
4727 smp_wmb();
4562 local_irq_enable(); 4728 local_irq_enable();
4563 preempt_enable(); 4729 preempt_enable();
4564 r = 1; 4730 r = 1;
@@ -4603,7 +4769,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4603 if (hw_breakpoint_active()) 4769 if (hw_breakpoint_active())
4604 hw_breakpoint_restore(); 4770 hw_breakpoint_restore();
4605 4771
4606 set_bit(KVM_REQ_KICK, &vcpu->requests); 4772 atomic_set(&vcpu->guest_mode, 0);
4773 smp_wmb();
4607 local_irq_enable(); 4774 local_irq_enable();
4608 4775
4609 ++vcpu->stat.exits; 4776 ++vcpu->stat.exits;
@@ -4665,7 +4832,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4665 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 4832 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4666 kvm_vcpu_block(vcpu); 4833 kvm_vcpu_block(vcpu);
4667 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 4834 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
4668 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 4835 if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
4669 { 4836 {
4670 switch(vcpu->arch.mp_state) { 4837 switch(vcpu->arch.mp_state) {
4671 case KVM_MP_STATE_HALTED: 4838 case KVM_MP_STATE_HALTED:
@@ -4717,8 +4884,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4717 int r; 4884 int r;
4718 sigset_t sigsaved; 4885 sigset_t sigsaved;
4719 4886
4720 vcpu_load(vcpu);
4721
4722 if (vcpu->sigset_active) 4887 if (vcpu->sigset_active)
4723 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 4888 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
4724 4889
@@ -4743,7 +4908,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4743 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4908 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4744 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); 4909 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
4745 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4910 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4746 if (r == EMULATE_DO_MMIO) { 4911 if (r != EMULATE_DONE) {
4747 r = 0; 4912 r = 0;
4748 goto out; 4913 goto out;
4749 } 4914 }
@@ -4759,14 +4924,11 @@ out:
4759 if (vcpu->sigset_active) 4924 if (vcpu->sigset_active)
4760 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 4925 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
4761 4926
4762 vcpu_put(vcpu);
4763 return r; 4927 return r;
4764} 4928}
4765 4929
4766int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4930int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4767{ 4931{
4768 vcpu_load(vcpu);
4769
4770 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4932 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4771 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4933 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4772 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4934 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -4789,15 +4951,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4789 regs->rip = kvm_rip_read(vcpu); 4951 regs->rip = kvm_rip_read(vcpu);
4790 regs->rflags = kvm_get_rflags(vcpu); 4952 regs->rflags = kvm_get_rflags(vcpu);
4791 4953
4792 vcpu_put(vcpu);
4793
4794 return 0; 4954 return 0;
4795} 4955}
4796 4956
4797int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4957int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4798{ 4958{
4799 vcpu_load(vcpu);
4800
4801 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 4959 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
4802 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 4960 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
4803 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 4961 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -4822,8 +4980,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4822 4980
4823 vcpu->arch.exception.pending = false; 4981 vcpu->arch.exception.pending = false;
4824 4982
4825 vcpu_put(vcpu);
4826
4827 return 0; 4983 return 0;
4828} 4984}
4829 4985
@@ -4842,8 +4998,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4842{ 4998{
4843 struct desc_ptr dt; 4999 struct desc_ptr dt;
4844 5000
4845 vcpu_load(vcpu);
4846
4847 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5001 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4848 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 5002 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4849 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 5003 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -4875,32 +5029,27 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4875 set_bit(vcpu->arch.interrupt.nr, 5029 set_bit(vcpu->arch.interrupt.nr,
4876 (unsigned long *)sregs->interrupt_bitmap); 5030 (unsigned long *)sregs->interrupt_bitmap);
4877 5031
4878 vcpu_put(vcpu);
4879
4880 return 0; 5032 return 0;
4881} 5033}
4882 5034
4883int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 5035int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
4884 struct kvm_mp_state *mp_state) 5036 struct kvm_mp_state *mp_state)
4885{ 5037{
4886 vcpu_load(vcpu);
4887 mp_state->mp_state = vcpu->arch.mp_state; 5038 mp_state->mp_state = vcpu->arch.mp_state;
4888 vcpu_put(vcpu);
4889 return 0; 5039 return 0;
4890} 5040}
4891 5041
4892int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 5042int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
4893 struct kvm_mp_state *mp_state) 5043 struct kvm_mp_state *mp_state)
4894{ 5044{
4895 vcpu_load(vcpu);
4896 vcpu->arch.mp_state = mp_state->mp_state; 5045 vcpu->arch.mp_state = mp_state->mp_state;
4897 vcpu_put(vcpu);
4898 return 0; 5046 return 0;
4899} 5047}
4900 5048
4901int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 5049int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4902 bool has_error_code, u32 error_code) 5050 bool has_error_code, u32 error_code)
4903{ 5051{
5052 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4904 int cs_db, cs_l, ret; 5053 int cs_db, cs_l, ret;
4905 cache_all_regs(vcpu); 5054 cache_all_regs(vcpu);
4906 5055
@@ -4915,6 +5064,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4915 ? X86EMUL_MODE_VM86 : cs_l 5064 ? X86EMUL_MODE_VM86 : cs_l
4916 ? X86EMUL_MODE_PROT64 : cs_db 5065 ? X86EMUL_MODE_PROT64 : cs_db
4917 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 5066 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
5067 memset(c, 0, sizeof(struct decode_cache));
5068 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4918 5069
4919 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, 5070 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
4920 tss_selector, reason, has_error_code, 5071 tss_selector, reason, has_error_code,
@@ -4923,6 +5074,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4923 if (ret) 5074 if (ret)
4924 return EMULATE_FAIL; 5075 return EMULATE_FAIL;
4925 5076
5077 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5078 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4926 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 5079 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4927 return EMULATE_DONE; 5080 return EMULATE_DONE;
4928} 5081}
@@ -4935,8 +5088,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4935 int pending_vec, max_bits; 5088 int pending_vec, max_bits;
4936 struct desc_ptr dt; 5089 struct desc_ptr dt;
4937 5090
4938 vcpu_load(vcpu);
4939
4940 dt.size = sregs->idt.limit; 5091 dt.size = sregs->idt.limit;
4941 dt.address = sregs->idt.base; 5092 dt.address = sregs->idt.base;
4942 kvm_x86_ops->set_idt(vcpu, &dt); 5093 kvm_x86_ops->set_idt(vcpu, &dt);
@@ -4996,8 +5147,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4996 !is_protmode(vcpu)) 5147 !is_protmode(vcpu))
4997 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5148 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4998 5149
4999 vcpu_put(vcpu);
5000
5001 return 0; 5150 return 0;
5002} 5151}
5003 5152
@@ -5007,12 +5156,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5007 unsigned long rflags; 5156 unsigned long rflags;
5008 int i, r; 5157 int i, r;
5009 5158
5010 vcpu_load(vcpu);
5011
5012 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { 5159 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
5013 r = -EBUSY; 5160 r = -EBUSY;
5014 if (vcpu->arch.exception.pending) 5161 if (vcpu->arch.exception.pending)
5015 goto unlock_out; 5162 goto out;
5016 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 5163 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
5017 kvm_queue_exception(vcpu, DB_VECTOR); 5164 kvm_queue_exception(vcpu, DB_VECTOR);
5018 else 5165 else
@@ -5054,34 +5201,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5054 5201
5055 r = 0; 5202 r = 0;
5056 5203
5057unlock_out: 5204out:
5058 vcpu_put(vcpu);
5059 5205
5060 return r; 5206 return r;
5061} 5207}
5062 5208
5063/* 5209/*
5064 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
5065 * we have asm/x86/processor.h
5066 */
5067struct fxsave {
5068 u16 cwd;
5069 u16 swd;
5070 u16 twd;
5071 u16 fop;
5072 u64 rip;
5073 u64 rdp;
5074 u32 mxcsr;
5075 u32 mxcsr_mask;
5076 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
5077#ifdef CONFIG_X86_64
5078 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
5079#else
5080 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
5081#endif
5082};
5083
5084/*
5085 * Translate a guest virtual address to a guest physical address. 5210 * Translate a guest virtual address to a guest physical address.
5086 */ 5211 */
5087int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 5212int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
@@ -5091,7 +5216,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
5091 gpa_t gpa; 5216 gpa_t gpa;
5092 int idx; 5217 int idx;
5093 5218
5094 vcpu_load(vcpu);
5095 idx = srcu_read_lock(&vcpu->kvm->srcu); 5219 idx = srcu_read_lock(&vcpu->kvm->srcu);
5096 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); 5220 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
5097 srcu_read_unlock(&vcpu->kvm->srcu, idx); 5221 srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -5099,16 +5223,14 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
5099 tr->valid = gpa != UNMAPPED_GVA; 5223 tr->valid = gpa != UNMAPPED_GVA;
5100 tr->writeable = 1; 5224 tr->writeable = 1;
5101 tr->usermode = 0; 5225 tr->usermode = 0;
5102 vcpu_put(vcpu);
5103 5226
5104 return 0; 5227 return 0;
5105} 5228}
5106 5229
5107int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 5230int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5108{ 5231{
5109 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 5232 struct i387_fxsave_struct *fxsave =
5110 5233 &vcpu->arch.guest_fpu.state->fxsave;
5111 vcpu_load(vcpu);
5112 5234
5113 memcpy(fpu->fpr, fxsave->st_space, 128); 5235 memcpy(fpu->fpr, fxsave->st_space, 128);
5114 fpu->fcw = fxsave->cwd; 5236 fpu->fcw = fxsave->cwd;
@@ -5119,16 +5241,13 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5119 fpu->last_dp = fxsave->rdp; 5241 fpu->last_dp = fxsave->rdp;
5120 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 5242 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
5121 5243
5122 vcpu_put(vcpu);
5123
5124 return 0; 5244 return 0;
5125} 5245}
5126 5246
5127int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 5247int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5128{ 5248{
5129 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 5249 struct i387_fxsave_struct *fxsave =
5130 5250 &vcpu->arch.guest_fpu.state->fxsave;
5131 vcpu_load(vcpu);
5132 5251
5133 memcpy(fxsave->st_space, fpu->fpr, 128); 5252 memcpy(fxsave->st_space, fpu->fpr, 128);
5134 fxsave->cwd = fpu->fcw; 5253 fxsave->cwd = fpu->fcw;
@@ -5139,61 +5258,63 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5139 fxsave->rdp = fpu->last_dp; 5258 fxsave->rdp = fpu->last_dp;
5140 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 5259 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
5141 5260
5142 vcpu_put(vcpu);
5143
5144 return 0; 5261 return 0;
5145} 5262}
5146 5263
5147void fx_init(struct kvm_vcpu *vcpu) 5264int fx_init(struct kvm_vcpu *vcpu)
5148{ 5265{
5149 unsigned after_mxcsr_mask; 5266 int err;
5267
5268 err = fpu_alloc(&vcpu->arch.guest_fpu);
5269 if (err)
5270 return err;
5271
5272 fpu_finit(&vcpu->arch.guest_fpu);
5150 5273
5151 /* 5274 /*
5152 * Touch the fpu the first time in non atomic context as if 5275 * Ensure guest xcr0 is valid for loading
5153 * this is the first fpu instruction the exception handler
5154 * will fire before the instruction returns and it'll have to
5155 * allocate ram with GFP_KERNEL.
5156 */ 5276 */
5157 if (!used_math()) 5277 vcpu->arch.xcr0 = XSTATE_FP;
5158 kvm_fx_save(&vcpu->arch.host_fx_image);
5159
5160 /* Initialize guest FPU by resetting ours and saving into guest's */
5161 preempt_disable();
5162 kvm_fx_save(&vcpu->arch.host_fx_image);
5163 kvm_fx_finit();
5164 kvm_fx_save(&vcpu->arch.guest_fx_image);
5165 kvm_fx_restore(&vcpu->arch.host_fx_image);
5166 preempt_enable();
5167 5278
5168 vcpu->arch.cr0 |= X86_CR0_ET; 5279 vcpu->arch.cr0 |= X86_CR0_ET;
5169 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 5280
5170 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 5281 return 0;
5171 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
5172 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
5173} 5282}
5174EXPORT_SYMBOL_GPL(fx_init); 5283EXPORT_SYMBOL_GPL(fx_init);
5175 5284
5285static void fx_free(struct kvm_vcpu *vcpu)
5286{
5287 fpu_free(&vcpu->arch.guest_fpu);
5288}
5289
5176void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 5290void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
5177{ 5291{
5178 if (vcpu->guest_fpu_loaded) 5292 if (vcpu->guest_fpu_loaded)
5179 return; 5293 return;
5180 5294
5295 /*
5296 * Restore all possible states in the guest,
5297 * and assume host would use all available bits.
5298 * Guest xcr0 would be loaded later.
5299 */
5300 kvm_put_guest_xcr0(vcpu);
5181 vcpu->guest_fpu_loaded = 1; 5301 vcpu->guest_fpu_loaded = 1;
5182 kvm_fx_save(&vcpu->arch.host_fx_image); 5302 unlazy_fpu(current);
5183 kvm_fx_restore(&vcpu->arch.guest_fx_image); 5303 fpu_restore_checking(&vcpu->arch.guest_fpu);
5184 trace_kvm_fpu(1); 5304 trace_kvm_fpu(1);
5185} 5305}
5186 5306
5187void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 5307void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
5188{ 5308{
5309 kvm_put_guest_xcr0(vcpu);
5310
5189 if (!vcpu->guest_fpu_loaded) 5311 if (!vcpu->guest_fpu_loaded)
5190 return; 5312 return;
5191 5313
5192 vcpu->guest_fpu_loaded = 0; 5314 vcpu->guest_fpu_loaded = 0;
5193 kvm_fx_save(&vcpu->arch.guest_fx_image); 5315 fpu_save_init(&vcpu->arch.guest_fpu);
5194 kvm_fx_restore(&vcpu->arch.host_fx_image);
5195 ++vcpu->stat.fpu_reload; 5316 ++vcpu->stat.fpu_reload;
5196 set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); 5317 kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
5197 trace_kvm_fpu(0); 5318 trace_kvm_fpu(0);
5198} 5319}
5199 5320
@@ -5204,6 +5325,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5204 vcpu->arch.time_page = NULL; 5325 vcpu->arch.time_page = NULL;
5205 } 5326 }
5206 5327
5328 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
5329 fx_free(vcpu);
5207 kvm_x86_ops->vcpu_free(vcpu); 5330 kvm_x86_ops->vcpu_free(vcpu);
5208} 5331}
5209 5332
@@ -5217,9 +5340,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
5217{ 5340{
5218 int r; 5341 int r;
5219 5342
5220 /* We do fxsave: this must be aligned. */
5221 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
5222
5223 vcpu->arch.mtrr_state.have_fixed = 1; 5343 vcpu->arch.mtrr_state.have_fixed = 1;
5224 vcpu_load(vcpu); 5344 vcpu_load(vcpu);
5225 r = kvm_arch_vcpu_reset(vcpu); 5345 r = kvm_arch_vcpu_reset(vcpu);
@@ -5241,6 +5361,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
5241 kvm_mmu_unload(vcpu); 5361 kvm_mmu_unload(vcpu);
5242 vcpu_put(vcpu); 5362 vcpu_put(vcpu);
5243 5363
5364 fx_free(vcpu);
5244 kvm_x86_ops->vcpu_free(vcpu); 5365 kvm_x86_ops->vcpu_free(vcpu);
5245} 5366}
5246 5367
@@ -5334,7 +5455,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5334 } 5455 }
5335 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5456 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
5336 5457
5458 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
5459 goto fail_free_mce_banks;
5460
5337 return 0; 5461 return 0;
5462fail_free_mce_banks:
5463 kfree(vcpu->arch.mce_banks);
5338fail_free_lapic: 5464fail_free_lapic:
5339 kvm_free_lapic(vcpu); 5465 kvm_free_lapic(vcpu);
5340fail_mmu_destroy: 5466fail_mmu_destroy:
@@ -5364,12 +5490,6 @@ struct kvm *kvm_arch_create_vm(void)
5364 if (!kvm) 5490 if (!kvm)
5365 return ERR_PTR(-ENOMEM); 5491 return ERR_PTR(-ENOMEM);
5366 5492
5367 kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
5368 if (!kvm->arch.aliases) {
5369 kfree(kvm);
5370 return ERR_PTR(-ENOMEM);
5371 }
5372
5373 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 5493 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5374 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 5494 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5375 5495
@@ -5412,12 +5532,12 @@ static void kvm_free_vcpus(struct kvm *kvm)
5412void kvm_arch_sync_events(struct kvm *kvm) 5532void kvm_arch_sync_events(struct kvm *kvm)
5413{ 5533{
5414 kvm_free_all_assigned_devices(kvm); 5534 kvm_free_all_assigned_devices(kvm);
5535 kvm_free_pit(kvm);
5415} 5536}
5416 5537
5417void kvm_arch_destroy_vm(struct kvm *kvm) 5538void kvm_arch_destroy_vm(struct kvm *kvm)
5418{ 5539{
5419 kvm_iommu_unmap_guest(kvm); 5540 kvm_iommu_unmap_guest(kvm);
5420 kvm_free_pit(kvm);
5421 kfree(kvm->arch.vpic); 5541 kfree(kvm->arch.vpic);
5422 kfree(kvm->arch.vioapic); 5542 kfree(kvm->arch.vioapic);
5423 kvm_free_vcpus(kvm); 5543 kvm_free_vcpus(kvm);
@@ -5427,7 +5547,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
5427 if (kvm->arch.ept_identity_pagetable) 5547 if (kvm->arch.ept_identity_pagetable)
5428 put_page(kvm->arch.ept_identity_pagetable); 5548 put_page(kvm->arch.ept_identity_pagetable);
5429 cleanup_srcu_struct(&kvm->srcu); 5549 cleanup_srcu_struct(&kvm->srcu);
5430 kfree(kvm->arch.aliases);
5431 kfree(kvm); 5550 kfree(kvm);
5432} 5551}
5433 5552
@@ -5438,6 +5557,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
5438 int user_alloc) 5557 int user_alloc)
5439{ 5558{
5440 int npages = memslot->npages; 5559 int npages = memslot->npages;
5560 int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
5561
5562 /* Prevent internal slot pages from being moved by fork()/COW. */
5563 if (memslot->id >= KVM_MEMORY_SLOTS)
5564 map_flags = MAP_SHARED | MAP_ANONYMOUS;
5441 5565
5442 /*To keep backward compatibility with older userspace, 5566 /*To keep backward compatibility with older userspace,
5443 *x86 needs to hanlde !user_alloc case. 5567 *x86 needs to hanlde !user_alloc case.
@@ -5450,7 +5574,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
5450 userspace_addr = do_mmap(NULL, 0, 5574 userspace_addr = do_mmap(NULL, 0,
5451 npages * PAGE_SIZE, 5575 npages * PAGE_SIZE,
5452 PROT_READ | PROT_WRITE, 5576 PROT_READ | PROT_WRITE,
5453 MAP_PRIVATE | MAP_ANONYMOUS, 5577 map_flags,
5454 0); 5578 0);
5455 up_write(&current->mm->mmap_sem); 5579 up_write(&current->mm->mmap_sem);
5456 5580
@@ -5523,7 +5647,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
5523 5647
5524 me = get_cpu(); 5648 me = get_cpu();
5525 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 5649 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
5526 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) 5650 if (atomic_xchg(&vcpu->guest_mode, 0))
5527 smp_send_reschedule(cpu); 5651 smp_send_reschedule(cpu);
5528 put_cpu(); 5652 put_cpu();
5529} 5653}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f4b54458285b..b7a404722d2b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,13 +65,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66} 66}
67 67
68static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
69{
70 return rcu_dereference_check(kvm->arch.aliases,
71 srcu_read_lock_held(&kvm->srcu)
72 || lockdep_is_held(&kvm->slots_lock));
73}
74
75void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 68void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
76void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 69void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
77 70
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 23ea02253900..636fc381c897 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -524,6 +524,12 @@ struct kvm_enable_cap {
524#define KVM_CAP_PPC_OSI 52 524#define KVM_CAP_PPC_OSI 52
525#define KVM_CAP_PPC_UNSET_IRQ 53 525#define KVM_CAP_PPC_UNSET_IRQ 53
526#define KVM_CAP_ENABLE_CAP 54 526#define KVM_CAP_ENABLE_CAP 54
527#ifdef __KVM_HAVE_XSAVE
528#define KVM_CAP_XSAVE 55
529#endif
530#ifdef __KVM_HAVE_XCRS
531#define KVM_CAP_XCRS 56
532#endif
527 533
528#ifdef KVM_CAP_IRQ_ROUTING 534#ifdef KVM_CAP_IRQ_ROUTING
529 535
@@ -613,6 +619,7 @@ struct kvm_clock_data {
613 */ 619 */
614#define KVM_CREATE_VCPU _IO(KVMIO, 0x41) 620#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
615#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) 621#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log)
622/* KVM_SET_MEMORY_ALIAS is obsolete: */
616#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) 623#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias)
617#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) 624#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44)
618#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) 625#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45)
@@ -714,6 +721,12 @@ struct kvm_clock_data {
714#define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs) 721#define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs)
715#define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs) 722#define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs)
716#define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap) 723#define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap)
724/* Available with KVM_CAP_XSAVE */
725#define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave)
726#define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave)
727/* Available with KVM_CAP_XCRS */
728#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs)
729#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs)
717 730
718#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) 731#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
719 732
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7cb116afa1cd..c13cc48697aa 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -81,13 +81,14 @@ struct kvm_vcpu {
81 int vcpu_id; 81 int vcpu_id;
82 struct mutex mutex; 82 struct mutex mutex;
83 int cpu; 83 int cpu;
84 atomic_t guest_mode;
84 struct kvm_run *run; 85 struct kvm_run *run;
85 unsigned long requests; 86 unsigned long requests;
86 unsigned long guest_debug; 87 unsigned long guest_debug;
87 int srcu_idx; 88 int srcu_idx;
88 89
89 int fpu_active; 90 int fpu_active;
90 int guest_fpu_loaded; 91 int guest_fpu_loaded, guest_xcr0_loaded;
91 wait_queue_head_t wq; 92 wait_queue_head_t wq;
92 int sigset_active; 93 int sigset_active;
93 sigset_t sigset; 94 sigset_t sigset;
@@ -123,6 +124,7 @@ struct kvm_memory_slot {
123 } *lpage_info[KVM_NR_PAGE_SIZES - 1]; 124 } *lpage_info[KVM_NR_PAGE_SIZES - 1];
124 unsigned long userspace_addr; 125 unsigned long userspace_addr;
125 int user_alloc; 126 int user_alloc;
127 int id;
126}; 128};
127 129
128static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) 130static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
@@ -266,6 +268,8 @@ extern pfn_t bad_pfn;
266 268
267int is_error_page(struct page *page); 269int is_error_page(struct page *page);
268int is_error_pfn(pfn_t pfn); 270int is_error_pfn(pfn_t pfn);
271int is_hwpoison_pfn(pfn_t pfn);
272int is_fault_pfn(pfn_t pfn);
269int kvm_is_error_hva(unsigned long addr); 273int kvm_is_error_hva(unsigned long addr);
270int kvm_set_memory_region(struct kvm *kvm, 274int kvm_set_memory_region(struct kvm *kvm,
271 struct kvm_userspace_memory_region *mem, 275 struct kvm_userspace_memory_region *mem,
@@ -284,8 +288,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
284 int user_alloc); 288 int user_alloc);
285void kvm_disable_largepages(void); 289void kvm_disable_largepages(void);
286void kvm_arch_flush_shadow(struct kvm *kvm); 290void kvm_arch_flush_shadow(struct kvm *kvm);
287gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
288gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn);
289 291
290struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); 292struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
291unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); 293unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
@@ -445,7 +447,8 @@ void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
445 struct kvm_irq_mask_notifier *kimn); 447 struct kvm_irq_mask_notifier *kimn);
446void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, 448void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
447 struct kvm_irq_mask_notifier *kimn); 449 struct kvm_irq_mask_notifier *kimn);
448void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask); 450void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
451 bool mask);
449 452
450#ifdef __KVM_HAVE_IOAPIC 453#ifdef __KVM_HAVE_IOAPIC
451void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, 454void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
@@ -562,10 +565,6 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se
562} 565}
563#endif 566#endif
564 567
565#ifndef KVM_ARCH_HAS_UNALIAS_INSTANTIATION
566#define unalias_gfn_instantiation unalias_gfn
567#endif
568
569#ifdef CONFIG_HAVE_KVM_IRQCHIP 568#ifdef CONFIG_HAVE_KVM_IRQCHIP
570 569
571#define KVM_MAX_IRQ_ROUTES 1024 570#define KVM_MAX_IRQ_ROUTES 1024
@@ -628,5 +627,25 @@ static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
628 627
629#endif 628#endif
630 629
630static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
631{
632 set_bit(req, &vcpu->requests);
633}
634
635static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu)
636{
637 return test_and_set_bit(req, &vcpu->requests);
638}
639
640static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
641{
642 if (test_bit(req, &vcpu->requests)) {
643 clear_bit(req, &vcpu->requests);
644 return true;
645 } else {
646 return false;
647 }
648}
649
631#endif 650#endif
632 651
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index fb46efbeabec..7ac0d4eee430 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -32,11 +32,11 @@
32 32
33typedef unsigned long gva_t; 33typedef unsigned long gva_t;
34typedef u64 gpa_t; 34typedef u64 gpa_t;
35typedef unsigned long gfn_t; 35typedef u64 gfn_t;
36 36
37typedef unsigned long hva_t; 37typedef unsigned long hva_t;
38typedef u64 hpa_t; 38typedef u64 hpa_t;
39typedef unsigned long hfn_t; 39typedef u64 hfn_t;
40 40
41typedef hfn_t pfn_t; 41typedef hfn_t pfn_t;
42 42
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a2b48041b910..7a9ab7db1975 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1465,6 +1465,14 @@ extern int sysctl_memory_failure_recovery;
1465extern void shake_page(struct page *p, int access); 1465extern void shake_page(struct page *p, int access);
1466extern atomic_long_t mce_bad_pages; 1466extern atomic_long_t mce_bad_pages;
1467extern int soft_offline_page(struct page *page, int flags); 1467extern int soft_offline_page(struct page *page, int flags);
1468#ifdef CONFIG_MEMORY_FAILURE
1469int is_hwpoison_address(unsigned long addr);
1470#else
1471static inline int is_hwpoison_address(unsigned long addr)
1472{
1473 return 0;
1474}
1475#endif
1468 1476
1469extern void dump_page(struct page *page); 1477extern void dump_page(struct page *page);
1470 1478
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 620b0b461593..6b44e52cacaa 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -45,6 +45,7 @@
45#include <linux/page-isolation.h> 45#include <linux/page-isolation.h>
46#include <linux/suspend.h> 46#include <linux/suspend.h>
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/swapops.h>
48#include "internal.h" 49#include "internal.h"
49 50
50int sysctl_memory_failure_early_kill __read_mostly = 0; 51int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1296,3 +1297,35 @@ done:
1296 /* keep elevated page count for bad page */ 1297 /* keep elevated page count for bad page */
1297 return ret; 1298 return ret;
1298} 1299}
1300
1301/*
1302 * The caller must hold current->mm->mmap_sem in read mode.
1303 */
1304int is_hwpoison_address(unsigned long addr)
1305{
1306 pgd_t *pgdp;
1307 pud_t pud, *pudp;
1308 pmd_t pmd, *pmdp;
1309 pte_t pte, *ptep;
1310 swp_entry_t entry;
1311
1312 pgdp = pgd_offset(current->mm, addr);
1313 if (!pgd_present(*pgdp))
1314 return 0;
1315 pudp = pud_offset(pgdp, addr);
1316 pud = *pudp;
1317 if (!pud_present(pud) || pud_large(pud))
1318 return 0;
1319 pmdp = pmd_offset(pudp, addr);
1320 pmd = *pmdp;
1321 if (!pmd_present(pmd) || pmd_large(pmd))
1322 return 0;
1323 ptep = pte_offset_map(pmdp, addr);
1324 pte = *ptep;
1325 pte_unmap(ptep);
1326 if (!is_swap_pte(pte))
1327 return 0;
1328 entry = pte_to_swp_entry(pte);
1329 return is_hwpoison_entry(entry);
1330}
1331EXPORT_SYMBOL_GPL(is_hwpoison_address);
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 4d10b1e047f4..7c98928b09d9 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Kernel-based Virtual Machine - device assignment support 2 * Kernel-based Virtual Machine - device assignment support
3 * 3 *
4 * Copyright (C) 2006-9 Red Hat, Inc 4 * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
5 * 5 *
6 * This work is licensed under the terms of the GNU GPL, version 2. See 6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory. 7 * the COPYING file in the top-level directory.
@@ -58,12 +58,10 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
58static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) 58static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
59{ 59{
60 struct kvm_assigned_dev_kernel *assigned_dev; 60 struct kvm_assigned_dev_kernel *assigned_dev;
61 struct kvm *kvm;
62 int i; 61 int i;
63 62
64 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, 63 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
65 interrupt_work); 64 interrupt_work);
66 kvm = assigned_dev->kvm;
67 65
68 spin_lock_irq(&assigned_dev->assigned_dev_lock); 66 spin_lock_irq(&assigned_dev->assigned_dev_lock);
69 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 67 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
@@ -448,9 +446,6 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
448 struct kvm_assigned_dev_kernel *match; 446 struct kvm_assigned_dev_kernel *match;
449 unsigned long host_irq_type, guest_irq_type; 447 unsigned long host_irq_type, guest_irq_type;
450 448
451 if (!capable(CAP_SYS_RAWIO))
452 return -EPERM;
453
454 if (!irqchip_in_kernel(kvm)) 449 if (!irqchip_in_kernel(kvm))
455 return r; 450 return r;
456 451
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 53850177163f..fc8487564d1f 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -2,6 +2,7 @@
2 * KVM coalesced MMIO 2 * KVM coalesced MMIO
3 * 3 *
4 * Copyright (c) 2008 Bull S.A.S. 4 * Copyright (c) 2008 Bull S.A.S.
5 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
5 * 6 *
6 * Author: Laurent Vivier <Laurent.Vivier@bull.net> 7 * Author: Laurent Vivier <Laurent.Vivier@bull.net>
7 * 8 *
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index b81f0ebbaaad..66cf65b510b1 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -2,6 +2,7 @@
2 * kvm eventfd support - use eventfd objects to signal various KVM events 2 * kvm eventfd support - use eventfd objects to signal various KVM events
3 * 3 *
4 * Copyright 2009 Novell. All Rights Reserved. 4 * Copyright 2009 Novell. All Rights Reserved.
5 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
5 * 6 *
6 * Author: 7 * Author:
7 * Gregory Haskins <ghaskins@novell.com> 8 * Gregory Haskins <ghaskins@novell.com>
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 3500dee9cf2b..0b9df8303dcf 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2001 MandrakeSoft S.A. 2 * Copyright (C) 2001 MandrakeSoft S.A.
3 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
3 * 4 *
4 * MandrakeSoft S.A. 5 * MandrakeSoft S.A.
5 * 43, rue d'Aboukir 6 * 43, rue d'Aboukir
@@ -151,7 +152,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
151 update_handled_vectors(ioapic); 152 update_handled_vectors(ioapic);
152 mask_after = e->fields.mask; 153 mask_after = e->fields.mask;
153 if (mask_before != mask_after) 154 if (mask_before != mask_after)
154 kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after); 155 kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
155 if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG 156 if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
156 && ioapic->irr & (1 << index)) 157 && ioapic->irr & (1 << index))
157 ioapic_service(ioapic, index); 158 ioapic_service(ioapic, index);
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 96048ee9e39e..62a9caf0563c 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -16,6 +16,8 @@
16 * 16 *
17 * Copyright (C) 2006-2008 Intel Corporation 17 * Copyright (C) 2006-2008 Intel Corporation
18 * Copyright IBM Corporation, 2008 18 * Copyright IBM Corporation, 2008
19 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
20 *
19 * Author: Allen M. Kay <allen.m.kay@intel.com> 21 * Author: Allen M. Kay <allen.m.kay@intel.com>
20 * Author: Weidong Han <weidong.han@intel.com> 22 * Author: Weidong Han <weidong.han@intel.com>
21 * Author: Ben-Ami Yassour <benami@il.ibm.com> 23 * Author: Ben-Ami Yassour <benami@il.ibm.com>
@@ -106,7 +108,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
106 get_order(page_size), flags); 108 get_order(page_size), flags);
107 if (r) { 109 if (r) {
108 printk(KERN_ERR "kvm_iommu_map_address:" 110 printk(KERN_ERR "kvm_iommu_map_address:"
109 "iommu failed to map pfn=%lx\n", pfn); 111 "iommu failed to map pfn=%llx\n", pfn);
110 goto unmap_pages; 112 goto unmap_pages;
111 } 113 }
112 114
@@ -124,9 +126,10 @@ unmap_pages:
124 126
125static int kvm_iommu_map_memslots(struct kvm *kvm) 127static int kvm_iommu_map_memslots(struct kvm *kvm)
126{ 128{
127 int i, r = 0; 129 int i, idx, r = 0;
128 struct kvm_memslots *slots; 130 struct kvm_memslots *slots;
129 131
132 idx = srcu_read_lock(&kvm->srcu);
130 slots = kvm_memslots(kvm); 133 slots = kvm_memslots(kvm);
131 134
132 for (i = 0; i < slots->nmemslots; i++) { 135 for (i = 0; i < slots->nmemslots; i++) {
@@ -134,6 +137,7 @@ static int kvm_iommu_map_memslots(struct kvm *kvm)
134 if (r) 137 if (r)
135 break; 138 break;
136 } 139 }
140 srcu_read_unlock(&kvm->srcu, idx);
137 141
138 return r; 142 return r;
139} 143}
@@ -283,15 +287,17 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
283 287
284static int kvm_iommu_unmap_memslots(struct kvm *kvm) 288static int kvm_iommu_unmap_memslots(struct kvm *kvm)
285{ 289{
286 int i; 290 int i, idx;
287 struct kvm_memslots *slots; 291 struct kvm_memslots *slots;
288 292
293 idx = srcu_read_lock(&kvm->srcu);
289 slots = kvm_memslots(kvm); 294 slots = kvm_memslots(kvm);
290 295
291 for (i = 0; i < slots->nmemslots; i++) { 296 for (i = 0; i < slots->nmemslots; i++) {
292 kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn, 297 kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn,
293 slots->memslots[i].npages); 298 slots->memslots[i].npages);
294 } 299 }
300 srcu_read_unlock(&kvm->srcu, idx);
295 301
296 return 0; 302 return 0;
297} 303}
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index a0e88809e45e..369e38010ad5 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -17,6 +17,7 @@
17 * Authors: 17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com> 18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 * 19 *
20 * Copyright 2010 Red Hat, Inc. and/or its affilates.
20 */ 21 */
21 22
22#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
@@ -99,7 +100,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
99 if (r < 0) 100 if (r < 0)
100 r = 0; 101 r = 0;
101 r += kvm_apic_set_irq(vcpu, irq); 102 r += kvm_apic_set_irq(vcpu, irq);
102 } else { 103 } else if (kvm_lapic_enabled(vcpu)) {
103 if (!lowest) 104 if (!lowest)
104 lowest = vcpu; 105 lowest = vcpu;
105 else if (kvm_apic_compare_prio(vcpu, lowest) < 0) 106 else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
@@ -278,15 +279,19 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
278 synchronize_rcu(); 279 synchronize_rcu();
279} 280}
280 281
281void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) 282void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
283 bool mask)
282{ 284{
283 struct kvm_irq_mask_notifier *kimn; 285 struct kvm_irq_mask_notifier *kimn;
284 struct hlist_node *n; 286 struct hlist_node *n;
287 int gsi;
285 288
286 rcu_read_lock(); 289 rcu_read_lock();
287 hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link) 290 gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
288 if (kimn->irq == irq) 291 if (gsi != -1)
289 kimn->func(kimn, mask); 292 hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link)
293 if (kimn->irq == gsi)
294 kimn->func(kimn, mask);
290 rcu_read_unlock(); 295 rcu_read_unlock();
291} 296}
292 297
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f032806a212f..b78b794c1039 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5,6 +5,7 @@
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affilates.
8 * 9 *
9 * Authors: 10 * Authors:
10 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
@@ -92,6 +93,12 @@ static bool kvm_rebooting;
92 93
93static bool largepages_enabled = true; 94static bool largepages_enabled = true;
94 95
96static struct page *hwpoison_page;
97static pfn_t hwpoison_pfn;
98
99static struct page *fault_page;
100static pfn_t fault_pfn;
101
95inline int kvm_is_mmio_pfn(pfn_t pfn) 102inline int kvm_is_mmio_pfn(pfn_t pfn)
96{ 103{
97 if (pfn_valid(pfn)) { 104 if (pfn_valid(pfn)) {
@@ -141,7 +148,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
141 raw_spin_lock(&kvm->requests_lock); 148 raw_spin_lock(&kvm->requests_lock);
142 me = smp_processor_id(); 149 me = smp_processor_id();
143 kvm_for_each_vcpu(i, vcpu, kvm) { 150 kvm_for_each_vcpu(i, vcpu, kvm) {
144 if (test_and_set_bit(req, &vcpu->requests)) 151 if (kvm_make_check_request(req, vcpu))
145 continue; 152 continue;
146 cpu = vcpu->cpu; 153 cpu = vcpu->cpu;
147 if (cpus != NULL && cpu != -1 && cpu != me) 154 if (cpus != NULL && cpu != -1 && cpu != me)
@@ -566,6 +573,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
566 573
567 new = old = *memslot; 574 new = old = *memslot;
568 575
576 new.id = mem->slot;
569 new.base_gfn = base_gfn; 577 new.base_gfn = base_gfn;
570 new.npages = npages; 578 new.npages = npages;
571 new.flags = mem->flags; 579 new.flags = mem->flags;
@@ -596,7 +604,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
596 /* Allocate if a slot is being created */ 604 /* Allocate if a slot is being created */
597#ifndef CONFIG_S390 605#ifndef CONFIG_S390
598 if (npages && !new.rmap) { 606 if (npages && !new.rmap) {
599 new.rmap = vmalloc(npages * sizeof(struct page *)); 607 new.rmap = vmalloc(npages * sizeof(*new.rmap));
600 608
601 if (!new.rmap) 609 if (!new.rmap)
602 goto out_free; 610 goto out_free;
@@ -621,9 +629,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
621 if (new.lpage_info[i]) 629 if (new.lpage_info[i])
622 continue; 630 continue;
623 631
624 lpages = 1 + (base_gfn + npages - 1) / 632 lpages = 1 + ((base_gfn + npages - 1)
625 KVM_PAGES_PER_HPAGE(level); 633 >> KVM_HPAGE_GFN_SHIFT(level));
626 lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level); 634 lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
627 635
628 new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); 636 new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i]));
629 637
@@ -633,9 +641,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
633 memset(new.lpage_info[i], 0, 641 memset(new.lpage_info[i], 0,
634 lpages * sizeof(*new.lpage_info[i])); 642 lpages * sizeof(*new.lpage_info[i]));
635 643
636 if (base_gfn % KVM_PAGES_PER_HPAGE(level)) 644 if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
637 new.lpage_info[i][0].write_count = 1; 645 new.lpage_info[i][0].write_count = 1;
638 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level)) 646 if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
639 new.lpage_info[i][lpages - 1].write_count = 1; 647 new.lpage_info[i][lpages - 1].write_count = 1;
640 ugfn = new.userspace_addr >> PAGE_SHIFT; 648 ugfn = new.userspace_addr >> PAGE_SHIFT;
641 /* 649 /*
@@ -810,16 +818,28 @@ EXPORT_SYMBOL_GPL(kvm_disable_largepages);
810 818
811int is_error_page(struct page *page) 819int is_error_page(struct page *page)
812{ 820{
813 return page == bad_page; 821 return page == bad_page || page == hwpoison_page || page == fault_page;
814} 822}
815EXPORT_SYMBOL_GPL(is_error_page); 823EXPORT_SYMBOL_GPL(is_error_page);
816 824
817int is_error_pfn(pfn_t pfn) 825int is_error_pfn(pfn_t pfn)
818{ 826{
819 return pfn == bad_pfn; 827 return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
820} 828}
821EXPORT_SYMBOL_GPL(is_error_pfn); 829EXPORT_SYMBOL_GPL(is_error_pfn);
822 830
831int is_hwpoison_pfn(pfn_t pfn)
832{
833 return pfn == hwpoison_pfn;
834}
835EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
836
837int is_fault_pfn(pfn_t pfn)
838{
839 return pfn == fault_pfn;
840}
841EXPORT_SYMBOL_GPL(is_fault_pfn);
842
823static inline unsigned long bad_hva(void) 843static inline unsigned long bad_hva(void)
824{ 844{
825 return PAGE_OFFSET; 845 return PAGE_OFFSET;
@@ -831,7 +851,7 @@ int kvm_is_error_hva(unsigned long addr)
831} 851}
832EXPORT_SYMBOL_GPL(kvm_is_error_hva); 852EXPORT_SYMBOL_GPL(kvm_is_error_hva);
833 853
834struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) 854struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
835{ 855{
836 int i; 856 int i;
837 struct kvm_memslots *slots = kvm_memslots(kvm); 857 struct kvm_memslots *slots = kvm_memslots(kvm);
@@ -845,20 +865,13 @@ struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
845 } 865 }
846 return NULL; 866 return NULL;
847} 867}
848EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased); 868EXPORT_SYMBOL_GPL(gfn_to_memslot);
849
850struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
851{
852 gfn = unalias_gfn(kvm, gfn);
853 return gfn_to_memslot_unaliased(kvm, gfn);
854}
855 869
856int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 870int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
857{ 871{
858 int i; 872 int i;
859 struct kvm_memslots *slots = kvm_memslots(kvm); 873 struct kvm_memslots *slots = kvm_memslots(kvm);
860 874
861 gfn = unalias_gfn_instantiation(kvm, gfn);
862 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 875 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
863 struct kvm_memory_slot *memslot = &slots->memslots[i]; 876 struct kvm_memory_slot *memslot = &slots->memslots[i];
864 877
@@ -903,7 +916,6 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
903 struct kvm_memslots *slots = kvm_memslots(kvm); 916 struct kvm_memslots *slots = kvm_memslots(kvm);
904 struct kvm_memory_slot *memslot = NULL; 917 struct kvm_memory_slot *memslot = NULL;
905 918
906 gfn = unalias_gfn(kvm, gfn);
907 for (i = 0; i < slots->nmemslots; ++i) { 919 for (i = 0; i < slots->nmemslots; ++i) {
908 memslot = &slots->memslots[i]; 920 memslot = &slots->memslots[i];
909 921
@@ -924,8 +936,7 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
924{ 936{
925 struct kvm_memory_slot *slot; 937 struct kvm_memory_slot *slot;
926 938
927 gfn = unalias_gfn_instantiation(kvm, gfn); 939 slot = gfn_to_memslot(kvm, gfn);
928 slot = gfn_to_memslot_unaliased(kvm, gfn);
929 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 940 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
930 return bad_hva(); 941 return bad_hva();
931 return gfn_to_hva_memslot(slot, gfn); 942 return gfn_to_hva_memslot(slot, gfn);
@@ -946,13 +957,19 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
946 struct vm_area_struct *vma; 957 struct vm_area_struct *vma;
947 958
948 down_read(&current->mm->mmap_sem); 959 down_read(&current->mm->mmap_sem);
960 if (is_hwpoison_address(addr)) {
961 up_read(&current->mm->mmap_sem);
962 get_page(hwpoison_page);
963 return page_to_pfn(hwpoison_page);
964 }
965
949 vma = find_vma(current->mm, addr); 966 vma = find_vma(current->mm, addr);
950 967
951 if (vma == NULL || addr < vma->vm_start || 968 if (vma == NULL || addr < vma->vm_start ||
952 !(vma->vm_flags & VM_PFNMAP)) { 969 !(vma->vm_flags & VM_PFNMAP)) {
953 up_read(&current->mm->mmap_sem); 970 up_read(&current->mm->mmap_sem);
954 get_page(bad_page); 971 get_page(fault_page);
955 return page_to_pfn(bad_page); 972 return page_to_pfn(fault_page);
956 } 973 }
957 974
958 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 975 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -1187,8 +1204,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1187{ 1204{
1188 struct kvm_memory_slot *memslot; 1205 struct kvm_memory_slot *memslot;
1189 1206
1190 gfn = unalias_gfn(kvm, gfn); 1207 memslot = gfn_to_memslot(kvm, gfn);
1191 memslot = gfn_to_memslot_unaliased(kvm, gfn);
1192 if (memslot && memslot->dirty_bitmap) { 1208 if (memslot && memslot->dirty_bitmap) {
1193 unsigned long rel_gfn = gfn - memslot->base_gfn; 1209 unsigned long rel_gfn = gfn - memslot->base_gfn;
1194 1210
@@ -1207,7 +1223,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1207 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1223 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1208 1224
1209 if (kvm_arch_vcpu_runnable(vcpu)) { 1225 if (kvm_arch_vcpu_runnable(vcpu)) {
1210 set_bit(KVM_REQ_UNHALT, &vcpu->requests); 1226 kvm_make_request(KVM_REQ_UNHALT, vcpu);
1211 break; 1227 break;
1212 } 1228 }
1213 if (kvm_cpu_has_pending_timer(vcpu)) 1229 if (kvm_cpu_has_pending_timer(vcpu))
@@ -1378,6 +1394,18 @@ static long kvm_vcpu_ioctl(struct file *filp,
1378 1394
1379 if (vcpu->kvm->mm != current->mm) 1395 if (vcpu->kvm->mm != current->mm)
1380 return -EIO; 1396 return -EIO;
1397
1398#if defined(CONFIG_S390) || defined(CONFIG_PPC)
1399 /*
1400 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
1401 * so vcpu_load() would break it.
1402 */
1403 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
1404 return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1405#endif
1406
1407
1408 vcpu_load(vcpu);
1381 switch (ioctl) { 1409 switch (ioctl) {
1382 case KVM_RUN: 1410 case KVM_RUN:
1383 r = -EINVAL; 1411 r = -EINVAL;
@@ -1520,7 +1548,7 @@ out_free2:
1520 goto out; 1548 goto out;
1521 p = &sigset; 1549 p = &sigset;
1522 } 1550 }
1523 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 1551 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
1524 break; 1552 break;
1525 } 1553 }
1526 case KVM_GET_FPU: { 1554 case KVM_GET_FPU: {
@@ -1555,6 +1583,7 @@ out_free2:
1555 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1583 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1556 } 1584 }
1557out: 1585out:
1586 vcpu_put(vcpu);
1558 kfree(fpu); 1587 kfree(fpu);
1559 kfree(kvm_sregs); 1588 kfree(kvm_sregs);
1560 return r; 1589 return r;
@@ -2197,6 +2226,24 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2197 2226
2198 bad_pfn = page_to_pfn(bad_page); 2227 bad_pfn = page_to_pfn(bad_page);
2199 2228
2229 hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2230
2231 if (hwpoison_page == NULL) {
2232 r = -ENOMEM;
2233 goto out_free_0;
2234 }
2235
2236 hwpoison_pfn = page_to_pfn(hwpoison_page);
2237
2238 fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2239
2240 if (fault_page == NULL) {
2241 r = -ENOMEM;
2242 goto out_free_0;
2243 }
2244
2245 fault_pfn = page_to_pfn(fault_page);
2246
2200 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2247 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2201 r = -ENOMEM; 2248 r = -ENOMEM;
2202 goto out_free_0; 2249 goto out_free_0;
@@ -2269,6 +2316,10 @@ out_free_1:
2269out_free_0a: 2316out_free_0a:
2270 free_cpumask_var(cpus_hardware_enabled); 2317 free_cpumask_var(cpus_hardware_enabled);
2271out_free_0: 2318out_free_0:
2319 if (fault_page)
2320 __free_page(fault_page);
2321 if (hwpoison_page)
2322 __free_page(hwpoison_page);
2272 __free_page(bad_page); 2323 __free_page(bad_page);
2273out: 2324out:
2274 kvm_arch_exit(); 2325 kvm_arch_exit();
@@ -2290,6 +2341,7 @@ void kvm_exit(void)
2290 kvm_arch_hardware_unsetup(); 2341 kvm_arch_hardware_unsetup();
2291 kvm_arch_exit(); 2342 kvm_arch_exit();
2292 free_cpumask_var(cpus_hardware_enabled); 2343 free_cpumask_var(cpus_hardware_enabled);
2344 __free_page(hwpoison_page);
2293 __free_page(bad_page); 2345 __free_page(bad_page);
2294} 2346}
2295EXPORT_SYMBOL_GPL(kvm_exit); 2347EXPORT_SYMBOL_GPL(kvm_exit);