aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-13 18:31:08 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-13 18:31:08 -0500
commit66cdd0ceaf65a18996f561b770eedde1d123b019 (patch)
tree4892eaa422d366fce5d1e866ff1fe0988af95569 /arch/x86
parent896ea17d3da5f44b2625c9cda9874d7dfe447393 (diff)
parent58b7825bc324da55415034a9f6ca5d716b8fd898 (diff)
Merge tag 'kvm-3.8-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Marcelo Tosatti: "Considerable KVM/PPC work, x86 kvmclock vsyscall support, IA32_TSC_ADJUST MSR emulation, amongst others." Fix up trivial conflict in kernel/sched/core.c due to cross-cpu migration notifier added next to rq migration call-back. * tag 'kvm-3.8-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (156 commits) KVM: emulator: fix real mode segment checks in address linearization VMX: remove unneeded enable_unrestricted_guest check KVM: VMX: fix DPL during entry to protected mode x86/kexec: crash_vmclear_local_vmcss needs __rcu kvm: Fix irqfd resampler list walk KVM: VMX: provide the vmclear function and a bitmap to support VMCLEAR in kdump x86/kexec: VMCLEAR VMCSs loaded on all cpus if necessary KVM: MMU: optimize for set_spte KVM: PPC: booke: Get/set guest EPCR register using ONE_REG interface KVM: PPC: bookehv: Add EPCR support in mtspr/mfspr emulation KVM: PPC: bookehv: Add guest computation mode for irq delivery KVM: PPC: Make EPCR a valid field for booke64 and bookehv KVM: PPC: booke: Extend MAS2 EPN mask for 64-bit KVM: PPC: e500: Mask MAS2 EPN high 32-bits in 32/64 tlbwe emulation KVM: PPC: Mask ea's high 32-bits in 32/64 instr emulation KVM: PPC: e500: Add emulation helper for getting instruction ea KVM: PPC: bookehv64: Add support for interrupt handling KVM: PPC: bookehv: Remove GET_VCPU macro from exception handler KVM: PPC: booke: Fix get_tb() compile error on 64-bit KVM: PPC: e500: Silence bogus GCC warning in tlb code ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/clocksource.h1
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/fixmap.h5
-rw-r--r--arch/x86/include/asm/kexec.h3
-rw-r--r--arch/x86/include/asm/kvm_guest.h6
-rw-r--r--arch/x86/include/asm/kvm_host.h24
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/pvclock.h47
-rw-r--r--arch/x86/include/asm/vmx.h3
-rw-r--r--arch/x86/include/asm/vsyscall.h20
-rw-r--r--arch/x86/kernel/crash.c32
-rw-r--r--arch/x86/kernel/kvm.c20
-rw-r--r--arch/x86/kernel/kvmclock.c88
-rw-r--r--arch/x86/kernel/pvclock.c143
-rw-r--r--arch/x86/kvm/cpuid.c3
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c5
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu.c65
-rw-r--r--arch/x86/kvm/paging_tmpl.h115
-rw-r--r--arch/x86/kvm/svm.c48
-rw-r--r--arch/x86/kvm/trace.h63
-rw-r--r--arch/x86/kvm/vmx.c203
-rw-r--r--arch/x86/kvm/x86.c548
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--arch/x86/vdso/vclock_gettime.c81
-rw-r--r--arch/x86/vdso/vgetcpu.c11
27 files changed, 1209 insertions, 339 deletions
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index 0bdbbb3b9ce7..16a57f4ed64d 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -8,6 +8,7 @@
8#define VCLOCK_NONE 0 /* No vDSO clock available. */ 8#define VCLOCK_NONE 0 /* No vDSO clock available. */
9#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ 9#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */
10#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ 10#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */
11#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */
11 12
12struct arch_clocksource_data { 13struct arch_clocksource_data {
13 int vclock_mode; 14 int vclock_mode;
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index da40b1e2228e..2d9075e863a0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -202,6 +202,7 @@
202 202
203/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ 203/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
204#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ 204#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
205#define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3b */
205#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ 206#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */
206#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ 207#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */
207#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ 208#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4da3c0c4c974..a09c28571064 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -19,6 +19,7 @@
19#include <asm/acpi.h> 19#include <asm/acpi.h>
20#include <asm/apicdef.h> 20#include <asm/apicdef.h>
21#include <asm/page.h> 21#include <asm/page.h>
22#include <asm/pvclock.h>
22#ifdef CONFIG_X86_32 23#ifdef CONFIG_X86_32
23#include <linux/threads.h> 24#include <linux/threads.h>
24#include <asm/kmap_types.h> 25#include <asm/kmap_types.h>
@@ -81,6 +82,10 @@ enum fixed_addresses {
81 VVAR_PAGE, 82 VVAR_PAGE,
82 VSYSCALL_HPET, 83 VSYSCALL_HPET,
83#endif 84#endif
85#ifdef CONFIG_PARAVIRT_CLOCK
86 PVCLOCK_FIXMAP_BEGIN,
87 PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1,
88#endif
84 FIX_DBGP_BASE, 89 FIX_DBGP_BASE,
85 FIX_EARLYCON_MEM_BASE, 90 FIX_EARLYCON_MEM_BASE,
86#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT 91#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 317ff1703d0b..6080d2694bad 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -163,6 +163,9 @@ struct kimage_arch {
163}; 163};
164#endif 164#endif
165 165
166typedef void crash_vmclear_fn(void);
167extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
168
166#endif /* __ASSEMBLY__ */ 169#endif /* __ASSEMBLY__ */
167 170
168#endif /* _ASM_X86_KEXEC_H */ 171#endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h
new file mode 100644
index 000000000000..a92b1763c419
--- /dev/null
+++ b/arch/x86/include/asm/kvm_guest.h
@@ -0,0 +1,6 @@
1#ifndef _ASM_X86_KVM_GUEST_H
2#define _ASM_X86_KVM_GUEST_H
3
4int kvm_setup_vsyscall_timeinfo(void);
5
6#endif /* _ASM_X86_KVM_GUEST_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b2e11f452435..dc87b65e9c3a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -22,6 +22,8 @@
22#include <linux/kvm_para.h> 22#include <linux/kvm_para.h>
23#include <linux/kvm_types.h> 23#include <linux/kvm_types.h>
24#include <linux/perf_event.h> 24#include <linux/perf_event.h>
25#include <linux/pvclock_gtod.h>
26#include <linux/clocksource.h>
25 27
26#include <asm/pvclock-abi.h> 28#include <asm/pvclock-abi.h>
27#include <asm/desc.h> 29#include <asm/desc.h>
@@ -442,6 +444,7 @@ struct kvm_vcpu_arch {
442 s8 virtual_tsc_shift; 444 s8 virtual_tsc_shift;
443 u32 virtual_tsc_mult; 445 u32 virtual_tsc_mult;
444 u32 virtual_tsc_khz; 446 u32 virtual_tsc_khz;
447 s64 ia32_tsc_adjust_msr;
445 448
446 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ 449 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
447 unsigned nmi_pending; /* NMI queued after currently running handler */ 450 unsigned nmi_pending; /* NMI queued after currently running handler */
@@ -559,6 +562,12 @@ struct kvm_arch {
559 u64 cur_tsc_write; 562 u64 cur_tsc_write;
560 u64 cur_tsc_offset; 563 u64 cur_tsc_offset;
561 u8 cur_tsc_generation; 564 u8 cur_tsc_generation;
565 int nr_vcpus_matched_tsc;
566
567 spinlock_t pvclock_gtod_sync_lock;
568 bool use_master_clock;
569 u64 master_kernel_ns;
570 cycle_t master_cycle_now;
562 571
563 struct kvm_xen_hvm_config xen_hvm_config; 572 struct kvm_xen_hvm_config xen_hvm_config;
564 573
@@ -612,6 +621,12 @@ struct kvm_vcpu_stat {
612 621
613struct x86_instruction_info; 622struct x86_instruction_info;
614 623
624struct msr_data {
625 bool host_initiated;
626 u32 index;
627 u64 data;
628};
629
615struct kvm_x86_ops { 630struct kvm_x86_ops {
616 int (*cpu_has_kvm_support)(void); /* __init */ 631 int (*cpu_has_kvm_support)(void); /* __init */
617 int (*disabled_by_bios)(void); /* __init */ 632 int (*disabled_by_bios)(void); /* __init */
@@ -634,7 +649,7 @@ struct kvm_x86_ops {
634 649
635 void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); 650 void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
636 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 651 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
637 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 652 int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
638 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 653 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
639 void (*get_segment)(struct kvm_vcpu *vcpu, 654 void (*get_segment)(struct kvm_vcpu *vcpu,
640 struct kvm_segment *var, int seg); 655 struct kvm_segment *var, int seg);
@@ -697,10 +712,11 @@ struct kvm_x86_ops {
697 bool (*has_wbinvd_exit)(void); 712 bool (*has_wbinvd_exit)(void);
698 713
699 void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); 714 void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale);
715 u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
700 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 716 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
701 717
702 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); 718 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
703 u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu); 719 u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
704 720
705 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); 721 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
706 722
@@ -785,7 +801,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
785 801
786void kvm_enable_efer_bits(u64); 802void kvm_enable_efer_bits(u64);
787int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); 803int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
788int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 804int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
789 805
790struct x86_emulate_ctxt; 806struct x86_emulate_ctxt;
791 807
@@ -812,7 +828,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
812int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); 828int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
813 829
814int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 830int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
815int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 831int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
816 832
817unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); 833unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
818void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); 834void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e400cdb2dd65..6e930b218724 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -236,6 +236,7 @@
236#define MSR_IA32_EBL_CR_POWERON 0x0000002a 236#define MSR_IA32_EBL_CR_POWERON 0x0000002a
237#define MSR_EBC_FREQUENCY_ID 0x0000002c 237#define MSR_EBC_FREQUENCY_ID 0x0000002c
238#define MSR_IA32_FEATURE_CONTROL 0x0000003a 238#define MSR_IA32_FEATURE_CONTROL 0x0000003a
239#define MSR_IA32_TSC_ADJUST 0x0000003b
239 240
240#define FEATURE_CONTROL_LOCKED (1<<0) 241#define FEATURE_CONTROL_LOCKED (1<<0)
241#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) 242#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index c59cc97fe6c1..109a9dd5d454 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -6,6 +6,7 @@
6 6
7/* some helper functions for xen and kvm pv clock sources */ 7/* some helper functions for xen and kvm pv clock sources */
8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); 8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
9u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
9void pvclock_set_flags(u8 flags); 10void pvclock_set_flags(u8 flags);
10unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); 11unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
11void pvclock_read_wallclock(struct pvclock_wall_clock *wall, 12void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
@@ -56,4 +57,50 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
56 return product; 57 return product;
57} 58}
58 59
60static __always_inline
61u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
62{
63 u64 delta = __native_read_tsc() - src->tsc_timestamp;
64 return pvclock_scale_delta(delta, src->tsc_to_system_mul,
65 src->tsc_shift);
66}
67
68static __always_inline
69unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
70 cycle_t *cycles, u8 *flags)
71{
72 unsigned version;
73 cycle_t ret, offset;
74 u8 ret_flags;
75
76 version = src->version;
77 /* Note: emulated platforms which do not advertise SSE2 support
78 * result in kvmclock not using the necessary RDTSC barriers.
79 * Without barriers, it is possible that RDTSC instruction reads from
80 * the time stamp counter outside rdtsc_barrier protected section
81 * below, resulting in violation of monotonicity.
82 */
83 rdtsc_barrier();
84 offset = pvclock_get_nsec_offset(src);
85 ret = src->system_time + offset;
86 ret_flags = src->flags;
87 rdtsc_barrier();
88
89 *cycles = ret;
90 *flags = ret_flags;
91 return version;
92}
93
94struct pvclock_vsyscall_time_info {
95 struct pvclock_vcpu_time_info pvti;
96 u32 migrate_count;
97} __attribute__((__aligned__(SMP_CACHE_BYTES)));
98
99#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
100#define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1)
101
102int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
103 int size);
104struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu);
105
59#endif /* _ASM_X86_PVCLOCK_H */ 106#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 36ec21c36d68..c2d56b34830d 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -445,8 +445,7 @@ enum vmcs_field {
445#define VMX_EPTP_WB_BIT (1ull << 14) 445#define VMX_EPTP_WB_BIT (1ull << 14)
446#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 446#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
447#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) 447#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
448#define VMX_EPT_AD_BIT (1ull << 21) 448#define VMX_EPT_AD_BIT (1ull << 21)
449#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
450#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 449#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
451#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 450#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
452 451
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index eaea1d31f753..80f80955cfd8 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -33,6 +33,26 @@ extern void map_vsyscall(void);
33 */ 33 */
34extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); 34extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
35 35
36#ifdef CONFIG_X86_64
37
38#define VGETCPU_CPU_MASK 0xfff
39
40static inline unsigned int __getcpu(void)
41{
42 unsigned int p;
43
44 if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
45 /* Load per CPU data from RDTSCP */
46 native_read_tscp(&p);
47 } else {
48 /* Load per CPU data from GDT */
49 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
50 }
51
52 return p;
53}
54#endif /* CONFIG_X86_64 */
55
36#endif /* __KERNEL__ */ 56#endif /* __KERNEL__ */
37 57
38#endif /* _ASM_X86_VSYSCALL_H */ 58#endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 13ad89971d47..74467feb4dc5 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -16,6 +16,7 @@
16#include <linux/delay.h> 16#include <linux/delay.h>
17#include <linux/elf.h> 17#include <linux/elf.h>
18#include <linux/elfcore.h> 18#include <linux/elfcore.h>
19#include <linux/module.h>
19 20
20#include <asm/processor.h> 21#include <asm/processor.h>
21#include <asm/hardirq.h> 22#include <asm/hardirq.h>
@@ -30,6 +31,27 @@
30 31
31int in_crash_kexec; 32int in_crash_kexec;
32 33
34/*
35 * This is used to VMCLEAR all VMCSs loaded on the
36 * processor. And when loading kvm_intel module, the
37 * callback function pointer will be assigned.
38 *
39 * protected by rcu.
40 */
41crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
42EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
43
44static inline void cpu_crash_vmclear_loaded_vmcss(void)
45{
46 crash_vmclear_fn *do_vmclear_operation = NULL;
47
48 rcu_read_lock();
49 do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
50 if (do_vmclear_operation)
51 do_vmclear_operation();
52 rcu_read_unlock();
53}
54
33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 55#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
34 56
35static void kdump_nmi_callback(int cpu, struct pt_regs *regs) 57static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
@@ -46,6 +68,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
46#endif 68#endif
47 crash_save_cpu(regs, cpu); 69 crash_save_cpu(regs, cpu);
48 70
71 /*
72 * VMCLEAR VMCSs loaded on all cpus if needed.
73 */
74 cpu_crash_vmclear_loaded_vmcss();
75
49 /* Disable VMX or SVM if needed. 76 /* Disable VMX or SVM if needed.
50 * 77 *
51 * We need to disable virtualization on all CPUs. 78 * We need to disable virtualization on all CPUs.
@@ -88,6 +115,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
88 115
89 kdump_nmi_shootdown_cpus(); 116 kdump_nmi_shootdown_cpus();
90 117
118 /*
119 * VMCLEAR VMCSs loaded on this cpu if needed.
120 */
121 cpu_crash_vmclear_loaded_vmcss();
122
91 /* Booting kdump kernel with VMX or SVM enabled won't work, 123 /* Booting kdump kernel with VMX or SVM enabled won't work,
92 * because (among other limitations) we can't disable paging 124 * because (among other limitations) we can't disable paging
93 * with the virt flags. 125 * with the virt flags.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 4180a874c764..08b973f64032 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -42,6 +42,7 @@
42#include <asm/apic.h> 42#include <asm/apic.h>
43#include <asm/apicdef.h> 43#include <asm/apicdef.h>
44#include <asm/hypervisor.h> 44#include <asm/hypervisor.h>
45#include <asm/kvm_guest.h>
45 46
46static int kvmapf = 1; 47static int kvmapf = 1;
47 48
@@ -62,6 +63,15 @@ static int parse_no_stealacc(char *arg)
62 63
63early_param("no-steal-acc", parse_no_stealacc); 64early_param("no-steal-acc", parse_no_stealacc);
64 65
66static int kvmclock_vsyscall = 1;
67static int parse_no_kvmclock_vsyscall(char *arg)
68{
69 kvmclock_vsyscall = 0;
70 return 0;
71}
72
73early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
74
65static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 75static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
66static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); 76static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
67static int has_steal_clock = 0; 77static int has_steal_clock = 0;
@@ -110,11 +120,6 @@ void kvm_async_pf_task_wait(u32 token)
110 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 120 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
111 struct kvm_task_sleep_node n, *e; 121 struct kvm_task_sleep_node n, *e;
112 DEFINE_WAIT(wait); 122 DEFINE_WAIT(wait);
113 int cpu, idle;
114
115 cpu = get_cpu();
116 idle = idle_cpu(cpu);
117 put_cpu();
118 123
119 spin_lock(&b->lock); 124 spin_lock(&b->lock);
120 e = _find_apf_task(b, token); 125 e = _find_apf_task(b, token);
@@ -128,7 +133,7 @@ void kvm_async_pf_task_wait(u32 token)
128 133
129 n.token = token; 134 n.token = token;
130 n.cpu = smp_processor_id(); 135 n.cpu = smp_processor_id();
131 n.halted = idle || preempt_count() > 1; 136 n.halted = is_idle_task(current) || preempt_count() > 1;
132 init_waitqueue_head(&n.wq); 137 init_waitqueue_head(&n.wq);
133 hlist_add_head(&n.link, &b->list); 138 hlist_add_head(&n.link, &b->list);
134 spin_unlock(&b->lock); 139 spin_unlock(&b->lock);
@@ -471,6 +476,9 @@ void __init kvm_guest_init(void)
471 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 476 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
472 apic_set_eoi_write(kvm_guest_apic_eoi_write); 477 apic_set_eoi_write(kvm_guest_apic_eoi_write);
473 478
479 if (kvmclock_vsyscall)
480 kvm_setup_vsyscall_timeinfo();
481
474#ifdef CONFIG_SMP 482#ifdef CONFIG_SMP
475 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 483 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
476 register_cpu_notifier(&kvm_cpu_notifier); 484 register_cpu_notifier(&kvm_cpu_notifier);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f1b42b3a186c..220a360010f8 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -23,6 +23,7 @@
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/hardirq.h> 25#include <linux/hardirq.h>
26#include <linux/memblock.h>
26 27
27#include <asm/x86_init.h> 28#include <asm/x86_init.h>
28#include <asm/reboot.h> 29#include <asm/reboot.h>
@@ -39,7 +40,7 @@ static int parse_no_kvmclock(char *arg)
39early_param("no-kvmclock", parse_no_kvmclock); 40early_param("no-kvmclock", parse_no_kvmclock);
40 41
41/* The hypervisor will put information about time periodically here */ 42/* The hypervisor will put information about time periodically here */
42static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); 43static struct pvclock_vsyscall_time_info *hv_clock;
43static struct pvclock_wall_clock wall_clock; 44static struct pvclock_wall_clock wall_clock;
44 45
45/* 46/*
@@ -52,15 +53,20 @@ static unsigned long kvm_get_wallclock(void)
52 struct pvclock_vcpu_time_info *vcpu_time; 53 struct pvclock_vcpu_time_info *vcpu_time;
53 struct timespec ts; 54 struct timespec ts;
54 int low, high; 55 int low, high;
56 int cpu;
55 57
56 low = (int)__pa_symbol(&wall_clock); 58 low = (int)__pa_symbol(&wall_clock);
57 high = ((u64)__pa_symbol(&wall_clock) >> 32); 59 high = ((u64)__pa_symbol(&wall_clock) >> 32);
58 60
59 native_write_msr(msr_kvm_wall_clock, low, high); 61 native_write_msr(msr_kvm_wall_clock, low, high);
60 62
61 vcpu_time = &get_cpu_var(hv_clock); 63 preempt_disable();
64 cpu = smp_processor_id();
65
66 vcpu_time = &hv_clock[cpu].pvti;
62 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); 67 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
63 put_cpu_var(hv_clock); 68
69 preempt_enable();
64 70
65 return ts.tv_sec; 71 return ts.tv_sec;
66} 72}
@@ -74,9 +80,11 @@ static cycle_t kvm_clock_read(void)
74{ 80{
75 struct pvclock_vcpu_time_info *src; 81 struct pvclock_vcpu_time_info *src;
76 cycle_t ret; 82 cycle_t ret;
83 int cpu;
77 84
78 preempt_disable_notrace(); 85 preempt_disable_notrace();
79 src = &__get_cpu_var(hv_clock); 86 cpu = smp_processor_id();
87 src = &hv_clock[cpu].pvti;
80 ret = pvclock_clocksource_read(src); 88 ret = pvclock_clocksource_read(src);
81 preempt_enable_notrace(); 89 preempt_enable_notrace();
82 return ret; 90 return ret;
@@ -99,8 +107,15 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
99static unsigned long kvm_get_tsc_khz(void) 107static unsigned long kvm_get_tsc_khz(void)
100{ 108{
101 struct pvclock_vcpu_time_info *src; 109 struct pvclock_vcpu_time_info *src;
102 src = &per_cpu(hv_clock, 0); 110 int cpu;
103 return pvclock_tsc_khz(src); 111 unsigned long tsc_khz;
112
113 preempt_disable();
114 cpu = smp_processor_id();
115 src = &hv_clock[cpu].pvti;
116 tsc_khz = pvclock_tsc_khz(src);
117 preempt_enable();
118 return tsc_khz;
104} 119}
105 120
106static void kvm_get_preset_lpj(void) 121static void kvm_get_preset_lpj(void)
@@ -119,10 +134,14 @@ bool kvm_check_and_clear_guest_paused(void)
119{ 134{
120 bool ret = false; 135 bool ret = false;
121 struct pvclock_vcpu_time_info *src; 136 struct pvclock_vcpu_time_info *src;
137 int cpu = smp_processor_id();
122 138
123 src = &__get_cpu_var(hv_clock); 139 if (!hv_clock)
140 return ret;
141
142 src = &hv_clock[cpu].pvti;
124 if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { 143 if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
125 __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); 144 src->flags &= ~PVCLOCK_GUEST_STOPPED;
126 ret = true; 145 ret = true;
127 } 146 }
128 147
@@ -141,9 +160,10 @@ int kvm_register_clock(char *txt)
141{ 160{
142 int cpu = smp_processor_id(); 161 int cpu = smp_processor_id();
143 int low, high, ret; 162 int low, high, ret;
163 struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti;
144 164
145 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; 165 low = (int)__pa(src) | 1;
146 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 166 high = ((u64)__pa(src) >> 32);
147 ret = native_write_msr_safe(msr_kvm_system_time, low, high); 167 ret = native_write_msr_safe(msr_kvm_system_time, low, high);
148 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 168 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
149 cpu, high, low, txt); 169 cpu, high, low, txt);
@@ -197,6 +217,8 @@ static void kvm_shutdown(void)
197 217
198void __init kvmclock_init(void) 218void __init kvmclock_init(void)
199{ 219{
220 unsigned long mem;
221
200 if (!kvm_para_available()) 222 if (!kvm_para_available())
201 return; 223 return;
202 224
@@ -209,8 +231,18 @@ void __init kvmclock_init(void)
209 printk(KERN_INFO "kvm-clock: Using msrs %x and %x", 231 printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
210 msr_kvm_system_time, msr_kvm_wall_clock); 232 msr_kvm_system_time, msr_kvm_wall_clock);
211 233
212 if (kvm_register_clock("boot clock")) 234 mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS,
235 PAGE_SIZE);
236 if (!mem)
237 return;
238 hv_clock = __va(mem);
239
240 if (kvm_register_clock("boot clock")) {
241 hv_clock = NULL;
242 memblock_free(mem,
243 sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
213 return; 244 return;
245 }
214 pv_time_ops.sched_clock = kvm_clock_read; 246 pv_time_ops.sched_clock = kvm_clock_read;
215 x86_platform.calibrate_tsc = kvm_get_tsc_khz; 247 x86_platform.calibrate_tsc = kvm_get_tsc_khz;
216 x86_platform.get_wallclock = kvm_get_wallclock; 248 x86_platform.get_wallclock = kvm_get_wallclock;
@@ -233,3 +265,37 @@ void __init kvmclock_init(void)
233 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) 265 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
234 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); 266 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
235} 267}
268
269int __init kvm_setup_vsyscall_timeinfo(void)
270{
271#ifdef CONFIG_X86_64
272 int cpu;
273 int ret;
274 u8 flags;
275 struct pvclock_vcpu_time_info *vcpu_time;
276 unsigned int size;
277
278 size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS;
279
280 preempt_disable();
281 cpu = smp_processor_id();
282
283 vcpu_time = &hv_clock[cpu].pvti;
284 flags = pvclock_read_flags(vcpu_time);
285
286 if (!(flags & PVCLOCK_TSC_STABLE_BIT)) {
287 preempt_enable();
288 return 1;
289 }
290
291 if ((ret = pvclock_init_vsyscall(hv_clock, size))) {
292 preempt_enable();
293 return ret;
294 }
295
296 preempt_enable();
297
298 kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
299#endif
300 return 0;
301}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 42eb3300dfc6..85c39590c1a4 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -17,23 +17,13 @@
17 17
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/notifier.h>
21#include <linux/sched.h>
22#include <linux/gfp.h>
23#include <linux/bootmem.h>
24#include <asm/fixmap.h>
20#include <asm/pvclock.h> 25#include <asm/pvclock.h>
21 26
22/*
23 * These are perodically updated
24 * xen: magic shared_info page
25 * kvm: gpa registered via msr
26 * and then copied here.
27 */
28struct pvclock_shadow_time {
29 u64 tsc_timestamp; /* TSC at last update of time vals. */
30 u64 system_timestamp; /* Time, in nanosecs, since boot. */
31 u32 tsc_to_nsec_mul;
32 int tsc_shift;
33 u32 version;
34 u8 flags;
35};
36
37static u8 valid_flags __read_mostly = 0; 27static u8 valid_flags __read_mostly = 0;
38 28
39void pvclock_set_flags(u8 flags) 29void pvclock_set_flags(u8 flags)
@@ -41,34 +31,6 @@ void pvclock_set_flags(u8 flags)
41 valid_flags = flags; 31 valid_flags = flags;
42} 32}
43 33
44static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
45{
46 u64 delta = native_read_tsc() - shadow->tsc_timestamp;
47 return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
48 shadow->tsc_shift);
49}
50
51/*
52 * Reads a consistent set of time-base values from hypervisor,
53 * into a shadow data area.
54 */
55static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
56 struct pvclock_vcpu_time_info *src)
57{
58 do {
59 dst->version = src->version;
60 rmb(); /* fetch version before data */
61 dst->tsc_timestamp = src->tsc_timestamp;
62 dst->system_timestamp = src->system_time;
63 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
64 dst->tsc_shift = src->tsc_shift;
65 dst->flags = src->flags;
66 rmb(); /* test version after fetching data */
67 } while ((src->version & 1) || (dst->version != src->version));
68
69 return dst->version;
70}
71
72unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) 34unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
73{ 35{
74 u64 pv_tsc_khz = 1000000ULL << 32; 36 u64 pv_tsc_khz = 1000000ULL << 32;
@@ -88,23 +50,32 @@ void pvclock_resume(void)
88 atomic64_set(&last_value, 0); 50 atomic64_set(&last_value, 0);
89} 51}
90 52
53u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
54{
55 unsigned version;
56 cycle_t ret;
57 u8 flags;
58
59 do {
60 version = __pvclock_read_cycles(src, &ret, &flags);
61 } while ((src->version & 1) || version != src->version);
62
63 return flags & valid_flags;
64}
65
91cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 66cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
92{ 67{
93 struct pvclock_shadow_time shadow;
94 unsigned version; 68 unsigned version;
95 cycle_t ret, offset; 69 cycle_t ret;
96 u64 last; 70 u64 last;
71 u8 flags;
97 72
98 do { 73 do {
99 version = pvclock_get_time_values(&shadow, src); 74 version = __pvclock_read_cycles(src, &ret, &flags);
100 barrier(); 75 } while ((src->version & 1) || version != src->version);
101 offset = pvclock_get_nsec_offset(&shadow);
102 ret = shadow.system_timestamp + offset;
103 barrier();
104 } while (version != src->version);
105 76
106 if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && 77 if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
107 (shadow.flags & PVCLOCK_TSC_STABLE_BIT)) 78 (flags & PVCLOCK_TSC_STABLE_BIT))
108 return ret; 79 return ret;
109 80
110 /* 81 /*
@@ -156,3 +127,71 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
156 127
157 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 128 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
158} 129}
130
131static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
132
133static struct pvclock_vsyscall_time_info *
134pvclock_get_vsyscall_user_time_info(int cpu)
135{
136 if (!pvclock_vdso_info) {
137 BUG();
138 return NULL;
139 }
140
141 return &pvclock_vdso_info[cpu];
142}
143
144struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
145{
146 return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
147}
148
149#ifdef CONFIG_X86_64
150static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
151 void *v)
152{
153 struct task_migration_notifier *mn = v;
154 struct pvclock_vsyscall_time_info *pvti;
155
156 pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
157
158 /* this is NULL when pvclock vsyscall is not initialized */
159 if (unlikely(pvti == NULL))
160 return NOTIFY_DONE;
161
162 pvti->migrate_count++;
163
164 return NOTIFY_DONE;
165}
166
167static struct notifier_block pvclock_migrate = {
168 .notifier_call = pvclock_task_migrate,
169};
170
171/*
172 * Initialize the generic pvclock vsyscall state. This will allocate
173 * a/some page(s) for the per-vcpu pvclock information, set up a
174 * fixmap mapping for the page(s)
175 */
176
177int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
178 int size)
179{
180 int idx;
181
182 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
183
184 pvclock_vdso_info = i;
185
186 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
187 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
188 __pa_symbol(i) + (idx*PAGE_SIZE),
189 PAGE_KERNEL_VVAR);
190 }
191
192
193 register_task_migration_notifier(&pvclock_migrate);
194
195 return 0;
196}
197#endif
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index ec79e773342e..a20ecb5b6cbf 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -320,6 +320,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
320 if (index == 0) { 320 if (index == 0) {
321 entry->ebx &= kvm_supported_word9_x86_features; 321 entry->ebx &= kvm_supported_word9_x86_features;
322 cpuid_mask(&entry->ebx, 9); 322 cpuid_mask(&entry->ebx, 9);
323 // TSC_ADJUST is emulated
324 entry->ebx |= F(TSC_ADJUST);
323 } else 325 } else
324 entry->ebx = 0; 326 entry->ebx = 0;
325 entry->eax = 0; 327 entry->eax = 0;
@@ -659,6 +661,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
659 } else 661 } else
660 *eax = *ebx = *ecx = *edx = 0; 662 *eax = *ebx = *ecx = *edx = 0;
661} 663}
664EXPORT_SYMBOL_GPL(kvm_cpuid);
662 665
663void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 666void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
664{ 667{
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 58fc51488828..b7fd07984888 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -31,6 +31,14 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
31 return best && (best->ecx & bit(X86_FEATURE_XSAVE)); 31 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
32} 32}
33 33
34static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu)
35{
36 struct kvm_cpuid_entry2 *best;
37
38 best = kvm_find_cpuid_entry(vcpu, 7, 0);
39 return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST));
40}
41
34static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) 42static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
35{ 43{
36 struct kvm_cpuid_entry2 *best; 44 struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index bba39bfa1c4b..a27e76371108 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -676,8 +676,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
676 addr.seg); 676 addr.seg);
677 if (!usable) 677 if (!usable)
678 goto bad; 678 goto bad;
679 /* code segment or read-only data segment */ 679 /* code segment in protected mode or read-only data segment */
680 if (((desc.type & 8) || !(desc.type & 2)) && write) 680 if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8))
681 || !(desc.type & 2)) && write)
681 goto bad; 682 goto bad;
682 /* unreadable code segment */ 683 /* unreadable code segment */
683 if (!fetch && (desc.type & 8) && !(desc.type & 2)) 684 if (!fetch && (desc.type & 8) && !(desc.type & 2))
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 43e9fadca5d0..9392f527f107 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1011,7 +1011,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
1011 local_irq_save(flags); 1011 local_irq_save(flags);
1012 1012
1013 now = apic->lapic_timer.timer.base->get_time(); 1013 now = apic->lapic_timer.timer.base->get_time();
1014 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); 1014 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
1015 if (likely(tscdeadline > guest_tsc)) { 1015 if (likely(tscdeadline > guest_tsc)) {
1016 ns = (tscdeadline - guest_tsc) * 1000000ULL; 1016 ns = (tscdeadline - guest_tsc) * 1000000ULL;
1017 do_div(ns, this_tsc_khz); 1017 do_div(ns, this_tsc_khz);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6f85fe0bf958..01d7c2ad05f5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2382,12 +2382,20 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2382 || (!vcpu->arch.mmu.direct_map && write_fault 2382 || (!vcpu->arch.mmu.direct_map && write_fault
2383 && !is_write_protection(vcpu) && !user_fault)) { 2383 && !is_write_protection(vcpu) && !user_fault)) {
2384 2384
2385 /*
2386 * There are two cases:
2387 * - the one is other vcpu creates new sp in the window
2388 * between mapping_level() and acquiring mmu-lock.
2389 * - the another case is the new sp is created by itself
2390 * (page-fault path) when guest uses the target gfn as
2391 * its page table.
2392 * Both of these cases can be fixed by allowing guest to
2393 * retry the access, it will refault, then we can establish
2394 * the mapping by using small page.
2395 */
2385 if (level > PT_PAGE_TABLE_LEVEL && 2396 if (level > PT_PAGE_TABLE_LEVEL &&
2386 has_wrprotected_page(vcpu->kvm, gfn, level)) { 2397 has_wrprotected_page(vcpu->kvm, gfn, level))
2387 ret = 1;
2388 drop_spte(vcpu->kvm, sptep);
2389 goto done; 2398 goto done;
2390 }
2391 2399
2392 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; 2400 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
2393 2401
@@ -2505,6 +2513,14 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2505 mmu_free_roots(vcpu); 2513 mmu_free_roots(vcpu);
2506} 2514}
2507 2515
2516static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2517{
2518 int bit7;
2519
2520 bit7 = (gpte >> 7) & 1;
2521 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2522}
2523
2508static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2524static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2509 bool no_dirty_log) 2525 bool no_dirty_log)
2510{ 2526{
@@ -2517,6 +2533,26 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2517 return gfn_to_pfn_memslot_atomic(slot, gfn); 2533 return gfn_to_pfn_memslot_atomic(slot, gfn);
2518} 2534}
2519 2535
2536static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
2537 struct kvm_mmu_page *sp, u64 *spte,
2538 u64 gpte)
2539{
2540 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
2541 goto no_present;
2542
2543 if (!is_present_gpte(gpte))
2544 goto no_present;
2545
2546 if (!(gpte & PT_ACCESSED_MASK))
2547 goto no_present;
2548
2549 return false;
2550
2551no_present:
2552 drop_spte(vcpu->kvm, spte);
2553 return true;
2554}
2555
2520static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, 2556static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2521 struct kvm_mmu_page *sp, 2557 struct kvm_mmu_page *sp,
2522 u64 *start, u64 *end) 2558 u64 *start, u64 *end)
@@ -2671,7 +2707,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2671 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done 2707 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2672 * here. 2708 * here.
2673 */ 2709 */
2674 if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && 2710 if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2675 level == PT_PAGE_TABLE_LEVEL && 2711 level == PT_PAGE_TABLE_LEVEL &&
2676 PageTransCompound(pfn_to_page(pfn)) && 2712 PageTransCompound(pfn_to_page(pfn)) &&
2677 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { 2713 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
@@ -2699,18 +2735,13 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2699 } 2735 }
2700} 2736}
2701 2737
2702static bool mmu_invalid_pfn(pfn_t pfn)
2703{
2704 return unlikely(is_invalid_pfn(pfn));
2705}
2706
2707static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, 2738static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
2708 pfn_t pfn, unsigned access, int *ret_val) 2739 pfn_t pfn, unsigned access, int *ret_val)
2709{ 2740{
2710 bool ret = true; 2741 bool ret = true;
2711 2742
2712 /* The pfn is invalid, report the error! */ 2743 /* The pfn is invalid, report the error! */
2713 if (unlikely(is_invalid_pfn(pfn))) { 2744 if (unlikely(is_error_pfn(pfn))) {
2714 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); 2745 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
2715 goto exit; 2746 goto exit;
2716 } 2747 }
@@ -2862,7 +2893,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2862 return r; 2893 return r;
2863 2894
2864 spin_lock(&vcpu->kvm->mmu_lock); 2895 spin_lock(&vcpu->kvm->mmu_lock);
2865 if (mmu_notifier_retry(vcpu, mmu_seq)) 2896 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
2866 goto out_unlock; 2897 goto out_unlock;
2867 kvm_mmu_free_some_pages(vcpu); 2898 kvm_mmu_free_some_pages(vcpu);
2868 if (likely(!force_pt_level)) 2899 if (likely(!force_pt_level))
@@ -3331,7 +3362,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3331 return r; 3362 return r;
3332 3363
3333 spin_lock(&vcpu->kvm->mmu_lock); 3364 spin_lock(&vcpu->kvm->mmu_lock);
3334 if (mmu_notifier_retry(vcpu, mmu_seq)) 3365 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3335 goto out_unlock; 3366 goto out_unlock;
3336 kvm_mmu_free_some_pages(vcpu); 3367 kvm_mmu_free_some_pages(vcpu);
3337 if (likely(!force_pt_level)) 3368 if (likely(!force_pt_level))
@@ -3399,14 +3430,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
3399 nonpaging_free(vcpu); 3430 nonpaging_free(vcpu);
3400} 3431}
3401 3432
3402static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
3403{
3404 int bit7;
3405
3406 bit7 = (gpte >> 7) & 1;
3407 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
3408}
3409
3410static inline void protect_clean_gpte(unsigned *access, unsigned gpte) 3433static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
3411{ 3434{
3412 unsigned mask; 3435 unsigned mask;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 714e2c01a6fe..891eb6d93b8b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -305,51 +305,43 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
305 addr, access); 305 addr, access);
306} 306}
307 307
308static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, 308static bool
309 struct kvm_mmu_page *sp, u64 *spte, 309FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
310 pt_element_t gpte) 310 u64 *spte, pt_element_t gpte, bool no_dirty_log)
311{ 311{
312 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
313 goto no_present;
314
315 if (!is_present_gpte(gpte))
316 goto no_present;
317
318 if (!(gpte & PT_ACCESSED_MASK))
319 goto no_present;
320
321 return false;
322
323no_present:
324 drop_spte(vcpu->kvm, spte);
325 return true;
326}
327
328static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
329 u64 *spte, const void *pte)
330{
331 pt_element_t gpte;
332 unsigned pte_access; 312 unsigned pte_access;
313 gfn_t gfn;
333 pfn_t pfn; 314 pfn_t pfn;
334 315
335 gpte = *(const pt_element_t *)pte; 316 if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
336 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) 317 return false;
337 return;
338 318
339 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 319 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
320
321 gfn = gpte_to_gfn(gpte);
340 pte_access = sp->role.access & gpte_access(vcpu, gpte); 322 pte_access = sp->role.access & gpte_access(vcpu, gpte);
341 protect_clean_gpte(&pte_access, gpte); 323 protect_clean_gpte(&pte_access, gpte);
342 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); 324 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
343 if (mmu_invalid_pfn(pfn)) 325 no_dirty_log && (pte_access & ACC_WRITE_MASK));
344 return; 326 if (is_error_pfn(pfn))
327 return false;
345 328
346 /* 329 /*
347 * we call mmu_set_spte() with host_writable = true because that 330 * we call mmu_set_spte() with host_writable = true because
348 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 331 * pte_prefetch_gfn_to_pfn always gets a writable pfn.
349 */ 332 */
350 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 333 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
351 NULL, PT_PAGE_TABLE_LEVEL, 334 NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true);
352 gpte_to_gfn(gpte), pfn, true, true); 335
336 return true;
337}
338
339static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
340 u64 *spte, const void *pte)
341{
342 pt_element_t gpte = *(const pt_element_t *)pte;
343
344 FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
353} 345}
354 346
355static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, 347static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
@@ -395,53 +387,34 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
395 spte = sp->spt + i; 387 spte = sp->spt + i;
396 388
397 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 389 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
398 pt_element_t gpte;
399 unsigned pte_access;
400 gfn_t gfn;
401 pfn_t pfn;
402
403 if (spte == sptep) 390 if (spte == sptep)
404 continue; 391 continue;
405 392
406 if (is_shadow_present_pte(*spte)) 393 if (is_shadow_present_pte(*spte))
407 continue; 394 continue;
408 395
409 gpte = gptep[i]; 396 if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
410
411 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
412 continue;
413
414 pte_access = sp->role.access & gpte_access(vcpu, gpte);
415 protect_clean_gpte(&pte_access, gpte);
416 gfn = gpte_to_gfn(gpte);
417 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
418 pte_access & ACC_WRITE_MASK);
419 if (mmu_invalid_pfn(pfn))
420 break; 397 break;
421
422 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
423 NULL, PT_PAGE_TABLE_LEVEL, gfn,
424 pfn, true, true);
425 } 398 }
426} 399}
427 400
428/* 401/*
429 * Fetch a shadow pte for a specific level in the paging hierarchy. 402 * Fetch a shadow pte for a specific level in the paging hierarchy.
403 * If the guest tries to write a write-protected page, we need to
404 * emulate this operation, return 1 to indicate this case.
430 */ 405 */
431static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 406static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
432 struct guest_walker *gw, 407 struct guest_walker *gw,
433 int user_fault, int write_fault, int hlevel, 408 int user_fault, int write_fault, int hlevel,
434 int *emulate, pfn_t pfn, bool map_writable, 409 pfn_t pfn, bool map_writable, bool prefault)
435 bool prefault)
436{ 410{
437 unsigned access = gw->pt_access;
438 struct kvm_mmu_page *sp = NULL; 411 struct kvm_mmu_page *sp = NULL;
439 int top_level;
440 unsigned direct_access;
441 struct kvm_shadow_walk_iterator it; 412 struct kvm_shadow_walk_iterator it;
413 unsigned direct_access, access = gw->pt_access;
414 int top_level, emulate = 0;
442 415
443 if (!is_present_gpte(gw->ptes[gw->level - 1])) 416 if (!is_present_gpte(gw->ptes[gw->level - 1]))
444 return NULL; 417 return 0;
445 418
446 direct_access = gw->pte_access; 419 direct_access = gw->pte_access;
447 420
@@ -505,17 +478,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
505 478
506 clear_sp_write_flooding_count(it.sptep); 479 clear_sp_write_flooding_count(it.sptep);
507 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, 480 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
508 user_fault, write_fault, emulate, it.level, 481 user_fault, write_fault, &emulate, it.level,
509 gw->gfn, pfn, prefault, map_writable); 482 gw->gfn, pfn, prefault, map_writable);
510 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 483 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
511 484
512 return it.sptep; 485 return emulate;
513 486
514out_gpte_changed: 487out_gpte_changed:
515 if (sp) 488 if (sp)
516 kvm_mmu_put_page(sp, it.sptep); 489 kvm_mmu_put_page(sp, it.sptep);
517 kvm_release_pfn_clean(pfn); 490 kvm_release_pfn_clean(pfn);
518 return NULL; 491 return 0;
519} 492}
520 493
521/* 494/*
@@ -538,8 +511,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
538 int write_fault = error_code & PFERR_WRITE_MASK; 511 int write_fault = error_code & PFERR_WRITE_MASK;
539 int user_fault = error_code & PFERR_USER_MASK; 512 int user_fault = error_code & PFERR_USER_MASK;
540 struct guest_walker walker; 513 struct guest_walker walker;
541 u64 *sptep;
542 int emulate = 0;
543 int r; 514 int r;
544 pfn_t pfn; 515 pfn_t pfn;
545 int level = PT_PAGE_TABLE_LEVEL; 516 int level = PT_PAGE_TABLE_LEVEL;
@@ -594,24 +565,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
594 return r; 565 return r;
595 566
596 spin_lock(&vcpu->kvm->mmu_lock); 567 spin_lock(&vcpu->kvm->mmu_lock);
597 if (mmu_notifier_retry(vcpu, mmu_seq)) 568 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
598 goto out_unlock; 569 goto out_unlock;
599 570
600 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 571 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
601 kvm_mmu_free_some_pages(vcpu); 572 kvm_mmu_free_some_pages(vcpu);
602 if (!force_pt_level) 573 if (!force_pt_level)
603 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 574 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
604 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 575 r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
605 level, &emulate, pfn, map_writable, prefault); 576 level, pfn, map_writable, prefault);
606 (void)sptep;
607 pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
608 sptep, *sptep, emulate);
609
610 ++vcpu->stat.pf_fixed; 577 ++vcpu->stat.pf_fixed;
611 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 578 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
612 spin_unlock(&vcpu->kvm->mmu_lock); 579 spin_unlock(&vcpu->kvm->mmu_lock);
613 580
614 return emulate; 581 return r;
615 582
616out_unlock: 583out_unlock:
617 spin_unlock(&vcpu->kvm->mmu_lock); 584 spin_unlock(&vcpu->kvm->mmu_lock);
@@ -757,7 +724,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
757 sizeof(pt_element_t))) 724 sizeof(pt_element_t)))
758 return -EINVAL; 725 return -EINVAL;
759 726
760 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { 727 if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
761 vcpu->kvm->tlbs_dirty++; 728 vcpu->kvm->tlbs_dirty++;
762 continue; 729 continue;
763 } 730 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d017df3899ef..d29d3cd1c156 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -20,6 +20,7 @@
20#include "mmu.h" 20#include "mmu.h"
21#include "kvm_cache_regs.h" 21#include "kvm_cache_regs.h"
22#include "x86.h" 22#include "x86.h"
23#include "cpuid.h"
23 24
24#include <linux/module.h> 25#include <linux/module.h>
25#include <linux/mod_devicetable.h> 26#include <linux/mod_devicetable.h>
@@ -630,15 +631,12 @@ static int svm_hardware_enable(void *garbage)
630 return -EBUSY; 631 return -EBUSY;
631 632
632 if (!has_svm()) { 633 if (!has_svm()) {
633 printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", 634 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
634 me);
635 return -EINVAL; 635 return -EINVAL;
636 } 636 }
637 sd = per_cpu(svm_data, me); 637 sd = per_cpu(svm_data, me);
638
639 if (!sd) { 638 if (!sd) {
640 printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n", 639 pr_err("%s: svm_data is NULL on %d\n", __func__, me);
641 me);
642 return -EINVAL; 640 return -EINVAL;
643 } 641 }
644 642
@@ -1012,6 +1010,13 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1012 svm->tsc_ratio = ratio; 1010 svm->tsc_ratio = ratio;
1013} 1011}
1014 1012
1013static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
1014{
1015 struct vcpu_svm *svm = to_svm(vcpu);
1016
1017 return svm->vmcb->control.tsc_offset;
1018}
1019
1015static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1020static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1016{ 1021{
1017 struct vcpu_svm *svm = to_svm(vcpu); 1022 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1189,6 +1194,8 @@ static void init_vmcb(struct vcpu_svm *svm)
1189static int svm_vcpu_reset(struct kvm_vcpu *vcpu) 1194static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
1190{ 1195{
1191 struct vcpu_svm *svm = to_svm(vcpu); 1196 struct vcpu_svm *svm = to_svm(vcpu);
1197 u32 dummy;
1198 u32 eax = 1;
1192 1199
1193 init_vmcb(svm); 1200 init_vmcb(svm);
1194 1201
@@ -1197,8 +1204,9 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
1197 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; 1204 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
1198 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; 1205 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
1199 } 1206 }
1200 vcpu->arch.regs_avail = ~0; 1207
1201 vcpu->arch.regs_dirty = ~0; 1208 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
1209 kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
1202 1210
1203 return 0; 1211 return 0;
1204} 1212}
@@ -1254,11 +1262,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1254 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 1262 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
1255 svm->asid_generation = 0; 1263 svm->asid_generation = 0;
1256 init_vmcb(svm); 1264 init_vmcb(svm);
1257 kvm_write_tsc(&svm->vcpu, 0);
1258
1259 err = fx_init(&svm->vcpu);
1260 if (err)
1261 goto free_page4;
1262 1265
1263 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 1266 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1264 if (kvm_vcpu_is_bsp(&svm->vcpu)) 1267 if (kvm_vcpu_is_bsp(&svm->vcpu))
@@ -1268,8 +1271,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1268 1271
1269 return &svm->vcpu; 1272 return &svm->vcpu;
1270 1273
1271free_page4:
1272 __free_page(hsave_page);
1273free_page3: 1274free_page3:
1274 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); 1275 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
1275free_page2: 1276free_page2:
@@ -3008,11 +3009,11 @@ static int cr8_write_interception(struct vcpu_svm *svm)
3008 return 0; 3009 return 0;
3009} 3010}
3010 3011
3011u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu) 3012u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
3012{ 3013{
3013 struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); 3014 struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
3014 return vmcb->control.tsc_offset + 3015 return vmcb->control.tsc_offset +
3015 svm_scale_tsc(vcpu, native_read_tsc()); 3016 svm_scale_tsc(vcpu, host_tsc);
3016} 3017}
3017 3018
3018static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) 3019static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
@@ -3131,13 +3132,15 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
3131 return 0; 3132 return 0;
3132} 3133}
3133 3134
3134static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) 3135static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
3135{ 3136{
3136 struct vcpu_svm *svm = to_svm(vcpu); 3137 struct vcpu_svm *svm = to_svm(vcpu);
3137 3138
3139 u32 ecx = msr->index;
3140 u64 data = msr->data;
3138 switch (ecx) { 3141 switch (ecx) {
3139 case MSR_IA32_TSC: 3142 case MSR_IA32_TSC:
3140 kvm_write_tsc(vcpu, data); 3143 kvm_write_tsc(vcpu, msr);
3141 break; 3144 break;
3142 case MSR_STAR: 3145 case MSR_STAR:
3143 svm->vmcb->save.star = data; 3146 svm->vmcb->save.star = data;
@@ -3192,20 +3195,24 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
3192 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 3195 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3193 break; 3196 break;
3194 default: 3197 default:
3195 return kvm_set_msr_common(vcpu, ecx, data); 3198 return kvm_set_msr_common(vcpu, msr);
3196 } 3199 }
3197 return 0; 3200 return 0;
3198} 3201}
3199 3202
3200static int wrmsr_interception(struct vcpu_svm *svm) 3203static int wrmsr_interception(struct vcpu_svm *svm)
3201{ 3204{
3205 struct msr_data msr;
3202 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 3206 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
3203 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 3207 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
3204 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 3208 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
3205 3209
3210 msr.data = data;
3211 msr.index = ecx;
3212 msr.host_initiated = false;
3206 3213
3207 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 3214 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3208 if (svm_set_msr(&svm->vcpu, ecx, data)) { 3215 if (svm_set_msr(&svm->vcpu, &msr)) {
3209 trace_kvm_msr_write_ex(ecx, data); 3216 trace_kvm_msr_write_ex(ecx, data);
3210 kvm_inject_gp(&svm->vcpu, 0); 3217 kvm_inject_gp(&svm->vcpu, 0);
3211 } else { 3218 } else {
@@ -4302,6 +4309,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4302 .has_wbinvd_exit = svm_has_wbinvd_exit, 4309 .has_wbinvd_exit = svm_has_wbinvd_exit,
4303 4310
4304 .set_tsc_khz = svm_set_tsc_khz, 4311 .set_tsc_khz = svm_set_tsc_khz,
4312 .read_tsc_offset = svm_read_tsc_offset,
4305 .write_tsc_offset = svm_write_tsc_offset, 4313 .write_tsc_offset = svm_write_tsc_offset,
4306 .adjust_tsc_offset = svm_adjust_tsc_offset, 4314 .adjust_tsc_offset = svm_adjust_tsc_offset,
4307 .compute_tsc_offset = svm_compute_tsc_offset, 4315 .compute_tsc_offset = svm_compute_tsc_offset,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index bca63f04dccb..fe5e00ed7036 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -4,6 +4,7 @@
4#include <linux/tracepoint.h> 4#include <linux/tracepoint.h>
5#include <asm/vmx.h> 5#include <asm/vmx.h>
6#include <asm/svm.h> 6#include <asm/svm.h>
7#include <asm/clocksource.h>
7 8
8#undef TRACE_SYSTEM 9#undef TRACE_SYSTEM
9#define TRACE_SYSTEM kvm 10#define TRACE_SYSTEM kvm
@@ -754,6 +755,68 @@ TRACE_EVENT(
754 __entry->write ? "Write" : "Read", 755 __entry->write ? "Write" : "Read",
755 __entry->gpa_match ? "GPA" : "GVA") 756 __entry->gpa_match ? "GPA" : "GVA")
756); 757);
758
759#ifdef CONFIG_X86_64
760
761#define host_clocks \
762 {VCLOCK_NONE, "none"}, \
763 {VCLOCK_TSC, "tsc"}, \
764 {VCLOCK_HPET, "hpet"} \
765
766TRACE_EVENT(kvm_update_master_clock,
767 TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
768 TP_ARGS(use_master_clock, host_clock, offset_matched),
769
770 TP_STRUCT__entry(
771 __field( bool, use_master_clock )
772 __field( unsigned int, host_clock )
773 __field( bool, offset_matched )
774 ),
775
776 TP_fast_assign(
777 __entry->use_master_clock = use_master_clock;
778 __entry->host_clock = host_clock;
779 __entry->offset_matched = offset_matched;
780 ),
781
782 TP_printk("masterclock %d hostclock %s offsetmatched %u",
783 __entry->use_master_clock,
784 __print_symbolic(__entry->host_clock, host_clocks),
785 __entry->offset_matched)
786);
787
788TRACE_EVENT(kvm_track_tsc,
789 TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched,
790 unsigned int online_vcpus, bool use_master_clock,
791 unsigned int host_clock),
792 TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock,
793 host_clock),
794
795 TP_STRUCT__entry(
796 __field( unsigned int, vcpu_id )
797 __field( unsigned int, nr_vcpus_matched_tsc )
798 __field( unsigned int, online_vcpus )
799 __field( bool, use_master_clock )
800 __field( unsigned int, host_clock )
801 ),
802
803 TP_fast_assign(
804 __entry->vcpu_id = vcpu_id;
805 __entry->nr_vcpus_matched_tsc = nr_matched;
806 __entry->online_vcpus = online_vcpus;
807 __entry->use_master_clock = use_master_clock;
808 __entry->host_clock = host_clock;
809 ),
810
811 TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u"
812 " hostclock %s",
813 __entry->vcpu_id, __entry->use_master_clock,
814 __entry->nr_vcpus_matched_tsc, __entry->online_vcpus,
815 __print_symbolic(__entry->host_clock, host_clocks))
816);
817
818#endif /* CONFIG_X86_64 */
819
757#endif /* _TRACE_KVM_H */ 820#endif /* _TRACE_KVM_H */
758 821
759#undef TRACE_INCLUDE_PATH 822#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f85815945fc6..9120ae1901e4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -42,6 +42,7 @@
42#include <asm/i387.h> 42#include <asm/i387.h>
43#include <asm/xcr.h> 43#include <asm/xcr.h>
44#include <asm/perf_event.h> 44#include <asm/perf_event.h>
45#include <asm/kexec.h>
45 46
46#include "trace.h" 47#include "trace.h"
47 48
@@ -802,11 +803,6 @@ static inline bool cpu_has_vmx_ept_ad_bits(void)
802 return vmx_capability.ept & VMX_EPT_AD_BIT; 803 return vmx_capability.ept & VMX_EPT_AD_BIT;
803} 804}
804 805
805static inline bool cpu_has_vmx_invept_individual_addr(void)
806{
807 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
808}
809
810static inline bool cpu_has_vmx_invept_context(void) 806static inline bool cpu_has_vmx_invept_context(void)
811{ 807{
812 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; 808 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
@@ -992,6 +988,46 @@ static void vmcs_load(struct vmcs *vmcs)
992 vmcs, phys_addr); 988 vmcs, phys_addr);
993} 989}
994 990
991#ifdef CONFIG_KEXEC
992/*
993 * This bitmap is used to indicate whether the vmclear
994 * operation is enabled on all cpus. All disabled by
995 * default.
996 */
997static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
998
999static inline void crash_enable_local_vmclear(int cpu)
1000{
1001 cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
1002}
1003
1004static inline void crash_disable_local_vmclear(int cpu)
1005{
1006 cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
1007}
1008
1009static inline int crash_local_vmclear_enabled(int cpu)
1010{
1011 return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
1012}
1013
1014static void crash_vmclear_local_loaded_vmcss(void)
1015{
1016 int cpu = raw_smp_processor_id();
1017 struct loaded_vmcs *v;
1018
1019 if (!crash_local_vmclear_enabled(cpu))
1020 return;
1021
1022 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
1023 loaded_vmcss_on_cpu_link)
1024 vmcs_clear(v->vmcs);
1025}
1026#else
1027static inline void crash_enable_local_vmclear(int cpu) { }
1028static inline void crash_disable_local_vmclear(int cpu) { }
1029#endif /* CONFIG_KEXEC */
1030
995static void __loaded_vmcs_clear(void *arg) 1031static void __loaded_vmcs_clear(void *arg)
996{ 1032{
997 struct loaded_vmcs *loaded_vmcs = arg; 1033 struct loaded_vmcs *loaded_vmcs = arg;
@@ -1001,15 +1037,28 @@ static void __loaded_vmcs_clear(void *arg)
1001 return; /* vcpu migration can race with cpu offline */ 1037 return; /* vcpu migration can race with cpu offline */
1002 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 1038 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
1003 per_cpu(current_vmcs, cpu) = NULL; 1039 per_cpu(current_vmcs, cpu) = NULL;
1040 crash_disable_local_vmclear(cpu);
1004 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 1041 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1042
1043 /*
1044 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
1045 * is before setting loaded_vmcs->vcpu to -1 which is done in
1046 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
1047 * then adds the vmcs into percpu list before it is deleted.
1048 */
1049 smp_wmb();
1050
1005 loaded_vmcs_init(loaded_vmcs); 1051 loaded_vmcs_init(loaded_vmcs);
1052 crash_enable_local_vmclear(cpu);
1006} 1053}
1007 1054
1008static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 1055static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1009{ 1056{
1010 if (loaded_vmcs->cpu != -1) 1057 int cpu = loaded_vmcs->cpu;
1011 smp_call_function_single( 1058
1012 loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); 1059 if (cpu != -1)
1060 smp_call_function_single(cpu,
1061 __loaded_vmcs_clear, loaded_vmcs, 1);
1013} 1062}
1014 1063
1015static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) 1064static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -1051,17 +1100,6 @@ static inline void ept_sync_context(u64 eptp)
1051 } 1100 }
1052} 1101}
1053 1102
1054static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
1055{
1056 if (enable_ept) {
1057 if (cpu_has_vmx_invept_individual_addr())
1058 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
1059 eptp, gpa);
1060 else
1061 ept_sync_context(eptp);
1062 }
1063}
1064
1065static __always_inline unsigned long vmcs_readl(unsigned long field) 1103static __always_inline unsigned long vmcs_readl(unsigned long field)
1066{ 1104{
1067 unsigned long value; 1105 unsigned long value;
@@ -1535,8 +1573,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1535 1573
1536 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1574 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1537 local_irq_disable(); 1575 local_irq_disable();
1576 crash_disable_local_vmclear(cpu);
1577
1578 /*
1579 * Read loaded_vmcs->cpu should be before fetching
1580 * loaded_vmcs->loaded_vmcss_on_cpu_link.
1581 * See the comments in __loaded_vmcs_clear().
1582 */
1583 smp_rmb();
1584
1538 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1585 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1539 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1586 &per_cpu(loaded_vmcss_on_cpu, cpu));
1587 crash_enable_local_vmclear(cpu);
1540 local_irq_enable(); 1588 local_irq_enable();
1541 1589
1542 /* 1590 /*
@@ -1839,11 +1887,10 @@ static u64 guest_read_tsc(void)
1839 * Like guest_read_tsc, but always returns L1's notion of the timestamp 1887 * Like guest_read_tsc, but always returns L1's notion of the timestamp
1840 * counter, even if a nested guest (L2) is currently running. 1888 * counter, even if a nested guest (L2) is currently running.
1841 */ 1889 */
1842u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) 1890u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
1843{ 1891{
1844 u64 host_tsc, tsc_offset; 1892 u64 tsc_offset;
1845 1893
1846 rdtscll(host_tsc);
1847 tsc_offset = is_guest_mode(vcpu) ? 1894 tsc_offset = is_guest_mode(vcpu) ?
1848 to_vmx(vcpu)->nested.vmcs01_tsc_offset : 1895 to_vmx(vcpu)->nested.vmcs01_tsc_offset :
1849 vmcs_read64(TSC_OFFSET); 1896 vmcs_read64(TSC_OFFSET);
@@ -1866,6 +1913,11 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1866 WARN(1, "user requested TSC rate below hardware speed\n"); 1913 WARN(1, "user requested TSC rate below hardware speed\n");
1867} 1914}
1868 1915
1916static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
1917{
1918 return vmcs_read64(TSC_OFFSET);
1919}
1920
1869/* 1921/*
1870 * writes 'offset' into guest's timestamp counter offset register 1922 * writes 'offset' into guest's timestamp counter offset register
1871 */ 1923 */
@@ -2202,15 +2254,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2202 * Returns 0 on success, non-0 otherwise. 2254 * Returns 0 on success, non-0 otherwise.
2203 * Assumes vcpu_load() was already called. 2255 * Assumes vcpu_load() was already called.
2204 */ 2256 */
2205static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 2257static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2206{ 2258{
2207 struct vcpu_vmx *vmx = to_vmx(vcpu); 2259 struct vcpu_vmx *vmx = to_vmx(vcpu);
2208 struct shared_msr_entry *msr; 2260 struct shared_msr_entry *msr;
2209 int ret = 0; 2261 int ret = 0;
2262 u32 msr_index = msr_info->index;
2263 u64 data = msr_info->data;
2210 2264
2211 switch (msr_index) { 2265 switch (msr_index) {
2212 case MSR_EFER: 2266 case MSR_EFER:
2213 ret = kvm_set_msr_common(vcpu, msr_index, data); 2267 ret = kvm_set_msr_common(vcpu, msr_info);
2214 break; 2268 break;
2215#ifdef CONFIG_X86_64 2269#ifdef CONFIG_X86_64
2216 case MSR_FS_BASE: 2270 case MSR_FS_BASE:
@@ -2236,7 +2290,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2236 vmcs_writel(GUEST_SYSENTER_ESP, data); 2290 vmcs_writel(GUEST_SYSENTER_ESP, data);
2237 break; 2291 break;
2238 case MSR_IA32_TSC: 2292 case MSR_IA32_TSC:
2239 kvm_write_tsc(vcpu, data); 2293 kvm_write_tsc(vcpu, msr_info);
2240 break; 2294 break;
2241 case MSR_IA32_CR_PAT: 2295 case MSR_IA32_CR_PAT:
2242 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2296 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -2244,7 +2298,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2244 vcpu->arch.pat = data; 2298 vcpu->arch.pat = data;
2245 break; 2299 break;
2246 } 2300 }
2247 ret = kvm_set_msr_common(vcpu, msr_index, data); 2301 ret = kvm_set_msr_common(vcpu, msr_info);
2302 break;
2303 case MSR_IA32_TSC_ADJUST:
2304 ret = kvm_set_msr_common(vcpu, msr_info);
2248 break; 2305 break;
2249 case MSR_TSC_AUX: 2306 case MSR_TSC_AUX:
2250 if (!vmx->rdtscp_enabled) 2307 if (!vmx->rdtscp_enabled)
@@ -2267,7 +2324,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2267 } 2324 }
2268 break; 2325 break;
2269 } 2326 }
2270 ret = kvm_set_msr_common(vcpu, msr_index, data); 2327 ret = kvm_set_msr_common(vcpu, msr_info);
2271 } 2328 }
2272 2329
2273 return ret; 2330 return ret;
@@ -2341,6 +2398,18 @@ static int hardware_enable(void *garbage)
2341 return -EBUSY; 2398 return -EBUSY;
2342 2399
2343 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 2400 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
2401
2402 /*
2403 * Now we can enable the vmclear operation in kdump
2404 * since the loaded_vmcss_on_cpu list on this cpu
2405 * has been initialized.
2406 *
2407 * Though the cpu is not in VMX operation now, there
2408 * is no problem to enable the vmclear operation
2409 * for the loaded_vmcss_on_cpu list is empty!
2410 */
2411 crash_enable_local_vmclear(cpu);
2412
2344 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 2413 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
2345 2414
2346 test_bits = FEATURE_CONTROL_LOCKED; 2415 test_bits = FEATURE_CONTROL_LOCKED;
@@ -2697,6 +2766,7 @@ static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment
2697 if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { 2766 if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
2698 tmp.base = vmcs_readl(sf->base); 2767 tmp.base = vmcs_readl(sf->base);
2699 tmp.selector = vmcs_read16(sf->selector); 2768 tmp.selector = vmcs_read16(sf->selector);
2769 tmp.dpl = tmp.selector & SELECTOR_RPL_MASK;
2700 tmp.s = 1; 2770 tmp.s = 1;
2701 } 2771 }
2702 vmx_set_segment(vcpu, &tmp, seg); 2772 vmx_set_segment(vcpu, &tmp, seg);
@@ -3246,7 +3316,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3246 * unrestricted guest like Westmere to older host that don't have 3316 * unrestricted guest like Westmere to older host that don't have
3247 * unrestricted guest like Nehelem. 3317 * unrestricted guest like Nehelem.
3248 */ 3318 */
3249 if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { 3319 if (vmx->rmode.vm86_active) {
3250 switch (seg) { 3320 switch (seg) {
3251 case VCPU_SREG_CS: 3321 case VCPU_SREG_CS:
3252 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); 3322 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
@@ -3897,8 +3967,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
3897 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 3967 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
3898 set_cr4_guest_host_mask(vmx); 3968 set_cr4_guest_host_mask(vmx);
3899 3969
3900 kvm_write_tsc(&vmx->vcpu, 0);
3901
3902 return 0; 3970 return 0;
3903} 3971}
3904 3972
@@ -3908,8 +3976,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3908 u64 msr; 3976 u64 msr;
3909 int ret; 3977 int ret;
3910 3978
3911 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
3912
3913 vmx->rmode.vm86_active = 0; 3979 vmx->rmode.vm86_active = 0;
3914 3980
3915 vmx->soft_vnmi_blocked = 0; 3981 vmx->soft_vnmi_blocked = 0;
@@ -3921,10 +3987,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3921 msr |= MSR_IA32_APICBASE_BSP; 3987 msr |= MSR_IA32_APICBASE_BSP;
3922 kvm_set_apic_base(&vmx->vcpu, msr); 3988 kvm_set_apic_base(&vmx->vcpu, msr);
3923 3989
3924 ret = fx_init(&vmx->vcpu);
3925 if (ret != 0)
3926 goto out;
3927
3928 vmx_segment_cache_clear(vmx); 3990 vmx_segment_cache_clear(vmx);
3929 3991
3930 seg_setup(VCPU_SREG_CS); 3992 seg_setup(VCPU_SREG_CS);
@@ -3965,7 +4027,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3965 kvm_rip_write(vcpu, 0xfff0); 4027 kvm_rip_write(vcpu, 0xfff0);
3966 else 4028 else
3967 kvm_rip_write(vcpu, 0); 4029 kvm_rip_write(vcpu, 0);
3968 kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
3969 4030
3970 vmcs_writel(GUEST_GDTR_BASE, 0); 4031 vmcs_writel(GUEST_GDTR_BASE, 0);
3971 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4032 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4015,7 +4076,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4015 /* HACK: Don't enable emulation on guest boot/reset */ 4076 /* HACK: Don't enable emulation on guest boot/reset */
4016 vmx->emulation_required = 0; 4077 vmx->emulation_required = 0;
4017 4078
4018out:
4019 return ret; 4079 return ret;
4020} 4080}
4021 4081
@@ -4287,16 +4347,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4287 if (is_machine_check(intr_info)) 4347 if (is_machine_check(intr_info))
4288 return handle_machine_check(vcpu); 4348 return handle_machine_check(vcpu);
4289 4349
4290 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
4291 !is_page_fault(intr_info)) {
4292 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4293 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
4294 vcpu->run->internal.ndata = 2;
4295 vcpu->run->internal.data[0] = vect_info;
4296 vcpu->run->internal.data[1] = intr_info;
4297 return 0;
4298 }
4299
4300 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 4350 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
4301 return 1; /* already handled by vmx_vcpu_run() */ 4351 return 1; /* already handled by vmx_vcpu_run() */
4302 4352
@@ -4315,6 +4365,22 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4315 error_code = 0; 4365 error_code = 0;
4316 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 4366 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
4317 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 4367 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
4368
4369 /*
4370 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
4371 * MMIO, it is better to report an internal error.
4372 * See the comments in vmx_handle_exit.
4373 */
4374 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
4375 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
4376 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4377 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
4378 vcpu->run->internal.ndata = 2;
4379 vcpu->run->internal.data[0] = vect_info;
4380 vcpu->run->internal.data[1] = intr_info;
4381 return 0;
4382 }
4383
4318 if (is_page_fault(intr_info)) { 4384 if (is_page_fault(intr_info)) {
4319 /* EPT won't cause page fault directly */ 4385 /* EPT won't cause page fault directly */
4320 BUG_ON(enable_ept); 4386 BUG_ON(enable_ept);
@@ -4626,11 +4692,15 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
4626 4692
4627static int handle_wrmsr(struct kvm_vcpu *vcpu) 4693static int handle_wrmsr(struct kvm_vcpu *vcpu)
4628{ 4694{
4695 struct msr_data msr;
4629 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 4696 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
4630 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 4697 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
4631 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 4698 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
4632 4699
4633 if (vmx_set_msr(vcpu, ecx, data) != 0) { 4700 msr.data = data;
4701 msr.index = ecx;
4702 msr.host_initiated = false;
4703 if (vmx_set_msr(vcpu, &msr) != 0) {
4634 trace_kvm_msr_write_ex(ecx, data); 4704 trace_kvm_msr_write_ex(ecx, data);
4635 kvm_inject_gp(vcpu, 0); 4705 kvm_inject_gp(vcpu, 0);
4636 return 1; 4706 return 1;
@@ -4827,11 +4897,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
4827 4897
4828 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4898 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4829 4899
4830 if (exit_qualification & (1 << 6)) {
4831 printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
4832 return -EINVAL;
4833 }
4834
4835 gla_validity = (exit_qualification >> 7) & 0x3; 4900 gla_validity = (exit_qualification >> 7) & 0x3;
4836 if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { 4901 if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
4837 printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); 4902 printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
@@ -5979,13 +6044,24 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
5979 return 0; 6044 return 0;
5980 } 6045 }
5981 6046
6047 /*
6048 * Note:
6049 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6050 * delivery event since it indicates guest is accessing MMIO.
6051 * The vm-exit can be triggered again after return to guest that
6052 * will cause infinite loop.
6053 */
5982 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 6054 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
5983 (exit_reason != EXIT_REASON_EXCEPTION_NMI && 6055 (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
5984 exit_reason != EXIT_REASON_EPT_VIOLATION && 6056 exit_reason != EXIT_REASON_EPT_VIOLATION &&
5985 exit_reason != EXIT_REASON_TASK_SWITCH)) 6057 exit_reason != EXIT_REASON_TASK_SWITCH)) {
5986 printk(KERN_WARNING "%s: unexpected, valid vectoring info " 6058 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5987 "(0x%x) and exit reason is 0x%x\n", 6059 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
5988 __func__, vectoring_info, exit_reason); 6060 vcpu->run->internal.ndata = 2;
6061 vcpu->run->internal.data[0] = vectoring_info;
6062 vcpu->run->internal.data[1] = exit_reason;
6063 return 0;
6064 }
5989 6065
5990 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && 6066 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
5991 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( 6067 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
@@ -7309,6 +7385,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7309 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 7385 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
7310 7386
7311 .set_tsc_khz = vmx_set_tsc_khz, 7387 .set_tsc_khz = vmx_set_tsc_khz,
7388 .read_tsc_offset = vmx_read_tsc_offset,
7312 .write_tsc_offset = vmx_write_tsc_offset, 7389 .write_tsc_offset = vmx_write_tsc_offset,
7313 .adjust_tsc_offset = vmx_adjust_tsc_offset, 7390 .adjust_tsc_offset = vmx_adjust_tsc_offset,
7314 .compute_tsc_offset = vmx_compute_tsc_offset, 7391 .compute_tsc_offset = vmx_compute_tsc_offset,
@@ -7367,6 +7444,11 @@ static int __init vmx_init(void)
7367 if (r) 7444 if (r)
7368 goto out3; 7445 goto out3;
7369 7446
7447#ifdef CONFIG_KEXEC
7448 rcu_assign_pointer(crash_vmclear_loaded_vmcss,
7449 crash_vmclear_local_loaded_vmcss);
7450#endif
7451
7370 vmx_disable_intercept_for_msr(MSR_FS_BASE, false); 7452 vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
7371 vmx_disable_intercept_for_msr(MSR_GS_BASE, false); 7453 vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
7372 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); 7454 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -7404,6 +7486,11 @@ static void __exit vmx_exit(void)
7404 free_page((unsigned long)vmx_io_bitmap_b); 7486 free_page((unsigned long)vmx_io_bitmap_b);
7405 free_page((unsigned long)vmx_io_bitmap_a); 7487 free_page((unsigned long)vmx_io_bitmap_a);
7406 7488
7489#ifdef CONFIG_KEXEC
7490 rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL);
7491 synchronize_rcu();
7492#endif
7493
7407 kvm_exit(); 7494 kvm_exit();
7408} 7495}
7409 7496
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4f7641756be2..76f54461f7cb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -46,6 +46,8 @@
46#include <linux/uaccess.h> 46#include <linux/uaccess.h>
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/pci.h> 48#include <linux/pci.h>
49#include <linux/timekeeper_internal.h>
50#include <linux/pvclock_gtod.h>
49#include <trace/events/kvm.h> 51#include <trace/events/kvm.h>
50 52
51#define CREATE_TRACE_POINTS 53#define CREATE_TRACE_POINTS
@@ -158,7 +160,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
158 160
159u64 __read_mostly host_xcr0; 161u64 __read_mostly host_xcr0;
160 162
161int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); 163static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
164
165static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
162 166
163static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) 167static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
164{ 168{
@@ -633,7 +637,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
633 } 637 }
634 638
635 if (is_long_mode(vcpu)) { 639 if (is_long_mode(vcpu)) {
636 if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) { 640 if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
637 if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) 641 if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
638 return 1; 642 return 1;
639 } else 643 } else
@@ -827,6 +831,7 @@ static u32 msrs_to_save[] = {
827static unsigned num_msrs_to_save; 831static unsigned num_msrs_to_save;
828 832
829static const u32 emulated_msrs[] = { 833static const u32 emulated_msrs[] = {
834 MSR_IA32_TSC_ADJUST,
830 MSR_IA32_TSCDEADLINE, 835 MSR_IA32_TSCDEADLINE,
831 MSR_IA32_MISC_ENABLE, 836 MSR_IA32_MISC_ENABLE,
832 MSR_IA32_MCG_STATUS, 837 MSR_IA32_MCG_STATUS,
@@ -886,9 +891,9 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
886 * Returns 0 on success, non-0 otherwise. 891 * Returns 0 on success, non-0 otherwise.
887 * Assumes vcpu_load() was already called. 892 * Assumes vcpu_load() was already called.
888 */ 893 */
889int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 894int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
890{ 895{
891 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 896 return kvm_x86_ops->set_msr(vcpu, msr);
892} 897}
893 898
894/* 899/*
@@ -896,9 +901,63 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
896 */ 901 */
897static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 902static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
898{ 903{
899 return kvm_set_msr(vcpu, index, *data); 904 struct msr_data msr;
905
906 msr.data = *data;
907 msr.index = index;
908 msr.host_initiated = true;
909 return kvm_set_msr(vcpu, &msr);
900} 910}
901 911
912#ifdef CONFIG_X86_64
913struct pvclock_gtod_data {
914 seqcount_t seq;
915
916 struct { /* extract of a clocksource struct */
917 int vclock_mode;
918 cycle_t cycle_last;
919 cycle_t mask;
920 u32 mult;
921 u32 shift;
922 } clock;
923
924 /* open coded 'struct timespec' */
925 u64 monotonic_time_snsec;
926 time_t monotonic_time_sec;
927};
928
929static struct pvclock_gtod_data pvclock_gtod_data;
930
931static void update_pvclock_gtod(struct timekeeper *tk)
932{
933 struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
934
935 write_seqcount_begin(&vdata->seq);
936
937 /* copy pvclock gtod data */
938 vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode;
939 vdata->clock.cycle_last = tk->clock->cycle_last;
940 vdata->clock.mask = tk->clock->mask;
941 vdata->clock.mult = tk->mult;
942 vdata->clock.shift = tk->shift;
943
944 vdata->monotonic_time_sec = tk->xtime_sec
945 + tk->wall_to_monotonic.tv_sec;
946 vdata->monotonic_time_snsec = tk->xtime_nsec
947 + (tk->wall_to_monotonic.tv_nsec
948 << tk->shift);
949 while (vdata->monotonic_time_snsec >=
950 (((u64)NSEC_PER_SEC) << tk->shift)) {
951 vdata->monotonic_time_snsec -=
952 ((u64)NSEC_PER_SEC) << tk->shift;
953 vdata->monotonic_time_sec++;
954 }
955
956 write_seqcount_end(&vdata->seq);
957}
958#endif
959
960
902static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 961static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
903{ 962{
904 int version; 963 int version;
@@ -995,6 +1054,10 @@ static inline u64 get_kernel_ns(void)
995 return timespec_to_ns(&ts); 1054 return timespec_to_ns(&ts);
996} 1055}
997 1056
1057#ifdef CONFIG_X86_64
1058static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1059#endif
1060
998static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 1061static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
999unsigned long max_tsc_khz; 1062unsigned long max_tsc_khz;
1000 1063
@@ -1046,12 +1109,47 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1046 return tsc; 1109 return tsc;
1047} 1110}
1048 1111
1049void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) 1112void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
1113{
1114#ifdef CONFIG_X86_64
1115 bool vcpus_matched;
1116 bool do_request = false;
1117 struct kvm_arch *ka = &vcpu->kvm->arch;
1118 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1119
1120 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1121 atomic_read(&vcpu->kvm->online_vcpus));
1122
1123 if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)
1124 if (!ka->use_master_clock)
1125 do_request = 1;
1126
1127 if (!vcpus_matched && ka->use_master_clock)
1128 do_request = 1;
1129
1130 if (do_request)
1131 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
1132
1133 trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
1134 atomic_read(&vcpu->kvm->online_vcpus),
1135 ka->use_master_clock, gtod->clock.vclock_mode);
1136#endif
1137}
1138
1139static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1140{
1141 u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
1142 vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1143}
1144
1145void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1050{ 1146{
1051 struct kvm *kvm = vcpu->kvm; 1147 struct kvm *kvm = vcpu->kvm;
1052 u64 offset, ns, elapsed; 1148 u64 offset, ns, elapsed;
1053 unsigned long flags; 1149 unsigned long flags;
1054 s64 usdiff; 1150 s64 usdiff;
1151 bool matched;
1152 u64 data = msr->data;
1055 1153
1056 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1154 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1057 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); 1155 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
@@ -1094,6 +1192,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1094 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); 1192 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1095 pr_debug("kvm: adjusted tsc offset by %llu\n", delta); 1193 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1096 } 1194 }
1195 matched = true;
1097 } else { 1196 } else {
1098 /* 1197 /*
1099 * We split periods of matched TSC writes into generations. 1198 * We split periods of matched TSC writes into generations.
@@ -1108,6 +1207,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1108 kvm->arch.cur_tsc_nsec = ns; 1207 kvm->arch.cur_tsc_nsec = ns;
1109 kvm->arch.cur_tsc_write = data; 1208 kvm->arch.cur_tsc_write = data;
1110 kvm->arch.cur_tsc_offset = offset; 1209 kvm->arch.cur_tsc_offset = offset;
1210 matched = false;
1111 pr_debug("kvm: new tsc generation %u, clock %llu\n", 1211 pr_debug("kvm: new tsc generation %u, clock %llu\n",
1112 kvm->arch.cur_tsc_generation, data); 1212 kvm->arch.cur_tsc_generation, data);
1113 } 1213 }
@@ -1129,26 +1229,195 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1129 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; 1229 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1130 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; 1230 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1131 1231
1232 if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
1233 update_ia32_tsc_adjust_msr(vcpu, offset);
1132 kvm_x86_ops->write_tsc_offset(vcpu, offset); 1234 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1133 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 1235 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1236
1237 spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
1238 if (matched)
1239 kvm->arch.nr_vcpus_matched_tsc++;
1240 else
1241 kvm->arch.nr_vcpus_matched_tsc = 0;
1242
1243 kvm_track_tsc_matching(vcpu);
1244 spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
1134} 1245}
1135 1246
1136EXPORT_SYMBOL_GPL(kvm_write_tsc); 1247EXPORT_SYMBOL_GPL(kvm_write_tsc);
1137 1248
1249#ifdef CONFIG_X86_64
1250
1251static cycle_t read_tsc(void)
1252{
1253 cycle_t ret;
1254 u64 last;
1255
1256 /*
1257 * Empirically, a fence (of type that depends on the CPU)
1258 * before rdtsc is enough to ensure that rdtsc is ordered
1259 * with respect to loads. The various CPU manuals are unclear
1260 * as to whether rdtsc can be reordered with later loads,
1261 * but no one has ever seen it happen.
1262 */
1263 rdtsc_barrier();
1264 ret = (cycle_t)vget_cycles();
1265
1266 last = pvclock_gtod_data.clock.cycle_last;
1267
1268 if (likely(ret >= last))
1269 return ret;
1270
1271 /*
1272 * GCC likes to generate cmov here, but this branch is extremely
1273 * predictable (it's just a funciton of time and the likely is
1274 * very likely) and there's a data dependence, so force GCC
1275 * to generate a branch instead. I don't barrier() because
1276 * we don't actually need a barrier, and if this function
1277 * ever gets inlined it will generate worse code.
1278 */
1279 asm volatile ("");
1280 return last;
1281}
1282
1283static inline u64 vgettsc(cycle_t *cycle_now)
1284{
1285 long v;
1286 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1287
1288 *cycle_now = read_tsc();
1289
1290 v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
1291 return v * gtod->clock.mult;
1292}
1293
1294static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
1295{
1296 unsigned long seq;
1297 u64 ns;
1298 int mode;
1299 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1300
1301 ts->tv_nsec = 0;
1302 do {
1303 seq = read_seqcount_begin(&gtod->seq);
1304 mode = gtod->clock.vclock_mode;
1305 ts->tv_sec = gtod->monotonic_time_sec;
1306 ns = gtod->monotonic_time_snsec;
1307 ns += vgettsc(cycle_now);
1308 ns >>= gtod->clock.shift;
1309 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1310 timespec_add_ns(ts, ns);
1311
1312 return mode;
1313}
1314
1315/* returns true if host is using tsc clocksource */
1316static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
1317{
1318 struct timespec ts;
1319
1320 /* checked again under seqlock below */
1321 if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
1322 return false;
1323
1324 if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
1325 return false;
1326
1327 monotonic_to_bootbased(&ts);
1328 *kernel_ns = timespec_to_ns(&ts);
1329
1330 return true;
1331}
1332#endif
1333
1334/*
1335 *
1336 * Assuming a stable TSC across physical CPUS, and a stable TSC
1337 * across virtual CPUs, the following condition is possible.
1338 * Each numbered line represents an event visible to both
1339 * CPUs at the next numbered event.
1340 *
1341 * "timespecX" represents host monotonic time. "tscX" represents
1342 * RDTSC value.
1343 *
1344 * VCPU0 on CPU0 | VCPU1 on CPU1
1345 *
1346 * 1. read timespec0,tsc0
1347 * 2. | timespec1 = timespec0 + N
1348 * | tsc1 = tsc0 + M
1349 * 3. transition to guest | transition to guest
1350 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
1351 * 5. | ret1 = timespec1 + (rdtsc - tsc1)
1352 * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
1353 *
1354 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
1355 *
1356 * - ret0 < ret1
1357 * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
1358 * ...
1359 * - 0 < N - M => M < N
1360 *
1361 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
1362 * always the case (the difference between two distinct xtime instances
1363 * might be smaller then the difference between corresponding TSC reads,
1364 * when updating guest vcpus pvclock areas).
1365 *
1366 * To avoid that problem, do not allow visibility of distinct
1367 * system_timestamp/tsc_timestamp values simultaneously: use a master
1368 * copy of host monotonic time values. Update that master copy
1369 * in lockstep.
1370 *
1371 * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
1372 *
1373 */
1374
1375static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1376{
1377#ifdef CONFIG_X86_64
1378 struct kvm_arch *ka = &kvm->arch;
1379 int vclock_mode;
1380 bool host_tsc_clocksource, vcpus_matched;
1381
1382 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1383 atomic_read(&kvm->online_vcpus));
1384
1385 /*
1386 * If the host uses TSC clock, then passthrough TSC as stable
1387 * to the guest.
1388 */
1389 host_tsc_clocksource = kvm_get_time_and_clockread(
1390 &ka->master_kernel_ns,
1391 &ka->master_cycle_now);
1392
1393 ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
1394
1395 if (ka->use_master_clock)
1396 atomic_set(&kvm_guest_has_master_clock, 1);
1397
1398 vclock_mode = pvclock_gtod_data.clock.vclock_mode;
1399 trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
1400 vcpus_matched);
1401#endif
1402}
1403
1138static int kvm_guest_time_update(struct kvm_vcpu *v) 1404static int kvm_guest_time_update(struct kvm_vcpu *v)
1139{ 1405{
1140 unsigned long flags; 1406 unsigned long flags, this_tsc_khz;
1141 struct kvm_vcpu_arch *vcpu = &v->arch; 1407 struct kvm_vcpu_arch *vcpu = &v->arch;
1408 struct kvm_arch *ka = &v->kvm->arch;
1142 void *shared_kaddr; 1409 void *shared_kaddr;
1143 unsigned long this_tsc_khz;
1144 s64 kernel_ns, max_kernel_ns; 1410 s64 kernel_ns, max_kernel_ns;
1145 u64 tsc_timestamp; 1411 u64 tsc_timestamp, host_tsc;
1412 struct pvclock_vcpu_time_info *guest_hv_clock;
1146 u8 pvclock_flags; 1413 u8 pvclock_flags;
1414 bool use_master_clock;
1415
1416 kernel_ns = 0;
1417 host_tsc = 0;
1147 1418
1148 /* Keep irq disabled to prevent changes to the clock */ 1419 /* Keep irq disabled to prevent changes to the clock */
1149 local_irq_save(flags); 1420 local_irq_save(flags);
1150 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
1151 kernel_ns = get_kernel_ns();
1152 this_tsc_khz = __get_cpu_var(cpu_tsc_khz); 1421 this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
1153 if (unlikely(this_tsc_khz == 0)) { 1422 if (unlikely(this_tsc_khz == 0)) {
1154 local_irq_restore(flags); 1423 local_irq_restore(flags);
@@ -1157,6 +1426,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1157 } 1426 }
1158 1427
1159 /* 1428 /*
1429 * If the host uses TSC clock, then passthrough TSC as stable
1430 * to the guest.
1431 */
1432 spin_lock(&ka->pvclock_gtod_sync_lock);
1433 use_master_clock = ka->use_master_clock;
1434 if (use_master_clock) {
1435 host_tsc = ka->master_cycle_now;
1436 kernel_ns = ka->master_kernel_ns;
1437 }
1438 spin_unlock(&ka->pvclock_gtod_sync_lock);
1439 if (!use_master_clock) {
1440 host_tsc = native_read_tsc();
1441 kernel_ns = get_kernel_ns();
1442 }
1443
1444 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
1445
1446 /*
1160 * We may have to catch up the TSC to match elapsed wall clock 1447 * We may have to catch up the TSC to match elapsed wall clock
1161 * time for two reasons, even if kvmclock is used. 1448 * time for two reasons, even if kvmclock is used.
1162 * 1) CPU could have been running below the maximum TSC rate 1449 * 1) CPU could have been running below the maximum TSC rate
@@ -1217,23 +1504,20 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1217 vcpu->hw_tsc_khz = this_tsc_khz; 1504 vcpu->hw_tsc_khz = this_tsc_khz;
1218 } 1505 }
1219 1506
1220 if (max_kernel_ns > kernel_ns) 1507 /* with a master <monotonic time, tsc value> tuple,
1221 kernel_ns = max_kernel_ns; 1508 * pvclock clock reads always increase at the (scaled) rate
1222 1509 * of guest TSC - no need to deal with sampling errors.
1510 */
1511 if (!use_master_clock) {
1512 if (max_kernel_ns > kernel_ns)
1513 kernel_ns = max_kernel_ns;
1514 }
1223 /* With all the info we got, fill in the values */ 1515 /* With all the info we got, fill in the values */
1224 vcpu->hv_clock.tsc_timestamp = tsc_timestamp; 1516 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1225 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; 1517 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1226 vcpu->last_kernel_ns = kernel_ns; 1518 vcpu->last_kernel_ns = kernel_ns;
1227 vcpu->last_guest_tsc = tsc_timestamp; 1519 vcpu->last_guest_tsc = tsc_timestamp;
1228 1520
1229 pvclock_flags = 0;
1230 if (vcpu->pvclock_set_guest_stopped_request) {
1231 pvclock_flags |= PVCLOCK_GUEST_STOPPED;
1232 vcpu->pvclock_set_guest_stopped_request = false;
1233 }
1234
1235 vcpu->hv_clock.flags = pvclock_flags;
1236
1237 /* 1521 /*
1238 * The interface expects us to write an even number signaling that the 1522 * The interface expects us to write an even number signaling that the
1239 * update is finished. Since the guest won't see the intermediate 1523 * update is finished. Since the guest won't see the intermediate
@@ -1243,6 +1527,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1243 1527
1244 shared_kaddr = kmap_atomic(vcpu->time_page); 1528 shared_kaddr = kmap_atomic(vcpu->time_page);
1245 1529
1530 guest_hv_clock = shared_kaddr + vcpu->time_offset;
1531
1532 /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
1533 pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
1534
1535 if (vcpu->pvclock_set_guest_stopped_request) {
1536 pvclock_flags |= PVCLOCK_GUEST_STOPPED;
1537 vcpu->pvclock_set_guest_stopped_request = false;
1538 }
1539
1540 /* If the host uses TSC clocksource, then it is stable */
1541 if (use_master_clock)
1542 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
1543
1544 vcpu->hv_clock.flags = pvclock_flags;
1545
1246 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 1546 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
1247 sizeof(vcpu->hv_clock)); 1547 sizeof(vcpu->hv_clock));
1248 1548
@@ -1572,9 +1872,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
1572 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); 1872 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
1573} 1873}
1574 1874
1575int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1875int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1576{ 1876{
1577 bool pr = false; 1877 bool pr = false;
1878 u32 msr = msr_info->index;
1879 u64 data = msr_info->data;
1578 1880
1579 switch (msr) { 1881 switch (msr) {
1580 case MSR_EFER: 1882 case MSR_EFER:
@@ -1625,6 +1927,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1625 case MSR_IA32_TSCDEADLINE: 1927 case MSR_IA32_TSCDEADLINE:
1626 kvm_set_lapic_tscdeadline_msr(vcpu, data); 1928 kvm_set_lapic_tscdeadline_msr(vcpu, data);
1627 break; 1929 break;
1930 case MSR_IA32_TSC_ADJUST:
1931 if (guest_cpuid_has_tsc_adjust(vcpu)) {
1932 if (!msr_info->host_initiated) {
1933 u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
1934 kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
1935 }
1936 vcpu->arch.ia32_tsc_adjust_msr = data;
1937 }
1938 break;
1628 case MSR_IA32_MISC_ENABLE: 1939 case MSR_IA32_MISC_ENABLE:
1629 vcpu->arch.ia32_misc_enable_msr = data; 1940 vcpu->arch.ia32_misc_enable_msr = data;
1630 break; 1941 break;
@@ -1984,6 +2295,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1984 case MSR_IA32_TSCDEADLINE: 2295 case MSR_IA32_TSCDEADLINE:
1985 data = kvm_get_lapic_tscdeadline_msr(vcpu); 2296 data = kvm_get_lapic_tscdeadline_msr(vcpu);
1986 break; 2297 break;
2298 case MSR_IA32_TSC_ADJUST:
2299 data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
2300 break;
1987 case MSR_IA32_MISC_ENABLE: 2301 case MSR_IA32_MISC_ENABLE:
1988 data = vcpu->arch.ia32_misc_enable_msr; 2302 data = vcpu->arch.ia32_misc_enable_msr;
1989 break; 2303 break;
@@ -2342,7 +2656,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2342 kvm_x86_ops->write_tsc_offset(vcpu, offset); 2656 kvm_x86_ops->write_tsc_offset(vcpu, offset);
2343 vcpu->arch.tsc_catchup = 1; 2657 vcpu->arch.tsc_catchup = 1;
2344 } 2658 }
2345 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2659 /*
2660 * On a host with synchronized TSC, there is no need to update
2661 * kvmclock on vcpu->cpu migration
2662 */
2663 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
2664 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2346 if (vcpu->cpu != cpu) 2665 if (vcpu->cpu != cpu)
2347 kvm_migrate_timers(vcpu); 2666 kvm_migrate_timers(vcpu);
2348 vcpu->cpu = cpu; 2667 vcpu->cpu = cpu;
@@ -2691,15 +3010,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2691 if (!vcpu->arch.apic) 3010 if (!vcpu->arch.apic)
2692 goto out; 3011 goto out;
2693 u.lapic = memdup_user(argp, sizeof(*u.lapic)); 3012 u.lapic = memdup_user(argp, sizeof(*u.lapic));
2694 if (IS_ERR(u.lapic)) { 3013 if (IS_ERR(u.lapic))
2695 r = PTR_ERR(u.lapic); 3014 return PTR_ERR(u.lapic);
2696 goto out;
2697 }
2698 3015
2699 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); 3016 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
2700 if (r)
2701 goto out;
2702 r = 0;
2703 break; 3017 break;
2704 } 3018 }
2705 case KVM_INTERRUPT: { 3019 case KVM_INTERRUPT: {
@@ -2709,16 +3023,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2709 if (copy_from_user(&irq, argp, sizeof irq)) 3023 if (copy_from_user(&irq, argp, sizeof irq))
2710 goto out; 3024 goto out;
2711 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 3025 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2712 if (r)
2713 goto out;
2714 r = 0;
2715 break; 3026 break;
2716 } 3027 }
2717 case KVM_NMI: { 3028 case KVM_NMI: {
2718 r = kvm_vcpu_ioctl_nmi(vcpu); 3029 r = kvm_vcpu_ioctl_nmi(vcpu);
2719 if (r)
2720 goto out;
2721 r = 0;
2722 break; 3030 break;
2723 } 3031 }
2724 case KVM_SET_CPUID: { 3032 case KVM_SET_CPUID: {
@@ -2729,8 +3037,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2729 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 3037 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2730 goto out; 3038 goto out;
2731 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 3039 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2732 if (r)
2733 goto out;
2734 break; 3040 break;
2735 } 3041 }
2736 case KVM_SET_CPUID2: { 3042 case KVM_SET_CPUID2: {
@@ -2742,8 +3048,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2742 goto out; 3048 goto out;
2743 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 3049 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
2744 cpuid_arg->entries); 3050 cpuid_arg->entries);
2745 if (r)
2746 goto out;
2747 break; 3051 break;
2748 } 3052 }
2749 case KVM_GET_CPUID2: { 3053 case KVM_GET_CPUID2: {
@@ -2875,10 +3179,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2875 } 3179 }
2876 case KVM_SET_XSAVE: { 3180 case KVM_SET_XSAVE: {
2877 u.xsave = memdup_user(argp, sizeof(*u.xsave)); 3181 u.xsave = memdup_user(argp, sizeof(*u.xsave));
2878 if (IS_ERR(u.xsave)) { 3182 if (IS_ERR(u.xsave))
2879 r = PTR_ERR(u.xsave); 3183 return PTR_ERR(u.xsave);
2880 goto out;
2881 }
2882 3184
2883 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); 3185 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
2884 break; 3186 break;
@@ -2900,10 +3202,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2900 } 3202 }
2901 case KVM_SET_XCRS: { 3203 case KVM_SET_XCRS: {
2902 u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); 3204 u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
2903 if (IS_ERR(u.xcrs)) { 3205 if (IS_ERR(u.xcrs))
2904 r = PTR_ERR(u.xcrs); 3206 return PTR_ERR(u.xcrs);
2905 goto out;
2906 }
2907 3207
2908 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); 3208 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
2909 break; 3209 break;
@@ -2951,7 +3251,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
2951 int ret; 3251 int ret;
2952 3252
2953 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 3253 if (addr > (unsigned int)(-3 * PAGE_SIZE))
2954 return -1; 3254 return -EINVAL;
2955 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 3255 ret = kvm_x86_ops->set_tss_addr(kvm, addr);
2956 return ret; 3256 return ret;
2957} 3257}
@@ -3212,8 +3512,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3212 switch (ioctl) { 3512 switch (ioctl) {
3213 case KVM_SET_TSS_ADDR: 3513 case KVM_SET_TSS_ADDR:
3214 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 3514 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
3215 if (r < 0)
3216 goto out;
3217 break; 3515 break;
3218 case KVM_SET_IDENTITY_MAP_ADDR: { 3516 case KVM_SET_IDENTITY_MAP_ADDR: {
3219 u64 ident_addr; 3517 u64 ident_addr;
@@ -3222,14 +3520,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
3222 if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) 3520 if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
3223 goto out; 3521 goto out;
3224 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); 3522 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
3225 if (r < 0)
3226 goto out;
3227 break; 3523 break;
3228 } 3524 }
3229 case KVM_SET_NR_MMU_PAGES: 3525 case KVM_SET_NR_MMU_PAGES:
3230 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 3526 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
3231 if (r)
3232 goto out;
3233 break; 3527 break;
3234 case KVM_GET_NR_MMU_PAGES: 3528 case KVM_GET_NR_MMU_PAGES:
3235 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 3529 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
@@ -3320,8 +3614,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3320 r = 0; 3614 r = 0;
3321 get_irqchip_out: 3615 get_irqchip_out:
3322 kfree(chip); 3616 kfree(chip);
3323 if (r)
3324 goto out;
3325 break; 3617 break;
3326 } 3618 }
3327 case KVM_SET_IRQCHIP: { 3619 case KVM_SET_IRQCHIP: {
@@ -3343,8 +3635,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3343 r = 0; 3635 r = 0;
3344 set_irqchip_out: 3636 set_irqchip_out:
3345 kfree(chip); 3637 kfree(chip);
3346 if (r)
3347 goto out;
3348 break; 3638 break;
3349 } 3639 }
3350 case KVM_GET_PIT: { 3640 case KVM_GET_PIT: {
@@ -3371,9 +3661,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3371 if (!kvm->arch.vpit) 3661 if (!kvm->arch.vpit)
3372 goto out; 3662 goto out;
3373 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 3663 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
3374 if (r)
3375 goto out;
3376 r = 0;
3377 break; 3664 break;
3378 } 3665 }
3379 case KVM_GET_PIT2: { 3666 case KVM_GET_PIT2: {
@@ -3397,9 +3684,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3397 if (!kvm->arch.vpit) 3684 if (!kvm->arch.vpit)
3398 goto out; 3685 goto out;
3399 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); 3686 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
3400 if (r)
3401 goto out;
3402 r = 0;
3403 break; 3687 break;
3404 } 3688 }
3405 case KVM_REINJECT_CONTROL: { 3689 case KVM_REINJECT_CONTROL: {
@@ -3408,9 +3692,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3408 if (copy_from_user(&control, argp, sizeof(control))) 3692 if (copy_from_user(&control, argp, sizeof(control)))
3409 goto out; 3693 goto out;
3410 r = kvm_vm_ioctl_reinject(kvm, &control); 3694 r = kvm_vm_ioctl_reinject(kvm, &control);
3411 if (r)
3412 goto out;
3413 r = 0;
3414 break; 3695 break;
3415 } 3696 }
3416 case KVM_XEN_HVM_CONFIG: { 3697 case KVM_XEN_HVM_CONFIG: {
@@ -4273,7 +4554,12 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
4273static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, 4554static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
4274 u32 msr_index, u64 data) 4555 u32 msr_index, u64 data)
4275{ 4556{
4276 return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); 4557 struct msr_data msr;
4558
4559 msr.data = data;
4560 msr.index = msr_index;
4561 msr.host_initiated = false;
4562 return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
4277} 4563}
4278 4564
4279static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, 4565static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -4495,7 +4781,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4495 * instruction -> ... 4781 * instruction -> ...
4496 */ 4782 */
4497 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); 4783 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
4498 if (!is_error_pfn(pfn)) { 4784 if (!is_error_noslot_pfn(pfn)) {
4499 kvm_release_pfn_clean(pfn); 4785 kvm_release_pfn_clean(pfn);
4500 return true; 4786 return true;
4501 } 4787 }
@@ -4881,6 +5167,50 @@ static void kvm_set_mmio_spte_mask(void)
4881 kvm_mmu_set_mmio_spte_mask(mask); 5167 kvm_mmu_set_mmio_spte_mask(mask);
4882} 5168}
4883 5169
5170#ifdef CONFIG_X86_64
5171static void pvclock_gtod_update_fn(struct work_struct *work)
5172{
5173 struct kvm *kvm;
5174
5175 struct kvm_vcpu *vcpu;
5176 int i;
5177
5178 raw_spin_lock(&kvm_lock);
5179 list_for_each_entry(kvm, &vm_list, vm_list)
5180 kvm_for_each_vcpu(i, vcpu, kvm)
5181 set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
5182 atomic_set(&kvm_guest_has_master_clock, 0);
5183 raw_spin_unlock(&kvm_lock);
5184}
5185
5186static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
5187
5188/*
5189 * Notification about pvclock gtod data update.
5190 */
5191static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
5192 void *priv)
5193{
5194 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
5195 struct timekeeper *tk = priv;
5196
5197 update_pvclock_gtod(tk);
5198
5199 /* disable master clock if host does not trust, or does not
5200 * use, TSC clocksource
5201 */
5202 if (gtod->clock.vclock_mode != VCLOCK_TSC &&
5203 atomic_read(&kvm_guest_has_master_clock) != 0)
5204 queue_work(system_long_wq, &pvclock_gtod_work);
5205
5206 return 0;
5207}
5208
5209static struct notifier_block pvclock_gtod_notifier = {
5210 .notifier_call = pvclock_gtod_notify,
5211};
5212#endif
5213
4884int kvm_arch_init(void *opaque) 5214int kvm_arch_init(void *opaque)
4885{ 5215{
4886 int r; 5216 int r;
@@ -4922,6 +5252,10 @@ int kvm_arch_init(void *opaque)
4922 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 5252 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
4923 5253
4924 kvm_lapic_init(); 5254 kvm_lapic_init();
5255#ifdef CONFIG_X86_64
5256 pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
5257#endif
5258
4925 return 0; 5259 return 0;
4926 5260
4927out: 5261out:
@@ -4936,6 +5270,9 @@ void kvm_arch_exit(void)
4936 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 5270 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
4937 CPUFREQ_TRANSITION_NOTIFIER); 5271 CPUFREQ_TRANSITION_NOTIFIER);
4938 unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); 5272 unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
5273#ifdef CONFIG_X86_64
5274 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
5275#endif
4939 kvm_x86_ops = NULL; 5276 kvm_x86_ops = NULL;
4940 kvm_mmu_module_exit(); 5277 kvm_mmu_module_exit();
4941} 5278}
@@ -5059,7 +5396,7 @@ out:
5059} 5396}
5060EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 5397EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
5061 5398
5062int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) 5399static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5063{ 5400{
5064 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 5401 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5065 char instruction[3]; 5402 char instruction[3];
@@ -5235,6 +5572,29 @@ static void process_nmi(struct kvm_vcpu *vcpu)
5235 kvm_make_request(KVM_REQ_EVENT, vcpu); 5572 kvm_make_request(KVM_REQ_EVENT, vcpu);
5236} 5573}
5237 5574
5575static void kvm_gen_update_masterclock(struct kvm *kvm)
5576{
5577#ifdef CONFIG_X86_64
5578 int i;
5579 struct kvm_vcpu *vcpu;
5580 struct kvm_arch *ka = &kvm->arch;
5581
5582 spin_lock(&ka->pvclock_gtod_sync_lock);
5583 kvm_make_mclock_inprogress_request(kvm);
5584 /* no guest entries from this point */
5585 pvclock_update_vm_gtod_copy(kvm);
5586
5587 kvm_for_each_vcpu(i, vcpu, kvm)
5588 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
5589
5590 /* guest entries allowed */
5591 kvm_for_each_vcpu(i, vcpu, kvm)
5592 clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
5593
5594 spin_unlock(&ka->pvclock_gtod_sync_lock);
5595#endif
5596}
5597
5238static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5598static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5239{ 5599{
5240 int r; 5600 int r;
@@ -5247,6 +5607,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5247 kvm_mmu_unload(vcpu); 5607 kvm_mmu_unload(vcpu);
5248 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 5608 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
5249 __kvm_migrate_timers(vcpu); 5609 __kvm_migrate_timers(vcpu);
5610 if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
5611 kvm_gen_update_masterclock(vcpu->kvm);
5250 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { 5612 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
5251 r = kvm_guest_time_update(vcpu); 5613 r = kvm_guest_time_update(vcpu);
5252 if (unlikely(r)) 5614 if (unlikely(r))
@@ -5362,7 +5724,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5362 if (hw_breakpoint_active()) 5724 if (hw_breakpoint_active())
5363 hw_breakpoint_restore(); 5725 hw_breakpoint_restore();
5364 5726
5365 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); 5727 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
5728 native_read_tsc());
5366 5729
5367 vcpu->mode = OUTSIDE_GUEST_MODE; 5730 vcpu->mode = OUTSIDE_GUEST_MODE;
5368 smp_wmb(); 5731 smp_wmb();
@@ -5419,7 +5782,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5419 pr_debug("vcpu %d received sipi with vector # %x\n", 5782 pr_debug("vcpu %d received sipi with vector # %x\n",
5420 vcpu->vcpu_id, vcpu->arch.sipi_vector); 5783 vcpu->vcpu_id, vcpu->arch.sipi_vector);
5421 kvm_lapic_reset(vcpu); 5784 kvm_lapic_reset(vcpu);
5422 r = kvm_arch_vcpu_reset(vcpu); 5785 r = kvm_vcpu_reset(vcpu);
5423 if (r) 5786 if (r)
5424 return r; 5787 return r;
5425 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5788 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -6047,7 +6410,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6047 r = vcpu_load(vcpu); 6410 r = vcpu_load(vcpu);
6048 if (r) 6411 if (r)
6049 return r; 6412 return r;
6050 r = kvm_arch_vcpu_reset(vcpu); 6413 r = kvm_vcpu_reset(vcpu);
6051 if (r == 0) 6414 if (r == 0)
6052 r = kvm_mmu_setup(vcpu); 6415 r = kvm_mmu_setup(vcpu);
6053 vcpu_put(vcpu); 6416 vcpu_put(vcpu);
@@ -6055,6 +6418,23 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6055 return r; 6418 return r;
6056} 6419}
6057 6420
6421int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
6422{
6423 int r;
6424 struct msr_data msr;
6425
6426 r = vcpu_load(vcpu);
6427 if (r)
6428 return r;
6429 msr.data = 0x0;
6430 msr.index = MSR_IA32_TSC;
6431 msr.host_initiated = true;
6432 kvm_write_tsc(vcpu, &msr);
6433 vcpu_put(vcpu);
6434
6435 return r;
6436}
6437
6058void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 6438void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6059{ 6439{
6060 int r; 6440 int r;
@@ -6069,7 +6449,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6069 kvm_x86_ops->vcpu_free(vcpu); 6449 kvm_x86_ops->vcpu_free(vcpu);
6070} 6450}
6071 6451
6072int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 6452static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
6073{ 6453{
6074 atomic_set(&vcpu->arch.nmi_queued, 0); 6454 atomic_set(&vcpu->arch.nmi_queued, 0);
6075 vcpu->arch.nmi_pending = 0; 6455 vcpu->arch.nmi_pending = 0;
@@ -6092,6 +6472,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
6092 6472
6093 kvm_pmu_reset(vcpu); 6473 kvm_pmu_reset(vcpu);
6094 6474
6475 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
6476 vcpu->arch.regs_avail = ~0;
6477 vcpu->arch.regs_dirty = ~0;
6478
6095 return kvm_x86_ops->vcpu_reset(vcpu); 6479 return kvm_x86_ops->vcpu_reset(vcpu);
6096} 6480}
6097 6481
@@ -6168,6 +6552,8 @@ int kvm_arch_hardware_enable(void *garbage)
6168 kvm_for_each_vcpu(i, vcpu, kvm) { 6552 kvm_for_each_vcpu(i, vcpu, kvm) {
6169 vcpu->arch.tsc_offset_adjustment += delta_cyc; 6553 vcpu->arch.tsc_offset_adjustment += delta_cyc;
6170 vcpu->arch.last_host_tsc = local_tsc; 6554 vcpu->arch.last_host_tsc = local_tsc;
6555 set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
6556 &vcpu->requests);
6171 } 6557 }
6172 6558
6173 /* 6559 /*
@@ -6258,10 +6644,17 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6258 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 6644 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
6259 goto fail_free_mce_banks; 6645 goto fail_free_mce_banks;
6260 6646
6647 r = fx_init(vcpu);
6648 if (r)
6649 goto fail_free_wbinvd_dirty_mask;
6650
6651 vcpu->arch.ia32_tsc_adjust_msr = 0x0;
6261 kvm_async_pf_hash_reset(vcpu); 6652 kvm_async_pf_hash_reset(vcpu);
6262 kvm_pmu_init(vcpu); 6653 kvm_pmu_init(vcpu);
6263 6654
6264 return 0; 6655 return 0;
6656fail_free_wbinvd_dirty_mask:
6657 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
6265fail_free_mce_banks: 6658fail_free_mce_banks:
6266 kfree(vcpu->arch.mce_banks); 6659 kfree(vcpu->arch.mce_banks);
6267fail_free_lapic: 6660fail_free_lapic:
@@ -6305,6 +6698,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6305 6698
6306 raw_spin_lock_init(&kvm->arch.tsc_write_lock); 6699 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
6307 mutex_init(&kvm->arch.apic_map_lock); 6700 mutex_init(&kvm->arch.apic_map_lock);
6701 spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
6702
6703 pvclock_update_vm_gtod_copy(kvm);
6308 6704
6309 return 0; 6705 return 0;
6310} 6706}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2b5219c12ac8..e224f7a671b6 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -112,7 +112,7 @@ void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
112void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 112void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
113int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); 113int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
114 114
115void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); 115void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
116 116
117int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, 117int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
118 gva_t addr, void *val, unsigned int bytes, 118 gva_t addr, void *val, unsigned int bytes,
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 4df6c373421a..205ad328aa52 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -22,6 +22,7 @@
22#include <asm/hpet.h> 22#include <asm/hpet.h>
23#include <asm/unistd.h> 23#include <asm/unistd.h>
24#include <asm/io.h> 24#include <asm/io.h>
25#include <asm/pvclock.h>
25 26
26#define gtod (&VVAR(vsyscall_gtod_data)) 27#define gtod (&VVAR(vsyscall_gtod_data))
27 28
@@ -62,6 +63,76 @@ static notrace cycle_t vread_hpet(void)
62 return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); 63 return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
63} 64}
64 65
66#ifdef CONFIG_PARAVIRT_CLOCK
67
68static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
69{
70 const struct pvclock_vsyscall_time_info *pvti_base;
71 int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
72 int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
73
74 BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
75
76 pvti_base = (struct pvclock_vsyscall_time_info *)
77 __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
78
79 return &pvti_base[offset];
80}
81
82static notrace cycle_t vread_pvclock(int *mode)
83{
84 const struct pvclock_vsyscall_time_info *pvti;
85 cycle_t ret;
86 u64 last;
87 u32 version;
88 u32 migrate_count;
89 u8 flags;
90 unsigned cpu, cpu1;
91
92
93 /*
94 * When looping to get a consistent (time-info, tsc) pair, we
95 * also need to deal with the possibility we can switch vcpus,
96 * so make sure we always re-fetch time-info for the current vcpu.
97 */
98 do {
99 cpu = __getcpu() & VGETCPU_CPU_MASK;
100 /* TODO: We can put vcpu id into higher bits of pvti.version.
101 * This will save a couple of cycles by getting rid of
102 * __getcpu() calls (Gleb).
103 */
104
105 pvti = get_pvti(cpu);
106
107 migrate_count = pvti->migrate_count;
108
109 version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
110
111 /*
112 * Test we're still on the cpu as well as the version.
113 * We could have been migrated just after the first
114 * vgetcpu but before fetching the version, so we
115 * wouldn't notice a version change.
116 */
117 cpu1 = __getcpu() & VGETCPU_CPU_MASK;
118 } while (unlikely(cpu != cpu1 ||
119 (pvti->pvti.version & 1) ||
120 pvti->pvti.version != version ||
121 pvti->migrate_count != migrate_count));
122
123 if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
124 *mode = VCLOCK_NONE;
125
126 /* refer to tsc.c read_tsc() comment for rationale */
127 last = VVAR(vsyscall_gtod_data).clock.cycle_last;
128
129 if (likely(ret >= last))
130 return ret;
131
132 return last;
133}
134#endif
135
65notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) 136notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
66{ 137{
67 long ret; 138 long ret;
@@ -80,7 +151,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
80} 151}
81 152
82 153
83notrace static inline u64 vgetsns(void) 154notrace static inline u64 vgetsns(int *mode)
84{ 155{
85 long v; 156 long v;
86 cycles_t cycles; 157 cycles_t cycles;
@@ -88,6 +159,10 @@ notrace static inline u64 vgetsns(void)
88 cycles = vread_tsc(); 159 cycles = vread_tsc();
89 else if (gtod->clock.vclock_mode == VCLOCK_HPET) 160 else if (gtod->clock.vclock_mode == VCLOCK_HPET)
90 cycles = vread_hpet(); 161 cycles = vread_hpet();
162#ifdef CONFIG_PARAVIRT_CLOCK
163 else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK)
164 cycles = vread_pvclock(mode);
165#endif
91 else 166 else
92 return 0; 167 return 0;
93 v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; 168 v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask;
@@ -107,7 +182,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
107 mode = gtod->clock.vclock_mode; 182 mode = gtod->clock.vclock_mode;
108 ts->tv_sec = gtod->wall_time_sec; 183 ts->tv_sec = gtod->wall_time_sec;
109 ns = gtod->wall_time_snsec; 184 ns = gtod->wall_time_snsec;
110 ns += vgetsns(); 185 ns += vgetsns(&mode);
111 ns >>= gtod->clock.shift; 186 ns >>= gtod->clock.shift;
112 } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 187 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
113 188
@@ -127,7 +202,7 @@ notrace static int do_monotonic(struct timespec *ts)
127 mode = gtod->clock.vclock_mode; 202 mode = gtod->clock.vclock_mode;
128 ts->tv_sec = gtod->monotonic_time_sec; 203 ts->tv_sec = gtod->monotonic_time_sec;
129 ns = gtod->monotonic_time_snsec; 204 ns = gtod->monotonic_time_snsec;
130 ns += vgetsns(); 205 ns += vgetsns(&mode);
131 ns >>= gtod->clock.shift; 206 ns >>= gtod->clock.shift;
132 } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 207 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
133 timespec_add_ns(ts, ns); 208 timespec_add_ns(ts, ns);
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 5463ad558573..2f94b039e55b 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -17,15 +17,10 @@ __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
17{ 17{
18 unsigned int p; 18 unsigned int p;
19 19
20 if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { 20 p = __getcpu();
21 /* Load per CPU data from RDTSCP */ 21
22 native_read_tscp(&p);
23 } else {
24 /* Load per CPU data from GDT */
25 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
26 }
27 if (cpu) 22 if (cpu)
28 *cpu = p & 0xfff; 23 *cpu = p & VGETCPU_CPU_MASK;
29 if (node) 24 if (node)
30 *node = p >> 12; 25 *node = p >> 12;
31 return 0; 26 return 0;