diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-13 18:31:08 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-13 18:31:08 -0500 |
commit | 66cdd0ceaf65a18996f561b770eedde1d123b019 (patch) | |
tree | 4892eaa422d366fce5d1e866ff1fe0988af95569 /arch/x86 | |
parent | 896ea17d3da5f44b2625c9cda9874d7dfe447393 (diff) | |
parent | 58b7825bc324da55415034a9f6ca5d716b8fd898 (diff) |
Merge tag 'kvm-3.8-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Marcelo Tosatti:
"Considerable KVM/PPC work, x86 kvmclock vsyscall support,
IA32_TSC_ADJUST MSR emulation, amongst others."
Fix up trivial conflict in kernel/sched/core.c due to cross-cpu
migration notifier added next to rq migration call-back.
* tag 'kvm-3.8-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (156 commits)
KVM: emulator: fix real mode segment checks in address linearization
VMX: remove unneeded enable_unrestricted_guest check
KVM: VMX: fix DPL during entry to protected mode
x86/kexec: crash_vmclear_local_vmcss needs __rcu
kvm: Fix irqfd resampler list walk
KVM: VMX: provide the vmclear function and a bitmap to support VMCLEAR in kdump
x86/kexec: VMCLEAR VMCSs loaded on all cpus if necessary
KVM: MMU: optimize for set_spte
KVM: PPC: booke: Get/set guest EPCR register using ONE_REG interface
KVM: PPC: bookehv: Add EPCR support in mtspr/mfspr emulation
KVM: PPC: bookehv: Add guest computation mode for irq delivery
KVM: PPC: Make EPCR a valid field for booke64 and bookehv
KVM: PPC: booke: Extend MAS2 EPN mask for 64-bit
KVM: PPC: e500: Mask MAS2 EPN high 32-bits in 32/64 tlbwe emulation
KVM: PPC: Mask ea's high 32-bits in 32/64 instr emulation
KVM: PPC: e500: Add emulation helper for getting instruction ea
KVM: PPC: bookehv64: Add support for interrupt handling
KVM: PPC: bookehv: Remove GET_VCPU macro from exception handler
KVM: PPC: booke: Fix get_tb() compile error on 64-bit
KVM: PPC: e500: Silence bogus GCC warning in tlb code
...
Diffstat (limited to 'arch/x86')
27 files changed, 1209 insertions, 339 deletions
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index 0bdbbb3b9ce7..16a57f4ed64d 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h | |||
@@ -8,6 +8,7 @@ | |||
8 | #define VCLOCK_NONE 0 /* No vDSO clock available. */ | 8 | #define VCLOCK_NONE 0 /* No vDSO clock available. */ |
9 | #define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ | 9 | #define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ |
10 | #define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ | 10 | #define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ |
11 | #define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */ | ||
11 | 12 | ||
12 | struct arch_clocksource_data { | 13 | struct arch_clocksource_data { |
13 | int vclock_mode; | 14 | int vclock_mode; |
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index da40b1e2228e..2d9075e863a0 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h | |||
@@ -202,6 +202,7 @@ | |||
202 | 202 | ||
203 | /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ | 203 | /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ |
204 | #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ | 204 | #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ |
205 | #define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3b */ | ||
205 | #define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ | 206 | #define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ |
206 | #define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ | 207 | #define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ |
207 | #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ | 208 | #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ |
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 4da3c0c4c974..a09c28571064 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <asm/acpi.h> | 19 | #include <asm/acpi.h> |
20 | #include <asm/apicdef.h> | 20 | #include <asm/apicdef.h> |
21 | #include <asm/page.h> | 21 | #include <asm/page.h> |
22 | #include <asm/pvclock.h> | ||
22 | #ifdef CONFIG_X86_32 | 23 | #ifdef CONFIG_X86_32 |
23 | #include <linux/threads.h> | 24 | #include <linux/threads.h> |
24 | #include <asm/kmap_types.h> | 25 | #include <asm/kmap_types.h> |
@@ -81,6 +82,10 @@ enum fixed_addresses { | |||
81 | VVAR_PAGE, | 82 | VVAR_PAGE, |
82 | VSYSCALL_HPET, | 83 | VSYSCALL_HPET, |
83 | #endif | 84 | #endif |
85 | #ifdef CONFIG_PARAVIRT_CLOCK | ||
86 | PVCLOCK_FIXMAP_BEGIN, | ||
87 | PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1, | ||
88 | #endif | ||
84 | FIX_DBGP_BASE, | 89 | FIX_DBGP_BASE, |
85 | FIX_EARLYCON_MEM_BASE, | 90 | FIX_EARLYCON_MEM_BASE, |
86 | #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT | 91 | #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT |
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 317ff1703d0b..6080d2694bad 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h | |||
@@ -163,6 +163,9 @@ struct kimage_arch { | |||
163 | }; | 163 | }; |
164 | #endif | 164 | #endif |
165 | 165 | ||
166 | typedef void crash_vmclear_fn(void); | ||
167 | extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; | ||
168 | |||
166 | #endif /* __ASSEMBLY__ */ | 169 | #endif /* __ASSEMBLY__ */ |
167 | 170 | ||
168 | #endif /* _ASM_X86_KEXEC_H */ | 171 | #endif /* _ASM_X86_KEXEC_H */ |
diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h new file mode 100644 index 000000000000..a92b1763c419 --- /dev/null +++ b/arch/x86/include/asm/kvm_guest.h | |||
@@ -0,0 +1,6 @@ | |||
1 | #ifndef _ASM_X86_KVM_GUEST_H | ||
2 | #define _ASM_X86_KVM_GUEST_H | ||
3 | |||
4 | int kvm_setup_vsyscall_timeinfo(void); | ||
5 | |||
6 | #endif /* _ASM_X86_KVM_GUEST_H */ | ||
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b2e11f452435..dc87b65e9c3a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -22,6 +22,8 @@ | |||
22 | #include <linux/kvm_para.h> | 22 | #include <linux/kvm_para.h> |
23 | #include <linux/kvm_types.h> | 23 | #include <linux/kvm_types.h> |
24 | #include <linux/perf_event.h> | 24 | #include <linux/perf_event.h> |
25 | #include <linux/pvclock_gtod.h> | ||
26 | #include <linux/clocksource.h> | ||
25 | 27 | ||
26 | #include <asm/pvclock-abi.h> | 28 | #include <asm/pvclock-abi.h> |
27 | #include <asm/desc.h> | 29 | #include <asm/desc.h> |
@@ -442,6 +444,7 @@ struct kvm_vcpu_arch { | |||
442 | s8 virtual_tsc_shift; | 444 | s8 virtual_tsc_shift; |
443 | u32 virtual_tsc_mult; | 445 | u32 virtual_tsc_mult; |
444 | u32 virtual_tsc_khz; | 446 | u32 virtual_tsc_khz; |
447 | s64 ia32_tsc_adjust_msr; | ||
445 | 448 | ||
446 | atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ | 449 | atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ |
447 | unsigned nmi_pending; /* NMI queued after currently running handler */ | 450 | unsigned nmi_pending; /* NMI queued after currently running handler */ |
@@ -559,6 +562,12 @@ struct kvm_arch { | |||
559 | u64 cur_tsc_write; | 562 | u64 cur_tsc_write; |
560 | u64 cur_tsc_offset; | 563 | u64 cur_tsc_offset; |
561 | u8 cur_tsc_generation; | 564 | u8 cur_tsc_generation; |
565 | int nr_vcpus_matched_tsc; | ||
566 | |||
567 | spinlock_t pvclock_gtod_sync_lock; | ||
568 | bool use_master_clock; | ||
569 | u64 master_kernel_ns; | ||
570 | cycle_t master_cycle_now; | ||
562 | 571 | ||
563 | struct kvm_xen_hvm_config xen_hvm_config; | 572 | struct kvm_xen_hvm_config xen_hvm_config; |
564 | 573 | ||
@@ -612,6 +621,12 @@ struct kvm_vcpu_stat { | |||
612 | 621 | ||
613 | struct x86_instruction_info; | 622 | struct x86_instruction_info; |
614 | 623 | ||
624 | struct msr_data { | ||
625 | bool host_initiated; | ||
626 | u32 index; | ||
627 | u64 data; | ||
628 | }; | ||
629 | |||
615 | struct kvm_x86_ops { | 630 | struct kvm_x86_ops { |
616 | int (*cpu_has_kvm_support)(void); /* __init */ | 631 | int (*cpu_has_kvm_support)(void); /* __init */ |
617 | int (*disabled_by_bios)(void); /* __init */ | 632 | int (*disabled_by_bios)(void); /* __init */ |
@@ -634,7 +649,7 @@ struct kvm_x86_ops { | |||
634 | 649 | ||
635 | void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); | 650 | void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); |
636 | int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); | 651 | int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); |
637 | int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); | 652 | int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); |
638 | u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); | 653 | u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); |
639 | void (*get_segment)(struct kvm_vcpu *vcpu, | 654 | void (*get_segment)(struct kvm_vcpu *vcpu, |
640 | struct kvm_segment *var, int seg); | 655 | struct kvm_segment *var, int seg); |
@@ -697,10 +712,11 @@ struct kvm_x86_ops { | |||
697 | bool (*has_wbinvd_exit)(void); | 712 | bool (*has_wbinvd_exit)(void); |
698 | 713 | ||
699 | void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); | 714 | void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); |
715 | u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu); | ||
700 | void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); | 716 | void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); |
701 | 717 | ||
702 | u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); | 718 | u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); |
703 | u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu); | 719 | u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); |
704 | 720 | ||
705 | void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); | 721 | void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); |
706 | 722 | ||
@@ -785,7 +801,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu, | |||
785 | 801 | ||
786 | void kvm_enable_efer_bits(u64); | 802 | void kvm_enable_efer_bits(u64); |
787 | int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); | 803 | int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); |
788 | int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); | 804 | int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); |
789 | 805 | ||
790 | struct x86_emulate_ctxt; | 806 | struct x86_emulate_ctxt; |
791 | 807 | ||
@@ -812,7 +828,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); | |||
812 | int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); | 828 | int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); |
813 | 829 | ||
814 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); | 830 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); |
815 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); | 831 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); |
816 | 832 | ||
817 | unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); | 833 | unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); |
818 | void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); | 834 | void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); |
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e400cdb2dd65..6e930b218724 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h | |||
@@ -236,6 +236,7 @@ | |||
236 | #define MSR_IA32_EBL_CR_POWERON 0x0000002a | 236 | #define MSR_IA32_EBL_CR_POWERON 0x0000002a |
237 | #define MSR_EBC_FREQUENCY_ID 0x0000002c | 237 | #define MSR_EBC_FREQUENCY_ID 0x0000002c |
238 | #define MSR_IA32_FEATURE_CONTROL 0x0000003a | 238 | #define MSR_IA32_FEATURE_CONTROL 0x0000003a |
239 | #define MSR_IA32_TSC_ADJUST 0x0000003b | ||
239 | 240 | ||
240 | #define FEATURE_CONTROL_LOCKED (1<<0) | 241 | #define FEATURE_CONTROL_LOCKED (1<<0) |
241 | #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) | 242 | #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) |
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index c59cc97fe6c1..109a9dd5d454 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h | |||
@@ -6,6 +6,7 @@ | |||
6 | 6 | ||
7 | /* some helper functions for xen and kvm pv clock sources */ | 7 | /* some helper functions for xen and kvm pv clock sources */ |
8 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); | 8 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); |
9 | u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); | ||
9 | void pvclock_set_flags(u8 flags); | 10 | void pvclock_set_flags(u8 flags); |
10 | unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); | 11 | unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); |
11 | void pvclock_read_wallclock(struct pvclock_wall_clock *wall, | 12 | void pvclock_read_wallclock(struct pvclock_wall_clock *wall, |
@@ -56,4 +57,50 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) | |||
56 | return product; | 57 | return product; |
57 | } | 58 | } |
58 | 59 | ||
60 | static __always_inline | ||
61 | u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) | ||
62 | { | ||
63 | u64 delta = __native_read_tsc() - src->tsc_timestamp; | ||
64 | return pvclock_scale_delta(delta, src->tsc_to_system_mul, | ||
65 | src->tsc_shift); | ||
66 | } | ||
67 | |||
68 | static __always_inline | ||
69 | unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, | ||
70 | cycle_t *cycles, u8 *flags) | ||
71 | { | ||
72 | unsigned version; | ||
73 | cycle_t ret, offset; | ||
74 | u8 ret_flags; | ||
75 | |||
76 | version = src->version; | ||
77 | /* Note: emulated platforms which do not advertise SSE2 support | ||
78 | * result in kvmclock not using the necessary RDTSC barriers. | ||
79 | * Without barriers, it is possible that RDTSC instruction reads from | ||
80 | * the time stamp counter outside rdtsc_barrier protected section | ||
81 | * below, resulting in violation of monotonicity. | ||
82 | */ | ||
83 | rdtsc_barrier(); | ||
84 | offset = pvclock_get_nsec_offset(src); | ||
85 | ret = src->system_time + offset; | ||
86 | ret_flags = src->flags; | ||
87 | rdtsc_barrier(); | ||
88 | |||
89 | *cycles = ret; | ||
90 | *flags = ret_flags; | ||
91 | return version; | ||
92 | } | ||
93 | |||
94 | struct pvclock_vsyscall_time_info { | ||
95 | struct pvclock_vcpu_time_info pvti; | ||
96 | u32 migrate_count; | ||
97 | } __attribute__((__aligned__(SMP_CACHE_BYTES))); | ||
98 | |||
99 | #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) | ||
100 | #define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1) | ||
101 | |||
102 | int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, | ||
103 | int size); | ||
104 | struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu); | ||
105 | |||
59 | #endif /* _ASM_X86_PVCLOCK_H */ | 106 | #endif /* _ASM_X86_PVCLOCK_H */ |
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 36ec21c36d68..c2d56b34830d 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h | |||
@@ -445,8 +445,7 @@ enum vmcs_field { | |||
445 | #define VMX_EPTP_WB_BIT (1ull << 14) | 445 | #define VMX_EPTP_WB_BIT (1ull << 14) |
446 | #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) | 446 | #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) |
447 | #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) | 447 | #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) |
448 | #define VMX_EPT_AD_BIT (1ull << 21) | 448 | #define VMX_EPT_AD_BIT (1ull << 21) |
449 | #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) | ||
450 | #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) | 449 | #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) |
451 | #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) | 450 | #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) |
452 | 451 | ||
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index eaea1d31f753..80f80955cfd8 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h | |||
@@ -33,6 +33,26 @@ extern void map_vsyscall(void); | |||
33 | */ | 33 | */ |
34 | extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); | 34 | extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); |
35 | 35 | ||
36 | #ifdef CONFIG_X86_64 | ||
37 | |||
38 | #define VGETCPU_CPU_MASK 0xfff | ||
39 | |||
40 | static inline unsigned int __getcpu(void) | ||
41 | { | ||
42 | unsigned int p; | ||
43 | |||
44 | if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { | ||
45 | /* Load per CPU data from RDTSCP */ | ||
46 | native_read_tscp(&p); | ||
47 | } else { | ||
48 | /* Load per CPU data from GDT */ | ||
49 | asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); | ||
50 | } | ||
51 | |||
52 | return p; | ||
53 | } | ||
54 | #endif /* CONFIG_X86_64 */ | ||
55 | |||
36 | #endif /* __KERNEL__ */ | 56 | #endif /* __KERNEL__ */ |
37 | 57 | ||
38 | #endif /* _ASM_X86_VSYSCALL_H */ | 58 | #endif /* _ASM_X86_VSYSCALL_H */ |
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 13ad89971d47..74467feb4dc5 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/delay.h> | 16 | #include <linux/delay.h> |
17 | #include <linux/elf.h> | 17 | #include <linux/elf.h> |
18 | #include <linux/elfcore.h> | 18 | #include <linux/elfcore.h> |
19 | #include <linux/module.h> | ||
19 | 20 | ||
20 | #include <asm/processor.h> | 21 | #include <asm/processor.h> |
21 | #include <asm/hardirq.h> | 22 | #include <asm/hardirq.h> |
@@ -30,6 +31,27 @@ | |||
30 | 31 | ||
31 | int in_crash_kexec; | 32 | int in_crash_kexec; |
32 | 33 | ||
34 | /* | ||
35 | * This is used to VMCLEAR all VMCSs loaded on the | ||
36 | * processor. And when loading kvm_intel module, the | ||
37 | * callback function pointer will be assigned. | ||
38 | * | ||
39 | * protected by rcu. | ||
40 | */ | ||
41 | crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; | ||
42 | EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); | ||
43 | |||
44 | static inline void cpu_crash_vmclear_loaded_vmcss(void) | ||
45 | { | ||
46 | crash_vmclear_fn *do_vmclear_operation = NULL; | ||
47 | |||
48 | rcu_read_lock(); | ||
49 | do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); | ||
50 | if (do_vmclear_operation) | ||
51 | do_vmclear_operation(); | ||
52 | rcu_read_unlock(); | ||
53 | } | ||
54 | |||
33 | #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) | 55 | #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) |
34 | 56 | ||
35 | static void kdump_nmi_callback(int cpu, struct pt_regs *regs) | 57 | static void kdump_nmi_callback(int cpu, struct pt_regs *regs) |
@@ -46,6 +68,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) | |||
46 | #endif | 68 | #endif |
47 | crash_save_cpu(regs, cpu); | 69 | crash_save_cpu(regs, cpu); |
48 | 70 | ||
71 | /* | ||
72 | * VMCLEAR VMCSs loaded on all cpus if needed. | ||
73 | */ | ||
74 | cpu_crash_vmclear_loaded_vmcss(); | ||
75 | |||
49 | /* Disable VMX or SVM if needed. | 76 | /* Disable VMX or SVM if needed. |
50 | * | 77 | * |
51 | * We need to disable virtualization on all CPUs. | 78 | * We need to disable virtualization on all CPUs. |
@@ -88,6 +115,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs) | |||
88 | 115 | ||
89 | kdump_nmi_shootdown_cpus(); | 116 | kdump_nmi_shootdown_cpus(); |
90 | 117 | ||
118 | /* | ||
119 | * VMCLEAR VMCSs loaded on this cpu if needed. | ||
120 | */ | ||
121 | cpu_crash_vmclear_loaded_vmcss(); | ||
122 | |||
91 | /* Booting kdump kernel with VMX or SVM enabled won't work, | 123 | /* Booting kdump kernel with VMX or SVM enabled won't work, |
92 | * because (among other limitations) we can't disable paging | 124 | * because (among other limitations) we can't disable paging |
93 | * with the virt flags. | 125 | * with the virt flags. |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 4180a874c764..08b973f64032 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -42,6 +42,7 @@ | |||
42 | #include <asm/apic.h> | 42 | #include <asm/apic.h> |
43 | #include <asm/apicdef.h> | 43 | #include <asm/apicdef.h> |
44 | #include <asm/hypervisor.h> | 44 | #include <asm/hypervisor.h> |
45 | #include <asm/kvm_guest.h> | ||
45 | 46 | ||
46 | static int kvmapf = 1; | 47 | static int kvmapf = 1; |
47 | 48 | ||
@@ -62,6 +63,15 @@ static int parse_no_stealacc(char *arg) | |||
62 | 63 | ||
63 | early_param("no-steal-acc", parse_no_stealacc); | 64 | early_param("no-steal-acc", parse_no_stealacc); |
64 | 65 | ||
66 | static int kvmclock_vsyscall = 1; | ||
67 | static int parse_no_kvmclock_vsyscall(char *arg) | ||
68 | { | ||
69 | kvmclock_vsyscall = 0; | ||
70 | return 0; | ||
71 | } | ||
72 | |||
73 | early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); | ||
74 | |||
65 | static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); | 75 | static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); |
66 | static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); | 76 | static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); |
67 | static int has_steal_clock = 0; | 77 | static int has_steal_clock = 0; |
@@ -110,11 +120,6 @@ void kvm_async_pf_task_wait(u32 token) | |||
110 | struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; | 120 | struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; |
111 | struct kvm_task_sleep_node n, *e; | 121 | struct kvm_task_sleep_node n, *e; |
112 | DEFINE_WAIT(wait); | 122 | DEFINE_WAIT(wait); |
113 | int cpu, idle; | ||
114 | |||
115 | cpu = get_cpu(); | ||
116 | idle = idle_cpu(cpu); | ||
117 | put_cpu(); | ||
118 | 123 | ||
119 | spin_lock(&b->lock); | 124 | spin_lock(&b->lock); |
120 | e = _find_apf_task(b, token); | 125 | e = _find_apf_task(b, token); |
@@ -128,7 +133,7 @@ void kvm_async_pf_task_wait(u32 token) | |||
128 | 133 | ||
129 | n.token = token; | 134 | n.token = token; |
130 | n.cpu = smp_processor_id(); | 135 | n.cpu = smp_processor_id(); |
131 | n.halted = idle || preempt_count() > 1; | 136 | n.halted = is_idle_task(current) || preempt_count() > 1; |
132 | init_waitqueue_head(&n.wq); | 137 | init_waitqueue_head(&n.wq); |
133 | hlist_add_head(&n.link, &b->list); | 138 | hlist_add_head(&n.link, &b->list); |
134 | spin_unlock(&b->lock); | 139 | spin_unlock(&b->lock); |
@@ -471,6 +476,9 @@ void __init kvm_guest_init(void) | |||
471 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) | 476 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) |
472 | apic_set_eoi_write(kvm_guest_apic_eoi_write); | 477 | apic_set_eoi_write(kvm_guest_apic_eoi_write); |
473 | 478 | ||
479 | if (kvmclock_vsyscall) | ||
480 | kvm_setup_vsyscall_timeinfo(); | ||
481 | |||
474 | #ifdef CONFIG_SMP | 482 | #ifdef CONFIG_SMP |
475 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; | 483 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; |
476 | register_cpu_notifier(&kvm_cpu_notifier); | 484 | register_cpu_notifier(&kvm_cpu_notifier); |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f1b42b3a186c..220a360010f8 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <asm/apic.h> | 23 | #include <asm/apic.h> |
24 | #include <linux/percpu.h> | 24 | #include <linux/percpu.h> |
25 | #include <linux/hardirq.h> | 25 | #include <linux/hardirq.h> |
26 | #include <linux/memblock.h> | ||
26 | 27 | ||
27 | #include <asm/x86_init.h> | 28 | #include <asm/x86_init.h> |
28 | #include <asm/reboot.h> | 29 | #include <asm/reboot.h> |
@@ -39,7 +40,7 @@ static int parse_no_kvmclock(char *arg) | |||
39 | early_param("no-kvmclock", parse_no_kvmclock); | 40 | early_param("no-kvmclock", parse_no_kvmclock); |
40 | 41 | ||
41 | /* The hypervisor will put information about time periodically here */ | 42 | /* The hypervisor will put information about time periodically here */ |
42 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); | 43 | static struct pvclock_vsyscall_time_info *hv_clock; |
43 | static struct pvclock_wall_clock wall_clock; | 44 | static struct pvclock_wall_clock wall_clock; |
44 | 45 | ||
45 | /* | 46 | /* |
@@ -52,15 +53,20 @@ static unsigned long kvm_get_wallclock(void) | |||
52 | struct pvclock_vcpu_time_info *vcpu_time; | 53 | struct pvclock_vcpu_time_info *vcpu_time; |
53 | struct timespec ts; | 54 | struct timespec ts; |
54 | int low, high; | 55 | int low, high; |
56 | int cpu; | ||
55 | 57 | ||
56 | low = (int)__pa_symbol(&wall_clock); | 58 | low = (int)__pa_symbol(&wall_clock); |
57 | high = ((u64)__pa_symbol(&wall_clock) >> 32); | 59 | high = ((u64)__pa_symbol(&wall_clock) >> 32); |
58 | 60 | ||
59 | native_write_msr(msr_kvm_wall_clock, low, high); | 61 | native_write_msr(msr_kvm_wall_clock, low, high); |
60 | 62 | ||
61 | vcpu_time = &get_cpu_var(hv_clock); | 63 | preempt_disable(); |
64 | cpu = smp_processor_id(); | ||
65 | |||
66 | vcpu_time = &hv_clock[cpu].pvti; | ||
62 | pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); | 67 | pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); |
63 | put_cpu_var(hv_clock); | 68 | |
69 | preempt_enable(); | ||
64 | 70 | ||
65 | return ts.tv_sec; | 71 | return ts.tv_sec; |
66 | } | 72 | } |
@@ -74,9 +80,11 @@ static cycle_t kvm_clock_read(void) | |||
74 | { | 80 | { |
75 | struct pvclock_vcpu_time_info *src; | 81 | struct pvclock_vcpu_time_info *src; |
76 | cycle_t ret; | 82 | cycle_t ret; |
83 | int cpu; | ||
77 | 84 | ||
78 | preempt_disable_notrace(); | 85 | preempt_disable_notrace(); |
79 | src = &__get_cpu_var(hv_clock); | 86 | cpu = smp_processor_id(); |
87 | src = &hv_clock[cpu].pvti; | ||
80 | ret = pvclock_clocksource_read(src); | 88 | ret = pvclock_clocksource_read(src); |
81 | preempt_enable_notrace(); | 89 | preempt_enable_notrace(); |
82 | return ret; | 90 | return ret; |
@@ -99,8 +107,15 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs) | |||
99 | static unsigned long kvm_get_tsc_khz(void) | 107 | static unsigned long kvm_get_tsc_khz(void) |
100 | { | 108 | { |
101 | struct pvclock_vcpu_time_info *src; | 109 | struct pvclock_vcpu_time_info *src; |
102 | src = &per_cpu(hv_clock, 0); | 110 | int cpu; |
103 | return pvclock_tsc_khz(src); | 111 | unsigned long tsc_khz; |
112 | |||
113 | preempt_disable(); | ||
114 | cpu = smp_processor_id(); | ||
115 | src = &hv_clock[cpu].pvti; | ||
116 | tsc_khz = pvclock_tsc_khz(src); | ||
117 | preempt_enable(); | ||
118 | return tsc_khz; | ||
104 | } | 119 | } |
105 | 120 | ||
106 | static void kvm_get_preset_lpj(void) | 121 | static void kvm_get_preset_lpj(void) |
@@ -119,10 +134,14 @@ bool kvm_check_and_clear_guest_paused(void) | |||
119 | { | 134 | { |
120 | bool ret = false; | 135 | bool ret = false; |
121 | struct pvclock_vcpu_time_info *src; | 136 | struct pvclock_vcpu_time_info *src; |
137 | int cpu = smp_processor_id(); | ||
122 | 138 | ||
123 | src = &__get_cpu_var(hv_clock); | 139 | if (!hv_clock) |
140 | return ret; | ||
141 | |||
142 | src = &hv_clock[cpu].pvti; | ||
124 | if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { | 143 | if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { |
125 | __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); | 144 | src->flags &= ~PVCLOCK_GUEST_STOPPED; |
126 | ret = true; | 145 | ret = true; |
127 | } | 146 | } |
128 | 147 | ||
@@ -141,9 +160,10 @@ int kvm_register_clock(char *txt) | |||
141 | { | 160 | { |
142 | int cpu = smp_processor_id(); | 161 | int cpu = smp_processor_id(); |
143 | int low, high, ret; | 162 | int low, high, ret; |
163 | struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; | ||
144 | 164 | ||
145 | low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; | 165 | low = (int)__pa(src) | 1; |
146 | high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); | 166 | high = ((u64)__pa(src) >> 32); |
147 | ret = native_write_msr_safe(msr_kvm_system_time, low, high); | 167 | ret = native_write_msr_safe(msr_kvm_system_time, low, high); |
148 | printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", | 168 | printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", |
149 | cpu, high, low, txt); | 169 | cpu, high, low, txt); |
@@ -197,6 +217,8 @@ static void kvm_shutdown(void) | |||
197 | 217 | ||
198 | void __init kvmclock_init(void) | 218 | void __init kvmclock_init(void) |
199 | { | 219 | { |
220 | unsigned long mem; | ||
221 | |||
200 | if (!kvm_para_available()) | 222 | if (!kvm_para_available()) |
201 | return; | 223 | return; |
202 | 224 | ||
@@ -209,8 +231,18 @@ void __init kvmclock_init(void) | |||
209 | printk(KERN_INFO "kvm-clock: Using msrs %x and %x", | 231 | printk(KERN_INFO "kvm-clock: Using msrs %x and %x", |
210 | msr_kvm_system_time, msr_kvm_wall_clock); | 232 | msr_kvm_system_time, msr_kvm_wall_clock); |
211 | 233 | ||
212 | if (kvm_register_clock("boot clock")) | 234 | mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS, |
235 | PAGE_SIZE); | ||
236 | if (!mem) | ||
237 | return; | ||
238 | hv_clock = __va(mem); | ||
239 | |||
240 | if (kvm_register_clock("boot clock")) { | ||
241 | hv_clock = NULL; | ||
242 | memblock_free(mem, | ||
243 | sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); | ||
213 | return; | 244 | return; |
245 | } | ||
214 | pv_time_ops.sched_clock = kvm_clock_read; | 246 | pv_time_ops.sched_clock = kvm_clock_read; |
215 | x86_platform.calibrate_tsc = kvm_get_tsc_khz; | 247 | x86_platform.calibrate_tsc = kvm_get_tsc_khz; |
216 | x86_platform.get_wallclock = kvm_get_wallclock; | 248 | x86_platform.get_wallclock = kvm_get_wallclock; |
@@ -233,3 +265,37 @@ void __init kvmclock_init(void) | |||
233 | if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) | 265 | if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) |
234 | pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); | 266 | pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); |
235 | } | 267 | } |
268 | |||
269 | int __init kvm_setup_vsyscall_timeinfo(void) | ||
270 | { | ||
271 | #ifdef CONFIG_X86_64 | ||
272 | int cpu; | ||
273 | int ret; | ||
274 | u8 flags; | ||
275 | struct pvclock_vcpu_time_info *vcpu_time; | ||
276 | unsigned int size; | ||
277 | |||
278 | size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS; | ||
279 | |||
280 | preempt_disable(); | ||
281 | cpu = smp_processor_id(); | ||
282 | |||
283 | vcpu_time = &hv_clock[cpu].pvti; | ||
284 | flags = pvclock_read_flags(vcpu_time); | ||
285 | |||
286 | if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { | ||
287 | preempt_enable(); | ||
288 | return 1; | ||
289 | } | ||
290 | |||
291 | if ((ret = pvclock_init_vsyscall(hv_clock, size))) { | ||
292 | preempt_enable(); | ||
293 | return ret; | ||
294 | } | ||
295 | |||
296 | preempt_enable(); | ||
297 | |||
298 | kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; | ||
299 | #endif | ||
300 | return 0; | ||
301 | } | ||
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 42eb3300dfc6..85c39590c1a4 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c | |||
@@ -17,23 +17,13 @@ | |||
17 | 17 | ||
18 | #include <linux/kernel.h> | 18 | #include <linux/kernel.h> |
19 | #include <linux/percpu.h> | 19 | #include <linux/percpu.h> |
20 | #include <linux/notifier.h> | ||
21 | #include <linux/sched.h> | ||
22 | #include <linux/gfp.h> | ||
23 | #include <linux/bootmem.h> | ||
24 | #include <asm/fixmap.h> | ||
20 | #include <asm/pvclock.h> | 25 | #include <asm/pvclock.h> |
21 | 26 | ||
22 | /* | ||
23 | * These are perodically updated | ||
24 | * xen: magic shared_info page | ||
25 | * kvm: gpa registered via msr | ||
26 | * and then copied here. | ||
27 | */ | ||
28 | struct pvclock_shadow_time { | ||
29 | u64 tsc_timestamp; /* TSC at last update of time vals. */ | ||
30 | u64 system_timestamp; /* Time, in nanosecs, since boot. */ | ||
31 | u32 tsc_to_nsec_mul; | ||
32 | int tsc_shift; | ||
33 | u32 version; | ||
34 | u8 flags; | ||
35 | }; | ||
36 | |||
37 | static u8 valid_flags __read_mostly = 0; | 27 | static u8 valid_flags __read_mostly = 0; |
38 | 28 | ||
39 | void pvclock_set_flags(u8 flags) | 29 | void pvclock_set_flags(u8 flags) |
@@ -41,34 +31,6 @@ void pvclock_set_flags(u8 flags) | |||
41 | valid_flags = flags; | 31 | valid_flags = flags; |
42 | } | 32 | } |
43 | 33 | ||
44 | static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) | ||
45 | { | ||
46 | u64 delta = native_read_tsc() - shadow->tsc_timestamp; | ||
47 | return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul, | ||
48 | shadow->tsc_shift); | ||
49 | } | ||
50 | |||
51 | /* | ||
52 | * Reads a consistent set of time-base values from hypervisor, | ||
53 | * into a shadow data area. | ||
54 | */ | ||
55 | static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, | ||
56 | struct pvclock_vcpu_time_info *src) | ||
57 | { | ||
58 | do { | ||
59 | dst->version = src->version; | ||
60 | rmb(); /* fetch version before data */ | ||
61 | dst->tsc_timestamp = src->tsc_timestamp; | ||
62 | dst->system_timestamp = src->system_time; | ||
63 | dst->tsc_to_nsec_mul = src->tsc_to_system_mul; | ||
64 | dst->tsc_shift = src->tsc_shift; | ||
65 | dst->flags = src->flags; | ||
66 | rmb(); /* test version after fetching data */ | ||
67 | } while ((src->version & 1) || (dst->version != src->version)); | ||
68 | |||
69 | return dst->version; | ||
70 | } | ||
71 | |||
72 | unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) | 34 | unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) |
73 | { | 35 | { |
74 | u64 pv_tsc_khz = 1000000ULL << 32; | 36 | u64 pv_tsc_khz = 1000000ULL << 32; |
@@ -88,23 +50,32 @@ void pvclock_resume(void) | |||
88 | atomic64_set(&last_value, 0); | 50 | atomic64_set(&last_value, 0); |
89 | } | 51 | } |
90 | 52 | ||
53 | u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) | ||
54 | { | ||
55 | unsigned version; | ||
56 | cycle_t ret; | ||
57 | u8 flags; | ||
58 | |||
59 | do { | ||
60 | version = __pvclock_read_cycles(src, &ret, &flags); | ||
61 | } while ((src->version & 1) || version != src->version); | ||
62 | |||
63 | return flags & valid_flags; | ||
64 | } | ||
65 | |||
91 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) | 66 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) |
92 | { | 67 | { |
93 | struct pvclock_shadow_time shadow; | ||
94 | unsigned version; | 68 | unsigned version; |
95 | cycle_t ret, offset; | 69 | cycle_t ret; |
96 | u64 last; | 70 | u64 last; |
71 | u8 flags; | ||
97 | 72 | ||
98 | do { | 73 | do { |
99 | version = pvclock_get_time_values(&shadow, src); | 74 | version = __pvclock_read_cycles(src, &ret, &flags); |
100 | barrier(); | 75 | } while ((src->version & 1) || version != src->version); |
101 | offset = pvclock_get_nsec_offset(&shadow); | ||
102 | ret = shadow.system_timestamp + offset; | ||
103 | barrier(); | ||
104 | } while (version != src->version); | ||
105 | 76 | ||
106 | if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && | 77 | if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && |
107 | (shadow.flags & PVCLOCK_TSC_STABLE_BIT)) | 78 | (flags & PVCLOCK_TSC_STABLE_BIT)) |
108 | return ret; | 79 | return ret; |
109 | 80 | ||
110 | /* | 81 | /* |
@@ -156,3 +127,71 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, | |||
156 | 127 | ||
157 | set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); | 128 | set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); |
158 | } | 129 | } |
130 | |||
131 | static struct pvclock_vsyscall_time_info *pvclock_vdso_info; | ||
132 | |||
133 | static struct pvclock_vsyscall_time_info * | ||
134 | pvclock_get_vsyscall_user_time_info(int cpu) | ||
135 | { | ||
136 | if (!pvclock_vdso_info) { | ||
137 | BUG(); | ||
138 | return NULL; | ||
139 | } | ||
140 | |||
141 | return &pvclock_vdso_info[cpu]; | ||
142 | } | ||
143 | |||
144 | struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) | ||
145 | { | ||
146 | return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; | ||
147 | } | ||
148 | |||
149 | #ifdef CONFIG_X86_64 | ||
150 | static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, | ||
151 | void *v) | ||
152 | { | ||
153 | struct task_migration_notifier *mn = v; | ||
154 | struct pvclock_vsyscall_time_info *pvti; | ||
155 | |||
156 | pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); | ||
157 | |||
158 | /* this is NULL when pvclock vsyscall is not initialized */ | ||
159 | if (unlikely(pvti == NULL)) | ||
160 | return NOTIFY_DONE; | ||
161 | |||
162 | pvti->migrate_count++; | ||
163 | |||
164 | return NOTIFY_DONE; | ||
165 | } | ||
166 | |||
167 | static struct notifier_block pvclock_migrate = { | ||
168 | .notifier_call = pvclock_task_migrate, | ||
169 | }; | ||
170 | |||
171 | /* | ||
172 | * Initialize the generic pvclock vsyscall state. This will allocate | ||
173 | * a/some page(s) for the per-vcpu pvclock information, set up a | ||
174 | * fixmap mapping for the page(s) | ||
175 | */ | ||
176 | |||
177 | int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, | ||
178 | int size) | ||
179 | { | ||
180 | int idx; | ||
181 | |||
182 | WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); | ||
183 | |||
184 | pvclock_vdso_info = i; | ||
185 | |||
186 | for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { | ||
187 | __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, | ||
188 | __pa_symbol(i) + (idx*PAGE_SIZE), | ||
189 | PAGE_KERNEL_VVAR); | ||
190 | } | ||
191 | |||
192 | |||
193 | register_task_migration_notifier(&pvclock_migrate); | ||
194 | |||
195 | return 0; | ||
196 | } | ||
197 | #endif | ||
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ec79e773342e..a20ecb5b6cbf 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c | |||
@@ -320,6 +320,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
320 | if (index == 0) { | 320 | if (index == 0) { |
321 | entry->ebx &= kvm_supported_word9_x86_features; | 321 | entry->ebx &= kvm_supported_word9_x86_features; |
322 | cpuid_mask(&entry->ebx, 9); | 322 | cpuid_mask(&entry->ebx, 9); |
323 | // TSC_ADJUST is emulated | ||
324 | entry->ebx |= F(TSC_ADJUST); | ||
323 | } else | 325 | } else |
324 | entry->ebx = 0; | 326 | entry->ebx = 0; |
325 | entry->eax = 0; | 327 | entry->eax = 0; |
@@ -659,6 +661,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) | |||
659 | } else | 661 | } else |
660 | *eax = *ebx = *ecx = *edx = 0; | 662 | *eax = *ebx = *ecx = *edx = 0; |
661 | } | 663 | } |
664 | EXPORT_SYMBOL_GPL(kvm_cpuid); | ||
662 | 665 | ||
663 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | 666 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) |
664 | { | 667 | { |
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 58fc51488828..b7fd07984888 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h | |||
@@ -31,6 +31,14 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) | |||
31 | return best && (best->ecx & bit(X86_FEATURE_XSAVE)); | 31 | return best && (best->ecx & bit(X86_FEATURE_XSAVE)); |
32 | } | 32 | } |
33 | 33 | ||
34 | static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu) | ||
35 | { | ||
36 | struct kvm_cpuid_entry2 *best; | ||
37 | |||
38 | best = kvm_find_cpuid_entry(vcpu, 7, 0); | ||
39 | return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST)); | ||
40 | } | ||
41 | |||
34 | static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) | 42 | static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) |
35 | { | 43 | { |
36 | struct kvm_cpuid_entry2 *best; | 44 | struct kvm_cpuid_entry2 *best; |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index bba39bfa1c4b..a27e76371108 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -676,8 +676,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, | |||
676 | addr.seg); | 676 | addr.seg); |
677 | if (!usable) | 677 | if (!usable) |
678 | goto bad; | 678 | goto bad; |
679 | /* code segment or read-only data segment */ | 679 | /* code segment in protected mode or read-only data segment */ |
680 | if (((desc.type & 8) || !(desc.type & 2)) && write) | 680 | if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8)) |
681 | || !(desc.type & 2)) && write) | ||
681 | goto bad; | 682 | goto bad; |
682 | /* unreadable code segment */ | 683 | /* unreadable code segment */ |
683 | if (!fetch && (desc.type & 8) && !(desc.type & 2)) | 684 | if (!fetch && (desc.type & 8) && !(desc.type & 2)) |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 43e9fadca5d0..9392f527f107 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -1011,7 +1011,7 @@ static void start_apic_timer(struct kvm_lapic *apic) | |||
1011 | local_irq_save(flags); | 1011 | local_irq_save(flags); |
1012 | 1012 | ||
1013 | now = apic->lapic_timer.timer.base->get_time(); | 1013 | now = apic->lapic_timer.timer.base->get_time(); |
1014 | guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); | 1014 | guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); |
1015 | if (likely(tscdeadline > guest_tsc)) { | 1015 | if (likely(tscdeadline > guest_tsc)) { |
1016 | ns = (tscdeadline - guest_tsc) * 1000000ULL; | 1016 | ns = (tscdeadline - guest_tsc) * 1000000ULL; |
1017 | do_div(ns, this_tsc_khz); | 1017 | do_div(ns, this_tsc_khz); |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6f85fe0bf958..01d7c2ad05f5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -2382,12 +2382,20 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2382 | || (!vcpu->arch.mmu.direct_map && write_fault | 2382 | || (!vcpu->arch.mmu.direct_map && write_fault |
2383 | && !is_write_protection(vcpu) && !user_fault)) { | 2383 | && !is_write_protection(vcpu) && !user_fault)) { |
2384 | 2384 | ||
2385 | /* | ||
2386 | * There are two cases: | ||
2387 | * - the one is other vcpu creates new sp in the window | ||
2388 | * between mapping_level() and acquiring mmu-lock. | ||
2389 | * - the another case is the new sp is created by itself | ||
2390 | * (page-fault path) when guest uses the target gfn as | ||
2391 | * its page table. | ||
2392 | * Both of these cases can be fixed by allowing guest to | ||
2393 | * retry the access, it will refault, then we can establish | ||
2394 | * the mapping by using small page. | ||
2395 | */ | ||
2385 | if (level > PT_PAGE_TABLE_LEVEL && | 2396 | if (level > PT_PAGE_TABLE_LEVEL && |
2386 | has_wrprotected_page(vcpu->kvm, gfn, level)) { | 2397 | has_wrprotected_page(vcpu->kvm, gfn, level)) |
2387 | ret = 1; | ||
2388 | drop_spte(vcpu->kvm, sptep); | ||
2389 | goto done; | 2398 | goto done; |
2390 | } | ||
2391 | 2399 | ||
2392 | spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; | 2400 | spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; |
2393 | 2401 | ||
@@ -2505,6 +2513,14 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | |||
2505 | mmu_free_roots(vcpu); | 2513 | mmu_free_roots(vcpu); |
2506 | } | 2514 | } |
2507 | 2515 | ||
2516 | static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) | ||
2517 | { | ||
2518 | int bit7; | ||
2519 | |||
2520 | bit7 = (gpte >> 7) & 1; | ||
2521 | return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; | ||
2522 | } | ||
2523 | |||
2508 | static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, | 2524 | static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, |
2509 | bool no_dirty_log) | 2525 | bool no_dirty_log) |
2510 | { | 2526 | { |
@@ -2517,6 +2533,26 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, | |||
2517 | return gfn_to_pfn_memslot_atomic(slot, gfn); | 2533 | return gfn_to_pfn_memslot_atomic(slot, gfn); |
2518 | } | 2534 | } |
2519 | 2535 | ||
2536 | static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu, | ||
2537 | struct kvm_mmu_page *sp, u64 *spte, | ||
2538 | u64 gpte) | ||
2539 | { | ||
2540 | if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) | ||
2541 | goto no_present; | ||
2542 | |||
2543 | if (!is_present_gpte(gpte)) | ||
2544 | goto no_present; | ||
2545 | |||
2546 | if (!(gpte & PT_ACCESSED_MASK)) | ||
2547 | goto no_present; | ||
2548 | |||
2549 | return false; | ||
2550 | |||
2551 | no_present: | ||
2552 | drop_spte(vcpu->kvm, spte); | ||
2553 | return true; | ||
2554 | } | ||
2555 | |||
2520 | static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, | 2556 | static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, |
2521 | struct kvm_mmu_page *sp, | 2557 | struct kvm_mmu_page *sp, |
2522 | u64 *start, u64 *end) | 2558 | u64 *start, u64 *end) |
@@ -2671,7 +2707,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, | |||
2671 | * PT_PAGE_TABLE_LEVEL and there would be no adjustment done | 2707 | * PT_PAGE_TABLE_LEVEL and there would be no adjustment done |
2672 | * here. | 2708 | * here. |
2673 | */ | 2709 | */ |
2674 | if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && | 2710 | if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && |
2675 | level == PT_PAGE_TABLE_LEVEL && | 2711 | level == PT_PAGE_TABLE_LEVEL && |
2676 | PageTransCompound(pfn_to_page(pfn)) && | 2712 | PageTransCompound(pfn_to_page(pfn)) && |
2677 | !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { | 2713 | !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { |
@@ -2699,18 +2735,13 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, | |||
2699 | } | 2735 | } |
2700 | } | 2736 | } |
2701 | 2737 | ||
2702 | static bool mmu_invalid_pfn(pfn_t pfn) | ||
2703 | { | ||
2704 | return unlikely(is_invalid_pfn(pfn)); | ||
2705 | } | ||
2706 | |||
2707 | static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, | 2738 | static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, |
2708 | pfn_t pfn, unsigned access, int *ret_val) | 2739 | pfn_t pfn, unsigned access, int *ret_val) |
2709 | { | 2740 | { |
2710 | bool ret = true; | 2741 | bool ret = true; |
2711 | 2742 | ||
2712 | /* The pfn is invalid, report the error! */ | 2743 | /* The pfn is invalid, report the error! */ |
2713 | if (unlikely(is_invalid_pfn(pfn))) { | 2744 | if (unlikely(is_error_pfn(pfn))) { |
2714 | *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); | 2745 | *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); |
2715 | goto exit; | 2746 | goto exit; |
2716 | } | 2747 | } |
@@ -2862,7 +2893,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, | |||
2862 | return r; | 2893 | return r; |
2863 | 2894 | ||
2864 | spin_lock(&vcpu->kvm->mmu_lock); | 2895 | spin_lock(&vcpu->kvm->mmu_lock); |
2865 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 2896 | if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) |
2866 | goto out_unlock; | 2897 | goto out_unlock; |
2867 | kvm_mmu_free_some_pages(vcpu); | 2898 | kvm_mmu_free_some_pages(vcpu); |
2868 | if (likely(!force_pt_level)) | 2899 | if (likely(!force_pt_level)) |
@@ -3331,7 +3362,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | |||
3331 | return r; | 3362 | return r; |
3332 | 3363 | ||
3333 | spin_lock(&vcpu->kvm->mmu_lock); | 3364 | spin_lock(&vcpu->kvm->mmu_lock); |
3334 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 3365 | if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) |
3335 | goto out_unlock; | 3366 | goto out_unlock; |
3336 | kvm_mmu_free_some_pages(vcpu); | 3367 | kvm_mmu_free_some_pages(vcpu); |
3337 | if (likely(!force_pt_level)) | 3368 | if (likely(!force_pt_level)) |
@@ -3399,14 +3430,6 @@ static void paging_free(struct kvm_vcpu *vcpu) | |||
3399 | nonpaging_free(vcpu); | 3430 | nonpaging_free(vcpu); |
3400 | } | 3431 | } |
3401 | 3432 | ||
3402 | static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) | ||
3403 | { | ||
3404 | int bit7; | ||
3405 | |||
3406 | bit7 = (gpte >> 7) & 1; | ||
3407 | return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; | ||
3408 | } | ||
3409 | |||
3410 | static inline void protect_clean_gpte(unsigned *access, unsigned gpte) | 3433 | static inline void protect_clean_gpte(unsigned *access, unsigned gpte) |
3411 | { | 3434 | { |
3412 | unsigned mask; | 3435 | unsigned mask; |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 714e2c01a6fe..891eb6d93b8b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -305,51 +305,43 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, | |||
305 | addr, access); | 305 | addr, access); |
306 | } | 306 | } |
307 | 307 | ||
308 | static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, | 308 | static bool |
309 | struct kvm_mmu_page *sp, u64 *spte, | 309 | FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
310 | pt_element_t gpte) | 310 | u64 *spte, pt_element_t gpte, bool no_dirty_log) |
311 | { | 311 | { |
312 | if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) | ||
313 | goto no_present; | ||
314 | |||
315 | if (!is_present_gpte(gpte)) | ||
316 | goto no_present; | ||
317 | |||
318 | if (!(gpte & PT_ACCESSED_MASK)) | ||
319 | goto no_present; | ||
320 | |||
321 | return false; | ||
322 | |||
323 | no_present: | ||
324 | drop_spte(vcpu->kvm, spte); | ||
325 | return true; | ||
326 | } | ||
327 | |||
328 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | ||
329 | u64 *spte, const void *pte) | ||
330 | { | ||
331 | pt_element_t gpte; | ||
332 | unsigned pte_access; | 312 | unsigned pte_access; |
313 | gfn_t gfn; | ||
333 | pfn_t pfn; | 314 | pfn_t pfn; |
334 | 315 | ||
335 | gpte = *(const pt_element_t *)pte; | 316 | if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) |
336 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) | 317 | return false; |
337 | return; | ||
338 | 318 | ||
339 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); | 319 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); |
320 | |||
321 | gfn = gpte_to_gfn(gpte); | ||
340 | pte_access = sp->role.access & gpte_access(vcpu, gpte); | 322 | pte_access = sp->role.access & gpte_access(vcpu, gpte); |
341 | protect_clean_gpte(&pte_access, gpte); | 323 | protect_clean_gpte(&pte_access, gpte); |
342 | pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); | 324 | pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, |
343 | if (mmu_invalid_pfn(pfn)) | 325 | no_dirty_log && (pte_access & ACC_WRITE_MASK)); |
344 | return; | 326 | if (is_error_pfn(pfn)) |
327 | return false; | ||
345 | 328 | ||
346 | /* | 329 | /* |
347 | * we call mmu_set_spte() with host_writable = true because that | 330 | * we call mmu_set_spte() with host_writable = true because |
348 | * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). | 331 | * pte_prefetch_gfn_to_pfn always gets a writable pfn. |
349 | */ | 332 | */ |
350 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, | 333 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, |
351 | NULL, PT_PAGE_TABLE_LEVEL, | 334 | NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true); |
352 | gpte_to_gfn(gpte), pfn, true, true); | 335 | |
336 | return true; | ||
337 | } | ||
338 | |||
339 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | ||
340 | u64 *spte, const void *pte) | ||
341 | { | ||
342 | pt_element_t gpte = *(const pt_element_t *)pte; | ||
343 | |||
344 | FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false); | ||
353 | } | 345 | } |
354 | 346 | ||
355 | static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, | 347 | static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, |
@@ -395,53 +387,34 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
395 | spte = sp->spt + i; | 387 | spte = sp->spt + i; |
396 | 388 | ||
397 | for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { | 389 | for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { |
398 | pt_element_t gpte; | ||
399 | unsigned pte_access; | ||
400 | gfn_t gfn; | ||
401 | pfn_t pfn; | ||
402 | |||
403 | if (spte == sptep) | 390 | if (spte == sptep) |
404 | continue; | 391 | continue; |
405 | 392 | ||
406 | if (is_shadow_present_pte(*spte)) | 393 | if (is_shadow_present_pte(*spte)) |
407 | continue; | 394 | continue; |
408 | 395 | ||
409 | gpte = gptep[i]; | 396 | if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) |
410 | |||
411 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) | ||
412 | continue; | ||
413 | |||
414 | pte_access = sp->role.access & gpte_access(vcpu, gpte); | ||
415 | protect_clean_gpte(&pte_access, gpte); | ||
416 | gfn = gpte_to_gfn(gpte); | ||
417 | pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, | ||
418 | pte_access & ACC_WRITE_MASK); | ||
419 | if (mmu_invalid_pfn(pfn)) | ||
420 | break; | 397 | break; |
421 | |||
422 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, | ||
423 | NULL, PT_PAGE_TABLE_LEVEL, gfn, | ||
424 | pfn, true, true); | ||
425 | } | 398 | } |
426 | } | 399 | } |
427 | 400 | ||
428 | /* | 401 | /* |
429 | * Fetch a shadow pte for a specific level in the paging hierarchy. | 402 | * Fetch a shadow pte for a specific level in the paging hierarchy. |
403 | * If the guest tries to write a write-protected page, we need to | ||
404 | * emulate this operation, return 1 to indicate this case. | ||
430 | */ | 405 | */ |
431 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | 406 | static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, |
432 | struct guest_walker *gw, | 407 | struct guest_walker *gw, |
433 | int user_fault, int write_fault, int hlevel, | 408 | int user_fault, int write_fault, int hlevel, |
434 | int *emulate, pfn_t pfn, bool map_writable, | 409 | pfn_t pfn, bool map_writable, bool prefault) |
435 | bool prefault) | ||
436 | { | 410 | { |
437 | unsigned access = gw->pt_access; | ||
438 | struct kvm_mmu_page *sp = NULL; | 411 | struct kvm_mmu_page *sp = NULL; |
439 | int top_level; | ||
440 | unsigned direct_access; | ||
441 | struct kvm_shadow_walk_iterator it; | 412 | struct kvm_shadow_walk_iterator it; |
413 | unsigned direct_access, access = gw->pt_access; | ||
414 | int top_level, emulate = 0; | ||
442 | 415 | ||
443 | if (!is_present_gpte(gw->ptes[gw->level - 1])) | 416 | if (!is_present_gpte(gw->ptes[gw->level - 1])) |
444 | return NULL; | 417 | return 0; |
445 | 418 | ||
446 | direct_access = gw->pte_access; | 419 | direct_access = gw->pte_access; |
447 | 420 | ||
@@ -505,17 +478,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
505 | 478 | ||
506 | clear_sp_write_flooding_count(it.sptep); | 479 | clear_sp_write_flooding_count(it.sptep); |
507 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, | 480 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, |
508 | user_fault, write_fault, emulate, it.level, | 481 | user_fault, write_fault, &emulate, it.level, |
509 | gw->gfn, pfn, prefault, map_writable); | 482 | gw->gfn, pfn, prefault, map_writable); |
510 | FNAME(pte_prefetch)(vcpu, gw, it.sptep); | 483 | FNAME(pte_prefetch)(vcpu, gw, it.sptep); |
511 | 484 | ||
512 | return it.sptep; | 485 | return emulate; |
513 | 486 | ||
514 | out_gpte_changed: | 487 | out_gpte_changed: |
515 | if (sp) | 488 | if (sp) |
516 | kvm_mmu_put_page(sp, it.sptep); | 489 | kvm_mmu_put_page(sp, it.sptep); |
517 | kvm_release_pfn_clean(pfn); | 490 | kvm_release_pfn_clean(pfn); |
518 | return NULL; | 491 | return 0; |
519 | } | 492 | } |
520 | 493 | ||
521 | /* | 494 | /* |
@@ -538,8 +511,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
538 | int write_fault = error_code & PFERR_WRITE_MASK; | 511 | int write_fault = error_code & PFERR_WRITE_MASK; |
539 | int user_fault = error_code & PFERR_USER_MASK; | 512 | int user_fault = error_code & PFERR_USER_MASK; |
540 | struct guest_walker walker; | 513 | struct guest_walker walker; |
541 | u64 *sptep; | ||
542 | int emulate = 0; | ||
543 | int r; | 514 | int r; |
544 | pfn_t pfn; | 515 | pfn_t pfn; |
545 | int level = PT_PAGE_TABLE_LEVEL; | 516 | int level = PT_PAGE_TABLE_LEVEL; |
@@ -594,24 +565,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
594 | return r; | 565 | return r; |
595 | 566 | ||
596 | spin_lock(&vcpu->kvm->mmu_lock); | 567 | spin_lock(&vcpu->kvm->mmu_lock); |
597 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 568 | if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) |
598 | goto out_unlock; | 569 | goto out_unlock; |
599 | 570 | ||
600 | kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); | 571 | kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); |
601 | kvm_mmu_free_some_pages(vcpu); | 572 | kvm_mmu_free_some_pages(vcpu); |
602 | if (!force_pt_level) | 573 | if (!force_pt_level) |
603 | transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); | 574 | transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); |
604 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 575 | r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, |
605 | level, &emulate, pfn, map_writable, prefault); | 576 | level, pfn, map_writable, prefault); |
606 | (void)sptep; | ||
607 | pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, | ||
608 | sptep, *sptep, emulate); | ||
609 | |||
610 | ++vcpu->stat.pf_fixed; | 577 | ++vcpu->stat.pf_fixed; |
611 | kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); | 578 | kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); |
612 | spin_unlock(&vcpu->kvm->mmu_lock); | 579 | spin_unlock(&vcpu->kvm->mmu_lock); |
613 | 580 | ||
614 | return emulate; | 581 | return r; |
615 | 582 | ||
616 | out_unlock: | 583 | out_unlock: |
617 | spin_unlock(&vcpu->kvm->mmu_lock); | 584 | spin_unlock(&vcpu->kvm->mmu_lock); |
@@ -757,7 +724,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
757 | sizeof(pt_element_t))) | 724 | sizeof(pt_element_t))) |
758 | return -EINVAL; | 725 | return -EINVAL; |
759 | 726 | ||
760 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { | 727 | if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) { |
761 | vcpu->kvm->tlbs_dirty++; | 728 | vcpu->kvm->tlbs_dirty++; |
762 | continue; | 729 | continue; |
763 | } | 730 | } |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d017df3899ef..d29d3cd1c156 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include "mmu.h" | 20 | #include "mmu.h" |
21 | #include "kvm_cache_regs.h" | 21 | #include "kvm_cache_regs.h" |
22 | #include "x86.h" | 22 | #include "x86.h" |
23 | #include "cpuid.h" | ||
23 | 24 | ||
24 | #include <linux/module.h> | 25 | #include <linux/module.h> |
25 | #include <linux/mod_devicetable.h> | 26 | #include <linux/mod_devicetable.h> |
@@ -630,15 +631,12 @@ static int svm_hardware_enable(void *garbage) | |||
630 | return -EBUSY; | 631 | return -EBUSY; |
631 | 632 | ||
632 | if (!has_svm()) { | 633 | if (!has_svm()) { |
633 | printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", | 634 | pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); |
634 | me); | ||
635 | return -EINVAL; | 635 | return -EINVAL; |
636 | } | 636 | } |
637 | sd = per_cpu(svm_data, me); | 637 | sd = per_cpu(svm_data, me); |
638 | |||
639 | if (!sd) { | 638 | if (!sd) { |
640 | printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n", | 639 | pr_err("%s: svm_data is NULL on %d\n", __func__, me); |
641 | me); | ||
642 | return -EINVAL; | 640 | return -EINVAL; |
643 | } | 641 | } |
644 | 642 | ||
@@ -1012,6 +1010,13 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) | |||
1012 | svm->tsc_ratio = ratio; | 1010 | svm->tsc_ratio = ratio; |
1013 | } | 1011 | } |
1014 | 1012 | ||
1013 | static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) | ||
1014 | { | ||
1015 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1016 | |||
1017 | return svm->vmcb->control.tsc_offset; | ||
1018 | } | ||
1019 | |||
1015 | static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) | 1020 | static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) |
1016 | { | 1021 | { |
1017 | struct vcpu_svm *svm = to_svm(vcpu); | 1022 | struct vcpu_svm *svm = to_svm(vcpu); |
@@ -1189,6 +1194,8 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
1189 | static int svm_vcpu_reset(struct kvm_vcpu *vcpu) | 1194 | static int svm_vcpu_reset(struct kvm_vcpu *vcpu) |
1190 | { | 1195 | { |
1191 | struct vcpu_svm *svm = to_svm(vcpu); | 1196 | struct vcpu_svm *svm = to_svm(vcpu); |
1197 | u32 dummy; | ||
1198 | u32 eax = 1; | ||
1192 | 1199 | ||
1193 | init_vmcb(svm); | 1200 | init_vmcb(svm); |
1194 | 1201 | ||
@@ -1197,8 +1204,9 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) | |||
1197 | svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; | 1204 | svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; |
1198 | svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; | 1205 | svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; |
1199 | } | 1206 | } |
1200 | vcpu->arch.regs_avail = ~0; | 1207 | |
1201 | vcpu->arch.regs_dirty = ~0; | 1208 | kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); |
1209 | kvm_register_write(vcpu, VCPU_REGS_RDX, eax); | ||
1202 | 1210 | ||
1203 | return 0; | 1211 | return 0; |
1204 | } | 1212 | } |
@@ -1254,11 +1262,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
1254 | svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; | 1262 | svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; |
1255 | svm->asid_generation = 0; | 1263 | svm->asid_generation = 0; |
1256 | init_vmcb(svm); | 1264 | init_vmcb(svm); |
1257 | kvm_write_tsc(&svm->vcpu, 0); | ||
1258 | |||
1259 | err = fx_init(&svm->vcpu); | ||
1260 | if (err) | ||
1261 | goto free_page4; | ||
1262 | 1265 | ||
1263 | svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | 1266 | svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; |
1264 | if (kvm_vcpu_is_bsp(&svm->vcpu)) | 1267 | if (kvm_vcpu_is_bsp(&svm->vcpu)) |
@@ -1268,8 +1271,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
1268 | 1271 | ||
1269 | return &svm->vcpu; | 1272 | return &svm->vcpu; |
1270 | 1273 | ||
1271 | free_page4: | ||
1272 | __free_page(hsave_page); | ||
1273 | free_page3: | 1274 | free_page3: |
1274 | __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); | 1275 | __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); |
1275 | free_page2: | 1276 | free_page2: |
@@ -3008,11 +3009,11 @@ static int cr8_write_interception(struct vcpu_svm *svm) | |||
3008 | return 0; | 3009 | return 0; |
3009 | } | 3010 | } |
3010 | 3011 | ||
3011 | u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu) | 3012 | u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) |
3012 | { | 3013 | { |
3013 | struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); | 3014 | struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); |
3014 | return vmcb->control.tsc_offset + | 3015 | return vmcb->control.tsc_offset + |
3015 | svm_scale_tsc(vcpu, native_read_tsc()); | 3016 | svm_scale_tsc(vcpu, host_tsc); |
3016 | } | 3017 | } |
3017 | 3018 | ||
3018 | static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | 3019 | static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) |
@@ -3131,13 +3132,15 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) | |||
3131 | return 0; | 3132 | return 0; |
3132 | } | 3133 | } |
3133 | 3134 | ||
3134 | static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | 3135 | static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) |
3135 | { | 3136 | { |
3136 | struct vcpu_svm *svm = to_svm(vcpu); | 3137 | struct vcpu_svm *svm = to_svm(vcpu); |
3137 | 3138 | ||
3139 | u32 ecx = msr->index; | ||
3140 | u64 data = msr->data; | ||
3138 | switch (ecx) { | 3141 | switch (ecx) { |
3139 | case MSR_IA32_TSC: | 3142 | case MSR_IA32_TSC: |
3140 | kvm_write_tsc(vcpu, data); | 3143 | kvm_write_tsc(vcpu, msr); |
3141 | break; | 3144 | break; |
3142 | case MSR_STAR: | 3145 | case MSR_STAR: |
3143 | svm->vmcb->save.star = data; | 3146 | svm->vmcb->save.star = data; |
@@ -3192,20 +3195,24 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
3192 | vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); | 3195 | vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); |
3193 | break; | 3196 | break; |
3194 | default: | 3197 | default: |
3195 | return kvm_set_msr_common(vcpu, ecx, data); | 3198 | return kvm_set_msr_common(vcpu, msr); |
3196 | } | 3199 | } |
3197 | return 0; | 3200 | return 0; |
3198 | } | 3201 | } |
3199 | 3202 | ||
3200 | static int wrmsr_interception(struct vcpu_svm *svm) | 3203 | static int wrmsr_interception(struct vcpu_svm *svm) |
3201 | { | 3204 | { |
3205 | struct msr_data msr; | ||
3202 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; | 3206 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; |
3203 | u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | 3207 | u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) |
3204 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); | 3208 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); |
3205 | 3209 | ||
3210 | msr.data = data; | ||
3211 | msr.index = ecx; | ||
3212 | msr.host_initiated = false; | ||
3206 | 3213 | ||
3207 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; | 3214 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; |
3208 | if (svm_set_msr(&svm->vcpu, ecx, data)) { | 3215 | if (svm_set_msr(&svm->vcpu, &msr)) { |
3209 | trace_kvm_msr_write_ex(ecx, data); | 3216 | trace_kvm_msr_write_ex(ecx, data); |
3210 | kvm_inject_gp(&svm->vcpu, 0); | 3217 | kvm_inject_gp(&svm->vcpu, 0); |
3211 | } else { | 3218 | } else { |
@@ -4302,6 +4309,7 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
4302 | .has_wbinvd_exit = svm_has_wbinvd_exit, | 4309 | .has_wbinvd_exit = svm_has_wbinvd_exit, |
4303 | 4310 | ||
4304 | .set_tsc_khz = svm_set_tsc_khz, | 4311 | .set_tsc_khz = svm_set_tsc_khz, |
4312 | .read_tsc_offset = svm_read_tsc_offset, | ||
4305 | .write_tsc_offset = svm_write_tsc_offset, | 4313 | .write_tsc_offset = svm_write_tsc_offset, |
4306 | .adjust_tsc_offset = svm_adjust_tsc_offset, | 4314 | .adjust_tsc_offset = svm_adjust_tsc_offset, |
4307 | .compute_tsc_offset = svm_compute_tsc_offset, | 4315 | .compute_tsc_offset = svm_compute_tsc_offset, |
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index bca63f04dccb..fe5e00ed7036 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/tracepoint.h> | 4 | #include <linux/tracepoint.h> |
5 | #include <asm/vmx.h> | 5 | #include <asm/vmx.h> |
6 | #include <asm/svm.h> | 6 | #include <asm/svm.h> |
7 | #include <asm/clocksource.h> | ||
7 | 8 | ||
8 | #undef TRACE_SYSTEM | 9 | #undef TRACE_SYSTEM |
9 | #define TRACE_SYSTEM kvm | 10 | #define TRACE_SYSTEM kvm |
@@ -754,6 +755,68 @@ TRACE_EVENT( | |||
754 | __entry->write ? "Write" : "Read", | 755 | __entry->write ? "Write" : "Read", |
755 | __entry->gpa_match ? "GPA" : "GVA") | 756 | __entry->gpa_match ? "GPA" : "GVA") |
756 | ); | 757 | ); |
758 | |||
759 | #ifdef CONFIG_X86_64 | ||
760 | |||
761 | #define host_clocks \ | ||
762 | {VCLOCK_NONE, "none"}, \ | ||
763 | {VCLOCK_TSC, "tsc"}, \ | ||
764 | {VCLOCK_HPET, "hpet"} \ | ||
765 | |||
766 | TRACE_EVENT(kvm_update_master_clock, | ||
767 | TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched), | ||
768 | TP_ARGS(use_master_clock, host_clock, offset_matched), | ||
769 | |||
770 | TP_STRUCT__entry( | ||
771 | __field( bool, use_master_clock ) | ||
772 | __field( unsigned int, host_clock ) | ||
773 | __field( bool, offset_matched ) | ||
774 | ), | ||
775 | |||
776 | TP_fast_assign( | ||
777 | __entry->use_master_clock = use_master_clock; | ||
778 | __entry->host_clock = host_clock; | ||
779 | __entry->offset_matched = offset_matched; | ||
780 | ), | ||
781 | |||
782 | TP_printk("masterclock %d hostclock %s offsetmatched %u", | ||
783 | __entry->use_master_clock, | ||
784 | __print_symbolic(__entry->host_clock, host_clocks), | ||
785 | __entry->offset_matched) | ||
786 | ); | ||
787 | |||
788 | TRACE_EVENT(kvm_track_tsc, | ||
789 | TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched, | ||
790 | unsigned int online_vcpus, bool use_master_clock, | ||
791 | unsigned int host_clock), | ||
792 | TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock, | ||
793 | host_clock), | ||
794 | |||
795 | TP_STRUCT__entry( | ||
796 | __field( unsigned int, vcpu_id ) | ||
797 | __field( unsigned int, nr_vcpus_matched_tsc ) | ||
798 | __field( unsigned int, online_vcpus ) | ||
799 | __field( bool, use_master_clock ) | ||
800 | __field( unsigned int, host_clock ) | ||
801 | ), | ||
802 | |||
803 | TP_fast_assign( | ||
804 | __entry->vcpu_id = vcpu_id; | ||
805 | __entry->nr_vcpus_matched_tsc = nr_matched; | ||
806 | __entry->online_vcpus = online_vcpus; | ||
807 | __entry->use_master_clock = use_master_clock; | ||
808 | __entry->host_clock = host_clock; | ||
809 | ), | ||
810 | |||
811 | TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u" | ||
812 | " hostclock %s", | ||
813 | __entry->vcpu_id, __entry->use_master_clock, | ||
814 | __entry->nr_vcpus_matched_tsc, __entry->online_vcpus, | ||
815 | __print_symbolic(__entry->host_clock, host_clocks)) | ||
816 | ); | ||
817 | |||
818 | #endif /* CONFIG_X86_64 */ | ||
819 | |||
757 | #endif /* _TRACE_KVM_H */ | 820 | #endif /* _TRACE_KVM_H */ |
758 | 821 | ||
759 | #undef TRACE_INCLUDE_PATH | 822 | #undef TRACE_INCLUDE_PATH |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f85815945fc6..9120ae1901e4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -42,6 +42,7 @@ | |||
42 | #include <asm/i387.h> | 42 | #include <asm/i387.h> |
43 | #include <asm/xcr.h> | 43 | #include <asm/xcr.h> |
44 | #include <asm/perf_event.h> | 44 | #include <asm/perf_event.h> |
45 | #include <asm/kexec.h> | ||
45 | 46 | ||
46 | #include "trace.h" | 47 | #include "trace.h" |
47 | 48 | ||
@@ -802,11 +803,6 @@ static inline bool cpu_has_vmx_ept_ad_bits(void) | |||
802 | return vmx_capability.ept & VMX_EPT_AD_BIT; | 803 | return vmx_capability.ept & VMX_EPT_AD_BIT; |
803 | } | 804 | } |
804 | 805 | ||
805 | static inline bool cpu_has_vmx_invept_individual_addr(void) | ||
806 | { | ||
807 | return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; | ||
808 | } | ||
809 | |||
810 | static inline bool cpu_has_vmx_invept_context(void) | 806 | static inline bool cpu_has_vmx_invept_context(void) |
811 | { | 807 | { |
812 | return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; | 808 | return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; |
@@ -992,6 +988,46 @@ static void vmcs_load(struct vmcs *vmcs) | |||
992 | vmcs, phys_addr); | 988 | vmcs, phys_addr); |
993 | } | 989 | } |
994 | 990 | ||
991 | #ifdef CONFIG_KEXEC | ||
992 | /* | ||
993 | * This bitmap is used to indicate whether the vmclear | ||
994 | * operation is enabled on all cpus. All disabled by | ||
995 | * default. | ||
996 | */ | ||
997 | static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; | ||
998 | |||
999 | static inline void crash_enable_local_vmclear(int cpu) | ||
1000 | { | ||
1001 | cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); | ||
1002 | } | ||
1003 | |||
1004 | static inline void crash_disable_local_vmclear(int cpu) | ||
1005 | { | ||
1006 | cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); | ||
1007 | } | ||
1008 | |||
1009 | static inline int crash_local_vmclear_enabled(int cpu) | ||
1010 | { | ||
1011 | return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); | ||
1012 | } | ||
1013 | |||
1014 | static void crash_vmclear_local_loaded_vmcss(void) | ||
1015 | { | ||
1016 | int cpu = raw_smp_processor_id(); | ||
1017 | struct loaded_vmcs *v; | ||
1018 | |||
1019 | if (!crash_local_vmclear_enabled(cpu)) | ||
1020 | return; | ||
1021 | |||
1022 | list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), | ||
1023 | loaded_vmcss_on_cpu_link) | ||
1024 | vmcs_clear(v->vmcs); | ||
1025 | } | ||
1026 | #else | ||
1027 | static inline void crash_enable_local_vmclear(int cpu) { } | ||
1028 | static inline void crash_disable_local_vmclear(int cpu) { } | ||
1029 | #endif /* CONFIG_KEXEC */ | ||
1030 | |||
995 | static void __loaded_vmcs_clear(void *arg) | 1031 | static void __loaded_vmcs_clear(void *arg) |
996 | { | 1032 | { |
997 | struct loaded_vmcs *loaded_vmcs = arg; | 1033 | struct loaded_vmcs *loaded_vmcs = arg; |
@@ -1001,15 +1037,28 @@ static void __loaded_vmcs_clear(void *arg) | |||
1001 | return; /* vcpu migration can race with cpu offline */ | 1037 | return; /* vcpu migration can race with cpu offline */ |
1002 | if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) | 1038 | if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) |
1003 | per_cpu(current_vmcs, cpu) = NULL; | 1039 | per_cpu(current_vmcs, cpu) = NULL; |
1040 | crash_disable_local_vmclear(cpu); | ||
1004 | list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); | 1041 | list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); |
1042 | |||
1043 | /* | ||
1044 | * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link | ||
1045 | * is before setting loaded_vmcs->vcpu to -1 which is done in | ||
1046 | * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist | ||
1047 | * then adds the vmcs into percpu list before it is deleted. | ||
1048 | */ | ||
1049 | smp_wmb(); | ||
1050 | |||
1005 | loaded_vmcs_init(loaded_vmcs); | 1051 | loaded_vmcs_init(loaded_vmcs); |
1052 | crash_enable_local_vmclear(cpu); | ||
1006 | } | 1053 | } |
1007 | 1054 | ||
1008 | static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) | 1055 | static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) |
1009 | { | 1056 | { |
1010 | if (loaded_vmcs->cpu != -1) | 1057 | int cpu = loaded_vmcs->cpu; |
1011 | smp_call_function_single( | 1058 | |
1012 | loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); | 1059 | if (cpu != -1) |
1060 | smp_call_function_single(cpu, | ||
1061 | __loaded_vmcs_clear, loaded_vmcs, 1); | ||
1013 | } | 1062 | } |
1014 | 1063 | ||
1015 | static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) | 1064 | static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) |
@@ -1051,17 +1100,6 @@ static inline void ept_sync_context(u64 eptp) | |||
1051 | } | 1100 | } |
1052 | } | 1101 | } |
1053 | 1102 | ||
1054 | static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) | ||
1055 | { | ||
1056 | if (enable_ept) { | ||
1057 | if (cpu_has_vmx_invept_individual_addr()) | ||
1058 | __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, | ||
1059 | eptp, gpa); | ||
1060 | else | ||
1061 | ept_sync_context(eptp); | ||
1062 | } | ||
1063 | } | ||
1064 | |||
1065 | static __always_inline unsigned long vmcs_readl(unsigned long field) | 1103 | static __always_inline unsigned long vmcs_readl(unsigned long field) |
1066 | { | 1104 | { |
1067 | unsigned long value; | 1105 | unsigned long value; |
@@ -1535,8 +1573,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
1535 | 1573 | ||
1536 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | 1574 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); |
1537 | local_irq_disable(); | 1575 | local_irq_disable(); |
1576 | crash_disable_local_vmclear(cpu); | ||
1577 | |||
1578 | /* | ||
1579 | * Read loaded_vmcs->cpu should be before fetching | ||
1580 | * loaded_vmcs->loaded_vmcss_on_cpu_link. | ||
1581 | * See the comments in __loaded_vmcs_clear(). | ||
1582 | */ | ||
1583 | smp_rmb(); | ||
1584 | |||
1538 | list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, | 1585 | list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, |
1539 | &per_cpu(loaded_vmcss_on_cpu, cpu)); | 1586 | &per_cpu(loaded_vmcss_on_cpu, cpu)); |
1587 | crash_enable_local_vmclear(cpu); | ||
1540 | local_irq_enable(); | 1588 | local_irq_enable(); |
1541 | 1589 | ||
1542 | /* | 1590 | /* |
@@ -1839,11 +1887,10 @@ static u64 guest_read_tsc(void) | |||
1839 | * Like guest_read_tsc, but always returns L1's notion of the timestamp | 1887 | * Like guest_read_tsc, but always returns L1's notion of the timestamp |
1840 | * counter, even if a nested guest (L2) is currently running. | 1888 | * counter, even if a nested guest (L2) is currently running. |
1841 | */ | 1889 | */ |
1842 | u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) | 1890 | u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) |
1843 | { | 1891 | { |
1844 | u64 host_tsc, tsc_offset; | 1892 | u64 tsc_offset; |
1845 | 1893 | ||
1846 | rdtscll(host_tsc); | ||
1847 | tsc_offset = is_guest_mode(vcpu) ? | 1894 | tsc_offset = is_guest_mode(vcpu) ? |
1848 | to_vmx(vcpu)->nested.vmcs01_tsc_offset : | 1895 | to_vmx(vcpu)->nested.vmcs01_tsc_offset : |
1849 | vmcs_read64(TSC_OFFSET); | 1896 | vmcs_read64(TSC_OFFSET); |
@@ -1866,6 +1913,11 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) | |||
1866 | WARN(1, "user requested TSC rate below hardware speed\n"); | 1913 | WARN(1, "user requested TSC rate below hardware speed\n"); |
1867 | } | 1914 | } |
1868 | 1915 | ||
1916 | static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) | ||
1917 | { | ||
1918 | return vmcs_read64(TSC_OFFSET); | ||
1919 | } | ||
1920 | |||
1869 | /* | 1921 | /* |
1870 | * writes 'offset' into guest's timestamp counter offset register | 1922 | * writes 'offset' into guest's timestamp counter offset register |
1871 | */ | 1923 | */ |
@@ -2202,15 +2254,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | |||
2202 | * Returns 0 on success, non-0 otherwise. | 2254 | * Returns 0 on success, non-0 otherwise. |
2203 | * Assumes vcpu_load() was already called. | 2255 | * Assumes vcpu_load() was already called. |
2204 | */ | 2256 | */ |
2205 | static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | 2257 | static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
2206 | { | 2258 | { |
2207 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2259 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2208 | struct shared_msr_entry *msr; | 2260 | struct shared_msr_entry *msr; |
2209 | int ret = 0; | 2261 | int ret = 0; |
2262 | u32 msr_index = msr_info->index; | ||
2263 | u64 data = msr_info->data; | ||
2210 | 2264 | ||
2211 | switch (msr_index) { | 2265 | switch (msr_index) { |
2212 | case MSR_EFER: | 2266 | case MSR_EFER: |
2213 | ret = kvm_set_msr_common(vcpu, msr_index, data); | 2267 | ret = kvm_set_msr_common(vcpu, msr_info); |
2214 | break; | 2268 | break; |
2215 | #ifdef CONFIG_X86_64 | 2269 | #ifdef CONFIG_X86_64 |
2216 | case MSR_FS_BASE: | 2270 | case MSR_FS_BASE: |
@@ -2236,7 +2290,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
2236 | vmcs_writel(GUEST_SYSENTER_ESP, data); | 2290 | vmcs_writel(GUEST_SYSENTER_ESP, data); |
2237 | break; | 2291 | break; |
2238 | case MSR_IA32_TSC: | 2292 | case MSR_IA32_TSC: |
2239 | kvm_write_tsc(vcpu, data); | 2293 | kvm_write_tsc(vcpu, msr_info); |
2240 | break; | 2294 | break; |
2241 | case MSR_IA32_CR_PAT: | 2295 | case MSR_IA32_CR_PAT: |
2242 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | 2296 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { |
@@ -2244,7 +2298,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
2244 | vcpu->arch.pat = data; | 2298 | vcpu->arch.pat = data; |
2245 | break; | 2299 | break; |
2246 | } | 2300 | } |
2247 | ret = kvm_set_msr_common(vcpu, msr_index, data); | 2301 | ret = kvm_set_msr_common(vcpu, msr_info); |
2302 | break; | ||
2303 | case MSR_IA32_TSC_ADJUST: | ||
2304 | ret = kvm_set_msr_common(vcpu, msr_info); | ||
2248 | break; | 2305 | break; |
2249 | case MSR_TSC_AUX: | 2306 | case MSR_TSC_AUX: |
2250 | if (!vmx->rdtscp_enabled) | 2307 | if (!vmx->rdtscp_enabled) |
@@ -2267,7 +2324,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
2267 | } | 2324 | } |
2268 | break; | 2325 | break; |
2269 | } | 2326 | } |
2270 | ret = kvm_set_msr_common(vcpu, msr_index, data); | 2327 | ret = kvm_set_msr_common(vcpu, msr_info); |
2271 | } | 2328 | } |
2272 | 2329 | ||
2273 | return ret; | 2330 | return ret; |
@@ -2341,6 +2398,18 @@ static int hardware_enable(void *garbage) | |||
2341 | return -EBUSY; | 2398 | return -EBUSY; |
2342 | 2399 | ||
2343 | INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); | 2400 | INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); |
2401 | |||
2402 | /* | ||
2403 | * Now we can enable the vmclear operation in kdump | ||
2404 | * since the loaded_vmcss_on_cpu list on this cpu | ||
2405 | * has been initialized. | ||
2406 | * | ||
2407 | * Though the cpu is not in VMX operation now, there | ||
2408 | * is no problem to enable the vmclear operation | ||
2409 | * for the loaded_vmcss_on_cpu list is empty! | ||
2410 | */ | ||
2411 | crash_enable_local_vmclear(cpu); | ||
2412 | |||
2344 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); | 2413 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); |
2345 | 2414 | ||
2346 | test_bits = FEATURE_CONTROL_LOCKED; | 2415 | test_bits = FEATURE_CONTROL_LOCKED; |
@@ -2697,6 +2766,7 @@ static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment | |||
2697 | if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { | 2766 | if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { |
2698 | tmp.base = vmcs_readl(sf->base); | 2767 | tmp.base = vmcs_readl(sf->base); |
2699 | tmp.selector = vmcs_read16(sf->selector); | 2768 | tmp.selector = vmcs_read16(sf->selector); |
2769 | tmp.dpl = tmp.selector & SELECTOR_RPL_MASK; | ||
2700 | tmp.s = 1; | 2770 | tmp.s = 1; |
2701 | } | 2771 | } |
2702 | vmx_set_segment(vcpu, &tmp, seg); | 2772 | vmx_set_segment(vcpu, &tmp, seg); |
@@ -3246,7 +3316,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
3246 | * unrestricted guest like Westmere to older host that don't have | 3316 | * unrestricted guest like Westmere to older host that don't have |
3247 | * unrestricted guest like Nehelem. | 3317 | * unrestricted guest like Nehelem. |
3248 | */ | 3318 | */ |
3249 | if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { | 3319 | if (vmx->rmode.vm86_active) { |
3250 | switch (seg) { | 3320 | switch (seg) { |
3251 | case VCPU_SREG_CS: | 3321 | case VCPU_SREG_CS: |
3252 | vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); | 3322 | vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); |
@@ -3897,8 +3967,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
3897 | vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); | 3967 | vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); |
3898 | set_cr4_guest_host_mask(vmx); | 3968 | set_cr4_guest_host_mask(vmx); |
3899 | 3969 | ||
3900 | kvm_write_tsc(&vmx->vcpu, 0); | ||
3901 | |||
3902 | return 0; | 3970 | return 0; |
3903 | } | 3971 | } |
3904 | 3972 | ||
@@ -3908,8 +3976,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
3908 | u64 msr; | 3976 | u64 msr; |
3909 | int ret; | 3977 | int ret; |
3910 | 3978 | ||
3911 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); | ||
3912 | |||
3913 | vmx->rmode.vm86_active = 0; | 3979 | vmx->rmode.vm86_active = 0; |
3914 | 3980 | ||
3915 | vmx->soft_vnmi_blocked = 0; | 3981 | vmx->soft_vnmi_blocked = 0; |
@@ -3921,10 +3987,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
3921 | msr |= MSR_IA32_APICBASE_BSP; | 3987 | msr |= MSR_IA32_APICBASE_BSP; |
3922 | kvm_set_apic_base(&vmx->vcpu, msr); | 3988 | kvm_set_apic_base(&vmx->vcpu, msr); |
3923 | 3989 | ||
3924 | ret = fx_init(&vmx->vcpu); | ||
3925 | if (ret != 0) | ||
3926 | goto out; | ||
3927 | |||
3928 | vmx_segment_cache_clear(vmx); | 3990 | vmx_segment_cache_clear(vmx); |
3929 | 3991 | ||
3930 | seg_setup(VCPU_SREG_CS); | 3992 | seg_setup(VCPU_SREG_CS); |
@@ -3965,7 +4027,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
3965 | kvm_rip_write(vcpu, 0xfff0); | 4027 | kvm_rip_write(vcpu, 0xfff0); |
3966 | else | 4028 | else |
3967 | kvm_rip_write(vcpu, 0); | 4029 | kvm_rip_write(vcpu, 0); |
3968 | kvm_register_write(vcpu, VCPU_REGS_RSP, 0); | ||
3969 | 4030 | ||
3970 | vmcs_writel(GUEST_GDTR_BASE, 0); | 4031 | vmcs_writel(GUEST_GDTR_BASE, 0); |
3971 | vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); | 4032 | vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); |
@@ -4015,7 +4076,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
4015 | /* HACK: Don't enable emulation on guest boot/reset */ | 4076 | /* HACK: Don't enable emulation on guest boot/reset */ |
4016 | vmx->emulation_required = 0; | 4077 | vmx->emulation_required = 0; |
4017 | 4078 | ||
4018 | out: | ||
4019 | return ret; | 4079 | return ret; |
4020 | } | 4080 | } |
4021 | 4081 | ||
@@ -4287,16 +4347,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) | |||
4287 | if (is_machine_check(intr_info)) | 4347 | if (is_machine_check(intr_info)) |
4288 | return handle_machine_check(vcpu); | 4348 | return handle_machine_check(vcpu); |
4289 | 4349 | ||
4290 | if ((vect_info & VECTORING_INFO_VALID_MASK) && | ||
4291 | !is_page_fault(intr_info)) { | ||
4292 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | ||
4293 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; | ||
4294 | vcpu->run->internal.ndata = 2; | ||
4295 | vcpu->run->internal.data[0] = vect_info; | ||
4296 | vcpu->run->internal.data[1] = intr_info; | ||
4297 | return 0; | ||
4298 | } | ||
4299 | |||
4300 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) | 4350 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) |
4301 | return 1; /* already handled by vmx_vcpu_run() */ | 4351 | return 1; /* already handled by vmx_vcpu_run() */ |
4302 | 4352 | ||
@@ -4315,6 +4365,22 @@ static int handle_exception(struct kvm_vcpu *vcpu) | |||
4315 | error_code = 0; | 4365 | error_code = 0; |
4316 | if (intr_info & INTR_INFO_DELIVER_CODE_MASK) | 4366 | if (intr_info & INTR_INFO_DELIVER_CODE_MASK) |
4317 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | 4367 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); |
4368 | |||
4369 | /* | ||
4370 | * The #PF with PFEC.RSVD = 1 indicates the guest is accessing | ||
4371 | * MMIO, it is better to report an internal error. | ||
4372 | * See the comments in vmx_handle_exit. | ||
4373 | */ | ||
4374 | if ((vect_info & VECTORING_INFO_VALID_MASK) && | ||
4375 | !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { | ||
4376 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | ||
4377 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; | ||
4378 | vcpu->run->internal.ndata = 2; | ||
4379 | vcpu->run->internal.data[0] = vect_info; | ||
4380 | vcpu->run->internal.data[1] = intr_info; | ||
4381 | return 0; | ||
4382 | } | ||
4383 | |||
4318 | if (is_page_fault(intr_info)) { | 4384 | if (is_page_fault(intr_info)) { |
4319 | /* EPT won't cause page fault directly */ | 4385 | /* EPT won't cause page fault directly */ |
4320 | BUG_ON(enable_ept); | 4386 | BUG_ON(enable_ept); |
@@ -4626,11 +4692,15 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) | |||
4626 | 4692 | ||
4627 | static int handle_wrmsr(struct kvm_vcpu *vcpu) | 4693 | static int handle_wrmsr(struct kvm_vcpu *vcpu) |
4628 | { | 4694 | { |
4695 | struct msr_data msr; | ||
4629 | u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; | 4696 | u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; |
4630 | u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | 4697 | u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) |
4631 | | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); | 4698 | | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); |
4632 | 4699 | ||
4633 | if (vmx_set_msr(vcpu, ecx, data) != 0) { | 4700 | msr.data = data; |
4701 | msr.index = ecx; | ||
4702 | msr.host_initiated = false; | ||
4703 | if (vmx_set_msr(vcpu, &msr) != 0) { | ||
4634 | trace_kvm_msr_write_ex(ecx, data); | 4704 | trace_kvm_msr_write_ex(ecx, data); |
4635 | kvm_inject_gp(vcpu, 0); | 4705 | kvm_inject_gp(vcpu, 0); |
4636 | return 1; | 4706 | return 1; |
@@ -4827,11 +4897,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) | |||
4827 | 4897 | ||
4828 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | 4898 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
4829 | 4899 | ||
4830 | if (exit_qualification & (1 << 6)) { | ||
4831 | printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); | ||
4832 | return -EINVAL; | ||
4833 | } | ||
4834 | |||
4835 | gla_validity = (exit_qualification >> 7) & 0x3; | 4900 | gla_validity = (exit_qualification >> 7) & 0x3; |
4836 | if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { | 4901 | if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { |
4837 | printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); | 4902 | printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); |
@@ -5979,13 +6044,24 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) | |||
5979 | return 0; | 6044 | return 0; |
5980 | } | 6045 | } |
5981 | 6046 | ||
6047 | /* | ||
6048 | * Note: | ||
6049 | * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by | ||
6050 | * delivery event since it indicates guest is accessing MMIO. | ||
6051 | * The vm-exit can be triggered again after return to guest that | ||
6052 | * will cause infinite loop. | ||
6053 | */ | ||
5982 | if ((vectoring_info & VECTORING_INFO_VALID_MASK) && | 6054 | if ((vectoring_info & VECTORING_INFO_VALID_MASK) && |
5983 | (exit_reason != EXIT_REASON_EXCEPTION_NMI && | 6055 | (exit_reason != EXIT_REASON_EXCEPTION_NMI && |
5984 | exit_reason != EXIT_REASON_EPT_VIOLATION && | 6056 | exit_reason != EXIT_REASON_EPT_VIOLATION && |
5985 | exit_reason != EXIT_REASON_TASK_SWITCH)) | 6057 | exit_reason != EXIT_REASON_TASK_SWITCH)) { |
5986 | printk(KERN_WARNING "%s: unexpected, valid vectoring info " | 6058 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; |
5987 | "(0x%x) and exit reason is 0x%x\n", | 6059 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; |
5988 | __func__, vectoring_info, exit_reason); | 6060 | vcpu->run->internal.ndata = 2; |
6061 | vcpu->run->internal.data[0] = vectoring_info; | ||
6062 | vcpu->run->internal.data[1] = exit_reason; | ||
6063 | return 0; | ||
6064 | } | ||
5989 | 6065 | ||
5990 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && | 6066 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && |
5991 | !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( | 6067 | !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( |
@@ -7309,6 +7385,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
7309 | .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, | 7385 | .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, |
7310 | 7386 | ||
7311 | .set_tsc_khz = vmx_set_tsc_khz, | 7387 | .set_tsc_khz = vmx_set_tsc_khz, |
7388 | .read_tsc_offset = vmx_read_tsc_offset, | ||
7312 | .write_tsc_offset = vmx_write_tsc_offset, | 7389 | .write_tsc_offset = vmx_write_tsc_offset, |
7313 | .adjust_tsc_offset = vmx_adjust_tsc_offset, | 7390 | .adjust_tsc_offset = vmx_adjust_tsc_offset, |
7314 | .compute_tsc_offset = vmx_compute_tsc_offset, | 7391 | .compute_tsc_offset = vmx_compute_tsc_offset, |
@@ -7367,6 +7444,11 @@ static int __init vmx_init(void) | |||
7367 | if (r) | 7444 | if (r) |
7368 | goto out3; | 7445 | goto out3; |
7369 | 7446 | ||
7447 | #ifdef CONFIG_KEXEC | ||
7448 | rcu_assign_pointer(crash_vmclear_loaded_vmcss, | ||
7449 | crash_vmclear_local_loaded_vmcss); | ||
7450 | #endif | ||
7451 | |||
7370 | vmx_disable_intercept_for_msr(MSR_FS_BASE, false); | 7452 | vmx_disable_intercept_for_msr(MSR_FS_BASE, false); |
7371 | vmx_disable_intercept_for_msr(MSR_GS_BASE, false); | 7453 | vmx_disable_intercept_for_msr(MSR_GS_BASE, false); |
7372 | vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); | 7454 | vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); |
@@ -7404,6 +7486,11 @@ static void __exit vmx_exit(void) | |||
7404 | free_page((unsigned long)vmx_io_bitmap_b); | 7486 | free_page((unsigned long)vmx_io_bitmap_b); |
7405 | free_page((unsigned long)vmx_io_bitmap_a); | 7487 | free_page((unsigned long)vmx_io_bitmap_a); |
7406 | 7488 | ||
7489 | #ifdef CONFIG_KEXEC | ||
7490 | rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL); | ||
7491 | synchronize_rcu(); | ||
7492 | #endif | ||
7493 | |||
7407 | kvm_exit(); | 7494 | kvm_exit(); |
7408 | } | 7495 | } |
7409 | 7496 | ||
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4f7641756be2..76f54461f7cb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -46,6 +46,8 @@ | |||
46 | #include <linux/uaccess.h> | 46 | #include <linux/uaccess.h> |
47 | #include <linux/hash.h> | 47 | #include <linux/hash.h> |
48 | #include <linux/pci.h> | 48 | #include <linux/pci.h> |
49 | #include <linux/timekeeper_internal.h> | ||
50 | #include <linux/pvclock_gtod.h> | ||
49 | #include <trace/events/kvm.h> | 51 | #include <trace/events/kvm.h> |
50 | 52 | ||
51 | #define CREATE_TRACE_POINTS | 53 | #define CREATE_TRACE_POINTS |
@@ -158,7 +160,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
158 | 160 | ||
159 | u64 __read_mostly host_xcr0; | 161 | u64 __read_mostly host_xcr0; |
160 | 162 | ||
161 | int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); | 163 | static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); |
164 | |||
165 | static int kvm_vcpu_reset(struct kvm_vcpu *vcpu); | ||
162 | 166 | ||
163 | static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) | 167 | static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) |
164 | { | 168 | { |
@@ -633,7 +637,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
633 | } | 637 | } |
634 | 638 | ||
635 | if (is_long_mode(vcpu)) { | 639 | if (is_long_mode(vcpu)) { |
636 | if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) { | 640 | if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) { |
637 | if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) | 641 | if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) |
638 | return 1; | 642 | return 1; |
639 | } else | 643 | } else |
@@ -827,6 +831,7 @@ static u32 msrs_to_save[] = { | |||
827 | static unsigned num_msrs_to_save; | 831 | static unsigned num_msrs_to_save; |
828 | 832 | ||
829 | static const u32 emulated_msrs[] = { | 833 | static const u32 emulated_msrs[] = { |
834 | MSR_IA32_TSC_ADJUST, | ||
830 | MSR_IA32_TSCDEADLINE, | 835 | MSR_IA32_TSCDEADLINE, |
831 | MSR_IA32_MISC_ENABLE, | 836 | MSR_IA32_MISC_ENABLE, |
832 | MSR_IA32_MCG_STATUS, | 837 | MSR_IA32_MCG_STATUS, |
@@ -886,9 +891,9 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); | |||
886 | * Returns 0 on success, non-0 otherwise. | 891 | * Returns 0 on success, non-0 otherwise. |
887 | * Assumes vcpu_load() was already called. | 892 | * Assumes vcpu_load() was already called. |
888 | */ | 893 | */ |
889 | int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | 894 | int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) |
890 | { | 895 | { |
891 | return kvm_x86_ops->set_msr(vcpu, msr_index, data); | 896 | return kvm_x86_ops->set_msr(vcpu, msr); |
892 | } | 897 | } |
893 | 898 | ||
894 | /* | 899 | /* |
@@ -896,9 +901,63 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
896 | */ | 901 | */ |
897 | static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) | 902 | static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) |
898 | { | 903 | { |
899 | return kvm_set_msr(vcpu, index, *data); | 904 | struct msr_data msr; |
905 | |||
906 | msr.data = *data; | ||
907 | msr.index = index; | ||
908 | msr.host_initiated = true; | ||
909 | return kvm_set_msr(vcpu, &msr); | ||
900 | } | 910 | } |
901 | 911 | ||
912 | #ifdef CONFIG_X86_64 | ||
913 | struct pvclock_gtod_data { | ||
914 | seqcount_t seq; | ||
915 | |||
916 | struct { /* extract of a clocksource struct */ | ||
917 | int vclock_mode; | ||
918 | cycle_t cycle_last; | ||
919 | cycle_t mask; | ||
920 | u32 mult; | ||
921 | u32 shift; | ||
922 | } clock; | ||
923 | |||
924 | /* open coded 'struct timespec' */ | ||
925 | u64 monotonic_time_snsec; | ||
926 | time_t monotonic_time_sec; | ||
927 | }; | ||
928 | |||
929 | static struct pvclock_gtod_data pvclock_gtod_data; | ||
930 | |||
931 | static void update_pvclock_gtod(struct timekeeper *tk) | ||
932 | { | ||
933 | struct pvclock_gtod_data *vdata = &pvclock_gtod_data; | ||
934 | |||
935 | write_seqcount_begin(&vdata->seq); | ||
936 | |||
937 | /* copy pvclock gtod data */ | ||
938 | vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; | ||
939 | vdata->clock.cycle_last = tk->clock->cycle_last; | ||
940 | vdata->clock.mask = tk->clock->mask; | ||
941 | vdata->clock.mult = tk->mult; | ||
942 | vdata->clock.shift = tk->shift; | ||
943 | |||
944 | vdata->monotonic_time_sec = tk->xtime_sec | ||
945 | + tk->wall_to_monotonic.tv_sec; | ||
946 | vdata->monotonic_time_snsec = tk->xtime_nsec | ||
947 | + (tk->wall_to_monotonic.tv_nsec | ||
948 | << tk->shift); | ||
949 | while (vdata->monotonic_time_snsec >= | ||
950 | (((u64)NSEC_PER_SEC) << tk->shift)) { | ||
951 | vdata->monotonic_time_snsec -= | ||
952 | ((u64)NSEC_PER_SEC) << tk->shift; | ||
953 | vdata->monotonic_time_sec++; | ||
954 | } | ||
955 | |||
956 | write_seqcount_end(&vdata->seq); | ||
957 | } | ||
958 | #endif | ||
959 | |||
960 | |||
902 | static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) | 961 | static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) |
903 | { | 962 | { |
904 | int version; | 963 | int version; |
@@ -995,6 +1054,10 @@ static inline u64 get_kernel_ns(void) | |||
995 | return timespec_to_ns(&ts); | 1054 | return timespec_to_ns(&ts); |
996 | } | 1055 | } |
997 | 1056 | ||
1057 | #ifdef CONFIG_X86_64 | ||
1058 | static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); | ||
1059 | #endif | ||
1060 | |||
998 | static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); | 1061 | static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); |
999 | unsigned long max_tsc_khz; | 1062 | unsigned long max_tsc_khz; |
1000 | 1063 | ||
@@ -1046,12 +1109,47 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) | |||
1046 | return tsc; | 1109 | return tsc; |
1047 | } | 1110 | } |
1048 | 1111 | ||
1049 | void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) | 1112 | void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) |
1113 | { | ||
1114 | #ifdef CONFIG_X86_64 | ||
1115 | bool vcpus_matched; | ||
1116 | bool do_request = false; | ||
1117 | struct kvm_arch *ka = &vcpu->kvm->arch; | ||
1118 | struct pvclock_gtod_data *gtod = &pvclock_gtod_data; | ||
1119 | |||
1120 | vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == | ||
1121 | atomic_read(&vcpu->kvm->online_vcpus)); | ||
1122 | |||
1123 | if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC) | ||
1124 | if (!ka->use_master_clock) | ||
1125 | do_request = 1; | ||
1126 | |||
1127 | if (!vcpus_matched && ka->use_master_clock) | ||
1128 | do_request = 1; | ||
1129 | |||
1130 | if (do_request) | ||
1131 | kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); | ||
1132 | |||
1133 | trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, | ||
1134 | atomic_read(&vcpu->kvm->online_vcpus), | ||
1135 | ka->use_master_clock, gtod->clock.vclock_mode); | ||
1136 | #endif | ||
1137 | } | ||
1138 | |||
1139 | static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) | ||
1140 | { | ||
1141 | u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu); | ||
1142 | vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; | ||
1143 | } | ||
1144 | |||
1145 | void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) | ||
1050 | { | 1146 | { |
1051 | struct kvm *kvm = vcpu->kvm; | 1147 | struct kvm *kvm = vcpu->kvm; |
1052 | u64 offset, ns, elapsed; | 1148 | u64 offset, ns, elapsed; |
1053 | unsigned long flags; | 1149 | unsigned long flags; |
1054 | s64 usdiff; | 1150 | s64 usdiff; |
1151 | bool matched; | ||
1152 | u64 data = msr->data; | ||
1055 | 1153 | ||
1056 | raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); | 1154 | raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); |
1057 | offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); | 1155 | offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); |
@@ -1094,6 +1192,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) | |||
1094 | offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); | 1192 | offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); |
1095 | pr_debug("kvm: adjusted tsc offset by %llu\n", delta); | 1193 | pr_debug("kvm: adjusted tsc offset by %llu\n", delta); |
1096 | } | 1194 | } |
1195 | matched = true; | ||
1097 | } else { | 1196 | } else { |
1098 | /* | 1197 | /* |
1099 | * We split periods of matched TSC writes into generations. | 1198 | * We split periods of matched TSC writes into generations. |
@@ -1108,6 +1207,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) | |||
1108 | kvm->arch.cur_tsc_nsec = ns; | 1207 | kvm->arch.cur_tsc_nsec = ns; |
1109 | kvm->arch.cur_tsc_write = data; | 1208 | kvm->arch.cur_tsc_write = data; |
1110 | kvm->arch.cur_tsc_offset = offset; | 1209 | kvm->arch.cur_tsc_offset = offset; |
1210 | matched = false; | ||
1111 | pr_debug("kvm: new tsc generation %u, clock %llu\n", | 1211 | pr_debug("kvm: new tsc generation %u, clock %llu\n", |
1112 | kvm->arch.cur_tsc_generation, data); | 1212 | kvm->arch.cur_tsc_generation, data); |
1113 | } | 1213 | } |
@@ -1129,26 +1229,195 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) | |||
1129 | vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; | 1229 | vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; |
1130 | vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; | 1230 | vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; |
1131 | 1231 | ||
1232 | if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) | ||
1233 | update_ia32_tsc_adjust_msr(vcpu, offset); | ||
1132 | kvm_x86_ops->write_tsc_offset(vcpu, offset); | 1234 | kvm_x86_ops->write_tsc_offset(vcpu, offset); |
1133 | raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); | 1235 | raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); |
1236 | |||
1237 | spin_lock(&kvm->arch.pvclock_gtod_sync_lock); | ||
1238 | if (matched) | ||
1239 | kvm->arch.nr_vcpus_matched_tsc++; | ||
1240 | else | ||
1241 | kvm->arch.nr_vcpus_matched_tsc = 0; | ||
1242 | |||
1243 | kvm_track_tsc_matching(vcpu); | ||
1244 | spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); | ||
1134 | } | 1245 | } |
1135 | 1246 | ||
1136 | EXPORT_SYMBOL_GPL(kvm_write_tsc); | 1247 | EXPORT_SYMBOL_GPL(kvm_write_tsc); |
1137 | 1248 | ||
1249 | #ifdef CONFIG_X86_64 | ||
1250 | |||
1251 | static cycle_t read_tsc(void) | ||
1252 | { | ||
1253 | cycle_t ret; | ||
1254 | u64 last; | ||
1255 | |||
1256 | /* | ||
1257 | * Empirically, a fence (of type that depends on the CPU) | ||
1258 | * before rdtsc is enough to ensure that rdtsc is ordered | ||
1259 | * with respect to loads. The various CPU manuals are unclear | ||
1260 | * as to whether rdtsc can be reordered with later loads, | ||
1261 | * but no one has ever seen it happen. | ||
1262 | */ | ||
1263 | rdtsc_barrier(); | ||
1264 | ret = (cycle_t)vget_cycles(); | ||
1265 | |||
1266 | last = pvclock_gtod_data.clock.cycle_last; | ||
1267 | |||
1268 | if (likely(ret >= last)) | ||
1269 | return ret; | ||
1270 | |||
1271 | /* | ||
1272 | * GCC likes to generate cmov here, but this branch is extremely | ||
1273 | * predictable (it's just a funciton of time and the likely is | ||
1274 | * very likely) and there's a data dependence, so force GCC | ||
1275 | * to generate a branch instead. I don't barrier() because | ||
1276 | * we don't actually need a barrier, and if this function | ||
1277 | * ever gets inlined it will generate worse code. | ||
1278 | */ | ||
1279 | asm volatile (""); | ||
1280 | return last; | ||
1281 | } | ||
1282 | |||
1283 | static inline u64 vgettsc(cycle_t *cycle_now) | ||
1284 | { | ||
1285 | long v; | ||
1286 | struct pvclock_gtod_data *gtod = &pvclock_gtod_data; | ||
1287 | |||
1288 | *cycle_now = read_tsc(); | ||
1289 | |||
1290 | v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask; | ||
1291 | return v * gtod->clock.mult; | ||
1292 | } | ||
1293 | |||
1294 | static int do_monotonic(struct timespec *ts, cycle_t *cycle_now) | ||
1295 | { | ||
1296 | unsigned long seq; | ||
1297 | u64 ns; | ||
1298 | int mode; | ||
1299 | struct pvclock_gtod_data *gtod = &pvclock_gtod_data; | ||
1300 | |||
1301 | ts->tv_nsec = 0; | ||
1302 | do { | ||
1303 | seq = read_seqcount_begin(>od->seq); | ||
1304 | mode = gtod->clock.vclock_mode; | ||
1305 | ts->tv_sec = gtod->monotonic_time_sec; | ||
1306 | ns = gtod->monotonic_time_snsec; | ||
1307 | ns += vgettsc(cycle_now); | ||
1308 | ns >>= gtod->clock.shift; | ||
1309 | } while (unlikely(read_seqcount_retry(>od->seq, seq))); | ||
1310 | timespec_add_ns(ts, ns); | ||
1311 | |||
1312 | return mode; | ||
1313 | } | ||
1314 | |||
1315 | /* returns true if host is using tsc clocksource */ | ||
1316 | static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) | ||
1317 | { | ||
1318 | struct timespec ts; | ||
1319 | |||
1320 | /* checked again under seqlock below */ | ||
1321 | if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) | ||
1322 | return false; | ||
1323 | |||
1324 | if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC) | ||
1325 | return false; | ||
1326 | |||
1327 | monotonic_to_bootbased(&ts); | ||
1328 | *kernel_ns = timespec_to_ns(&ts); | ||
1329 | |||
1330 | return true; | ||
1331 | } | ||
1332 | #endif | ||
1333 | |||
1334 | /* | ||
1335 | * | ||
1336 | * Assuming a stable TSC across physical CPUS, and a stable TSC | ||
1337 | * across virtual CPUs, the following condition is possible. | ||
1338 | * Each numbered line represents an event visible to both | ||
1339 | * CPUs at the next numbered event. | ||
1340 | * | ||
1341 | * "timespecX" represents host monotonic time. "tscX" represents | ||
1342 | * RDTSC value. | ||
1343 | * | ||
1344 | * VCPU0 on CPU0 | VCPU1 on CPU1 | ||
1345 | * | ||
1346 | * 1. read timespec0,tsc0 | ||
1347 | * 2. | timespec1 = timespec0 + N | ||
1348 | * | tsc1 = tsc0 + M | ||
1349 | * 3. transition to guest | transition to guest | ||
1350 | * 4. ret0 = timespec0 + (rdtsc - tsc0) | | ||
1351 | * 5. | ret1 = timespec1 + (rdtsc - tsc1) | ||
1352 | * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) | ||
1353 | * | ||
1354 | * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: | ||
1355 | * | ||
1356 | * - ret0 < ret1 | ||
1357 | * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) | ||
1358 | * ... | ||
1359 | * - 0 < N - M => M < N | ||
1360 | * | ||
1361 | * That is, when timespec0 != timespec1, M < N. Unfortunately that is not | ||
1362 | * always the case (the difference between two distinct xtime instances | ||
1363 | * might be smaller then the difference between corresponding TSC reads, | ||
1364 | * when updating guest vcpus pvclock areas). | ||
1365 | * | ||
1366 | * To avoid that problem, do not allow visibility of distinct | ||
1367 | * system_timestamp/tsc_timestamp values simultaneously: use a master | ||
1368 | * copy of host monotonic time values. Update that master copy | ||
1369 | * in lockstep. | ||
1370 | * | ||
1371 | * Rely on synchronization of host TSCs and guest TSCs for monotonicity. | ||
1372 | * | ||
1373 | */ | ||
1374 | |||
1375 | static void pvclock_update_vm_gtod_copy(struct kvm *kvm) | ||
1376 | { | ||
1377 | #ifdef CONFIG_X86_64 | ||
1378 | struct kvm_arch *ka = &kvm->arch; | ||
1379 | int vclock_mode; | ||
1380 | bool host_tsc_clocksource, vcpus_matched; | ||
1381 | |||
1382 | vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == | ||
1383 | atomic_read(&kvm->online_vcpus)); | ||
1384 | |||
1385 | /* | ||
1386 | * If the host uses TSC clock, then passthrough TSC as stable | ||
1387 | * to the guest. | ||
1388 | */ | ||
1389 | host_tsc_clocksource = kvm_get_time_and_clockread( | ||
1390 | &ka->master_kernel_ns, | ||
1391 | &ka->master_cycle_now); | ||
1392 | |||
1393 | ka->use_master_clock = host_tsc_clocksource & vcpus_matched; | ||
1394 | |||
1395 | if (ka->use_master_clock) | ||
1396 | atomic_set(&kvm_guest_has_master_clock, 1); | ||
1397 | |||
1398 | vclock_mode = pvclock_gtod_data.clock.vclock_mode; | ||
1399 | trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, | ||
1400 | vcpus_matched); | ||
1401 | #endif | ||
1402 | } | ||
1403 | |||
1138 | static int kvm_guest_time_update(struct kvm_vcpu *v) | 1404 | static int kvm_guest_time_update(struct kvm_vcpu *v) |
1139 | { | 1405 | { |
1140 | unsigned long flags; | 1406 | unsigned long flags, this_tsc_khz; |
1141 | struct kvm_vcpu_arch *vcpu = &v->arch; | 1407 | struct kvm_vcpu_arch *vcpu = &v->arch; |
1408 | struct kvm_arch *ka = &v->kvm->arch; | ||
1142 | void *shared_kaddr; | 1409 | void *shared_kaddr; |
1143 | unsigned long this_tsc_khz; | ||
1144 | s64 kernel_ns, max_kernel_ns; | 1410 | s64 kernel_ns, max_kernel_ns; |
1145 | u64 tsc_timestamp; | 1411 | u64 tsc_timestamp, host_tsc; |
1412 | struct pvclock_vcpu_time_info *guest_hv_clock; | ||
1146 | u8 pvclock_flags; | 1413 | u8 pvclock_flags; |
1414 | bool use_master_clock; | ||
1415 | |||
1416 | kernel_ns = 0; | ||
1417 | host_tsc = 0; | ||
1147 | 1418 | ||
1148 | /* Keep irq disabled to prevent changes to the clock */ | 1419 | /* Keep irq disabled to prevent changes to the clock */ |
1149 | local_irq_save(flags); | 1420 | local_irq_save(flags); |
1150 | tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); | ||
1151 | kernel_ns = get_kernel_ns(); | ||
1152 | this_tsc_khz = __get_cpu_var(cpu_tsc_khz); | 1421 | this_tsc_khz = __get_cpu_var(cpu_tsc_khz); |
1153 | if (unlikely(this_tsc_khz == 0)) { | 1422 | if (unlikely(this_tsc_khz == 0)) { |
1154 | local_irq_restore(flags); | 1423 | local_irq_restore(flags); |
@@ -1157,6 +1426,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) | |||
1157 | } | 1426 | } |
1158 | 1427 | ||
1159 | /* | 1428 | /* |
1429 | * If the host uses TSC clock, then passthrough TSC as stable | ||
1430 | * to the guest. | ||
1431 | */ | ||
1432 | spin_lock(&ka->pvclock_gtod_sync_lock); | ||
1433 | use_master_clock = ka->use_master_clock; | ||
1434 | if (use_master_clock) { | ||
1435 | host_tsc = ka->master_cycle_now; | ||
1436 | kernel_ns = ka->master_kernel_ns; | ||
1437 | } | ||
1438 | spin_unlock(&ka->pvclock_gtod_sync_lock); | ||
1439 | if (!use_master_clock) { | ||
1440 | host_tsc = native_read_tsc(); | ||
1441 | kernel_ns = get_kernel_ns(); | ||
1442 | } | ||
1443 | |||
1444 | tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc); | ||
1445 | |||
1446 | /* | ||
1160 | * We may have to catch up the TSC to match elapsed wall clock | 1447 | * We may have to catch up the TSC to match elapsed wall clock |
1161 | * time for two reasons, even if kvmclock is used. | 1448 | * time for two reasons, even if kvmclock is used. |
1162 | * 1) CPU could have been running below the maximum TSC rate | 1449 | * 1) CPU could have been running below the maximum TSC rate |
@@ -1217,23 +1504,20 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) | |||
1217 | vcpu->hw_tsc_khz = this_tsc_khz; | 1504 | vcpu->hw_tsc_khz = this_tsc_khz; |
1218 | } | 1505 | } |
1219 | 1506 | ||
1220 | if (max_kernel_ns > kernel_ns) | 1507 | /* with a master <monotonic time, tsc value> tuple, |
1221 | kernel_ns = max_kernel_ns; | 1508 | * pvclock clock reads always increase at the (scaled) rate |
1222 | 1509 | * of guest TSC - no need to deal with sampling errors. | |
1510 | */ | ||
1511 | if (!use_master_clock) { | ||
1512 | if (max_kernel_ns > kernel_ns) | ||
1513 | kernel_ns = max_kernel_ns; | ||
1514 | } | ||
1223 | /* With all the info we got, fill in the values */ | 1515 | /* With all the info we got, fill in the values */ |
1224 | vcpu->hv_clock.tsc_timestamp = tsc_timestamp; | 1516 | vcpu->hv_clock.tsc_timestamp = tsc_timestamp; |
1225 | vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; | 1517 | vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; |
1226 | vcpu->last_kernel_ns = kernel_ns; | 1518 | vcpu->last_kernel_ns = kernel_ns; |
1227 | vcpu->last_guest_tsc = tsc_timestamp; | 1519 | vcpu->last_guest_tsc = tsc_timestamp; |
1228 | 1520 | ||
1229 | pvclock_flags = 0; | ||
1230 | if (vcpu->pvclock_set_guest_stopped_request) { | ||
1231 | pvclock_flags |= PVCLOCK_GUEST_STOPPED; | ||
1232 | vcpu->pvclock_set_guest_stopped_request = false; | ||
1233 | } | ||
1234 | |||
1235 | vcpu->hv_clock.flags = pvclock_flags; | ||
1236 | |||
1237 | /* | 1521 | /* |
1238 | * The interface expects us to write an even number signaling that the | 1522 | * The interface expects us to write an even number signaling that the |
1239 | * update is finished. Since the guest won't see the intermediate | 1523 | * update is finished. Since the guest won't see the intermediate |
@@ -1243,6 +1527,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) | |||
1243 | 1527 | ||
1244 | shared_kaddr = kmap_atomic(vcpu->time_page); | 1528 | shared_kaddr = kmap_atomic(vcpu->time_page); |
1245 | 1529 | ||
1530 | guest_hv_clock = shared_kaddr + vcpu->time_offset; | ||
1531 | |||
1532 | /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ | ||
1533 | pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); | ||
1534 | |||
1535 | if (vcpu->pvclock_set_guest_stopped_request) { | ||
1536 | pvclock_flags |= PVCLOCK_GUEST_STOPPED; | ||
1537 | vcpu->pvclock_set_guest_stopped_request = false; | ||
1538 | } | ||
1539 | |||
1540 | /* If the host uses TSC clocksource, then it is stable */ | ||
1541 | if (use_master_clock) | ||
1542 | pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; | ||
1543 | |||
1544 | vcpu->hv_clock.flags = pvclock_flags; | ||
1545 | |||
1246 | memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, | 1546 | memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, |
1247 | sizeof(vcpu->hv_clock)); | 1547 | sizeof(vcpu->hv_clock)); |
1248 | 1548 | ||
@@ -1572,9 +1872,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu) | |||
1572 | &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); | 1872 | &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); |
1573 | } | 1873 | } |
1574 | 1874 | ||
1575 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | 1875 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
1576 | { | 1876 | { |
1577 | bool pr = false; | 1877 | bool pr = false; |
1878 | u32 msr = msr_info->index; | ||
1879 | u64 data = msr_info->data; | ||
1578 | 1880 | ||
1579 | switch (msr) { | 1881 | switch (msr) { |
1580 | case MSR_EFER: | 1882 | case MSR_EFER: |
@@ -1625,6 +1927,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1625 | case MSR_IA32_TSCDEADLINE: | 1927 | case MSR_IA32_TSCDEADLINE: |
1626 | kvm_set_lapic_tscdeadline_msr(vcpu, data); | 1928 | kvm_set_lapic_tscdeadline_msr(vcpu, data); |
1627 | break; | 1929 | break; |
1930 | case MSR_IA32_TSC_ADJUST: | ||
1931 | if (guest_cpuid_has_tsc_adjust(vcpu)) { | ||
1932 | if (!msr_info->host_initiated) { | ||
1933 | u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; | ||
1934 | kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true); | ||
1935 | } | ||
1936 | vcpu->arch.ia32_tsc_adjust_msr = data; | ||
1937 | } | ||
1938 | break; | ||
1628 | case MSR_IA32_MISC_ENABLE: | 1939 | case MSR_IA32_MISC_ENABLE: |
1629 | vcpu->arch.ia32_misc_enable_msr = data; | 1940 | vcpu->arch.ia32_misc_enable_msr = data; |
1630 | break; | 1941 | break; |
@@ -1984,6 +2295,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1984 | case MSR_IA32_TSCDEADLINE: | 2295 | case MSR_IA32_TSCDEADLINE: |
1985 | data = kvm_get_lapic_tscdeadline_msr(vcpu); | 2296 | data = kvm_get_lapic_tscdeadline_msr(vcpu); |
1986 | break; | 2297 | break; |
2298 | case MSR_IA32_TSC_ADJUST: | ||
2299 | data = (u64)vcpu->arch.ia32_tsc_adjust_msr; | ||
2300 | break; | ||
1987 | case MSR_IA32_MISC_ENABLE: | 2301 | case MSR_IA32_MISC_ENABLE: |
1988 | data = vcpu->arch.ia32_misc_enable_msr; | 2302 | data = vcpu->arch.ia32_misc_enable_msr; |
1989 | break; | 2303 | break; |
@@ -2342,7 +2656,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
2342 | kvm_x86_ops->write_tsc_offset(vcpu, offset); | 2656 | kvm_x86_ops->write_tsc_offset(vcpu, offset); |
2343 | vcpu->arch.tsc_catchup = 1; | 2657 | vcpu->arch.tsc_catchup = 1; |
2344 | } | 2658 | } |
2345 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | 2659 | /* |
2660 | * On a host with synchronized TSC, there is no need to update | ||
2661 | * kvmclock on vcpu->cpu migration | ||
2662 | */ | ||
2663 | if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) | ||
2664 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | ||
2346 | if (vcpu->cpu != cpu) | 2665 | if (vcpu->cpu != cpu) |
2347 | kvm_migrate_timers(vcpu); | 2666 | kvm_migrate_timers(vcpu); |
2348 | vcpu->cpu = cpu; | 2667 | vcpu->cpu = cpu; |
@@ -2691,15 +3010,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
2691 | if (!vcpu->arch.apic) | 3010 | if (!vcpu->arch.apic) |
2692 | goto out; | 3011 | goto out; |
2693 | u.lapic = memdup_user(argp, sizeof(*u.lapic)); | 3012 | u.lapic = memdup_user(argp, sizeof(*u.lapic)); |
2694 | if (IS_ERR(u.lapic)) { | 3013 | if (IS_ERR(u.lapic)) |
2695 | r = PTR_ERR(u.lapic); | 3014 | return PTR_ERR(u.lapic); |
2696 | goto out; | ||
2697 | } | ||
2698 | 3015 | ||
2699 | r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); | 3016 | r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); |
2700 | if (r) | ||
2701 | goto out; | ||
2702 | r = 0; | ||
2703 | break; | 3017 | break; |
2704 | } | 3018 | } |
2705 | case KVM_INTERRUPT: { | 3019 | case KVM_INTERRUPT: { |
@@ -2709,16 +3023,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
2709 | if (copy_from_user(&irq, argp, sizeof irq)) | 3023 | if (copy_from_user(&irq, argp, sizeof irq)) |
2710 | goto out; | 3024 | goto out; |
2711 | r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); | 3025 | r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); |
2712 | if (r) | ||
2713 | goto out; | ||
2714 | r = 0; | ||
2715 | break; | 3026 | break; |
2716 | } | 3027 | } |
2717 | case KVM_NMI: { | 3028 | case KVM_NMI: { |
2718 | r = kvm_vcpu_ioctl_nmi(vcpu); | 3029 | r = kvm_vcpu_ioctl_nmi(vcpu); |
2719 | if (r) | ||
2720 | goto out; | ||
2721 | r = 0; | ||
2722 | break; | 3030 | break; |
2723 | } | 3031 | } |
2724 | case KVM_SET_CPUID: { | 3032 | case KVM_SET_CPUID: { |
@@ -2729,8 +3037,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
2729 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | 3037 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) |
2730 | goto out; | 3038 | goto out; |
2731 | r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); | 3039 | r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); |
2732 | if (r) | ||
2733 | goto out; | ||
2734 | break; | 3040 | break; |
2735 | } | 3041 | } |
2736 | case KVM_SET_CPUID2: { | 3042 | case KVM_SET_CPUID2: { |
@@ -2742,8 +3048,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
2742 | goto out; | 3048 | goto out; |
2743 | r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, | 3049 | r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, |
2744 | cpuid_arg->entries); | 3050 | cpuid_arg->entries); |
2745 | if (r) | ||
2746 | goto out; | ||
2747 | break; | 3051 | break; |
2748 | } | 3052 | } |
2749 | case KVM_GET_CPUID2: { | 3053 | case KVM_GET_CPUID2: { |
@@ -2875,10 +3179,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
2875 | } | 3179 | } |
2876 | case KVM_SET_XSAVE: { | 3180 | case KVM_SET_XSAVE: { |
2877 | u.xsave = memdup_user(argp, sizeof(*u.xsave)); | 3181 | u.xsave = memdup_user(argp, sizeof(*u.xsave)); |
2878 | if (IS_ERR(u.xsave)) { | 3182 | if (IS_ERR(u.xsave)) |
2879 | r = PTR_ERR(u.xsave); | 3183 | return PTR_ERR(u.xsave); |
2880 | goto out; | ||
2881 | } | ||
2882 | 3184 | ||
2883 | r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); | 3185 | r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); |
2884 | break; | 3186 | break; |
@@ -2900,10 +3202,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
2900 | } | 3202 | } |
2901 | case KVM_SET_XCRS: { | 3203 | case KVM_SET_XCRS: { |
2902 | u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); | 3204 | u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); |
2903 | if (IS_ERR(u.xcrs)) { | 3205 | if (IS_ERR(u.xcrs)) |
2904 | r = PTR_ERR(u.xcrs); | 3206 | return PTR_ERR(u.xcrs); |
2905 | goto out; | ||
2906 | } | ||
2907 | 3207 | ||
2908 | r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); | 3208 | r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); |
2909 | break; | 3209 | break; |
@@ -2951,7 +3251,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) | |||
2951 | int ret; | 3251 | int ret; |
2952 | 3252 | ||
2953 | if (addr > (unsigned int)(-3 * PAGE_SIZE)) | 3253 | if (addr > (unsigned int)(-3 * PAGE_SIZE)) |
2954 | return -1; | 3254 | return -EINVAL; |
2955 | ret = kvm_x86_ops->set_tss_addr(kvm, addr); | 3255 | ret = kvm_x86_ops->set_tss_addr(kvm, addr); |
2956 | return ret; | 3256 | return ret; |
2957 | } | 3257 | } |
@@ -3212,8 +3512,6 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3212 | switch (ioctl) { | 3512 | switch (ioctl) { |
3213 | case KVM_SET_TSS_ADDR: | 3513 | case KVM_SET_TSS_ADDR: |
3214 | r = kvm_vm_ioctl_set_tss_addr(kvm, arg); | 3514 | r = kvm_vm_ioctl_set_tss_addr(kvm, arg); |
3215 | if (r < 0) | ||
3216 | goto out; | ||
3217 | break; | 3515 | break; |
3218 | case KVM_SET_IDENTITY_MAP_ADDR: { | 3516 | case KVM_SET_IDENTITY_MAP_ADDR: { |
3219 | u64 ident_addr; | 3517 | u64 ident_addr; |
@@ -3222,14 +3520,10 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3222 | if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) | 3520 | if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) |
3223 | goto out; | 3521 | goto out; |
3224 | r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); | 3522 | r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); |
3225 | if (r < 0) | ||
3226 | goto out; | ||
3227 | break; | 3523 | break; |
3228 | } | 3524 | } |
3229 | case KVM_SET_NR_MMU_PAGES: | 3525 | case KVM_SET_NR_MMU_PAGES: |
3230 | r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); | 3526 | r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); |
3231 | if (r) | ||
3232 | goto out; | ||
3233 | break; | 3527 | break; |
3234 | case KVM_GET_NR_MMU_PAGES: | 3528 | case KVM_GET_NR_MMU_PAGES: |
3235 | r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); | 3529 | r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); |
@@ -3320,8 +3614,6 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3320 | r = 0; | 3614 | r = 0; |
3321 | get_irqchip_out: | 3615 | get_irqchip_out: |
3322 | kfree(chip); | 3616 | kfree(chip); |
3323 | if (r) | ||
3324 | goto out; | ||
3325 | break; | 3617 | break; |
3326 | } | 3618 | } |
3327 | case KVM_SET_IRQCHIP: { | 3619 | case KVM_SET_IRQCHIP: { |
@@ -3343,8 +3635,6 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3343 | r = 0; | 3635 | r = 0; |
3344 | set_irqchip_out: | 3636 | set_irqchip_out: |
3345 | kfree(chip); | 3637 | kfree(chip); |
3346 | if (r) | ||
3347 | goto out; | ||
3348 | break; | 3638 | break; |
3349 | } | 3639 | } |
3350 | case KVM_GET_PIT: { | 3640 | case KVM_GET_PIT: { |
@@ -3371,9 +3661,6 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3371 | if (!kvm->arch.vpit) | 3661 | if (!kvm->arch.vpit) |
3372 | goto out; | 3662 | goto out; |
3373 | r = kvm_vm_ioctl_set_pit(kvm, &u.ps); | 3663 | r = kvm_vm_ioctl_set_pit(kvm, &u.ps); |
3374 | if (r) | ||
3375 | goto out; | ||
3376 | r = 0; | ||
3377 | break; | 3664 | break; |
3378 | } | 3665 | } |
3379 | case KVM_GET_PIT2: { | 3666 | case KVM_GET_PIT2: { |
@@ -3397,9 +3684,6 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3397 | if (!kvm->arch.vpit) | 3684 | if (!kvm->arch.vpit) |
3398 | goto out; | 3685 | goto out; |
3399 | r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); | 3686 | r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); |
3400 | if (r) | ||
3401 | goto out; | ||
3402 | r = 0; | ||
3403 | break; | 3687 | break; |
3404 | } | 3688 | } |
3405 | case KVM_REINJECT_CONTROL: { | 3689 | case KVM_REINJECT_CONTROL: { |
@@ -3408,9 +3692,6 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3408 | if (copy_from_user(&control, argp, sizeof(control))) | 3692 | if (copy_from_user(&control, argp, sizeof(control))) |
3409 | goto out; | 3693 | goto out; |
3410 | r = kvm_vm_ioctl_reinject(kvm, &control); | 3694 | r = kvm_vm_ioctl_reinject(kvm, &control); |
3411 | if (r) | ||
3412 | goto out; | ||
3413 | r = 0; | ||
3414 | break; | 3695 | break; |
3415 | } | 3696 | } |
3416 | case KVM_XEN_HVM_CONFIG: { | 3697 | case KVM_XEN_HVM_CONFIG: { |
@@ -4273,7 +4554,12 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, | |||
4273 | static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, | 4554 | static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, |
4274 | u32 msr_index, u64 data) | 4555 | u32 msr_index, u64 data) |
4275 | { | 4556 | { |
4276 | return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); | 4557 | struct msr_data msr; |
4558 | |||
4559 | msr.data = data; | ||
4560 | msr.index = msr_index; | ||
4561 | msr.host_initiated = false; | ||
4562 | return kvm_set_msr(emul_to_vcpu(ctxt), &msr); | ||
4277 | } | 4563 | } |
4278 | 4564 | ||
4279 | static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, | 4565 | static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, |
@@ -4495,7 +4781,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) | |||
4495 | * instruction -> ... | 4781 | * instruction -> ... |
4496 | */ | 4782 | */ |
4497 | pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); | 4783 | pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); |
4498 | if (!is_error_pfn(pfn)) { | 4784 | if (!is_error_noslot_pfn(pfn)) { |
4499 | kvm_release_pfn_clean(pfn); | 4785 | kvm_release_pfn_clean(pfn); |
4500 | return true; | 4786 | return true; |
4501 | } | 4787 | } |
@@ -4881,6 +5167,50 @@ static void kvm_set_mmio_spte_mask(void) | |||
4881 | kvm_mmu_set_mmio_spte_mask(mask); | 5167 | kvm_mmu_set_mmio_spte_mask(mask); |
4882 | } | 5168 | } |
4883 | 5169 | ||
5170 | #ifdef CONFIG_X86_64 | ||
5171 | static void pvclock_gtod_update_fn(struct work_struct *work) | ||
5172 | { | ||
5173 | struct kvm *kvm; | ||
5174 | |||
5175 | struct kvm_vcpu *vcpu; | ||
5176 | int i; | ||
5177 | |||
5178 | raw_spin_lock(&kvm_lock); | ||
5179 | list_for_each_entry(kvm, &vm_list, vm_list) | ||
5180 | kvm_for_each_vcpu(i, vcpu, kvm) | ||
5181 | set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests); | ||
5182 | atomic_set(&kvm_guest_has_master_clock, 0); | ||
5183 | raw_spin_unlock(&kvm_lock); | ||
5184 | } | ||
5185 | |||
5186 | static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); | ||
5187 | |||
5188 | /* | ||
5189 | * Notification about pvclock gtod data update. | ||
5190 | */ | ||
5191 | static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, | ||
5192 | void *priv) | ||
5193 | { | ||
5194 | struct pvclock_gtod_data *gtod = &pvclock_gtod_data; | ||
5195 | struct timekeeper *tk = priv; | ||
5196 | |||
5197 | update_pvclock_gtod(tk); | ||
5198 | |||
5199 | /* disable master clock if host does not trust, or does not | ||
5200 | * use, TSC clocksource | ||
5201 | */ | ||
5202 | if (gtod->clock.vclock_mode != VCLOCK_TSC && | ||
5203 | atomic_read(&kvm_guest_has_master_clock) != 0) | ||
5204 | queue_work(system_long_wq, &pvclock_gtod_work); | ||
5205 | |||
5206 | return 0; | ||
5207 | } | ||
5208 | |||
5209 | static struct notifier_block pvclock_gtod_notifier = { | ||
5210 | .notifier_call = pvclock_gtod_notify, | ||
5211 | }; | ||
5212 | #endif | ||
5213 | |||
4884 | int kvm_arch_init(void *opaque) | 5214 | int kvm_arch_init(void *opaque) |
4885 | { | 5215 | { |
4886 | int r; | 5216 | int r; |
@@ -4922,6 +5252,10 @@ int kvm_arch_init(void *opaque) | |||
4922 | host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | 5252 | host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); |
4923 | 5253 | ||
4924 | kvm_lapic_init(); | 5254 | kvm_lapic_init(); |
5255 | #ifdef CONFIG_X86_64 | ||
5256 | pvclock_gtod_register_notifier(&pvclock_gtod_notifier); | ||
5257 | #endif | ||
5258 | |||
4925 | return 0; | 5259 | return 0; |
4926 | 5260 | ||
4927 | out: | 5261 | out: |
@@ -4936,6 +5270,9 @@ void kvm_arch_exit(void) | |||
4936 | cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, | 5270 | cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, |
4937 | CPUFREQ_TRANSITION_NOTIFIER); | 5271 | CPUFREQ_TRANSITION_NOTIFIER); |
4938 | unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); | 5272 | unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); |
5273 | #ifdef CONFIG_X86_64 | ||
5274 | pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); | ||
5275 | #endif | ||
4939 | kvm_x86_ops = NULL; | 5276 | kvm_x86_ops = NULL; |
4940 | kvm_mmu_module_exit(); | 5277 | kvm_mmu_module_exit(); |
4941 | } | 5278 | } |
@@ -5059,7 +5396,7 @@ out: | |||
5059 | } | 5396 | } |
5060 | EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); | 5397 | EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); |
5061 | 5398 | ||
5062 | int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) | 5399 | static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) |
5063 | { | 5400 | { |
5064 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | 5401 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
5065 | char instruction[3]; | 5402 | char instruction[3]; |
@@ -5235,6 +5572,29 @@ static void process_nmi(struct kvm_vcpu *vcpu) | |||
5235 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 5572 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
5236 | } | 5573 | } |
5237 | 5574 | ||
5575 | static void kvm_gen_update_masterclock(struct kvm *kvm) | ||
5576 | { | ||
5577 | #ifdef CONFIG_X86_64 | ||
5578 | int i; | ||
5579 | struct kvm_vcpu *vcpu; | ||
5580 | struct kvm_arch *ka = &kvm->arch; | ||
5581 | |||
5582 | spin_lock(&ka->pvclock_gtod_sync_lock); | ||
5583 | kvm_make_mclock_inprogress_request(kvm); | ||
5584 | /* no guest entries from this point */ | ||
5585 | pvclock_update_vm_gtod_copy(kvm); | ||
5586 | |||
5587 | kvm_for_each_vcpu(i, vcpu, kvm) | ||
5588 | set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); | ||
5589 | |||
5590 | /* guest entries allowed */ | ||
5591 | kvm_for_each_vcpu(i, vcpu, kvm) | ||
5592 | clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); | ||
5593 | |||
5594 | spin_unlock(&ka->pvclock_gtod_sync_lock); | ||
5595 | #endif | ||
5596 | } | ||
5597 | |||
5238 | static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | 5598 | static int vcpu_enter_guest(struct kvm_vcpu *vcpu) |
5239 | { | 5599 | { |
5240 | int r; | 5600 | int r; |
@@ -5247,6 +5607,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5247 | kvm_mmu_unload(vcpu); | 5607 | kvm_mmu_unload(vcpu); |
5248 | if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) | 5608 | if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) |
5249 | __kvm_migrate_timers(vcpu); | 5609 | __kvm_migrate_timers(vcpu); |
5610 | if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) | ||
5611 | kvm_gen_update_masterclock(vcpu->kvm); | ||
5250 | if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { | 5612 | if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { |
5251 | r = kvm_guest_time_update(vcpu); | 5613 | r = kvm_guest_time_update(vcpu); |
5252 | if (unlikely(r)) | 5614 | if (unlikely(r)) |
@@ -5362,7 +5724,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5362 | if (hw_breakpoint_active()) | 5724 | if (hw_breakpoint_active()) |
5363 | hw_breakpoint_restore(); | 5725 | hw_breakpoint_restore(); |
5364 | 5726 | ||
5365 | vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); | 5727 | vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, |
5728 | native_read_tsc()); | ||
5366 | 5729 | ||
5367 | vcpu->mode = OUTSIDE_GUEST_MODE; | 5730 | vcpu->mode = OUTSIDE_GUEST_MODE; |
5368 | smp_wmb(); | 5731 | smp_wmb(); |
@@ -5419,7 +5782,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
5419 | pr_debug("vcpu %d received sipi with vector # %x\n", | 5782 | pr_debug("vcpu %d received sipi with vector # %x\n", |
5420 | vcpu->vcpu_id, vcpu->arch.sipi_vector); | 5783 | vcpu->vcpu_id, vcpu->arch.sipi_vector); |
5421 | kvm_lapic_reset(vcpu); | 5784 | kvm_lapic_reset(vcpu); |
5422 | r = kvm_arch_vcpu_reset(vcpu); | 5785 | r = kvm_vcpu_reset(vcpu); |
5423 | if (r) | 5786 | if (r) |
5424 | return r; | 5787 | return r; |
5425 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 5788 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
@@ -6047,7 +6410,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | |||
6047 | r = vcpu_load(vcpu); | 6410 | r = vcpu_load(vcpu); |
6048 | if (r) | 6411 | if (r) |
6049 | return r; | 6412 | return r; |
6050 | r = kvm_arch_vcpu_reset(vcpu); | 6413 | r = kvm_vcpu_reset(vcpu); |
6051 | if (r == 0) | 6414 | if (r == 0) |
6052 | r = kvm_mmu_setup(vcpu); | 6415 | r = kvm_mmu_setup(vcpu); |
6053 | vcpu_put(vcpu); | 6416 | vcpu_put(vcpu); |
@@ -6055,6 +6418,23 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | |||
6055 | return r; | 6418 | return r; |
6056 | } | 6419 | } |
6057 | 6420 | ||
6421 | int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) | ||
6422 | { | ||
6423 | int r; | ||
6424 | struct msr_data msr; | ||
6425 | |||
6426 | r = vcpu_load(vcpu); | ||
6427 | if (r) | ||
6428 | return r; | ||
6429 | msr.data = 0x0; | ||
6430 | msr.index = MSR_IA32_TSC; | ||
6431 | msr.host_initiated = true; | ||
6432 | kvm_write_tsc(vcpu, &msr); | ||
6433 | vcpu_put(vcpu); | ||
6434 | |||
6435 | return r; | ||
6436 | } | ||
6437 | |||
6058 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) | 6438 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) |
6059 | { | 6439 | { |
6060 | int r; | 6440 | int r; |
@@ -6069,7 +6449,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) | |||
6069 | kvm_x86_ops->vcpu_free(vcpu); | 6449 | kvm_x86_ops->vcpu_free(vcpu); |
6070 | } | 6450 | } |
6071 | 6451 | ||
6072 | int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) | 6452 | static int kvm_vcpu_reset(struct kvm_vcpu *vcpu) |
6073 | { | 6453 | { |
6074 | atomic_set(&vcpu->arch.nmi_queued, 0); | 6454 | atomic_set(&vcpu->arch.nmi_queued, 0); |
6075 | vcpu->arch.nmi_pending = 0; | 6455 | vcpu->arch.nmi_pending = 0; |
@@ -6092,6 +6472,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) | |||
6092 | 6472 | ||
6093 | kvm_pmu_reset(vcpu); | 6473 | kvm_pmu_reset(vcpu); |
6094 | 6474 | ||
6475 | memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); | ||
6476 | vcpu->arch.regs_avail = ~0; | ||
6477 | vcpu->arch.regs_dirty = ~0; | ||
6478 | |||
6095 | return kvm_x86_ops->vcpu_reset(vcpu); | 6479 | return kvm_x86_ops->vcpu_reset(vcpu); |
6096 | } | 6480 | } |
6097 | 6481 | ||
@@ -6168,6 +6552,8 @@ int kvm_arch_hardware_enable(void *garbage) | |||
6168 | kvm_for_each_vcpu(i, vcpu, kvm) { | 6552 | kvm_for_each_vcpu(i, vcpu, kvm) { |
6169 | vcpu->arch.tsc_offset_adjustment += delta_cyc; | 6553 | vcpu->arch.tsc_offset_adjustment += delta_cyc; |
6170 | vcpu->arch.last_host_tsc = local_tsc; | 6554 | vcpu->arch.last_host_tsc = local_tsc; |
6555 | set_bit(KVM_REQ_MASTERCLOCK_UPDATE, | ||
6556 | &vcpu->requests); | ||
6171 | } | 6557 | } |
6172 | 6558 | ||
6173 | /* | 6559 | /* |
@@ -6258,10 +6644,17 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
6258 | if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) | 6644 | if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) |
6259 | goto fail_free_mce_banks; | 6645 | goto fail_free_mce_banks; |
6260 | 6646 | ||
6647 | r = fx_init(vcpu); | ||
6648 | if (r) | ||
6649 | goto fail_free_wbinvd_dirty_mask; | ||
6650 | |||
6651 | vcpu->arch.ia32_tsc_adjust_msr = 0x0; | ||
6261 | kvm_async_pf_hash_reset(vcpu); | 6652 | kvm_async_pf_hash_reset(vcpu); |
6262 | kvm_pmu_init(vcpu); | 6653 | kvm_pmu_init(vcpu); |
6263 | 6654 | ||
6264 | return 0; | 6655 | return 0; |
6656 | fail_free_wbinvd_dirty_mask: | ||
6657 | free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); | ||
6265 | fail_free_mce_banks: | 6658 | fail_free_mce_banks: |
6266 | kfree(vcpu->arch.mce_banks); | 6659 | kfree(vcpu->arch.mce_banks); |
6267 | fail_free_lapic: | 6660 | fail_free_lapic: |
@@ -6305,6 +6698,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) | |||
6305 | 6698 | ||
6306 | raw_spin_lock_init(&kvm->arch.tsc_write_lock); | 6699 | raw_spin_lock_init(&kvm->arch.tsc_write_lock); |
6307 | mutex_init(&kvm->arch.apic_map_lock); | 6700 | mutex_init(&kvm->arch.apic_map_lock); |
6701 | spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); | ||
6702 | |||
6703 | pvclock_update_vm_gtod_copy(kvm); | ||
6308 | 6704 | ||
6309 | return 0; | 6705 | return 0; |
6310 | } | 6706 | } |
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2b5219c12ac8..e224f7a671b6 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h | |||
@@ -112,7 +112,7 @@ void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); | |||
112 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); | 112 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); |
113 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); | 113 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); |
114 | 114 | ||
115 | void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); | 115 | void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); |
116 | 116 | ||
117 | int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, | 117 | int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, |
118 | gva_t addr, void *val, unsigned int bytes, | 118 | gva_t addr, void *val, unsigned int bytes, |
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 4df6c373421a..205ad328aa52 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <asm/hpet.h> | 22 | #include <asm/hpet.h> |
23 | #include <asm/unistd.h> | 23 | #include <asm/unistd.h> |
24 | #include <asm/io.h> | 24 | #include <asm/io.h> |
25 | #include <asm/pvclock.h> | ||
25 | 26 | ||
26 | #define gtod (&VVAR(vsyscall_gtod_data)) | 27 | #define gtod (&VVAR(vsyscall_gtod_data)) |
27 | 28 | ||
@@ -62,6 +63,76 @@ static notrace cycle_t vread_hpet(void) | |||
62 | return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); | 63 | return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); |
63 | } | 64 | } |
64 | 65 | ||
66 | #ifdef CONFIG_PARAVIRT_CLOCK | ||
67 | |||
68 | static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) | ||
69 | { | ||
70 | const struct pvclock_vsyscall_time_info *pvti_base; | ||
71 | int idx = cpu / (PAGE_SIZE/PVTI_SIZE); | ||
72 | int offset = cpu % (PAGE_SIZE/PVTI_SIZE); | ||
73 | |||
74 | BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END); | ||
75 | |||
76 | pvti_base = (struct pvclock_vsyscall_time_info *) | ||
77 | __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx); | ||
78 | |||
79 | return &pvti_base[offset]; | ||
80 | } | ||
81 | |||
82 | static notrace cycle_t vread_pvclock(int *mode) | ||
83 | { | ||
84 | const struct pvclock_vsyscall_time_info *pvti; | ||
85 | cycle_t ret; | ||
86 | u64 last; | ||
87 | u32 version; | ||
88 | u32 migrate_count; | ||
89 | u8 flags; | ||
90 | unsigned cpu, cpu1; | ||
91 | |||
92 | |||
93 | /* | ||
94 | * When looping to get a consistent (time-info, tsc) pair, we | ||
95 | * also need to deal with the possibility we can switch vcpus, | ||
96 | * so make sure we always re-fetch time-info for the current vcpu. | ||
97 | */ | ||
98 | do { | ||
99 | cpu = __getcpu() & VGETCPU_CPU_MASK; | ||
100 | /* TODO: We can put vcpu id into higher bits of pvti.version. | ||
101 | * This will save a couple of cycles by getting rid of | ||
102 | * __getcpu() calls (Gleb). | ||
103 | */ | ||
104 | |||
105 | pvti = get_pvti(cpu); | ||
106 | |||
107 | migrate_count = pvti->migrate_count; | ||
108 | |||
109 | version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); | ||
110 | |||
111 | /* | ||
112 | * Test we're still on the cpu as well as the version. | ||
113 | * We could have been migrated just after the first | ||
114 | * vgetcpu but before fetching the version, so we | ||
115 | * wouldn't notice a version change. | ||
116 | */ | ||
117 | cpu1 = __getcpu() & VGETCPU_CPU_MASK; | ||
118 | } while (unlikely(cpu != cpu1 || | ||
119 | (pvti->pvti.version & 1) || | ||
120 | pvti->pvti.version != version || | ||
121 | pvti->migrate_count != migrate_count)); | ||
122 | |||
123 | if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) | ||
124 | *mode = VCLOCK_NONE; | ||
125 | |||
126 | /* refer to tsc.c read_tsc() comment for rationale */ | ||
127 | last = VVAR(vsyscall_gtod_data).clock.cycle_last; | ||
128 | |||
129 | if (likely(ret >= last)) | ||
130 | return ret; | ||
131 | |||
132 | return last; | ||
133 | } | ||
134 | #endif | ||
135 | |||
65 | notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) | 136 | notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) |
66 | { | 137 | { |
67 | long ret; | 138 | long ret; |
@@ -80,7 +151,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) | |||
80 | } | 151 | } |
81 | 152 | ||
82 | 153 | ||
83 | notrace static inline u64 vgetsns(void) | 154 | notrace static inline u64 vgetsns(int *mode) |
84 | { | 155 | { |
85 | long v; | 156 | long v; |
86 | cycles_t cycles; | 157 | cycles_t cycles; |
@@ -88,6 +159,10 @@ notrace static inline u64 vgetsns(void) | |||
88 | cycles = vread_tsc(); | 159 | cycles = vread_tsc(); |
89 | else if (gtod->clock.vclock_mode == VCLOCK_HPET) | 160 | else if (gtod->clock.vclock_mode == VCLOCK_HPET) |
90 | cycles = vread_hpet(); | 161 | cycles = vread_hpet(); |
162 | #ifdef CONFIG_PARAVIRT_CLOCK | ||
163 | else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK) | ||
164 | cycles = vread_pvclock(mode); | ||
165 | #endif | ||
91 | else | 166 | else |
92 | return 0; | 167 | return 0; |
93 | v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; | 168 | v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; |
@@ -107,7 +182,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts) | |||
107 | mode = gtod->clock.vclock_mode; | 182 | mode = gtod->clock.vclock_mode; |
108 | ts->tv_sec = gtod->wall_time_sec; | 183 | ts->tv_sec = gtod->wall_time_sec; |
109 | ns = gtod->wall_time_snsec; | 184 | ns = gtod->wall_time_snsec; |
110 | ns += vgetsns(); | 185 | ns += vgetsns(&mode); |
111 | ns >>= gtod->clock.shift; | 186 | ns >>= gtod->clock.shift; |
112 | } while (unlikely(read_seqcount_retry(>od->seq, seq))); | 187 | } while (unlikely(read_seqcount_retry(>od->seq, seq))); |
113 | 188 | ||
@@ -127,7 +202,7 @@ notrace static int do_monotonic(struct timespec *ts) | |||
127 | mode = gtod->clock.vclock_mode; | 202 | mode = gtod->clock.vclock_mode; |
128 | ts->tv_sec = gtod->monotonic_time_sec; | 203 | ts->tv_sec = gtod->monotonic_time_sec; |
129 | ns = gtod->monotonic_time_snsec; | 204 | ns = gtod->monotonic_time_snsec; |
130 | ns += vgetsns(); | 205 | ns += vgetsns(&mode); |
131 | ns >>= gtod->clock.shift; | 206 | ns >>= gtod->clock.shift; |
132 | } while (unlikely(read_seqcount_retry(>od->seq, seq))); | 207 | } while (unlikely(read_seqcount_retry(>od->seq, seq))); |
133 | timespec_add_ns(ts, ns); | 208 | timespec_add_ns(ts, ns); |
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c index 5463ad558573..2f94b039e55b 100644 --- a/arch/x86/vdso/vgetcpu.c +++ b/arch/x86/vdso/vgetcpu.c | |||
@@ -17,15 +17,10 @@ __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) | |||
17 | { | 17 | { |
18 | unsigned int p; | 18 | unsigned int p; |
19 | 19 | ||
20 | if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { | 20 | p = __getcpu(); |
21 | /* Load per CPU data from RDTSCP */ | 21 | |
22 | native_read_tscp(&p); | ||
23 | } else { | ||
24 | /* Load per CPU data from GDT */ | ||
25 | asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); | ||
26 | } | ||
27 | if (cpu) | 22 | if (cpu) |
28 | *cpu = p & 0xfff; | 23 | *cpu = p & VGETCPU_CPU_MASK; |
29 | if (node) | 24 | if (node) |
30 | *node = p >> 12; | 25 | *node = p >> 12; |
31 | return 0; | 26 | return 0; |