aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig7
-rw-r--r--arch/x86/Makefile.um4
-rw-r--r--arch/x86/boot/Makefile6
-rw-r--r--arch/x86/include/asm/debugreg.h67
-rw-r--r--arch/x86/include/asm/kgdb.h10
-rw-r--r--arch/x86/include/asm/kvm.h4
-rw-r--r--arch/x86/include/asm/kvm_emulate.h3
-rw-r--r--arch/x86/include/asm/kvm_host.h63
-rw-r--r--arch/x86/include/asm/paravirt.h1
-rw-r--r--arch/x86/include/asm/perf_event.h1
-rw-r--r--arch/x86/include/asm/processor.h63
-rw-r--r--arch/x86/include/asm/tsc.h4
-rw-r--r--arch/x86/include/asm/x86_init.h6
-rw-r--r--arch/x86/kernel/cpu/common.c1
-rw-r--r--arch/x86/kernel/cpu/perf_event.c4
-rw-r--r--arch/x86/kernel/entry_32.S17
-rw-r--r--arch/x86/kernel/irqinit.c6
-rw-r--r--arch/x86/kernel/kgdb.c6
-rw-r--r--arch/x86/kernel/kvmclock.c15
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/kernel/pci-dma.c5
-rw-r--r--arch/x86/kernel/smpboot.c1
-rw-r--r--arch/x86/kernel/tsc.c4
-rw-r--r--arch/x86/kernel/x86_init.c5
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c112
-rw-r--r--arch/x86/kvm/i8259.c1
-rw-r--r--arch/x86/kvm/lapic.c4
-rw-r--r--arch/x86/kvm/mmu.c85
-rw-r--r--arch/x86/kvm/mmu_audit.c4
-rw-r--r--arch/x86/kvm/pmu.c10
-rw-r--r--arch/x86/kvm/svm.c119
-rw-r--r--arch/x86/kvm/vmx.c53
-rw-r--r--arch/x86/kvm/x86.c403
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c1
-rw-r--r--arch/x86/pci/acpi.c7
-rw-r--r--arch/x86/pci/fixup.c12
-rw-r--r--arch/x86/pci/i386.c85
-rw-r--r--arch/x86/pci/mrst.c40
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts7
-rw-r--r--arch/x86/platform/geode/Makefile1
-rw-r--r--arch/x86/platform/geode/geos.c128
-rw-r--r--arch/x86/power/cpu.c4
-rw-r--r--arch/x86/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/um/Kconfig4
-rw-r--r--arch/x86/um/asm/processor.h10
-rw-r--r--arch/x86/um/asm/processor_32.h10
-rw-r--r--arch/x86/um/asm/processor_64.h10
-rw-r--r--arch/x86/um/bugs_32.c4
-rw-r--r--arch/x86/um/mem_32.c8
-rw-r--r--arch/x86/um/vdso/vma.c3
-rw-r--r--arch/x86/vdso/vdso32-setup.c17
-rw-r--r--arch/x86/vdso/vma.c3
-rw-r--r--arch/x86/xen/setup.c2
-rw-r--r--arch/x86/xen/smp.c6
56 files changed, 1079 insertions, 390 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 90195235596..3ad653de710 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2125,6 +2125,13 @@ config NET5501
2125 ---help--- 2125 ---help---
2126 This option enables system support for the Soekris Engineering net5501. 2126 This option enables system support for the Soekris Engineering net5501.
2127 2127
2128config GEOS
2129 bool "Traverse Technologies GEOS System Support (LEDS, GPIO, etc)"
2130 select GPIOLIB
2131 depends on DMI
2132 ---help---
2133 This option enables system support for the Traverse Technologies GEOS.
2134
2128endif # X86_32 2135endif # X86_32
2129 2136
2130config AMD_NB 2137config AMD_NB
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index 36ddec6a41c..4be406abeef 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -8,15 +8,11 @@ ELF_ARCH := i386
8ELF_FORMAT := elf32-i386 8ELF_FORMAT := elf32-i386
9CHECKFLAGS += -D__i386__ 9CHECKFLAGS += -D__i386__
10 10
11ifeq ("$(origin SUBARCH)", "command line")
12ifneq ("$(shell uname -m | sed -e s/i.86/i386/)", "$(SUBARCH)")
13KBUILD_CFLAGS += $(call cc-option,-m32) 11KBUILD_CFLAGS += $(call cc-option,-m32)
14KBUILD_AFLAGS += $(call cc-option,-m32) 12KBUILD_AFLAGS += $(call cc-option,-m32)
15LINK-y += $(call cc-option,-m32) 13LINK-y += $(call cc-option,-m32)
16 14
17export LDFLAGS 15export LDFLAGS
18endif
19endif
20 16
21# First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y. 17# First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y.
22include $(srctree)/arch/x86/Makefile_32.cpu 18include $(srctree)/arch/x86/Makefile_32.cpu
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 3e02148bb77..5a747dd884d 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -37,9 +37,9 @@ setup-y += video-bios.o
37targets += $(setup-y) 37targets += $(setup-y)
38hostprogs-y := mkcpustr tools/build 38hostprogs-y := mkcpustr tools/build
39 39
40HOSTCFLAGS_mkcpustr.o := -I$(srctree)/arch/$(SRCARCH)/include 40HOST_EXTRACFLAGS += -I$(srctree)/tools/include $(LINUXINCLUDE) \
41HOST_EXTRACFLAGS += -I$(objtree)/include -I$(srctree)/tools/include \ 41 -D__EXPORTED_HEADERS__
42 -include $(srctree)/include/linux/kconfig.h 42
43$(obj)/cpu.o: $(obj)/cpustr.h 43$(obj)/cpu.o: $(obj)/cpustr.h
44 44
45quiet_cmd_cpustr = CPUSTR $@ 45quiet_cmd_cpustr = CPUSTR $@
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index b903d5ea394..2d91580bf22 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -78,8 +78,75 @@
78 */ 78 */
79#ifdef __KERNEL__ 79#ifdef __KERNEL__
80 80
81#include <linux/bug.h>
82
81DECLARE_PER_CPU(unsigned long, cpu_dr7); 83DECLARE_PER_CPU(unsigned long, cpu_dr7);
82 84
85#ifndef CONFIG_PARAVIRT
86/*
87 * These special macros can be used to get or set a debugging register
88 */
89#define get_debugreg(var, register) \
90 (var) = native_get_debugreg(register)
91#define set_debugreg(value, register) \
92 native_set_debugreg(register, value)
93#endif
94
95static inline unsigned long native_get_debugreg(int regno)
96{
97 unsigned long val = 0; /* Damn you, gcc! */
98
99 switch (regno) {
100 case 0:
101 asm("mov %%db0, %0" :"=r" (val));
102 break;
103 case 1:
104 asm("mov %%db1, %0" :"=r" (val));
105 break;
106 case 2:
107 asm("mov %%db2, %0" :"=r" (val));
108 break;
109 case 3:
110 asm("mov %%db3, %0" :"=r" (val));
111 break;
112 case 6:
113 asm("mov %%db6, %0" :"=r" (val));
114 break;
115 case 7:
116 asm("mov %%db7, %0" :"=r" (val));
117 break;
118 default:
119 BUG();
120 }
121 return val;
122}
123
124static inline void native_set_debugreg(int regno, unsigned long value)
125{
126 switch (regno) {
127 case 0:
128 asm("mov %0, %%db0" ::"r" (value));
129 break;
130 case 1:
131 asm("mov %0, %%db1" ::"r" (value));
132 break;
133 case 2:
134 asm("mov %0, %%db2" ::"r" (value));
135 break;
136 case 3:
137 asm("mov %0, %%db3" ::"r" (value));
138 break;
139 case 6:
140 asm("mov %0, %%db6" ::"r" (value));
141 break;
142 case 7:
143 asm("mov %0, %%db7" ::"r" (value));
144 break;
145 default:
146 BUG();
147 }
148}
149
83static inline void hw_breakpoint_disable(void) 150static inline void hw_breakpoint_disable(void)
84{ 151{
85 /* Zero the control register for HW Breakpoint */ 152 /* Zero the control register for HW Breakpoint */
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index 77e95f54570..332f98c9111 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -64,11 +64,15 @@ enum regnames {
64 GDB_PS, /* 17 */ 64 GDB_PS, /* 17 */
65 GDB_CS, /* 18 */ 65 GDB_CS, /* 18 */
66 GDB_SS, /* 19 */ 66 GDB_SS, /* 19 */
67 GDB_DS, /* 20 */
68 GDB_ES, /* 21 */
69 GDB_FS, /* 22 */
70 GDB_GS, /* 23 */
67}; 71};
68#define GDB_ORIG_AX 57 72#define GDB_ORIG_AX 57
69#define DBG_MAX_REG_NUM 20 73#define DBG_MAX_REG_NUM 24
70/* 17 64 bit regs and 3 32 bit regs */ 74/* 17 64 bit regs and 5 32 bit regs */
71#define NUMREGBYTES ((17 * 8) + (3 * 4)) 75#define NUMREGBYTES ((17 * 8) + (5 * 4))
72#endif /* ! CONFIG_X86_32 */ 76#endif /* ! CONFIG_X86_32 */
73 77
74static inline void arch_kgdb_breakpoint(void) 78static inline void arch_kgdb_breakpoint(void)
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 4d8dcbdfc12..e7d1c194d27 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -321,4 +321,8 @@ struct kvm_xcrs {
321 __u64 padding[16]; 321 __u64 padding[16];
322}; 322};
323 323
324/* definition of registers in kvm_run */
325struct kvm_sync_regs {
326};
327
324#endif /* _ASM_X86_KVM_H */ 328#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7b9cfc4878a..c222e1a1b12 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -176,6 +176,7 @@ struct x86_emulate_ops {
176 void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); 176 void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
177 ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); 177 ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
178 int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); 178 int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
179 void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val);
179 int (*cpl)(struct x86_emulate_ctxt *ctxt); 180 int (*cpl)(struct x86_emulate_ctxt *ctxt);
180 int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); 181 int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
181 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); 182 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
@@ -388,7 +389,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
388#define EMULATION_INTERCEPTED 2 389#define EMULATION_INTERCEPTED 2
389int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); 390int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
390int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 391int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
391 u16 tss_selector, int reason, 392 u16 tss_selector, int idt_index, int reason,
392 bool has_error_code, u32 error_code); 393 bool has_error_code, u32 error_code);
393int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); 394int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
394#endif /* _ASM_X86_KVM_X86_EMULATE_H */ 395#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 52d6640a5ca..e216ba066e7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -29,7 +29,7 @@
29#include <asm/msr-index.h> 29#include <asm/msr-index.h>
30 30
31#define KVM_MAX_VCPUS 254 31#define KVM_MAX_VCPUS 254
32#define KVM_SOFT_MAX_VCPUS 64 32#define KVM_SOFT_MAX_VCPUS 160
33#define KVM_MEMORY_SLOTS 32 33#define KVM_MEMORY_SLOTS 32
34/* memory slots that does not exposed to userspace */ 34/* memory slots that does not exposed to userspace */
35#define KVM_PRIVATE_MEM_SLOTS 4 35#define KVM_PRIVATE_MEM_SLOTS 4
@@ -181,13 +181,6 @@ struct kvm_mmu_memory_cache {
181 void *objects[KVM_NR_MEM_OBJS]; 181 void *objects[KVM_NR_MEM_OBJS];
182}; 182};
183 183
184#define NR_PTE_CHAIN_ENTRIES 5
185
186struct kvm_pte_chain {
187 u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
188 struct hlist_node link;
189};
190
191/* 184/*
192 * kvm_mmu_page_role, below, is defined as: 185 * kvm_mmu_page_role, below, is defined as:
193 * 186 *
@@ -427,12 +420,16 @@ struct kvm_vcpu_arch {
427 420
428 u64 last_guest_tsc; 421 u64 last_guest_tsc;
429 u64 last_kernel_ns; 422 u64 last_kernel_ns;
430 u64 last_tsc_nsec; 423 u64 last_host_tsc;
431 u64 last_tsc_write; 424 u64 tsc_offset_adjustment;
432 u32 virtual_tsc_khz; 425 u64 this_tsc_nsec;
426 u64 this_tsc_write;
427 u8 this_tsc_generation;
433 bool tsc_catchup; 428 bool tsc_catchup;
434 u32 tsc_catchup_mult; 429 bool tsc_always_catchup;
435 s8 tsc_catchup_shift; 430 s8 virtual_tsc_shift;
431 u32 virtual_tsc_mult;
432 u32 virtual_tsc_khz;
436 433
437 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ 434 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
438 unsigned nmi_pending; /* NMI queued after currently running handler */ 435 unsigned nmi_pending; /* NMI queued after currently running handler */
@@ -478,6 +475,21 @@ struct kvm_vcpu_arch {
478 u32 id; 475 u32 id;
479 bool send_user_only; 476 bool send_user_only;
480 } apf; 477 } apf;
478
479 /* OSVW MSRs (AMD only) */
480 struct {
481 u64 length;
482 u64 status;
483 } osvw;
484};
485
486struct kvm_lpage_info {
487 unsigned long rmap_pde;
488 int write_count;
489};
490
491struct kvm_arch_memory_slot {
492 struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
481}; 493};
482 494
483struct kvm_arch { 495struct kvm_arch {
@@ -511,8 +523,12 @@ struct kvm_arch {
511 s64 kvmclock_offset; 523 s64 kvmclock_offset;
512 raw_spinlock_t tsc_write_lock; 524 raw_spinlock_t tsc_write_lock;
513 u64 last_tsc_nsec; 525 u64 last_tsc_nsec;
514 u64 last_tsc_offset;
515 u64 last_tsc_write; 526 u64 last_tsc_write;
527 u32 last_tsc_khz;
528 u64 cur_tsc_nsec;
529 u64 cur_tsc_write;
530 u64 cur_tsc_offset;
531 u8 cur_tsc_generation;
516 532
517 struct kvm_xen_hvm_config xen_hvm_config; 533 struct kvm_xen_hvm_config xen_hvm_config;
518 534
@@ -644,7 +660,7 @@ struct kvm_x86_ops {
644 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 660 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
645 int (*get_lpage_level)(void); 661 int (*get_lpage_level)(void);
646 bool (*rdtscp_supported)(void); 662 bool (*rdtscp_supported)(void);
647 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment); 663 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);
648 664
649 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 665 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
650 666
@@ -652,7 +668,7 @@ struct kvm_x86_ops {
652 668
653 bool (*has_wbinvd_exit)(void); 669 bool (*has_wbinvd_exit)(void);
654 670
655 void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz); 671 void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale);
656 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 672 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
657 673
658 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); 674 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
@@ -674,6 +690,17 @@ struct kvm_arch_async_pf {
674 690
675extern struct kvm_x86_ops *kvm_x86_ops; 691extern struct kvm_x86_ops *kvm_x86_ops;
676 692
693static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
694 s64 adjustment)
695{
696 kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false);
697}
698
699static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
700{
701 kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true);
702}
703
677int kvm_mmu_module_init(void); 704int kvm_mmu_module_init(void);
678void kvm_mmu_module_exit(void); 705void kvm_mmu_module_exit(void);
679 706
@@ -741,8 +768,8 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
741void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 768void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
742int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); 769int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
743 770
744int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 771int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
745 bool has_error_code, u32 error_code); 772 int reason, bool has_error_code, u32 error_code);
746 773
747int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 774int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
748int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 775int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index c0180fd372d..aa0f9130836 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -10,6 +10,7 @@
10#include <asm/paravirt_types.h> 10#include <asm/paravirt_types.h>
11 11
12#ifndef __ASSEMBLY__ 12#ifndef __ASSEMBLY__
13#include <linux/bug.h>
13#include <linux/types.h> 14#include <linux/types.h>
14#include <linux/cpumask.h> 15#include <linux/cpumask.h>
15 16
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index e8fb2c7a5f4..2291895b183 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -23,6 +23,7 @@
23#define ARCH_PERFMON_EVENTSEL_USR (1ULL << 16) 23#define ARCH_PERFMON_EVENTSEL_USR (1ULL << 16)
24#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17) 24#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17)
25#define ARCH_PERFMON_EVENTSEL_EDGE (1ULL << 18) 25#define ARCH_PERFMON_EVENTSEL_EDGE (1ULL << 18)
26#define ARCH_PERFMON_EVENTSEL_PIN_CONTROL (1ULL << 19)
26#define ARCH_PERFMON_EVENTSEL_INT (1ULL << 20) 27#define ARCH_PERFMON_EVENTSEL_INT (1ULL << 20)
27#define ARCH_PERFMON_EVENTSEL_ANY (1ULL << 21) 28#define ARCH_PERFMON_EVENTSEL_ANY (1ULL << 21)
28#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22) 29#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 78e30ea492b..a19542c1685 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -484,61 +484,6 @@ struct thread_struct {
484 unsigned io_bitmap_max; 484 unsigned io_bitmap_max;
485}; 485};
486 486
487static inline unsigned long native_get_debugreg(int regno)
488{
489 unsigned long val = 0; /* Damn you, gcc! */
490
491 switch (regno) {
492 case 0:
493 asm("mov %%db0, %0" :"=r" (val));
494 break;
495 case 1:
496 asm("mov %%db1, %0" :"=r" (val));
497 break;
498 case 2:
499 asm("mov %%db2, %0" :"=r" (val));
500 break;
501 case 3:
502 asm("mov %%db3, %0" :"=r" (val));
503 break;
504 case 6:
505 asm("mov %%db6, %0" :"=r" (val));
506 break;
507 case 7:
508 asm("mov %%db7, %0" :"=r" (val));
509 break;
510 default:
511 BUG();
512 }
513 return val;
514}
515
516static inline void native_set_debugreg(int regno, unsigned long value)
517{
518 switch (regno) {
519 case 0:
520 asm("mov %0, %%db0" ::"r" (value));
521 break;
522 case 1:
523 asm("mov %0, %%db1" ::"r" (value));
524 break;
525 case 2:
526 asm("mov %0, %%db2" ::"r" (value));
527 break;
528 case 3:
529 asm("mov %0, %%db3" ::"r" (value));
530 break;
531 case 6:
532 asm("mov %0, %%db6" ::"r" (value));
533 break;
534 case 7:
535 asm("mov %0, %%db7" ::"r" (value));
536 break;
537 default:
538 BUG();
539 }
540}
541
542/* 487/*
543 * Set IOPL bits in EFLAGS from given mask 488 * Set IOPL bits in EFLAGS from given mask
544 */ 489 */
@@ -584,14 +529,6 @@ static inline void native_swapgs(void)
584#define __cpuid native_cpuid 529#define __cpuid native_cpuid
585#define paravirt_enabled() 0 530#define paravirt_enabled() 0
586 531
587/*
588 * These special macros can be used to get or set a debugging register
589 */
590#define get_debugreg(var, register) \
591 (var) = native_get_debugreg(register)
592#define set_debugreg(value, register) \
593 native_set_debugreg(register, value)
594
595static inline void load_sp0(struct tss_struct *tss, 532static inline void load_sp0(struct tss_struct *tss,
596 struct thread_struct *thread) 533 struct thread_struct *thread)
597{ 534{
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 15d99153a96..c91e8b9d588 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -61,7 +61,7 @@ extern void check_tsc_sync_source(int cpu);
61extern void check_tsc_sync_target(void); 61extern void check_tsc_sync_target(void);
62 62
63extern int notsc_setup(char *); 63extern int notsc_setup(char *);
64extern void save_sched_clock_state(void); 64extern void tsc_save_sched_clock_state(void);
65extern void restore_sched_clock_state(void); 65extern void tsc_restore_sched_clock_state(void);
66 66
67#endif /* _ASM_X86_TSC_H */ 67#endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 517d4767ffd..baaca8defec 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -145,9 +145,11 @@ struct x86_init_ops {
145/** 145/**
146 * struct x86_cpuinit_ops - platform specific cpu hotplug setups 146 * struct x86_cpuinit_ops - platform specific cpu hotplug setups
147 * @setup_percpu_clockev: set up the per cpu clock event device 147 * @setup_percpu_clockev: set up the per cpu clock event device
148 * @early_percpu_clock_init: early init of the per cpu clock event device
148 */ 149 */
149struct x86_cpuinit_ops { 150struct x86_cpuinit_ops {
150 void (*setup_percpu_clockev)(void); 151 void (*setup_percpu_clockev)(void);
152 void (*early_percpu_clock_init)(void);
151 void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node); 153 void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node);
152}; 154};
153 155
@@ -160,6 +162,8 @@ struct x86_cpuinit_ops {
160 * @is_untracked_pat_range exclude from PAT logic 162 * @is_untracked_pat_range exclude from PAT logic
161 * @nmi_init enable NMI on cpus 163 * @nmi_init enable NMI on cpus
162 * @i8042_detect pre-detect if i8042 controller exists 164 * @i8042_detect pre-detect if i8042 controller exists
165 * @save_sched_clock_state: save state for sched_clock() on suspend
166 * @restore_sched_clock_state: restore state for sched_clock() on resume
163 */ 167 */
164struct x86_platform_ops { 168struct x86_platform_ops {
165 unsigned long (*calibrate_tsc)(void); 169 unsigned long (*calibrate_tsc)(void);
@@ -171,6 +175,8 @@ struct x86_platform_ops {
171 void (*nmi_init)(void); 175 void (*nmi_init)(void);
172 unsigned char (*get_nmi_reason)(void); 176 unsigned char (*get_nmi_reason)(void);
173 int (*i8042_detect)(void); 177 int (*i8042_detect)(void);
178 void (*save_sched_clock_state)(void);
179 void (*restore_sched_clock_state)(void);
174}; 180};
175 181
176struct pci_dev; 182struct pci_dev;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ade9c794ed9..e49477444ff 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -18,6 +18,7 @@
18#include <asm/archrandom.h> 18#include <asm/archrandom.h>
19#include <asm/hypervisor.h> 19#include <asm/hypervisor.h>
20#include <asm/processor.h> 20#include <asm/processor.h>
21#include <asm/debugreg.h>
21#include <asm/sections.h> 22#include <asm/sections.h>
22#include <linux/topology.h> 23#include <linux/topology.h>
23#include <linux/cpumask.h> 24#include <linux/cpumask.h>
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0a18d16cb58..fa2900c0e39 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -643,14 +643,14 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
643 /* Prefer fixed purpose counters */ 643 /* Prefer fixed purpose counters */
644 if (x86_pmu.num_counters_fixed) { 644 if (x86_pmu.num_counters_fixed) {
645 idx = X86_PMC_IDX_FIXED; 645 idx = X86_PMC_IDX_FIXED;
646 for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { 646 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
647 if (!__test_and_set_bit(idx, sched->state.used)) 647 if (!__test_and_set_bit(idx, sched->state.used))
648 goto done; 648 goto done;
649 } 649 }
650 } 650 }
651 /* Grab the first unused counter starting with idx */ 651 /* Grab the first unused counter starting with idx */
652 idx = sched->state.counter; 652 idx = sched->state.counter;
653 for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { 653 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
654 if (!__test_and_set_bit(idx, sched->state.used)) 654 if (!__test_and_set_bit(idx, sched->state.used))
655 goto done; 655 goto done;
656 } 656 }
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 79d97e68f04..7b784f4ef1e 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -98,12 +98,6 @@
98#endif 98#endif
99.endm 99.endm
100 100
101#ifdef CONFIG_VM86
102#define resume_userspace_sig check_userspace
103#else
104#define resume_userspace_sig resume_userspace
105#endif
106
107/* 101/*
108 * User gs save/restore 102 * User gs save/restore
109 * 103 *
@@ -327,10 +321,19 @@ ret_from_exception:
327 preempt_stop(CLBR_ANY) 321 preempt_stop(CLBR_ANY)
328ret_from_intr: 322ret_from_intr:
329 GET_THREAD_INFO(%ebp) 323 GET_THREAD_INFO(%ebp)
330check_userspace: 324resume_userspace_sig:
325#ifdef CONFIG_VM86
331 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS 326 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
332 movb PT_CS(%esp), %al 327 movb PT_CS(%esp), %al
333 andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax 328 andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
329#else
330 /*
331 * We can be coming here from a syscall done in the kernel space,
332 * e.g. a failed kernel_execve().
333 */
334 movl PT_CS(%esp), %eax
335 andl $SEGMENT_RPL_MASK, %eax
336#endif
334 cmpl $USER_RPL, %eax 337 cmpl $USER_RPL, %eax
335 jb resume_kernel # not returning to v8086 or userspace 338 jb resume_kernel # not returning to v8086 or userspace
336 339
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 99b85b423bb..6d5fc8cfd5d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -305,10 +305,10 @@ void __init native_init_IRQ(void)
305 * us. (some of these will be overridden and become 305 * us. (some of these will be overridden and become
306 * 'special' SMP interrupts) 306 * 'special' SMP interrupts)
307 */ 307 */
308 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 308 i = FIRST_EXTERNAL_VECTOR;
309 for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
309 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ 310 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
310 if (!test_bit(i, used_vectors)) 311 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
311 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
312 } 312 }
313 313
314 if (!acpi_ioapic && !of_ioapic) 314 if (!acpi_ioapic && !of_ioapic)
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 4425a12ece4..db6720edfdd 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -66,8 +66,6 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
66 { "ss", 4, offsetof(struct pt_regs, ss) }, 66 { "ss", 4, offsetof(struct pt_regs, ss) },
67 { "ds", 4, offsetof(struct pt_regs, ds) }, 67 { "ds", 4, offsetof(struct pt_regs, ds) },
68 { "es", 4, offsetof(struct pt_regs, es) }, 68 { "es", 4, offsetof(struct pt_regs, es) },
69 { "fs", 4, -1 },
70 { "gs", 4, -1 },
71#else 69#else
72 { "ax", 8, offsetof(struct pt_regs, ax) }, 70 { "ax", 8, offsetof(struct pt_regs, ax) },
73 { "bx", 8, offsetof(struct pt_regs, bx) }, 71 { "bx", 8, offsetof(struct pt_regs, bx) },
@@ -89,7 +87,11 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
89 { "flags", 4, offsetof(struct pt_regs, flags) }, 87 { "flags", 4, offsetof(struct pt_regs, flags) },
90 { "cs", 4, offsetof(struct pt_regs, cs) }, 88 { "cs", 4, offsetof(struct pt_regs, cs) },
91 { "ss", 4, offsetof(struct pt_regs, ss) }, 89 { "ss", 4, offsetof(struct pt_regs, ss) },
90 { "ds", 4, -1 },
91 { "es", 4, -1 },
92#endif 92#endif
93 { "fs", 4, -1 },
94 { "gs", 4, -1 },
93}; 95};
94 96
95int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) 97int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 44842d756b2..f8492da65bf 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -136,6 +136,15 @@ int kvm_register_clock(char *txt)
136 return ret; 136 return ret;
137} 137}
138 138
139static void kvm_save_sched_clock_state(void)
140{
141}
142
143static void kvm_restore_sched_clock_state(void)
144{
145 kvm_register_clock("primary cpu clock, resume");
146}
147
139#ifdef CONFIG_X86_LOCAL_APIC 148#ifdef CONFIG_X86_LOCAL_APIC
140static void __cpuinit kvm_setup_secondary_clock(void) 149static void __cpuinit kvm_setup_secondary_clock(void)
141{ 150{
@@ -144,8 +153,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
144 * we shouldn't fail. 153 * we shouldn't fail.
145 */ 154 */
146 WARN_ON(kvm_register_clock("secondary cpu clock")); 155 WARN_ON(kvm_register_clock("secondary cpu clock"));
147 /* ok, done with our trickery, call native */
148 setup_secondary_APIC_clock();
149} 156}
150#endif 157#endif
151 158
@@ -194,9 +201,11 @@ void __init kvmclock_init(void)
194 x86_platform.get_wallclock = kvm_get_wallclock; 201 x86_platform.get_wallclock = kvm_get_wallclock;
195 x86_platform.set_wallclock = kvm_set_wallclock; 202 x86_platform.set_wallclock = kvm_set_wallclock;
196#ifdef CONFIG_X86_LOCAL_APIC 203#ifdef CONFIG_X86_LOCAL_APIC
197 x86_cpuinit.setup_percpu_clockev = 204 x86_cpuinit.early_percpu_clock_init =
198 kvm_setup_secondary_clock; 205 kvm_setup_secondary_clock;
199#endif 206#endif
207 x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
208 x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
200 machine_ops.shutdown = kvm_shutdown; 209 machine_ops.shutdown = kvm_shutdown;
201#ifdef CONFIG_KEXEC 210#ifdef CONFIG_KEXEC
202 machine_ops.crash_shutdown = kvm_crash_shutdown; 211 machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 2b26485f0c1..ab137605e69 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -26,6 +26,7 @@
26 26
27#include <asm/bug.h> 27#include <asm/bug.h>
28#include <asm/paravirt.h> 28#include <asm/paravirt.h>
29#include <asm/debugreg.h>
29#include <asm/desc.h> 30#include <asm/desc.h>
30#include <asm/setup.h> 31#include <asm/setup.h>
31#include <asm/pgtable.h> 32#include <asm/pgtable.h>
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1c4d769e21e..28e5e06fcba 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -262,10 +262,11 @@ rootfs_initcall(pci_iommu_init);
262 262
263static __devinit void via_no_dac(struct pci_dev *dev) 263static __devinit void via_no_dac(struct pci_dev *dev)
264{ 264{
265 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { 265 if (forbid_dac == 0) {
266 dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); 266 dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
267 forbid_dac = 1; 267 forbid_dac = 1;
268 } 268 }
269} 269}
270DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); 270DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
271 PCI_CLASS_BRIDGE_PCI, 8, via_no_dac);
271#endif 272#endif
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e578a79a309..5104a2b685c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -255,6 +255,7 @@ notrace static void __cpuinit start_secondary(void *unused)
255 * most necessary things. 255 * most necessary things.
256 */ 256 */
257 cpu_init(); 257 cpu_init();
258 x86_cpuinit.early_percpu_clock_init();
258 preempt_disable(); 259 preempt_disable();
259 smp_callin(); 260 smp_callin();
260 261
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 183c5925a9f..899a03f2d18 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -630,7 +630,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
630 630
631static unsigned long long cyc2ns_suspend; 631static unsigned long long cyc2ns_suspend;
632 632
633void save_sched_clock_state(void) 633void tsc_save_sched_clock_state(void)
634{ 634{
635 if (!sched_clock_stable) 635 if (!sched_clock_stable)
636 return; 636 return;
@@ -646,7 +646,7 @@ void save_sched_clock_state(void)
646 * that sched_clock() continues from the point where it was left off during 646 * that sched_clock() continues from the point where it was left off during
647 * suspend. 647 * suspend.
648 */ 648 */
649void restore_sched_clock_state(void) 649void tsc_restore_sched_clock_state(void)
650{ 650{
651 unsigned long long offset; 651 unsigned long long offset;
652 unsigned long flags; 652 unsigned long flags;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 947a06ccc67..e9f265fd79a 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -91,6 +91,7 @@ struct x86_init_ops x86_init __initdata = {
91}; 91};
92 92
93struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 93struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
94 .early_percpu_clock_init = x86_init_noop,
94 .setup_percpu_clockev = setup_secondary_APIC_clock, 95 .setup_percpu_clockev = setup_secondary_APIC_clock,
95 .fixup_cpu_id = x86_default_fixup_cpu_id, 96 .fixup_cpu_id = x86_default_fixup_cpu_id,
96}; 97};
@@ -107,7 +108,9 @@ struct x86_platform_ops x86_platform = {
107 .is_untracked_pat_range = is_ISA_range, 108 .is_untracked_pat_range = is_ISA_range,
108 .nmi_init = default_nmi_init, 109 .nmi_init = default_nmi_init,
109 .get_nmi_reason = default_get_nmi_reason, 110 .get_nmi_reason = default_get_nmi_reason,
110 .i8042_detect = default_i8042_detect 111 .i8042_detect = default_i8042_detect,
112 .save_sched_clock_state = tsc_save_sched_clock_state,
113 .restore_sched_clock_state = tsc_restore_sched_clock_state,
111}; 114};
112 115
113EXPORT_SYMBOL_GPL(x86_platform); 116EXPORT_SYMBOL_GPL(x86_platform);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 89b02bfaaca..9fed5bedaad 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -236,7 +236,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
236 const u32 kvm_supported_word6_x86_features = 236 const u32 kvm_supported_word6_x86_features =
237 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | 237 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
238 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 238 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
239 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 239 F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
240 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); 240 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
241 241
242 /* cpuid 0xC0000001.edx */ 242 /* cpuid 0xC0000001.edx */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 5b97e1797a6..26d1fb437eb 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -43,4 +43,12 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
43 return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); 43 return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
44} 44}
45 45
46static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
47{
48 struct kvm_cpuid_entry2 *best;
49
50 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
51 return best && (best->ecx & bit(X86_FEATURE_OSVW));
52}
53
46#endif 54#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0982507b962..83756223f8a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -57,6 +57,7 @@
57#define OpDS 23ull /* DS */ 57#define OpDS 23ull /* DS */
58#define OpFS 24ull /* FS */ 58#define OpFS 24ull /* FS */
59#define OpGS 25ull /* GS */ 59#define OpGS 25ull /* GS */
60#define OpMem8 26ull /* 8-bit zero extended memory operand */
60 61
61#define OpBits 5 /* Width of operand field */ 62#define OpBits 5 /* Width of operand field */
62#define OpMask ((1ull << OpBits) - 1) 63#define OpMask ((1ull << OpBits) - 1)
@@ -101,6 +102,7 @@
101#define SrcAcc (OpAcc << SrcShift) 102#define SrcAcc (OpAcc << SrcShift)
102#define SrcImmU16 (OpImmU16 << SrcShift) 103#define SrcImmU16 (OpImmU16 << SrcShift)
103#define SrcDX (OpDX << SrcShift) 104#define SrcDX (OpDX << SrcShift)
105#define SrcMem8 (OpMem8 << SrcShift)
104#define SrcMask (OpMask << SrcShift) 106#define SrcMask (OpMask << SrcShift)
105#define BitOp (1<<11) 107#define BitOp (1<<11)
106#define MemAbs (1<<12) /* Memory operand is absolute displacement */ 108#define MemAbs (1<<12) /* Memory operand is absolute displacement */
@@ -858,8 +860,7 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
858} 860}
859 861
860static void decode_register_operand(struct x86_emulate_ctxt *ctxt, 862static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
861 struct operand *op, 863 struct operand *op)
862 int inhibit_bytereg)
863{ 864{
864 unsigned reg = ctxt->modrm_reg; 865 unsigned reg = ctxt->modrm_reg;
865 int highbyte_regs = ctxt->rex_prefix == 0; 866 int highbyte_regs = ctxt->rex_prefix == 0;
@@ -876,7 +877,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
876 } 877 }
877 878
878 op->type = OP_REG; 879 op->type = OP_REG;
879 if ((ctxt->d & ByteOp) && !inhibit_bytereg) { 880 if (ctxt->d & ByteOp) {
880 op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); 881 op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
881 op->bytes = 1; 882 op->bytes = 1;
882 } else { 883 } else {
@@ -1151,6 +1152,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1151 return 1; 1152 return 1;
1152} 1153}
1153 1154
1155static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
1156 u16 index, struct desc_struct *desc)
1157{
1158 struct desc_ptr dt;
1159 ulong addr;
1160
1161 ctxt->ops->get_idt(ctxt, &dt);
1162
1163 if (dt.size < index * 8 + 7)
1164 return emulate_gp(ctxt, index << 3 | 0x2);
1165
1166 addr = dt.address + index * 8;
1167 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
1168 &ctxt->exception);
1169}
1170
1154static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, 1171static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1155 u16 selector, struct desc_ptr *dt) 1172 u16 selector, struct desc_ptr *dt)
1156{ 1173{
@@ -1227,6 +1244,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1227 seg_desc.type = 3; 1244 seg_desc.type = 3;
1228 seg_desc.p = 1; 1245 seg_desc.p = 1;
1229 seg_desc.s = 1; 1246 seg_desc.s = 1;
1247 if (ctxt->mode == X86EMUL_MODE_VM86)
1248 seg_desc.dpl = 3;
1230 goto load; 1249 goto load;
1231 } 1250 }
1232 1251
@@ -1891,6 +1910,17 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1891 ss->p = 1; 1910 ss->p = 1;
1892} 1911}
1893 1912
1913static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
1914{
1915 u32 eax, ebx, ecx, edx;
1916
1917 eax = ecx = 0;
1918 return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)
1919 && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
1920 && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
1921 && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
1922}
1923
1894static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) 1924static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
1895{ 1925{
1896 struct x86_emulate_ops *ops = ctxt->ops; 1926 struct x86_emulate_ops *ops = ctxt->ops;
@@ -2007,6 +2037,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
2007 if (ctxt->mode == X86EMUL_MODE_REAL) 2037 if (ctxt->mode == X86EMUL_MODE_REAL)
2008 return emulate_gp(ctxt, 0); 2038 return emulate_gp(ctxt, 0);
2009 2039
2040 /*
2041 * Not recognized on AMD in compat mode (but is recognized in legacy
2042 * mode).
2043 */
2044 if ((ctxt->mode == X86EMUL_MODE_PROT32) && (efer & EFER_LMA)
2045 && !vendor_intel(ctxt))
2046 return emulate_ud(ctxt);
2047
2010 /* XXX sysenter/sysexit have not been tested in 64bit mode. 2048 /* XXX sysenter/sysexit have not been tested in 64bit mode.
2011 * Therefore, we inject an #UD. 2049 * Therefore, we inject an #UD.
2012 */ 2050 */
@@ -2306,6 +2344,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2306 return emulate_gp(ctxt, 0); 2344 return emulate_gp(ctxt, 0);
2307 ctxt->_eip = tss->eip; 2345 ctxt->_eip = tss->eip;
2308 ctxt->eflags = tss->eflags | 2; 2346 ctxt->eflags = tss->eflags | 2;
2347
2348 /* General purpose registers */
2309 ctxt->regs[VCPU_REGS_RAX] = tss->eax; 2349 ctxt->regs[VCPU_REGS_RAX] = tss->eax;
2310 ctxt->regs[VCPU_REGS_RCX] = tss->ecx; 2350 ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
2311 ctxt->regs[VCPU_REGS_RDX] = tss->edx; 2351 ctxt->regs[VCPU_REGS_RDX] = tss->edx;
@@ -2328,6 +2368,24 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2328 set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); 2368 set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
2329 2369
2330 /* 2370 /*
2371 * If we're switching between Protected Mode and VM86, we need to make
2372 * sure to update the mode before loading the segment descriptors so
2373 * that the selectors are interpreted correctly.
2374 *
2375 * Need to get rflags to the vcpu struct immediately because it
2376 * influences the CPL which is checked at least when loading the segment
2377 * descriptors and when pushing an error code to the new kernel stack.
2378 *
2379 * TODO Introduce a separate ctxt->ops->set_cpl callback
2380 */
2381 if (ctxt->eflags & X86_EFLAGS_VM)
2382 ctxt->mode = X86EMUL_MODE_VM86;
2383 else
2384 ctxt->mode = X86EMUL_MODE_PROT32;
2385
2386 ctxt->ops->set_rflags(ctxt, ctxt->eflags);
2387
2388 /*
2331 * Now load segment descriptors. If fault happenes at this stage 2389 * Now load segment descriptors. If fault happenes at this stage
2332 * it is handled in a context of new task 2390 * it is handled in a context of new task
2333 */ 2391 */
@@ -2401,7 +2459,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2401} 2459}
2402 2460
2403static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, 2461static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2404 u16 tss_selector, int reason, 2462 u16 tss_selector, int idt_index, int reason,
2405 bool has_error_code, u32 error_code) 2463 bool has_error_code, u32 error_code)
2406{ 2464{
2407 struct x86_emulate_ops *ops = ctxt->ops; 2465 struct x86_emulate_ops *ops = ctxt->ops;
@@ -2423,12 +2481,35 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2423 2481
2424 /* FIXME: check that next_tss_desc is tss */ 2482 /* FIXME: check that next_tss_desc is tss */
2425 2483
2426 if (reason != TASK_SWITCH_IRET) { 2484 /*
2427 if ((tss_selector & 3) > next_tss_desc.dpl || 2485 * Check privileges. The three cases are task switch caused by...
2428 ops->cpl(ctxt) > next_tss_desc.dpl) 2486 *
2429 return emulate_gp(ctxt, 0); 2487 * 1. jmp/call/int to task gate: Check against DPL of the task gate
2488 * 2. Exception/IRQ/iret: No check is performed
2489 * 3. jmp/call to TSS: Check agains DPL of the TSS
2490 */
2491 if (reason == TASK_SWITCH_GATE) {
2492 if (idt_index != -1) {
2493 /* Software interrupts */
2494 struct desc_struct task_gate_desc;
2495 int dpl;
2496
2497 ret = read_interrupt_descriptor(ctxt, idt_index,
2498 &task_gate_desc);
2499 if (ret != X86EMUL_CONTINUE)
2500 return ret;
2501
2502 dpl = task_gate_desc.dpl;
2503 if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
2504 return emulate_gp(ctxt, (idt_index << 3) | 0x2);
2505 }
2506 } else if (reason != TASK_SWITCH_IRET) {
2507 int dpl = next_tss_desc.dpl;
2508 if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
2509 return emulate_gp(ctxt, tss_selector);
2430 } 2510 }
2431 2511
2512
2432 desc_limit = desc_limit_scaled(&next_tss_desc); 2513 desc_limit = desc_limit_scaled(&next_tss_desc);
2433 if (!next_tss_desc.p || 2514 if (!next_tss_desc.p ||
2434 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || 2515 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
@@ -2481,7 +2562,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2481} 2562}
2482 2563
2483int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 2564int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2484 u16 tss_selector, int reason, 2565 u16 tss_selector, int idt_index, int reason,
2485 bool has_error_code, u32 error_code) 2566 bool has_error_code, u32 error_code)
2486{ 2567{
2487 int rc; 2568 int rc;
@@ -2489,7 +2570,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2489 ctxt->_eip = ctxt->eip; 2570 ctxt->_eip = ctxt->eip;
2490 ctxt->dst.type = OP_NONE; 2571 ctxt->dst.type = OP_NONE;
2491 2572
2492 rc = emulator_do_task_switch(ctxt, tss_selector, reason, 2573 rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
2493 has_error_code, error_code); 2574 has_error_code, error_code);
2494 2575
2495 if (rc == X86EMUL_CONTINUE) 2576 if (rc == X86EMUL_CONTINUE)
@@ -3514,13 +3595,13 @@ static struct opcode twobyte_table[256] = {
3514 I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), 3595 I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
3515 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), 3596 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
3516 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), 3597 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
3517 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3598 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3518 /* 0xB8 - 0xBF */ 3599 /* 0xB8 - 0xBF */
3519 N, N, 3600 N, N,
3520 G(BitOp, group8), 3601 G(BitOp, group8),
3521 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), 3602 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
3522 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), 3603 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
3523 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3604 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3524 /* 0xC0 - 0xCF */ 3605 /* 0xC0 - 0xCF */
3525 D2bv(DstMem | SrcReg | ModRM | Lock), 3606 D2bv(DstMem | SrcReg | ModRM | Lock),
3526 N, D(DstMem | SrcReg | ModRM | Mov), 3607 N, D(DstMem | SrcReg | ModRM | Mov),
@@ -3602,9 +3683,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3602 3683
3603 switch (d) { 3684 switch (d) {
3604 case OpReg: 3685 case OpReg:
3605 decode_register_operand(ctxt, op, 3686 decode_register_operand(ctxt, op);
3606 op == &ctxt->dst &&
3607 ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
3608 break; 3687 break;
3609 case OpImmUByte: 3688 case OpImmUByte:
3610 rc = decode_imm(ctxt, op, 1, false); 3689 rc = decode_imm(ctxt, op, 1, false);
@@ -3656,6 +3735,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3656 case OpImm: 3735 case OpImm:
3657 rc = decode_imm(ctxt, op, imm_size(ctxt), true); 3736 rc = decode_imm(ctxt, op, imm_size(ctxt), true);
3658 break; 3737 break;
3738 case OpMem8:
3739 ctxt->memop.bytes = 1;
3740 goto mem_common;
3659 case OpMem16: 3741 case OpMem16:
3660 ctxt->memop.bytes = 2; 3742 ctxt->memop.bytes = 2;
3661 goto mem_common; 3743 goto mem_common;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index b6a73537e1e..81cf4fa4a2b 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -307,6 +307,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
307 if (val & 0x10) { 307 if (val & 0x10) {
308 s->init4 = val & 1; 308 s->init4 = val & 1;
309 s->last_irr = 0; 309 s->last_irr = 0;
310 s->irr &= s->elcr;
310 s->imr = 0; 311 s->imr = 0;
311 s->priority_add = 0; 312 s->priority_add = 0;
312 s->special_mask = 0; 313 s->special_mask = 0;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 31bfc6927bc..858432287ab 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -433,7 +433,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
433 break; 433 break;
434 434
435 case APIC_DM_INIT: 435 case APIC_DM_INIT:
436 if (level) { 436 if (!trig_mode || level) {
437 result = 1; 437 result = 1;
438 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 438 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
439 kvm_make_request(KVM_REQ_EVENT, vcpu); 439 kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -731,7 +731,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
731 u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; 731 u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
732 u64 ns = 0; 732 u64 ns = 0;
733 struct kvm_vcpu *vcpu = apic->vcpu; 733 struct kvm_vcpu *vcpu = apic->vcpu;
734 unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); 734 unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
735 unsigned long flags; 735 unsigned long flags;
736 736
737 if (unlikely(!tscdeadline || !this_tsc_khz)) 737 if (unlikely(!tscdeadline || !this_tsc_khz))
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 224b02c3cda..4cb16426884 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -688,9 +688,8 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
688{ 688{
689 unsigned long idx; 689 unsigned long idx;
690 690
691 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 691 idx = gfn_to_index(gfn, slot->base_gfn, level);
692 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 692 return &slot->arch.lpage_info[level - 2][idx];
693 return &slot->lpage_info[level - 2][idx];
694} 693}
695 694
696static void account_shadowed(struct kvm *kvm, gfn_t gfn) 695static void account_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -946,7 +945,7 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
946 } 945 }
947} 946}
948 947
949static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level, 948static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
950 struct kvm_memory_slot *slot) 949 struct kvm_memory_slot *slot)
951{ 950{
952 struct kvm_lpage_info *linfo; 951 struct kvm_lpage_info *linfo;
@@ -966,7 +965,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
966 struct kvm_memory_slot *slot; 965 struct kvm_memory_slot *slot;
967 966
968 slot = gfn_to_memslot(kvm, gfn); 967 slot = gfn_to_memslot(kvm, gfn);
969 return __gfn_to_rmap(kvm, gfn, level, slot); 968 return __gfn_to_rmap(gfn, level, slot);
970} 969}
971 970
972static bool rmap_can_add(struct kvm_vcpu *vcpu) 971static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -988,7 +987,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
988 return pte_list_add(vcpu, spte, rmapp); 987 return pte_list_add(vcpu, spte, rmapp);
989} 988}
990 989
991static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 990static u64 *rmap_next(unsigned long *rmapp, u64 *spte)
992{ 991{
993 return pte_list_next(rmapp, spte); 992 return pte_list_next(rmapp, spte);
994} 993}
@@ -1018,8 +1017,8 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
1018 u64 *spte; 1017 u64 *spte;
1019 int i, write_protected = 0; 1018 int i, write_protected = 0;
1020 1019
1021 rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot); 1020 rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot);
1022 spte = rmap_next(kvm, rmapp, NULL); 1021 spte = rmap_next(rmapp, NULL);
1023 while (spte) { 1022 while (spte) {
1024 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1023 BUG_ON(!(*spte & PT_PRESENT_MASK));
1025 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 1024 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
@@ -1027,14 +1026,14 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
1027 mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); 1026 mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
1028 write_protected = 1; 1027 write_protected = 1;
1029 } 1028 }
1030 spte = rmap_next(kvm, rmapp, spte); 1029 spte = rmap_next(rmapp, spte);
1031 } 1030 }
1032 1031
1033 /* check for huge page mappings */ 1032 /* check for huge page mappings */
1034 for (i = PT_DIRECTORY_LEVEL; 1033 for (i = PT_DIRECTORY_LEVEL;
1035 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 1034 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1036 rmapp = __gfn_to_rmap(kvm, gfn, i, slot); 1035 rmapp = __gfn_to_rmap(gfn, i, slot);
1037 spte = rmap_next(kvm, rmapp, NULL); 1036 spte = rmap_next(rmapp, NULL);
1038 while (spte) { 1037 while (spte) {
1039 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1038 BUG_ON(!(*spte & PT_PRESENT_MASK));
1040 BUG_ON(!is_large_pte(*spte)); 1039 BUG_ON(!is_large_pte(*spte));
@@ -1045,7 +1044,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
1045 spte = NULL; 1044 spte = NULL;
1046 write_protected = 1; 1045 write_protected = 1;
1047 } 1046 }
1048 spte = rmap_next(kvm, rmapp, spte); 1047 spte = rmap_next(rmapp, spte);
1049 } 1048 }
1050 } 1049 }
1051 1050
@@ -1066,7 +1065,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1066 u64 *spte; 1065 u64 *spte;
1067 int need_tlb_flush = 0; 1066 int need_tlb_flush = 0;
1068 1067
1069 while ((spte = rmap_next(kvm, rmapp, NULL))) { 1068 while ((spte = rmap_next(rmapp, NULL))) {
1070 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1069 BUG_ON(!(*spte & PT_PRESENT_MASK));
1071 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 1070 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
1072 drop_spte(kvm, spte); 1071 drop_spte(kvm, spte);
@@ -1085,14 +1084,14 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1085 1084
1086 WARN_ON(pte_huge(*ptep)); 1085 WARN_ON(pte_huge(*ptep));
1087 new_pfn = pte_pfn(*ptep); 1086 new_pfn = pte_pfn(*ptep);
1088 spte = rmap_next(kvm, rmapp, NULL); 1087 spte = rmap_next(rmapp, NULL);
1089 while (spte) { 1088 while (spte) {
1090 BUG_ON(!is_shadow_present_pte(*spte)); 1089 BUG_ON(!is_shadow_present_pte(*spte));
1091 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 1090 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
1092 need_flush = 1; 1091 need_flush = 1;
1093 if (pte_write(*ptep)) { 1092 if (pte_write(*ptep)) {
1094 drop_spte(kvm, spte); 1093 drop_spte(kvm, spte);
1095 spte = rmap_next(kvm, rmapp, NULL); 1094 spte = rmap_next(rmapp, NULL);
1096 } else { 1095 } else {
1097 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 1096 new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
1098 new_spte |= (u64)new_pfn << PAGE_SHIFT; 1097 new_spte |= (u64)new_pfn << PAGE_SHIFT;
@@ -1102,7 +1101,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1102 new_spte &= ~shadow_accessed_mask; 1101 new_spte &= ~shadow_accessed_mask;
1103 mmu_spte_clear_track_bits(spte); 1102 mmu_spte_clear_track_bits(spte);
1104 mmu_spte_set(spte, new_spte); 1103 mmu_spte_set(spte, new_spte);
1105 spte = rmap_next(kvm, rmapp, spte); 1104 spte = rmap_next(rmapp, spte);
1106 } 1105 }
1107 } 1106 }
1108 if (need_flush) 1107 if (need_flush)
@@ -1176,7 +1175,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1176 if (!shadow_accessed_mask) 1175 if (!shadow_accessed_mask)
1177 return kvm_unmap_rmapp(kvm, rmapp, data); 1176 return kvm_unmap_rmapp(kvm, rmapp, data);
1178 1177
1179 spte = rmap_next(kvm, rmapp, NULL); 1178 spte = rmap_next(rmapp, NULL);
1180 while (spte) { 1179 while (spte) {
1181 int _young; 1180 int _young;
1182 u64 _spte = *spte; 1181 u64 _spte = *spte;
@@ -1186,7 +1185,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1186 young = 1; 1185 young = 1;
1187 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); 1186 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
1188 } 1187 }
1189 spte = rmap_next(kvm, rmapp, spte); 1188 spte = rmap_next(rmapp, spte);
1190 } 1189 }
1191 return young; 1190 return young;
1192} 1191}
@@ -1205,7 +1204,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1205 if (!shadow_accessed_mask) 1204 if (!shadow_accessed_mask)
1206 goto out; 1205 goto out;
1207 1206
1208 spte = rmap_next(kvm, rmapp, NULL); 1207 spte = rmap_next(rmapp, NULL);
1209 while (spte) { 1208 while (spte) {
1210 u64 _spte = *spte; 1209 u64 _spte = *spte;
1211 BUG_ON(!(_spte & PT_PRESENT_MASK)); 1210 BUG_ON(!(_spte & PT_PRESENT_MASK));
@@ -1214,7 +1213,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1214 young = 1; 1213 young = 1;
1215 break; 1214 break;
1216 } 1215 }
1217 spte = rmap_next(kvm, rmapp, spte); 1216 spte = rmap_next(rmapp, spte);
1218 } 1217 }
1219out: 1218out:
1220 return young; 1219 return young;
@@ -1391,11 +1390,6 @@ struct kvm_mmu_pages {
1391 unsigned int nr; 1390 unsigned int nr;
1392}; 1391};
1393 1392
1394#define for_each_unsync_children(bitmap, idx) \
1395 for (idx = find_first_bit(bitmap, 512); \
1396 idx < 512; \
1397 idx = find_next_bit(bitmap, 512, idx+1))
1398
1399static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, 1393static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1400 int idx) 1394 int idx)
1401{ 1395{
@@ -1417,7 +1411,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1417{ 1411{
1418 int i, ret, nr_unsync_leaf = 0; 1412 int i, ret, nr_unsync_leaf = 0;
1419 1413
1420 for_each_unsync_children(sp->unsync_child_bitmap, i) { 1414 for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
1421 struct kvm_mmu_page *child; 1415 struct kvm_mmu_page *child;
1422 u64 ent = sp->spt[i]; 1416 u64 ent = sp->spt[i];
1423 1417
@@ -1803,6 +1797,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1803{ 1797{
1804 if (is_large_pte(*sptep)) { 1798 if (is_large_pte(*sptep)) {
1805 drop_spte(vcpu->kvm, sptep); 1799 drop_spte(vcpu->kvm, sptep);
1800 --vcpu->kvm->stat.lpages;
1806 kvm_flush_remote_tlbs(vcpu->kvm); 1801 kvm_flush_remote_tlbs(vcpu->kvm);
1807 } 1802 }
1808} 1803}
@@ -3190,15 +3185,14 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3190#undef PTTYPE 3185#undef PTTYPE
3191 3186
3192static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, 3187static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
3193 struct kvm_mmu *context, 3188 struct kvm_mmu *context)
3194 int level)
3195{ 3189{
3196 int maxphyaddr = cpuid_maxphyaddr(vcpu); 3190 int maxphyaddr = cpuid_maxphyaddr(vcpu);
3197 u64 exb_bit_rsvd = 0; 3191 u64 exb_bit_rsvd = 0;
3198 3192
3199 if (!context->nx) 3193 if (!context->nx)
3200 exb_bit_rsvd = rsvd_bits(63, 63); 3194 exb_bit_rsvd = rsvd_bits(63, 63);
3201 switch (level) { 3195 switch (context->root_level) {
3202 case PT32_ROOT_LEVEL: 3196 case PT32_ROOT_LEVEL:
3203 /* no rsvd bits for 2 level 4K page table entries */ 3197 /* no rsvd bits for 2 level 4K page table entries */
3204 context->rsvd_bits_mask[0][1] = 0; 3198 context->rsvd_bits_mask[0][1] = 0;
@@ -3256,8 +3250,9 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
3256 int level) 3250 int level)
3257{ 3251{
3258 context->nx = is_nx(vcpu); 3252 context->nx = is_nx(vcpu);
3253 context->root_level = level;
3259 3254
3260 reset_rsvds_bits_mask(vcpu, context, level); 3255 reset_rsvds_bits_mask(vcpu, context);
3261 3256
3262 ASSERT(is_pae(vcpu)); 3257 ASSERT(is_pae(vcpu));
3263 context->new_cr3 = paging_new_cr3; 3258 context->new_cr3 = paging_new_cr3;
@@ -3267,7 +3262,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
3267 context->invlpg = paging64_invlpg; 3262 context->invlpg = paging64_invlpg;
3268 context->update_pte = paging64_update_pte; 3263 context->update_pte = paging64_update_pte;
3269 context->free = paging_free; 3264 context->free = paging_free;
3270 context->root_level = level;
3271 context->shadow_root_level = level; 3265 context->shadow_root_level = level;
3272 context->root_hpa = INVALID_PAGE; 3266 context->root_hpa = INVALID_PAGE;
3273 context->direct_map = false; 3267 context->direct_map = false;
@@ -3284,8 +3278,9 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
3284 struct kvm_mmu *context) 3278 struct kvm_mmu *context)
3285{ 3279{
3286 context->nx = false; 3280 context->nx = false;
3281 context->root_level = PT32_ROOT_LEVEL;
3287 3282
3288 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); 3283 reset_rsvds_bits_mask(vcpu, context);
3289 3284
3290 context->new_cr3 = paging_new_cr3; 3285 context->new_cr3 = paging_new_cr3;
3291 context->page_fault = paging32_page_fault; 3286 context->page_fault = paging32_page_fault;
@@ -3294,7 +3289,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
3294 context->sync_page = paging32_sync_page; 3289 context->sync_page = paging32_sync_page;
3295 context->invlpg = paging32_invlpg; 3290 context->invlpg = paging32_invlpg;
3296 context->update_pte = paging32_update_pte; 3291 context->update_pte = paging32_update_pte;
3297 context->root_level = PT32_ROOT_LEVEL;
3298 context->shadow_root_level = PT32E_ROOT_LEVEL; 3292 context->shadow_root_level = PT32E_ROOT_LEVEL;
3299 context->root_hpa = INVALID_PAGE; 3293 context->root_hpa = INVALID_PAGE;
3300 context->direct_map = false; 3294 context->direct_map = false;
@@ -3325,7 +3319,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3325 context->get_cr3 = get_cr3; 3319 context->get_cr3 = get_cr3;
3326 context->get_pdptr = kvm_pdptr_read; 3320 context->get_pdptr = kvm_pdptr_read;
3327 context->inject_page_fault = kvm_inject_page_fault; 3321 context->inject_page_fault = kvm_inject_page_fault;
3328 context->nx = is_nx(vcpu);
3329 3322
3330 if (!is_paging(vcpu)) { 3323 if (!is_paging(vcpu)) {
3331 context->nx = false; 3324 context->nx = false;
@@ -3333,19 +3326,19 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3333 context->root_level = 0; 3326 context->root_level = 0;
3334 } else if (is_long_mode(vcpu)) { 3327 } else if (is_long_mode(vcpu)) {
3335 context->nx = is_nx(vcpu); 3328 context->nx = is_nx(vcpu);
3336 reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
3337 context->gva_to_gpa = paging64_gva_to_gpa;
3338 context->root_level = PT64_ROOT_LEVEL; 3329 context->root_level = PT64_ROOT_LEVEL;
3330 reset_rsvds_bits_mask(vcpu, context);
3331 context->gva_to_gpa = paging64_gva_to_gpa;
3339 } else if (is_pae(vcpu)) { 3332 } else if (is_pae(vcpu)) {
3340 context->nx = is_nx(vcpu); 3333 context->nx = is_nx(vcpu);
3341 reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
3342 context->gva_to_gpa = paging64_gva_to_gpa;
3343 context->root_level = PT32E_ROOT_LEVEL; 3334 context->root_level = PT32E_ROOT_LEVEL;
3335 reset_rsvds_bits_mask(vcpu, context);
3336 context->gva_to_gpa = paging64_gva_to_gpa;
3344 } else { 3337 } else {
3345 context->nx = false; 3338 context->nx = false;
3346 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
3347 context->gva_to_gpa = paging32_gva_to_gpa;
3348 context->root_level = PT32_ROOT_LEVEL; 3339 context->root_level = PT32_ROOT_LEVEL;
3340 reset_rsvds_bits_mask(vcpu, context);
3341 context->gva_to_gpa = paging32_gva_to_gpa;
3349 } 3342 }
3350 3343
3351 return 0; 3344 return 0;
@@ -3408,18 +3401,18 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3408 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; 3401 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
3409 } else if (is_long_mode(vcpu)) { 3402 } else if (is_long_mode(vcpu)) {
3410 g_context->nx = is_nx(vcpu); 3403 g_context->nx = is_nx(vcpu);
3411 reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
3412 g_context->root_level = PT64_ROOT_LEVEL; 3404 g_context->root_level = PT64_ROOT_LEVEL;
3405 reset_rsvds_bits_mask(vcpu, g_context);
3413 g_context->gva_to_gpa = paging64_gva_to_gpa_nested; 3406 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3414 } else if (is_pae(vcpu)) { 3407 } else if (is_pae(vcpu)) {
3415 g_context->nx = is_nx(vcpu); 3408 g_context->nx = is_nx(vcpu);
3416 reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
3417 g_context->root_level = PT32E_ROOT_LEVEL; 3409 g_context->root_level = PT32E_ROOT_LEVEL;
3410 reset_rsvds_bits_mask(vcpu, g_context);
3418 g_context->gva_to_gpa = paging64_gva_to_gpa_nested; 3411 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3419 } else { 3412 } else {
3420 g_context->nx = false; 3413 g_context->nx = false;
3421 reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
3422 g_context->root_level = PT32_ROOT_LEVEL; 3414 g_context->root_level = PT32_ROOT_LEVEL;
3415 reset_rsvds_bits_mask(vcpu, g_context);
3423 g_context->gva_to_gpa = paging32_gva_to_gpa_nested; 3416 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3424 } 3417 }
3425 3418
@@ -3555,7 +3548,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
3555 * If we're seeing too many writes to a page, it may no longer be a page table, 3548 * If we're seeing too many writes to a page, it may no longer be a page table,
3556 * or we may be forking, in which case it is better to unmap the page. 3549 * or we may be forking, in which case it is better to unmap the page.
3557 */ 3550 */
3558static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte) 3551static bool detect_write_flooding(struct kvm_mmu_page *sp)
3559{ 3552{
3560 /* 3553 /*
3561 * Skip write-flooding detected for the sp whose level is 1, because 3554 * Skip write-flooding detected for the sp whose level is 1, because
@@ -3664,10 +3657,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3664 3657
3665 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; 3658 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
3666 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { 3659 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
3667 spte = get_written_sptes(sp, gpa, &npte);
3668
3669 if (detect_write_misaligned(sp, gpa, bytes) || 3660 if (detect_write_misaligned(sp, gpa, bytes) ||
3670 detect_write_flooding(sp, spte)) { 3661 detect_write_flooding(sp)) {
3671 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 3662 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
3672 &invalid_list); 3663 &invalid_list);
3673 ++vcpu->kvm->stat.mmu_flooded; 3664 ++vcpu->kvm->stat.mmu_flooded;
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ea7b4fd3467..715da5a19a5 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -200,13 +200,13 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
200 slot = gfn_to_memslot(kvm, sp->gfn); 200 slot = gfn_to_memslot(kvm, sp->gfn);
201 rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; 201 rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
202 202
203 spte = rmap_next(kvm, rmapp, NULL); 203 spte = rmap_next(rmapp, NULL);
204 while (spte) { 204 while (spte) {
205 if (is_writable_pte(*spte)) 205 if (is_writable_pte(*spte))
206 audit_printk(kvm, "shadow page has writable " 206 audit_printk(kvm, "shadow page has writable "
207 "mappings: gfn %llx role %x\n", 207 "mappings: gfn %llx role %x\n",
208 sp->gfn, sp->role.word); 208 sp->gfn, sp->role.word);
209 spte = rmap_next(kvm, rmapp, spte); 209 spte = rmap_next(rmapp, spte);
210 } 210 }
211} 211}
212 212
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 7aad5446f39..a73f0c10481 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -33,10 +33,11 @@ static struct kvm_arch_event_perf_mapping {
33 [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, 33 [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
34 [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, 34 [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
35 [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, 35 [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
36 [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
36}; 37};
37 38
38/* mapping between fixed pmc index and arch_events array */ 39/* mapping between fixed pmc index and arch_events array */
39int fixed_pmc_events[] = {1, 0, 2}; 40int fixed_pmc_events[] = {1, 0, 7};
40 41
41static bool pmc_is_gp(struct kvm_pmc *pmc) 42static bool pmc_is_gp(struct kvm_pmc *pmc)
42{ 43{
@@ -210,6 +211,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
210 unsigned config, type = PERF_TYPE_RAW; 211 unsigned config, type = PERF_TYPE_RAW;
211 u8 event_select, unit_mask; 212 u8 event_select, unit_mask;
212 213
214 if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
215 printk_once("kvm pmu: pin control bit is ignored\n");
216
213 pmc->eventsel = eventsel; 217 pmc->eventsel = eventsel;
214 218
215 stop_counter(pmc); 219 stop_counter(pmc);
@@ -220,7 +224,7 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
220 event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; 224 event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
221 unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; 225 unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
222 226
223 if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE | 227 if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
224 ARCH_PERFMON_EVENTSEL_INV | 228 ARCH_PERFMON_EVENTSEL_INV |
225 ARCH_PERFMON_EVENTSEL_CMASK))) { 229 ARCH_PERFMON_EVENTSEL_CMASK))) {
226 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, 230 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
@@ -413,7 +417,7 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
413 struct kvm_pmc *counters; 417 struct kvm_pmc *counters;
414 u64 ctr; 418 u64 ctr;
415 419
416 pmc &= (3u << 30) - 1; 420 pmc &= ~(3u << 30);
417 if (!fixed && pmc >= pmu->nr_arch_gp_counters) 421 if (!fixed && pmc >= pmu->nr_arch_gp_counters)
418 return 1; 422 return 1;
419 if (fixed && pmc >= pmu->nr_arch_fixed_counters) 423 if (fixed && pmc >= pmu->nr_arch_fixed_counters)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e385214711c..e334389e1c7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -111,6 +111,12 @@ struct nested_state {
111#define MSRPM_OFFSETS 16 111#define MSRPM_OFFSETS 16
112static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; 112static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
113 113
114/*
115 * Set osvw_len to higher value when updated Revision Guides
116 * are published and we know what the new status bits are
117 */
118static uint64_t osvw_len = 4, osvw_status;
119
114struct vcpu_svm { 120struct vcpu_svm {
115 struct kvm_vcpu vcpu; 121 struct kvm_vcpu vcpu;
116 struct vmcb *vmcb; 122 struct vmcb *vmcb;
@@ -177,11 +183,13 @@ static bool npt_enabled = true;
177#else 183#else
178static bool npt_enabled; 184static bool npt_enabled;
179#endif 185#endif
180static int npt = 1;
181 186
187/* allow nested paging (virtualized MMU) for all guests */
188static int npt = true;
182module_param(npt, int, S_IRUGO); 189module_param(npt, int, S_IRUGO);
183 190
184static int nested = 1; 191/* allow nested virtualization in KVM/SVM */
192static int nested = true;
185module_param(nested, int, S_IRUGO); 193module_param(nested, int, S_IRUGO);
186 194
187static void svm_flush_tlb(struct kvm_vcpu *vcpu); 195static void svm_flush_tlb(struct kvm_vcpu *vcpu);
@@ -557,6 +565,27 @@ static void svm_init_erratum_383(void)
557 erratum_383_found = true; 565 erratum_383_found = true;
558} 566}
559 567
568static void svm_init_osvw(struct kvm_vcpu *vcpu)
569{
570 /*
571 * Guests should see errata 400 and 415 as fixed (assuming that
572 * HLT and IO instructions are intercepted).
573 */
574 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
575 vcpu->arch.osvw.status = osvw_status & ~(6ULL);
576
577 /*
578 * By increasing VCPU's osvw.length to 3 we are telling the guest that
579 * all osvw.status bits inside that length, including bit 0 (which is
580 * reserved for erratum 298), are valid. However, if host processor's
581 * osvw_len is 0 then osvw_status[0] carries no information. We need to
582 * be conservative here and therefore we tell the guest that erratum 298
583 * is present (because we really don't know).
584 */
585 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
586 vcpu->arch.osvw.status |= 1;
587}
588
560static int has_svm(void) 589static int has_svm(void)
561{ 590{
562 const char *msg; 591 const char *msg;
@@ -623,6 +652,36 @@ static int svm_hardware_enable(void *garbage)
623 __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; 652 __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
624 } 653 }
625 654
655
656 /*
657 * Get OSVW bits.
658 *
659 * Note that it is possible to have a system with mixed processor
660 * revisions and therefore different OSVW bits. If bits are not the same
661 * on different processors then choose the worst case (i.e. if erratum
662 * is present on one processor and not on another then assume that the
663 * erratum is present everywhere).
664 */
665 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
666 uint64_t len, status = 0;
667 int err;
668
669 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
670 if (!err)
671 status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
672 &err);
673
674 if (err)
675 osvw_status = osvw_len = 0;
676 else {
677 if (len < osvw_len)
678 osvw_len = len;
679 osvw_status |= status;
680 osvw_status &= (1ULL << osvw_len) - 1;
681 }
682 } else
683 osvw_status = osvw_len = 0;
684
626 svm_init_erratum_383(); 685 svm_init_erratum_383();
627 686
628 amd_pmu_enable_virt(); 687 amd_pmu_enable_virt();
@@ -910,20 +969,25 @@ static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
910 return _tsc; 969 return _tsc;
911} 970}
912 971
913static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) 972static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
914{ 973{
915 struct vcpu_svm *svm = to_svm(vcpu); 974 struct vcpu_svm *svm = to_svm(vcpu);
916 u64 ratio; 975 u64 ratio;
917 u64 khz; 976 u64 khz;
918 977
919 /* TSC scaling supported? */ 978 /* Guest TSC same frequency as host TSC? */
920 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) 979 if (!scale) {
980 svm->tsc_ratio = TSC_RATIO_DEFAULT;
921 return; 981 return;
982 }
922 983
923 /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */ 984 /* TSC scaling supported? */
924 if (user_tsc_khz == 0) { 985 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
925 vcpu->arch.virtual_tsc_khz = 0; 986 if (user_tsc_khz > tsc_khz) {
926 svm->tsc_ratio = TSC_RATIO_DEFAULT; 987 vcpu->arch.tsc_catchup = 1;
988 vcpu->arch.tsc_always_catchup = 1;
989 } else
990 WARN(1, "user requested TSC rate below hardware speed\n");
927 return; 991 return;
928 } 992 }
929 993
@@ -938,7 +1002,6 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
938 user_tsc_khz); 1002 user_tsc_khz);
939 return; 1003 return;
940 } 1004 }
941 vcpu->arch.virtual_tsc_khz = user_tsc_khz;
942 svm->tsc_ratio = ratio; 1005 svm->tsc_ratio = ratio;
943} 1006}
944 1007
@@ -958,10 +1021,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
958 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1021 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
959} 1022}
960 1023
961static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 1024static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
962{ 1025{
963 struct vcpu_svm *svm = to_svm(vcpu); 1026 struct vcpu_svm *svm = to_svm(vcpu);
964 1027
1028 WARN_ON(adjustment < 0);
1029 if (host)
1030 adjustment = svm_scale_tsc(vcpu, adjustment);
1031
965 svm->vmcb->control.tsc_offset += adjustment; 1032 svm->vmcb->control.tsc_offset += adjustment;
966 if (is_guest_mode(vcpu)) 1033 if (is_guest_mode(vcpu))
967 svm->nested.hsave->control.tsc_offset += adjustment; 1034 svm->nested.hsave->control.tsc_offset += adjustment;
@@ -1191,6 +1258,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1191 if (kvm_vcpu_is_bsp(&svm->vcpu)) 1258 if (kvm_vcpu_is_bsp(&svm->vcpu))
1192 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 1259 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1193 1260
1261 svm_init_osvw(&svm->vcpu);
1262
1194 return &svm->vcpu; 1263 return &svm->vcpu;
1195 1264
1196free_page4: 1265free_page4:
@@ -1268,6 +1337,21 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1268 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1337 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1269} 1338}
1270 1339
1340static void svm_update_cpl(struct kvm_vcpu *vcpu)
1341{
1342 struct vcpu_svm *svm = to_svm(vcpu);
1343 int cpl;
1344
1345 if (!is_protmode(vcpu))
1346 cpl = 0;
1347 else if (svm->vmcb->save.rflags & X86_EFLAGS_VM)
1348 cpl = 3;
1349 else
1350 cpl = svm->vmcb->save.cs.selector & 0x3;
1351
1352 svm->vmcb->save.cpl = cpl;
1353}
1354
1271static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1355static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1272{ 1356{
1273 return to_svm(vcpu)->vmcb->save.rflags; 1357 return to_svm(vcpu)->vmcb->save.rflags;
@@ -1275,7 +1359,11 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1275 1359
1276static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1360static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1277{ 1361{
1362 unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags;
1363
1278 to_svm(vcpu)->vmcb->save.rflags = rflags; 1364 to_svm(vcpu)->vmcb->save.rflags = rflags;
1365 if ((old_rflags ^ rflags) & X86_EFLAGS_VM)
1366 svm_update_cpl(vcpu);
1279} 1367}
1280 1368
1281static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 1369static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
@@ -1543,9 +1631,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
1543 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; 1631 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1544 } 1632 }
1545 if (seg == VCPU_SREG_CS) 1633 if (seg == VCPU_SREG_CS)
1546 svm->vmcb->save.cpl 1634 svm_update_cpl(vcpu);
1547 = (svm->vmcb->save.cs.attrib
1548 >> SVM_SELECTOR_DPL_SHIFT) & 3;
1549 1635
1550 mark_dirty(svm->vmcb, VMCB_SEG); 1636 mark_dirty(svm->vmcb, VMCB_SEG);
1551} 1637}
@@ -2735,7 +2821,10 @@ static int task_switch_interception(struct vcpu_svm *svm)
2735 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) 2821 (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
2736 skip_emulated_instruction(&svm->vcpu); 2822 skip_emulated_instruction(&svm->vcpu);
2737 2823
2738 if (kvm_task_switch(&svm->vcpu, tss_selector, reason, 2824 if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2825 int_vec = -1;
2826
2827 if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
2739 has_error_code, error_code) == EMULATE_FAIL) { 2828 has_error_code, error_code) == EMULATE_FAIL) {
2740 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2829 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2741 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 2830 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 246490f643b..280751c8472 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -70,9 +70,6 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
70static bool __read_mostly vmm_exclusive = 1; 70static bool __read_mostly vmm_exclusive = 1;
71module_param(vmm_exclusive, bool, S_IRUGO); 71module_param(vmm_exclusive, bool, S_IRUGO);
72 72
73static bool __read_mostly yield_on_hlt = 1;
74module_param(yield_on_hlt, bool, S_IRUGO);
75
76static bool __read_mostly fasteoi = 1; 73static bool __read_mostly fasteoi = 1;
77module_param(fasteoi, bool, S_IRUGO); 74module_param(fasteoi, bool, S_IRUGO);
78 75
@@ -1655,17 +1652,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1655 vmx_set_interrupt_shadow(vcpu, 0); 1652 vmx_set_interrupt_shadow(vcpu, 0);
1656} 1653}
1657 1654
1658static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1659{
1660 /* Ensure that we clear the HLT state in the VMCS. We don't need to
1661 * explicitly skip the instruction because if the HLT state is set, then
1662 * the instruction is already executing and RIP has already been
1663 * advanced. */
1664 if (!yield_on_hlt &&
1665 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1666 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1667}
1668
1669/* 1655/*
1670 * KVM wants to inject page-faults which it got to the guest. This function 1656 * KVM wants to inject page-faults which it got to the guest. This function
1671 * checks whether in a nested guest, we need to inject them to L1 or L2. 1657 * checks whether in a nested guest, we need to inject them to L1 or L2.
@@ -1678,7 +1664,7 @@ static int nested_pf_handled(struct kvm_vcpu *vcpu)
1678 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1664 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1679 1665
1680 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ 1666 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
1681 if (!(vmcs12->exception_bitmap & PF_VECTOR)) 1667 if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
1682 return 0; 1668 return 0;
1683 1669
1684 nested_vmx_vmexit(vcpu); 1670 nested_vmx_vmexit(vcpu);
@@ -1718,7 +1704,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1718 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1704 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1719 1705
1720 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1706 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1721 vmx_clear_hlt(vcpu);
1722} 1707}
1723 1708
1724static bool vmx_rdtscp_supported(void) 1709static bool vmx_rdtscp_supported(void)
@@ -1817,13 +1802,19 @@ u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
1817} 1802}
1818 1803
1819/* 1804/*
1820 * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ 1805 * Engage any workarounds for mis-matched TSC rates. Currently limited to
1821 * ioctl. In this case the call-back should update internal vmx state to make 1806 * software catchup for faster rates on slower CPUs.
1822 * the changes effective.
1823 */ 1807 */
1824static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) 1808static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1825{ 1809{
1826 /* Nothing to do here */ 1810 if (!scale)
1811 return;
1812
1813 if (user_tsc_khz > tsc_khz) {
1814 vcpu->arch.tsc_catchup = 1;
1815 vcpu->arch.tsc_always_catchup = 1;
1816 } else
1817 WARN(1, "user requested TSC rate below hardware speed\n");
1827} 1818}
1828 1819
1829/* 1820/*
@@ -1850,7 +1841,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1850 } 1841 }
1851} 1842}
1852 1843
1853static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 1844static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
1854{ 1845{
1855 u64 offset = vmcs_read64(TSC_OFFSET); 1846 u64 offset = vmcs_read64(TSC_OFFSET);
1856 vmcs_write64(TSC_OFFSET, offset + adjustment); 1847 vmcs_write64(TSC_OFFSET, offset + adjustment);
@@ -2219,6 +2210,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2219 msr = find_msr_entry(vmx, msr_index); 2210 msr = find_msr_entry(vmx, msr_index);
2220 if (msr) { 2211 if (msr) {
2221 msr->data = data; 2212 msr->data = data;
2213 if (msr - vmx->guest_msrs < vmx->save_nmsrs)
2214 kvm_set_shared_msr(msr->index, msr->data,
2215 msr->mask);
2222 break; 2216 break;
2223 } 2217 }
2224 ret = kvm_set_msr_common(vcpu, msr_index, data); 2218 ret = kvm_set_msr_common(vcpu, msr_index, data);
@@ -2399,7 +2393,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2399 &_pin_based_exec_control) < 0) 2393 &_pin_based_exec_control) < 0)
2400 return -EIO; 2394 return -EIO;
2401 2395
2402 min = 2396 min = CPU_BASED_HLT_EXITING |
2403#ifdef CONFIG_X86_64 2397#ifdef CONFIG_X86_64
2404 CPU_BASED_CR8_LOAD_EXITING | 2398 CPU_BASED_CR8_LOAD_EXITING |
2405 CPU_BASED_CR8_STORE_EXITING | 2399 CPU_BASED_CR8_STORE_EXITING |
@@ -2414,9 +2408,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2414 CPU_BASED_INVLPG_EXITING | 2408 CPU_BASED_INVLPG_EXITING |
2415 CPU_BASED_RDPMC_EXITING; 2409 CPU_BASED_RDPMC_EXITING;
2416 2410
2417 if (yield_on_hlt)
2418 min |= CPU_BASED_HLT_EXITING;
2419
2420 opt = CPU_BASED_TPR_SHADOW | 2411 opt = CPU_BASED_TPR_SHADOW |
2421 CPU_BASED_USE_MSR_BITMAPS | 2412 CPU_BASED_USE_MSR_BITMAPS |
2422 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2413 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -4003,7 +3994,6 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
4003 } else 3994 } else
4004 intr |= INTR_TYPE_EXT_INTR; 3995 intr |= INTR_TYPE_EXT_INTR;
4005 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 3996 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4006 vmx_clear_hlt(vcpu);
4007} 3997}
4008 3998
4009static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 3999static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -4035,7 +4025,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4035 } 4025 }
4036 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 4026 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4037 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4027 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4038 vmx_clear_hlt(vcpu);
4039} 4028}
4040 4029
4041static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 4030static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -4672,9 +4661,10 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
4672 bool has_error_code = false; 4661 bool has_error_code = false;
4673 u32 error_code = 0; 4662 u32 error_code = 0;
4674 u16 tss_selector; 4663 u16 tss_selector;
4675 int reason, type, idt_v; 4664 int reason, type, idt_v, idt_index;
4676 4665
4677 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 4666 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
4667 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
4678 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 4668 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
4679 4669
4680 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4670 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4712,8 +4702,9 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
4712 type != INTR_TYPE_NMI_INTR)) 4702 type != INTR_TYPE_NMI_INTR))
4713 skip_emulated_instruction(vcpu); 4703 skip_emulated_instruction(vcpu);
4714 4704
4715 if (kvm_task_switch(vcpu, tss_selector, reason, 4705 if (kvm_task_switch(vcpu, tss_selector,
4716 has_error_code, error_code) == EMULATE_FAIL) { 4706 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
4707 has_error_code, error_code) == EMULATE_FAIL) {
4717 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4708 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4718 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 4709 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
4719 vcpu->run->internal.ndata = 0; 4710 vcpu->run->internal.ndata = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 54696b5f844..4044ce0bf7c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -97,6 +97,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
97u32 kvm_max_guest_tsc_khz; 97u32 kvm_max_guest_tsc_khz;
98EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); 98EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
99 99
100/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
101static u32 tsc_tolerance_ppm = 250;
102module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
103
100#define KVM_NR_SHARED_MSRS 16 104#define KVM_NR_SHARED_MSRS 16
101 105
102struct kvm_shared_msrs_global { 106struct kvm_shared_msrs_global {
@@ -969,50 +973,51 @@ static inline u64 get_kernel_ns(void)
969static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 973static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
970unsigned long max_tsc_khz; 974unsigned long max_tsc_khz;
971 975
972static inline int kvm_tsc_changes_freq(void) 976static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
973{ 977{
974 int cpu = get_cpu(); 978 return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
975 int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && 979 vcpu->arch.virtual_tsc_shift);
976 cpufreq_quick_get(cpu) != 0;
977 put_cpu();
978 return ret;
979} 980}
980 981
981u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) 982static u32 adjust_tsc_khz(u32 khz, s32 ppm)
982{ 983{
983 if (vcpu->arch.virtual_tsc_khz) 984 u64 v = (u64)khz * (1000000 + ppm);
984 return vcpu->arch.virtual_tsc_khz; 985 do_div(v, 1000000);
985 else 986 return v;
986 return __this_cpu_read(cpu_tsc_khz);
987} 987}
988 988
989static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) 989static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
990{ 990{
991 u64 ret; 991 u32 thresh_lo, thresh_hi;
992 992 int use_scaling = 0;
993 WARN_ON(preemptible());
994 if (kvm_tsc_changes_freq())
995 printk_once(KERN_WARNING
996 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
997 ret = nsec * vcpu_tsc_khz(vcpu);
998 do_div(ret, USEC_PER_SEC);
999 return ret;
1000}
1001 993
1002static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
1003{
1004 /* Compute a scale to convert nanoseconds in TSC cycles */ 994 /* Compute a scale to convert nanoseconds in TSC cycles */
1005 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, 995 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
1006 &vcpu->arch.tsc_catchup_shift, 996 &vcpu->arch.virtual_tsc_shift,
1007 &vcpu->arch.tsc_catchup_mult); 997 &vcpu->arch.virtual_tsc_mult);
998 vcpu->arch.virtual_tsc_khz = this_tsc_khz;
999
1000 /*
1001 * Compute the variation in TSC rate which is acceptable
1002 * within the range of tolerance and decide if the
1003 * rate being applied is within that bounds of the hardware
1004 * rate. If so, no scaling or compensation need be done.
1005 */
1006 thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
1007 thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
1008 if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
1009 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
1010 use_scaling = 1;
1011 }
1012 kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
1008} 1013}
1009 1014
1010static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) 1015static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1011{ 1016{
1012 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, 1017 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
1013 vcpu->arch.tsc_catchup_mult, 1018 vcpu->arch.virtual_tsc_mult,
1014 vcpu->arch.tsc_catchup_shift); 1019 vcpu->arch.virtual_tsc_shift);
1015 tsc += vcpu->arch.last_tsc_write; 1020 tsc += vcpu->arch.this_tsc_write;
1016 return tsc; 1021 return tsc;
1017} 1022}
1018 1023
@@ -1021,48 +1026,88 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1021 struct kvm *kvm = vcpu->kvm; 1026 struct kvm *kvm = vcpu->kvm;
1022 u64 offset, ns, elapsed; 1027 u64 offset, ns, elapsed;
1023 unsigned long flags; 1028 unsigned long flags;
1024 s64 sdiff; 1029 s64 usdiff;
1025 1030
1026 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1031 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1027 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); 1032 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1028 ns = get_kernel_ns(); 1033 ns = get_kernel_ns();
1029 elapsed = ns - kvm->arch.last_tsc_nsec; 1034 elapsed = ns - kvm->arch.last_tsc_nsec;
1030 sdiff = data - kvm->arch.last_tsc_write; 1035
1031 if (sdiff < 0) 1036 /* n.b - signed multiplication and division required */
1032 sdiff = -sdiff; 1037 usdiff = data - kvm->arch.last_tsc_write;
1038#ifdef CONFIG_X86_64
1039 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
1040#else
1041 /* do_div() only does unsigned */
1042 asm("idivl %2; xor %%edx, %%edx"
1043 : "=A"(usdiff)
1044 : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
1045#endif
1046 do_div(elapsed, 1000);
1047 usdiff -= elapsed;
1048 if (usdiff < 0)
1049 usdiff = -usdiff;
1033 1050
1034 /* 1051 /*
1035 * Special case: close write to TSC within 5 seconds of 1052 * Special case: TSC write with a small delta (1 second) of virtual
1036 * another CPU is interpreted as an attempt to synchronize 1053 * cycle time against real time is interpreted as an attempt to
1037 * The 5 seconds is to accommodate host load / swapping as 1054 * synchronize the CPU.
1038 * well as any reset of TSC during the boot process. 1055 *
1039 * 1056 * For a reliable TSC, we can match TSC offsets, and for an unstable
1040 * In that case, for a reliable TSC, we can match TSC offsets, 1057 * TSC, we add elapsed time in this computation. We could let the
1041 * or make a best guest using elapsed value. 1058 * compensation code attempt to catch up if we fall behind, but
1042 */ 1059 * it's better to try to match offsets from the beginning.
1043 if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && 1060 */
1044 elapsed < 5ULL * NSEC_PER_SEC) { 1061 if (usdiff < USEC_PER_SEC &&
1062 vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
1045 if (!check_tsc_unstable()) { 1063 if (!check_tsc_unstable()) {
1046 offset = kvm->arch.last_tsc_offset; 1064 offset = kvm->arch.cur_tsc_offset;
1047 pr_debug("kvm: matched tsc offset for %llu\n", data); 1065 pr_debug("kvm: matched tsc offset for %llu\n", data);
1048 } else { 1066 } else {
1049 u64 delta = nsec_to_cycles(vcpu, elapsed); 1067 u64 delta = nsec_to_cycles(vcpu, elapsed);
1050 offset += delta; 1068 data += delta;
1069 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1051 pr_debug("kvm: adjusted tsc offset by %llu\n", delta); 1070 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1052 } 1071 }
1053 ns = kvm->arch.last_tsc_nsec; 1072 } else {
1073 /*
1074 * We split periods of matched TSC writes into generations.
1075 * For each generation, we track the original measured
1076 * nanosecond time, offset, and write, so if TSCs are in
1077 * sync, we can match exact offset, and if not, we can match
1078 * exact software computaion in compute_guest_tsc()
1079 *
1080 * These values are tracked in kvm->arch.cur_xxx variables.
1081 */
1082 kvm->arch.cur_tsc_generation++;
1083 kvm->arch.cur_tsc_nsec = ns;
1084 kvm->arch.cur_tsc_write = data;
1085 kvm->arch.cur_tsc_offset = offset;
1086 pr_debug("kvm: new tsc generation %u, clock %llu\n",
1087 kvm->arch.cur_tsc_generation, data);
1054 } 1088 }
1089
1090 /*
1091 * We also track th most recent recorded KHZ, write and time to
1092 * allow the matching interval to be extended at each write.
1093 */
1055 kvm->arch.last_tsc_nsec = ns; 1094 kvm->arch.last_tsc_nsec = ns;
1056 kvm->arch.last_tsc_write = data; 1095 kvm->arch.last_tsc_write = data;
1057 kvm->arch.last_tsc_offset = offset; 1096 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
1058 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1059 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1060 1097
1061 /* Reset of TSC must disable overshoot protection below */ 1098 /* Reset of TSC must disable overshoot protection below */
1062 vcpu->arch.hv_clock.tsc_timestamp = 0; 1099 vcpu->arch.hv_clock.tsc_timestamp = 0;
1063 vcpu->arch.last_tsc_write = data; 1100 vcpu->arch.last_guest_tsc = data;
1064 vcpu->arch.last_tsc_nsec = ns; 1101
1102 /* Keep track of which generation this VCPU has synchronized to */
1103 vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
1104 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1105 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1106
1107 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1108 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1065} 1109}
1110
1066EXPORT_SYMBOL_GPL(kvm_write_tsc); 1111EXPORT_SYMBOL_GPL(kvm_write_tsc);
1067 1112
1068static int kvm_guest_time_update(struct kvm_vcpu *v) 1113static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -1078,7 +1123,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1078 local_irq_save(flags); 1123 local_irq_save(flags);
1079 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); 1124 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
1080 kernel_ns = get_kernel_ns(); 1125 kernel_ns = get_kernel_ns();
1081 this_tsc_khz = vcpu_tsc_khz(v); 1126 this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
1082 if (unlikely(this_tsc_khz == 0)) { 1127 if (unlikely(this_tsc_khz == 0)) {
1083 local_irq_restore(flags); 1128 local_irq_restore(flags);
1084 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); 1129 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -1098,7 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1098 if (vcpu->tsc_catchup) { 1143 if (vcpu->tsc_catchup) {
1099 u64 tsc = compute_guest_tsc(v, kernel_ns); 1144 u64 tsc = compute_guest_tsc(v, kernel_ns);
1100 if (tsc > tsc_timestamp) { 1145 if (tsc > tsc_timestamp) {
1101 kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); 1146 adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
1102 tsc_timestamp = tsc; 1147 tsc_timestamp = tsc;
1103 } 1148 }
1104 } 1149 }
@@ -1130,7 +1175,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1130 * observed by the guest and ensure the new system time is greater. 1175 * observed by the guest and ensure the new system time is greater.
1131 */ 1176 */
1132 max_kernel_ns = 0; 1177 max_kernel_ns = 0;
1133 if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { 1178 if (vcpu->hv_clock.tsc_timestamp) {
1134 max_kernel_ns = vcpu->last_guest_tsc - 1179 max_kernel_ns = vcpu->last_guest_tsc -
1135 vcpu->hv_clock.tsc_timestamp; 1180 vcpu->hv_clock.tsc_timestamp;
1136 max_kernel_ns = pvclock_scale_delta(max_kernel_ns, 1181 max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
@@ -1504,6 +1549,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1504 case MSR_K7_HWCR: 1549 case MSR_K7_HWCR:
1505 data &= ~(u64)0x40; /* ignore flush filter disable */ 1550 data &= ~(u64)0x40; /* ignore flush filter disable */
1506 data &= ~(u64)0x100; /* ignore ignne emulation enable */ 1551 data &= ~(u64)0x100; /* ignore ignne emulation enable */
1552 data &= ~(u64)0x8; /* ignore TLB cache disable */
1507 if (data != 0) { 1553 if (data != 0) {
1508 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1554 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1509 data); 1555 data);
@@ -1676,6 +1722,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1676 */ 1722 */
1677 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); 1723 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
1678 break; 1724 break;
1725 case MSR_AMD64_OSVW_ID_LENGTH:
1726 if (!guest_cpuid_has_osvw(vcpu))
1727 return 1;
1728 vcpu->arch.osvw.length = data;
1729 break;
1730 case MSR_AMD64_OSVW_STATUS:
1731 if (!guest_cpuid_has_osvw(vcpu))
1732 return 1;
1733 vcpu->arch.osvw.status = data;
1734 break;
1679 default: 1735 default:
1680 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1736 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1681 return xen_hvm_config(vcpu, data); 1737 return xen_hvm_config(vcpu, data);
@@ -1960,6 +2016,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1960 */ 2016 */
1961 data = 0xbe702111; 2017 data = 0xbe702111;
1962 break; 2018 break;
2019 case MSR_AMD64_OSVW_ID_LENGTH:
2020 if (!guest_cpuid_has_osvw(vcpu))
2021 return 1;
2022 data = vcpu->arch.osvw.length;
2023 break;
2024 case MSR_AMD64_OSVW_STATUS:
2025 if (!guest_cpuid_has_osvw(vcpu))
2026 return 1;
2027 data = vcpu->arch.osvw.status;
2028 break;
1963 default: 2029 default:
1964 if (kvm_pmu_msr(vcpu, msr)) 2030 if (kvm_pmu_msr(vcpu, msr))
1965 return kvm_pmu_get_msr(vcpu, msr, pdata); 2031 return kvm_pmu_get_msr(vcpu, msr, pdata);
@@ -2080,6 +2146,7 @@ int kvm_dev_ioctl_check_extension(long ext)
2080 case KVM_CAP_XSAVE: 2146 case KVM_CAP_XSAVE:
2081 case KVM_CAP_ASYNC_PF: 2147 case KVM_CAP_ASYNC_PF:
2082 case KVM_CAP_GET_TSC_KHZ: 2148 case KVM_CAP_GET_TSC_KHZ:
2149 case KVM_CAP_PCI_2_3:
2083 r = 1; 2150 r = 1;
2084 break; 2151 break;
2085 case KVM_CAP_COALESCED_MMIO: 2152 case KVM_CAP_COALESCED_MMIO:
@@ -2214,19 +2281,23 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2214 } 2281 }
2215 2282
2216 kvm_x86_ops->vcpu_load(vcpu, cpu); 2283 kvm_x86_ops->vcpu_load(vcpu, cpu);
2217 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2218 /* Make sure TSC doesn't go backwards */
2219 s64 tsc_delta;
2220 u64 tsc;
2221 2284
2222 tsc = kvm_x86_ops->read_l1_tsc(vcpu); 2285 /* Apply any externally detected TSC adjustments (due to suspend) */
2223 tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : 2286 if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
2224 tsc - vcpu->arch.last_guest_tsc; 2287 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
2288 vcpu->arch.tsc_offset_adjustment = 0;
2289 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
2290 }
2225 2291
2292 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2293 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
2294 native_read_tsc() - vcpu->arch.last_host_tsc;
2226 if (tsc_delta < 0) 2295 if (tsc_delta < 0)
2227 mark_tsc_unstable("KVM discovered backwards TSC"); 2296 mark_tsc_unstable("KVM discovered backwards TSC");
2228 if (check_tsc_unstable()) { 2297 if (check_tsc_unstable()) {
2229 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); 2298 u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
2299 vcpu->arch.last_guest_tsc);
2300 kvm_x86_ops->write_tsc_offset(vcpu, offset);
2230 vcpu->arch.tsc_catchup = 1; 2301 vcpu->arch.tsc_catchup = 1;
2231 } 2302 }
2232 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2303 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2243,7 +2314,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2243{ 2314{
2244 kvm_x86_ops->vcpu_put(vcpu); 2315 kvm_x86_ops->vcpu_put(vcpu);
2245 kvm_put_guest_fpu(vcpu); 2316 kvm_put_guest_fpu(vcpu);
2246 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); 2317 vcpu->arch.last_host_tsc = native_read_tsc();
2247} 2318}
2248 2319
2249static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2320static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -2785,26 +2856,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2785 u32 user_tsc_khz; 2856 u32 user_tsc_khz;
2786 2857
2787 r = -EINVAL; 2858 r = -EINVAL;
2788 if (!kvm_has_tsc_control)
2789 break;
2790
2791 user_tsc_khz = (u32)arg; 2859 user_tsc_khz = (u32)arg;
2792 2860
2793 if (user_tsc_khz >= kvm_max_guest_tsc_khz) 2861 if (user_tsc_khz >= kvm_max_guest_tsc_khz)
2794 goto out; 2862 goto out;
2795 2863
2796 kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); 2864 if (user_tsc_khz == 0)
2865 user_tsc_khz = tsc_khz;
2866
2867 kvm_set_tsc_khz(vcpu, user_tsc_khz);
2797 2868
2798 r = 0; 2869 r = 0;
2799 goto out; 2870 goto out;
2800 } 2871 }
2801 case KVM_GET_TSC_KHZ: { 2872 case KVM_GET_TSC_KHZ: {
2802 r = -EIO; 2873 r = vcpu->arch.virtual_tsc_khz;
2803 if (check_tsc_unstable())
2804 goto out;
2805
2806 r = vcpu_tsc_khz(vcpu);
2807
2808 goto out; 2874 goto out;
2809 } 2875 }
2810 default: 2876 default:
@@ -2815,6 +2881,11 @@ out:
2815 return r; 2881 return r;
2816} 2882}
2817 2883
2884int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
2885{
2886 return VM_FAULT_SIGBUS;
2887}
2888
2818static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 2889static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
2819{ 2890{
2820 int ret; 2891 int ret;
@@ -2998,6 +3069,8 @@ static void write_protect_slot(struct kvm *kvm,
2998 unsigned long *dirty_bitmap, 3069 unsigned long *dirty_bitmap,
2999 unsigned long nr_dirty_pages) 3070 unsigned long nr_dirty_pages)
3000{ 3071{
3072 spin_lock(&kvm->mmu_lock);
3073
3001 /* Not many dirty pages compared to # of shadow pages. */ 3074 /* Not many dirty pages compared to # of shadow pages. */
3002 if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { 3075 if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
3003 unsigned long gfn_offset; 3076 unsigned long gfn_offset;
@@ -3005,16 +3078,13 @@ static void write_protect_slot(struct kvm *kvm,
3005 for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { 3078 for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
3006 unsigned long gfn = memslot->base_gfn + gfn_offset; 3079 unsigned long gfn = memslot->base_gfn + gfn_offset;
3007 3080
3008 spin_lock(&kvm->mmu_lock);
3009 kvm_mmu_rmap_write_protect(kvm, gfn, memslot); 3081 kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
3010 spin_unlock(&kvm->mmu_lock);
3011 } 3082 }
3012 kvm_flush_remote_tlbs(kvm); 3083 kvm_flush_remote_tlbs(kvm);
3013 } else { 3084 } else
3014 spin_lock(&kvm->mmu_lock);
3015 kvm_mmu_slot_remove_write_access(kvm, memslot->id); 3085 kvm_mmu_slot_remove_write_access(kvm, memslot->id);
3016 spin_unlock(&kvm->mmu_lock); 3086
3017 } 3087 spin_unlock(&kvm->mmu_lock);
3018} 3088}
3019 3089
3020/* 3090/*
@@ -3133,6 +3203,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
3133 r = -EEXIST; 3203 r = -EEXIST;
3134 if (kvm->arch.vpic) 3204 if (kvm->arch.vpic)
3135 goto create_irqchip_unlock; 3205 goto create_irqchip_unlock;
3206 r = -EINVAL;
3207 if (atomic_read(&kvm->online_vcpus))
3208 goto create_irqchip_unlock;
3136 r = -ENOMEM; 3209 r = -ENOMEM;
3137 vpic = kvm_create_pic(kvm); 3210 vpic = kvm_create_pic(kvm);
3138 if (vpic) { 3211 if (vpic) {
@@ -4063,6 +4136,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
4063 return res; 4136 return res;
4064} 4137}
4065 4138
4139static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
4140{
4141 kvm_set_rflags(emul_to_vcpu(ctxt), val);
4142}
4143
4066static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) 4144static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
4067{ 4145{
4068 return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); 4146 return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4244,6 +4322,7 @@ static struct x86_emulate_ops emulate_ops = {
4244 .set_idt = emulator_set_idt, 4322 .set_idt = emulator_set_idt,
4245 .get_cr = emulator_get_cr, 4323 .get_cr = emulator_get_cr,
4246 .set_cr = emulator_set_cr, 4324 .set_cr = emulator_set_cr,
4325 .set_rflags = emulator_set_rflags,
4247 .cpl = emulator_get_cpl, 4326 .cpl = emulator_get_cpl,
4248 .get_dr = emulator_get_dr, 4327 .get_dr = emulator_get_dr,
4249 .set_dr = emulator_set_dr, 4328 .set_dr = emulator_set_dr,
@@ -5288,6 +5367,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5288 profile_hit(KVM_PROFILING, (void *)rip); 5367 profile_hit(KVM_PROFILING, (void *)rip);
5289 } 5368 }
5290 5369
5370 if (unlikely(vcpu->arch.tsc_always_catchup))
5371 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5291 5372
5292 kvm_lapic_sync_from_vapic(vcpu); 5373 kvm_lapic_sync_from_vapic(vcpu);
5293 5374
@@ -5587,15 +5668,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
5587 return 0; 5668 return 0;
5588} 5669}
5589 5670
5590int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 5671int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
5591 bool has_error_code, u32 error_code) 5672 int reason, bool has_error_code, u32 error_code)
5592{ 5673{
5593 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 5674 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5594 int ret; 5675 int ret;
5595 5676
5596 init_emulate_ctxt(vcpu); 5677 init_emulate_ctxt(vcpu);
5597 5678
5598 ret = emulator_task_switch(ctxt, tss_selector, reason, 5679 ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
5599 has_error_code, error_code); 5680 has_error_code, error_code);
5600 5681
5601 if (ret) 5682 if (ret)
@@ -5928,13 +6009,88 @@ int kvm_arch_hardware_enable(void *garbage)
5928 struct kvm *kvm; 6009 struct kvm *kvm;
5929 struct kvm_vcpu *vcpu; 6010 struct kvm_vcpu *vcpu;
5930 int i; 6011 int i;
6012 int ret;
6013 u64 local_tsc;
6014 u64 max_tsc = 0;
6015 bool stable, backwards_tsc = false;
5931 6016
5932 kvm_shared_msr_cpu_online(); 6017 kvm_shared_msr_cpu_online();
5933 list_for_each_entry(kvm, &vm_list, vm_list) 6018 ret = kvm_x86_ops->hardware_enable(garbage);
5934 kvm_for_each_vcpu(i, vcpu, kvm) 6019 if (ret != 0)
5935 if (vcpu->cpu == smp_processor_id()) 6020 return ret;
5936 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 6021
5937 return kvm_x86_ops->hardware_enable(garbage); 6022 local_tsc = native_read_tsc();
6023 stable = !check_tsc_unstable();
6024 list_for_each_entry(kvm, &vm_list, vm_list) {
6025 kvm_for_each_vcpu(i, vcpu, kvm) {
6026 if (!stable && vcpu->cpu == smp_processor_id())
6027 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
6028 if (stable && vcpu->arch.last_host_tsc > local_tsc) {
6029 backwards_tsc = true;
6030 if (vcpu->arch.last_host_tsc > max_tsc)
6031 max_tsc = vcpu->arch.last_host_tsc;
6032 }
6033 }
6034 }
6035
6036 /*
6037 * Sometimes, even reliable TSCs go backwards. This happens on
6038 * platforms that reset TSC during suspend or hibernate actions, but
6039 * maintain synchronization. We must compensate. Fortunately, we can
6040 * detect that condition here, which happens early in CPU bringup,
6041 * before any KVM threads can be running. Unfortunately, we can't
6042 * bring the TSCs fully up to date with real time, as we aren't yet far
6043 * enough into CPU bringup that we know how much real time has actually
6044 * elapsed; our helper function, get_kernel_ns() will be using boot
6045 * variables that haven't been updated yet.
6046 *
6047 * So we simply find the maximum observed TSC above, then record the
6048 * adjustment to TSC in each VCPU. When the VCPU later gets loaded,
6049 * the adjustment will be applied. Note that we accumulate
6050 * adjustments, in case multiple suspend cycles happen before some VCPU
6051 * gets a chance to run again. In the event that no KVM threads get a
6052 * chance to run, we will miss the entire elapsed period, as we'll have
6053 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
6054 * loose cycle time. This isn't too big a deal, since the loss will be
6055 * uniform across all VCPUs (not to mention the scenario is extremely
6056 * unlikely). It is possible that a second hibernate recovery happens
6057 * much faster than a first, causing the observed TSC here to be
6058 * smaller; this would require additional padding adjustment, which is
6059 * why we set last_host_tsc to the local tsc observed here.
6060 *
6061 * N.B. - this code below runs only on platforms with reliable TSC,
6062 * as that is the only way backwards_tsc is set above. Also note
6063 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
6064 * have the same delta_cyc adjustment applied if backwards_tsc
6065 * is detected. Note further, this adjustment is only done once,
6066 * as we reset last_host_tsc on all VCPUs to stop this from being
6067 * called multiple times (one for each physical CPU bringup).
6068 *
6069 * Platforms with unnreliable TSCs don't have to deal with this, they
6070 * will be compensated by the logic in vcpu_load, which sets the TSC to
6071 * catchup mode. This will catchup all VCPUs to real time, but cannot
6072 * guarantee that they stay in perfect synchronization.
6073 */
6074 if (backwards_tsc) {
6075 u64 delta_cyc = max_tsc - local_tsc;
6076 list_for_each_entry(kvm, &vm_list, vm_list) {
6077 kvm_for_each_vcpu(i, vcpu, kvm) {
6078 vcpu->arch.tsc_offset_adjustment += delta_cyc;
6079 vcpu->arch.last_host_tsc = local_tsc;
6080 }
6081
6082 /*
6083 * We have to disable TSC offset matching.. if you were
6084 * booting a VM while issuing an S4 host suspend....
6085 * you may have some problem. Solving this issue is
6086 * left as an exercise to the reader.
6087 */
6088 kvm->arch.last_tsc_nsec = 0;
6089 kvm->arch.last_tsc_write = 0;
6090 }
6091
6092 }
6093 return 0;
5938} 6094}
5939 6095
5940void kvm_arch_hardware_disable(void *garbage) 6096void kvm_arch_hardware_disable(void *garbage)
@@ -5958,6 +6114,11 @@ void kvm_arch_check_processor_compat(void *rtn)
5958 kvm_x86_ops->check_processor_compatibility(rtn); 6114 kvm_x86_ops->check_processor_compatibility(rtn);
5959} 6115}
5960 6116
6117bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
6118{
6119 return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
6120}
6121
5961int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 6122int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5962{ 6123{
5963 struct page *page; 6124 struct page *page;
@@ -5980,7 +6141,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5980 } 6141 }
5981 vcpu->arch.pio_data = page_address(page); 6142 vcpu->arch.pio_data = page_address(page);
5982 6143
5983 kvm_init_tsc_catchup(vcpu, max_tsc_khz); 6144 kvm_set_tsc_khz(vcpu, max_tsc_khz);
5984 6145
5985 r = kvm_mmu_create(vcpu); 6146 r = kvm_mmu_create(vcpu);
5986 if (r < 0) 6147 if (r < 0)
@@ -6032,8 +6193,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
6032 free_page((unsigned long)vcpu->arch.pio_data); 6193 free_page((unsigned long)vcpu->arch.pio_data);
6033} 6194}
6034 6195
6035int kvm_arch_init_vm(struct kvm *kvm) 6196int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6036{ 6197{
6198 if (type)
6199 return -EINVAL;
6200
6037 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6201 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
6038 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 6202 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
6039 6203
@@ -6093,6 +6257,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
6093 put_page(kvm->arch.ept_identity_pagetable); 6257 put_page(kvm->arch.ept_identity_pagetable);
6094} 6258}
6095 6259
6260void kvm_arch_free_memslot(struct kvm_memory_slot *free,
6261 struct kvm_memory_slot *dont)
6262{
6263 int i;
6264
6265 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6266 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
6267 vfree(free->arch.lpage_info[i]);
6268 free->arch.lpage_info[i] = NULL;
6269 }
6270 }
6271}
6272
6273int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6274{
6275 int i;
6276
6277 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6278 unsigned long ugfn;
6279 int lpages;
6280 int level = i + 2;
6281
6282 lpages = gfn_to_index(slot->base_gfn + npages - 1,
6283 slot->base_gfn, level) + 1;
6284
6285 slot->arch.lpage_info[i] =
6286 vzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
6287 if (!slot->arch.lpage_info[i])
6288 goto out_free;
6289
6290 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
6291 slot->arch.lpage_info[i][0].write_count = 1;
6292 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
6293 slot->arch.lpage_info[i][lpages - 1].write_count = 1;
6294 ugfn = slot->userspace_addr >> PAGE_SHIFT;
6295 /*
6296 * If the gfn and userspace address are not aligned wrt each
6297 * other, or if explicitly asked to, disable large page
6298 * support for this slot
6299 */
6300 if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
6301 !kvm_largepages_enabled()) {
6302 unsigned long j;
6303
6304 for (j = 0; j < lpages; ++j)
6305 slot->arch.lpage_info[i][j].write_count = 1;
6306 }
6307 }
6308
6309 return 0;
6310
6311out_free:
6312 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6313 vfree(slot->arch.lpage_info[i]);
6314 slot->arch.lpage_info[i] = NULL;
6315 }
6316 return -ENOMEM;
6317}
6318
6096int kvm_arch_prepare_memory_region(struct kvm *kvm, 6319int kvm_arch_prepare_memory_region(struct kvm *kvm,
6097 struct kvm_memory_slot *memslot, 6320 struct kvm_memory_slot *memslot,
6098 struct kvm_memory_slot old, 6321 struct kvm_memory_slot old,
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
index 036efbea8b2..aef7140c006 100644
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -1,3 +1,4 @@
1#include <linux/bug.h>
1#include <linux/kernel.h> 2#include <linux/kernel.h>
2 3
3#include "opcode.h" 4#include "opcode.h"
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 49a5cb55429..ed2835e148b 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -416,7 +416,12 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
416 kfree(sd); 416 kfree(sd);
417 } else { 417 } else {
418 get_current_resources(device, busnum, domain, &resources); 418 get_current_resources(device, busnum, domain, &resources);
419 if (list_empty(&resources)) 419
420 /*
421 * _CRS with no apertures is normal, so only fall back to
422 * defaults or native bridge info if we're ignoring _CRS.
423 */
424 if (!pci_use_crs)
420 x86_pci_root_bus_resources(busnum, &resources); 425 x86_pci_root_bus_resources(busnum, &resources);
421 bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd, 426 bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd,
422 &resources); 427 &resources);
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 6dd89555fbf..d0e6e403b4f 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -164,11 +164,11 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8367_0, pci_fixup_
164 */ 164 */
165static void __devinit pci_fixup_transparent_bridge(struct pci_dev *dev) 165static void __devinit pci_fixup_transparent_bridge(struct pci_dev *dev)
166{ 166{
167 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && 167 if ((dev->device & 0xff00) == 0x2400)
168 (dev->device & 0xff00) == 0x2400)
169 dev->transparent = 1; 168 dev->transparent = 1;
170} 169}
171DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixup_transparent_bridge); 170DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID,
171 PCI_CLASS_BRIDGE_PCI, 8, pci_fixup_transparent_bridge);
172 172
173/* 173/*
174 * Fixup for C1 Halt Disconnect problem on nForce2 systems. 174 * Fixup for C1 Halt Disconnect problem on nForce2 systems.
@@ -322,9 +322,6 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
322 struct pci_bus *bus; 322 struct pci_bus *bus;
323 u16 config; 323 u16 config;
324 324
325 if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
326 return;
327
328 /* Is VGA routed to us? */ 325 /* Is VGA routed to us? */
329 bus = pdev->bus; 326 bus = pdev->bus;
330 while (bus) { 327 while (bus) {
@@ -353,7 +350,8 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
353 dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n"); 350 dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n");
354 } 351 }
355} 352}
356DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video); 353DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID,
354 PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video);
357 355
358 356
359static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = { 357static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = {
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 91821a1a0c3..831971e731f 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -39,6 +39,87 @@
39#include <asm/io_apic.h> 39#include <asm/io_apic.h>
40 40
41 41
42/*
43 * This list of dynamic mappings is for temporarily maintaining
44 * original BIOS BAR addresses for possible reinstatement.
45 */
46struct pcibios_fwaddrmap {
47 struct list_head list;
48 struct pci_dev *dev;
49 resource_size_t fw_addr[DEVICE_COUNT_RESOURCE];
50};
51
52static LIST_HEAD(pcibios_fwaddrmappings);
53static DEFINE_SPINLOCK(pcibios_fwaddrmap_lock);
54
55/* Must be called with 'pcibios_fwaddrmap_lock' lock held. */
56static struct pcibios_fwaddrmap *pcibios_fwaddrmap_lookup(struct pci_dev *dev)
57{
58 struct pcibios_fwaddrmap *map;
59
60 WARN_ON(!spin_is_locked(&pcibios_fwaddrmap_lock));
61
62 list_for_each_entry(map, &pcibios_fwaddrmappings, list)
63 if (map->dev == dev)
64 return map;
65
66 return NULL;
67}
68
69static void
70pcibios_save_fw_addr(struct pci_dev *dev, int idx, resource_size_t fw_addr)
71{
72 unsigned long flags;
73 struct pcibios_fwaddrmap *map;
74
75 spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
76 map = pcibios_fwaddrmap_lookup(dev);
77 if (!map) {
78 spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
79 map = kzalloc(sizeof(*map), GFP_KERNEL);
80 if (!map)
81 return;
82
83 map->dev = pci_dev_get(dev);
84 map->fw_addr[idx] = fw_addr;
85 INIT_LIST_HEAD(&map->list);
86
87 spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
88 list_add_tail(&map->list, &pcibios_fwaddrmappings);
89 } else
90 map->fw_addr[idx] = fw_addr;
91 spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
92}
93
94resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx)
95{
96 unsigned long flags;
97 struct pcibios_fwaddrmap *map;
98 resource_size_t fw_addr = 0;
99
100 spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
101 map = pcibios_fwaddrmap_lookup(dev);
102 if (map)
103 fw_addr = map->fw_addr[idx];
104 spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
105
106 return fw_addr;
107}
108
109static void pcibios_fw_addr_list_del(void)
110{
111 unsigned long flags;
112 struct pcibios_fwaddrmap *entry, *next;
113
114 spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
115 list_for_each_entry_safe(entry, next, &pcibios_fwaddrmappings, list) {
116 list_del(&entry->list);
117 pci_dev_put(entry->dev);
118 kfree(entry);
119 }
120 spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
121}
122
42static int 123static int
43skip_isa_ioresource_align(struct pci_dev *dev) { 124skip_isa_ioresource_align(struct pci_dev *dev) {
44 125
@@ -182,7 +263,8 @@ static void __init pcibios_allocate_resources(int pass)
182 idx, r, disabled, pass); 263 idx, r, disabled, pass);
183 if (pci_claim_resource(dev, idx) < 0) { 264 if (pci_claim_resource(dev, idx) < 0) {
184 /* We'll assign a new address later */ 265 /* We'll assign a new address later */
185 dev->fw_addr[idx] = r->start; 266 pcibios_save_fw_addr(dev,
267 idx, r->start);
186 r->end -= r->start; 268 r->end -= r->start;
187 r->start = 0; 269 r->start = 0;
188 } 270 }
@@ -228,6 +310,7 @@ static int __init pcibios_assign_resources(void)
228 } 310 }
229 311
230 pci_assign_unassigned_resources(); 312 pci_assign_unassigned_resources();
313 pcibios_fw_addr_list_del();
231 314
232 return 0; 315 return 0;
233} 316}
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index cb29191cee5..140942f66b3 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -43,6 +43,8 @@
43#define PCI_FIXED_BAR_4_SIZE 0x14 43#define PCI_FIXED_BAR_4_SIZE 0x14
44#define PCI_FIXED_BAR_5_SIZE 0x1c 44#define PCI_FIXED_BAR_5_SIZE 0x1c
45 45
46static int pci_soc_mode = 0;
47
46/** 48/**
47 * fixed_bar_cap - return the offset of the fixed BAR cap if found 49 * fixed_bar_cap - return the offset of the fixed BAR cap if found
48 * @bus: PCI bus 50 * @bus: PCI bus
@@ -148,7 +150,9 @@ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
148 */ 150 */
149 if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE) 151 if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE)
150 return 0; 152 return 0;
151 if (bus == 0 && (devfn == PCI_DEVFN(2, 0) || devfn == PCI_DEVFN(0, 0))) 153 if (bus == 0 && (devfn == PCI_DEVFN(2, 0)
154 || devfn == PCI_DEVFN(0, 0)
155 || devfn == PCI_DEVFN(3, 0)))
152 return 1; 156 return 1;
153 return 0; /* langwell on others */ 157 return 0; /* langwell on others */
154} 158}
@@ -231,14 +235,43 @@ struct pci_ops pci_mrst_ops = {
231 */ 235 */
232int __init pci_mrst_init(void) 236int __init pci_mrst_init(void)
233{ 237{
234 printk(KERN_INFO "Moorestown platform detected, using MRST PCI ops\n"); 238 printk(KERN_INFO "Intel MID platform detected, using MID PCI ops\n");
235 pci_mmcfg_late_init(); 239 pci_mmcfg_late_init();
236 pcibios_enable_irq = mrst_pci_irq_enable; 240 pcibios_enable_irq = mrst_pci_irq_enable;
237 pci_root_ops = pci_mrst_ops; 241 pci_root_ops = pci_mrst_ops;
242 pci_soc_mode = 1;
238 /* Continue with standard init */ 243 /* Continue with standard init */
239 return 1; 244 return 1;
240} 245}
241 246
247/* Langwell devices are not true pci devices, they are not subject to 10 ms
248 * d3 to d0 delay required by pci spec.
249 */
250static void __devinit pci_d3delay_fixup(struct pci_dev *dev)
251{
252 /* PCI fixups are effectively decided compile time. If we have a dual
253 SoC/non-SoC kernel we don't want to mangle d3 on non SoC devices */
254 if (!pci_soc_mode)
255 return;
256 /* true pci devices in lincroft should allow type 1 access, the rest
257 * are langwell fake pci devices.
258 */
259 if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID))
260 return;
261 dev->d3_delay = 0;
262}
263DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup);
264
265static void __devinit mrst_power_off_unused_dev(struct pci_dev *dev)
266{
267 pci_set_power_state(dev, PCI_D3cold);
268}
269DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev);
270DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev);
271DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x080C, mrst_power_off_unused_dev);
272DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0812, mrst_power_off_unused_dev);
273DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0815, mrst_power_off_unused_dev);
274
242/* 275/*
243 * Langwell devices reside at fixed offsets, don't try to move them. 276 * Langwell devices reside at fixed offsets, don't try to move them.
244 */ 277 */
@@ -248,6 +281,9 @@ static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev)
248 u32 size; 281 u32 size;
249 int i; 282 int i;
250 283
284 if (!pci_soc_mode)
285 return;
286
251 /* Must have extended configuration space */ 287 /* Must have extended configuration space */
252 if (dev->cfg_size < PCIE_CAP_OFFSET + 4) 288 if (dev->cfg_size < PCIE_CAP_OFFSET + 4)
253 return; 289 return;
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
index e70be38ce03..ce874f872cc 100644
--- a/arch/x86/platform/ce4100/falconfalls.dts
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -208,16 +208,19 @@
208 interrupts = <14 1>; 208 interrupts = <14 1>;
209 }; 209 };
210 210
211 gpio@b,1 { 211 pcigpio: gpio@b,1 {
212 #gpio-cells = <2>;
213 #interrupt-cells = <2>;
212 compatible = "pci8086,2e67.2", 214 compatible = "pci8086,2e67.2",
213 "pci8086,2e67", 215 "pci8086,2e67",
214 "pciclassff0000", 216 "pciclassff0000",
215 "pciclassff00"; 217 "pciclassff00";
216 218
217 #gpio-cells = <2>;
218 reg = <0x15900 0x0 0x0 0x0 0x0>; 219 reg = <0x15900 0x0 0x0 0x0 0x0>;
219 interrupts = <15 1>; 220 interrupts = <15 1>;
221 interrupt-controller;
220 gpio-controller; 222 gpio-controller;
223 intel,muxctl = <0>;
221 }; 224 };
222 225
223 i2c-controller@b,2 { 226 i2c-controller@b,2 {
diff --git a/arch/x86/platform/geode/Makefile b/arch/x86/platform/geode/Makefile
index 246b788847f..5b51194f4c8 100644
--- a/arch/x86/platform/geode/Makefile
+++ b/arch/x86/platform/geode/Makefile
@@ -1,2 +1,3 @@
1obj-$(CONFIG_ALIX) += alix.o 1obj-$(CONFIG_ALIX) += alix.o
2obj-$(CONFIG_NET5501) += net5501.o 2obj-$(CONFIG_NET5501) += net5501.o
3obj-$(CONFIG_GEOS) += geos.o
diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c
new file mode 100644
index 00000000000..c2e6d53558b
--- /dev/null
+++ b/arch/x86/platform/geode/geos.c
@@ -0,0 +1,128 @@
1/*
2 * System Specific setup for Traverse Technologies GEOS.
3 * At the moment this means setup of GPIO control of LEDs.
4 *
5 * Copyright (C) 2008 Constantin Baranov <const@mimas.ru>
6 * Copyright (C) 2011 Ed Wildgoose <kernel@wildgooses.com>
7 * and Philip Prindeville <philipp@redfish-solutions.com>
8 *
9 * TODO: There are large similarities with leds-net5501.c
10 * by Alessandro Zummo <a.zummo@towertech.it>
11 * In the future leds-net5501.c should be migrated over to platform
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License version 2
15 * as published by the Free Software Foundation.
16 */
17
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/io.h>
21#include <linux/string.h>
22#include <linux/module.h>
23#include <linux/leds.h>
24#include <linux/platform_device.h>
25#include <linux/gpio.h>
26#include <linux/input.h>
27#include <linux/gpio_keys.h>
28#include <linux/dmi.h>
29
30#include <asm/geode.h>
31
32static struct gpio_keys_button geos_gpio_buttons[] = {
33 {
34 .code = KEY_RESTART,
35 .gpio = 3,
36 .active_low = 1,
37 .desc = "Reset button",
38 .type = EV_KEY,
39 .wakeup = 0,
40 .debounce_interval = 100,
41 .can_disable = 0,
42 }
43};
44static struct gpio_keys_platform_data geos_buttons_data = {
45 .buttons = geos_gpio_buttons,
46 .nbuttons = ARRAY_SIZE(geos_gpio_buttons),
47 .poll_interval = 20,
48};
49
50static struct platform_device geos_buttons_dev = {
51 .name = "gpio-keys-polled",
52 .id = 1,
53 .dev = {
54 .platform_data = &geos_buttons_data,
55 }
56};
57
58static struct gpio_led geos_leds[] = {
59 {
60 .name = "geos:1",
61 .gpio = 6,
62 .default_trigger = "default-on",
63 .active_low = 1,
64 },
65 {
66 .name = "geos:2",
67 .gpio = 25,
68 .default_trigger = "default-off",
69 .active_low = 1,
70 },
71 {
72 .name = "geos:3",
73 .gpio = 27,
74 .default_trigger = "default-off",
75 .active_low = 1,
76 },
77};
78
79static struct gpio_led_platform_data geos_leds_data = {
80 .num_leds = ARRAY_SIZE(geos_leds),
81 .leds = geos_leds,
82};
83
84static struct platform_device geos_leds_dev = {
85 .name = "leds-gpio",
86 .id = -1,
87 .dev.platform_data = &geos_leds_data,
88};
89
90static struct __initdata platform_device *geos_devs[] = {
91 &geos_buttons_dev,
92 &geos_leds_dev,
93};
94
95static void __init register_geos(void)
96{
97 /* Setup LED control through leds-gpio driver */
98 platform_add_devices(geos_devs, ARRAY_SIZE(geos_devs));
99}
100
101static int __init geos_init(void)
102{
103 const char *vendor, *product;
104
105 if (!is_geode())
106 return 0;
107
108 vendor = dmi_get_system_info(DMI_SYS_VENDOR);
109 if (!vendor || strcmp(vendor, "Traverse Technologies"))
110 return 0;
111
112 product = dmi_get_system_info(DMI_PRODUCT_NAME);
113 if (!product || strcmp(product, "Geos"))
114 return 0;
115
116 printk(KERN_INFO "%s: system is recognized as \"%s %s\"\n",
117 KBUILD_MODNAME, vendor, product);
118
119 register_geos();
120
121 return 0;
122}
123
124module_init(geos_init);
125
126MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>");
127MODULE_DESCRIPTION("Traverse Technologies Geos System Setup");
128MODULE_LICENSE("GPL");
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 4889655ba78..47936830968 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -115,7 +115,7 @@ static void __save_processor_state(struct saved_context *ctxt)
115void save_processor_state(void) 115void save_processor_state(void)
116{ 116{
117 __save_processor_state(&saved_context); 117 __save_processor_state(&saved_context);
118 save_sched_clock_state(); 118 x86_platform.save_sched_clock_state();
119} 119}
120#ifdef CONFIG_X86_32 120#ifdef CONFIG_X86_32
121EXPORT_SYMBOL(save_processor_state); 121EXPORT_SYMBOL(save_processor_state);
@@ -231,8 +231,8 @@ static void __restore_processor_state(struct saved_context *ctxt)
231/* Needed by apm.c */ 231/* Needed by apm.c */
232void restore_processor_state(void) 232void restore_processor_state(void)
233{ 233{
234 x86_platform.restore_sched_clock_state();
234 __restore_processor_state(&saved_context); 235 __restore_processor_state(&saved_context);
235 restore_sched_clock_state();
236} 236}
237#ifdef CONFIG_X86_32 237#ifdef CONFIG_X86_32
238EXPORT_SYMBOL(restore_processor_state); 238EXPORT_SYMBOL(restore_processor_state);
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index ce98e287c06..e7e67cc3c14 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -288,7 +288,7 @@
288279 i386 mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend 288279 i386 mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend
289280 i386 mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive 289280 i386 mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive
290281 i386 mq_notify sys_mq_notify compat_sys_mq_notify 290281 i386 mq_notify sys_mq_notify compat_sys_mq_notify
291282 i386 mq_getsetaddr sys_mq_getsetattr compat_sys_mq_getsetattr 291282 i386 mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr
292283 i386 kexec_load sys_kexec_load compat_sys_kexec_load 292283 i386 kexec_load sys_kexec_load compat_sys_kexec_load
293284 i386 waitid sys_waitid compat_sys_waitid 293284 i386 waitid sys_waitid compat_sys_waitid
294# 285 sys_setaltroot 294# 285 sys_setaltroot
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index b2b54d2edf5..9926e11a772 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -15,8 +15,8 @@ config UML_X86
15 select GENERIC_FIND_FIRST_BIT 15 select GENERIC_FIND_FIRST_BIT
16 16
17config 64BIT 17config 64BIT
18 bool 18 bool "64-bit kernel" if SUBARCH = "x86"
19 default SUBARCH = "x86_64" 19 default SUBARCH != "i386"
20 20
21config X86_32 21config X86_32
22 def_bool !64BIT 22 def_bool !64BIT
diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
index 2c32df6fe23..04f82e020f2 100644
--- a/arch/x86/um/asm/processor.h
+++ b/arch/x86/um/asm/processor.h
@@ -17,6 +17,16 @@
17#define ARCH_IS_STACKGROW(address) \ 17#define ARCH_IS_STACKGROW(address) \
18 (address + 65536 + 32 * sizeof(unsigned long) >= UPT_SP(&current->thread.regs.regs)) 18 (address + 65536 + 32 * sizeof(unsigned long) >= UPT_SP(&current->thread.regs.regs))
19 19
20#include <asm/user.h>
21
22/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
23static inline void rep_nop(void)
24{
25 __asm__ __volatile__("rep;nop": : :"memory");
26}
27
28#define cpu_relax() rep_nop()
29
20#include <asm/processor-generic.h> 30#include <asm/processor-generic.h>
21 31
22#endif 32#endif
diff --git a/arch/x86/um/asm/processor_32.h b/arch/x86/um/asm/processor_32.h
index 018f732704d..6c6689e574c 100644
--- a/arch/x86/um/asm/processor_32.h
+++ b/arch/x86/um/asm/processor_32.h
@@ -45,16 +45,6 @@ static inline void arch_copy_thread(struct arch_thread *from,
45 memcpy(&to->tls_array, &from->tls_array, sizeof(from->tls_array)); 45 memcpy(&to->tls_array, &from->tls_array, sizeof(from->tls_array));
46} 46}
47 47
48#include <asm/user.h>
49
50/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
51static inline void rep_nop(void)
52{
53 __asm__ __volatile__("rep;nop": : :"memory");
54}
55
56#define cpu_relax() rep_nop()
57
58/* 48/*
59 * Default implementation of macro that returns current 49 * Default implementation of macro that returns current
60 * instruction pointer ("program counter"). Stolen 50 * instruction pointer ("program counter"). Stolen
diff --git a/arch/x86/um/asm/processor_64.h b/arch/x86/um/asm/processor_64.h
index 61de92d916c..4b02a8455bd 100644
--- a/arch/x86/um/asm/processor_64.h
+++ b/arch/x86/um/asm/processor_64.h
@@ -14,14 +14,6 @@ struct arch_thread {
14 struct faultinfo faultinfo; 14 struct faultinfo faultinfo;
15}; 15};
16 16
17/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
18static inline void rep_nop(void)
19{
20 __asm__ __volatile__("rep;nop": : :"memory");
21}
22
23#define cpu_relax() rep_nop()
24
25#define INIT_ARCH_THREAD { .debugregs = { [ 0 ... 7 ] = 0 }, \ 17#define INIT_ARCH_THREAD { .debugregs = { [ 0 ... 7 ] = 0 }, \
26 .debugregs_seq = 0, \ 18 .debugregs_seq = 0, \
27 .fs = 0, \ 19 .fs = 0, \
@@ -37,8 +29,6 @@ static inline void arch_copy_thread(struct arch_thread *from,
37 to->fs = from->fs; 29 to->fs = from->fs;
38} 30}
39 31
40#include <asm/user.h>
41
42#define current_text_addr() \ 32#define current_text_addr() \
43 ({ void *pc; __asm__("movq $1f,%0\n1:":"=g" (pc)); pc; }) 33 ({ void *pc; __asm__("movq $1f,%0\n1:":"=g" (pc)); pc; })
44 34
diff --git a/arch/x86/um/bugs_32.c b/arch/x86/um/bugs_32.c
index a1fba5fb9db..17d88cf2c6c 100644
--- a/arch/x86/um/bugs_32.c
+++ b/arch/x86/um/bugs_32.c
@@ -13,8 +13,6 @@
13static int host_has_cmov = 1; 13static int host_has_cmov = 1;
14static jmp_buf cmov_test_return; 14static jmp_buf cmov_test_return;
15 15
16#define TASK_PID(task) *((int *) &(((char *) (task))[HOST_TASK_PID]))
17
18static void cmov_sigill_test_handler(int sig) 16static void cmov_sigill_test_handler(int sig)
19{ 17{
20 host_has_cmov = 0; 18 host_has_cmov = 0;
@@ -51,7 +49,7 @@ void arch_examine_signal(int sig, struct uml_pt_regs *regs)
51 * This is testing for a cmov (0x0f 0x4x) instruction causing a 49 * This is testing for a cmov (0x0f 0x4x) instruction causing a
52 * SIGILL in init. 50 * SIGILL in init.
53 */ 51 */
54 if ((sig != SIGILL) || (TASK_PID(get_current()) != 1)) 52 if ((sig != SIGILL) || (get_current_pid() != 1))
55 return; 53 return;
56 54
57 if (copy_from_user_proc(tmp, (void *) UPT_IP(regs), 2)) { 55 if (copy_from_user_proc(tmp, (void *) UPT_IP(regs), 2)) {
diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c
index 639900a6fde..f40281e5d6a 100644
--- a/arch/x86/um/mem_32.c
+++ b/arch/x86/um/mem_32.c
@@ -23,14 +23,6 @@ static int __init gate_vma_init(void)
23 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; 23 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
24 gate_vma.vm_page_prot = __P101; 24 gate_vma.vm_page_prot = __P101;
25 25
26 /*
27 * Make sure the vDSO gets into every core dump.
28 * Dumping its contents makes post-mortem fully interpretable later
29 * without matching up the same kernel and hardware config to see
30 * what PC values meant.
31 */
32 gate_vma.vm_flags |= VM_ALWAYSDUMP;
33
34 return 0; 26 return 0;
35} 27}
36__initcall(gate_vma_init); 28__initcall(gate_vma_init);
diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c
index 91f4ec9a0a5..af91901babb 100644
--- a/arch/x86/um/vdso/vma.c
+++ b/arch/x86/um/vdso/vma.c
@@ -64,8 +64,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
64 64
65 err = install_special_mapping(mm, um_vdso_addr, PAGE_SIZE, 65 err = install_special_mapping(mm, um_vdso_addr, PAGE_SIZE,
66 VM_READ|VM_EXEC| 66 VM_READ|VM_EXEC|
67 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 67 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
68 VM_ALWAYSDUMP,
69 vdsop); 68 vdsop);
70 69
71 up_write(&mm->mmap_sem); 70 up_write(&mm->mmap_sem);
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 468d591dde3..a944020fa85 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -250,13 +250,7 @@ static int __init gate_vma_init(void)
250 gate_vma.vm_end = FIXADDR_USER_END; 250 gate_vma.vm_end = FIXADDR_USER_END;
251 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; 251 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
252 gate_vma.vm_page_prot = __P101; 252 gate_vma.vm_page_prot = __P101;
253 /* 253
254 * Make sure the vDSO gets into every core dump.
255 * Dumping its contents makes post-mortem fully interpretable later
256 * without matching up the same kernel and hardware config to see
257 * what PC values meant.
258 */
259 gate_vma.vm_flags |= VM_ALWAYSDUMP;
260 return 0; 254 return 0;
261} 255}
262 256
@@ -343,17 +337,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
343 if (compat_uses_vma || !compat) { 337 if (compat_uses_vma || !compat) {
344 /* 338 /*
345 * MAYWRITE to allow gdb to COW and set breakpoints 339 * MAYWRITE to allow gdb to COW and set breakpoints
346 *
347 * Make sure the vDSO gets into every core dump.
348 * Dumping its contents makes post-mortem fully
349 * interpretable later without matching up the same
350 * kernel and hardware config to see what PC values
351 * meant.
352 */ 340 */
353 ret = install_special_mapping(mm, addr, PAGE_SIZE, 341 ret = install_special_mapping(mm, addr, PAGE_SIZE,
354 VM_READ|VM_EXEC| 342 VM_READ|VM_EXEC|
355 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 343 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
356 VM_ALWAYSDUMP,
357 vdso32_pages); 344 vdso32_pages);
358 345
359 if (ret) 346 if (ret)
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 153407c35b7..17e18279649 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -124,8 +124,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
124 124
125 ret = install_special_mapping(mm, addr, vdso_size, 125 ret = install_special_mapping(mm, addr, vdso_size,
126 VM_READ|VM_EXEC| 126 VM_READ|VM_EXEC|
127 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 127 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
128 VM_ALWAYSDUMP,
129 vdso_pages); 128 vdso_pages);
130 if (ret) { 129 if (ret) {
131 current->mm->context.vdso = NULL; 130 current->mm->context.vdso = NULL;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 12366238d07..1ba8dff2675 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -10,6 +10,7 @@
10#include <linux/pm.h> 10#include <linux/pm.h>
11#include <linux/memblock.h> 11#include <linux/memblock.h>
12#include <linux/cpuidle.h> 12#include <linux/cpuidle.h>
13#include <linux/cpufreq.h>
13 14
14#include <asm/elf.h> 15#include <asm/elf.h>
15#include <asm/vdso.h> 16#include <asm/vdso.h>
@@ -420,6 +421,7 @@ void __init xen_arch_setup(void)
420 boot_cpu_data.hlt_works_ok = 1; 421 boot_cpu_data.hlt_works_ok = 1;
421#endif 422#endif
422 disable_cpuidle(); 423 disable_cpuidle();
424 disable_cpufreq();
423 WARN_ON(set_pm_idle_to_default()); 425 WARN_ON(set_pm_idle_to_default());
424 fiddle_vdso(); 426 fiddle_vdso();
425} 427}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 315d8fa0c8f..02900e8ce26 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -75,8 +75,14 @@ static void __cpuinit cpu_bringup(void)
75 75
76 xen_setup_cpu_clockevents(); 76 xen_setup_cpu_clockevents();
77 77
78 notify_cpu_starting(cpu);
79
80 ipi_call_lock();
78 set_cpu_online(cpu, true); 81 set_cpu_online(cpu, true);
82 ipi_call_unlock();
83
79 this_cpu_write(cpu_state, CPU_ONLINE); 84 this_cpu_write(cpu_state, CPU_ONLINE);
85
80 wmb(); 86 wmb();
81 87
82 /* We can take interrupts now: we're officially "up". */ 88 /* We can take interrupts now: we're officially "up". */