aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLen Brown <len.brown@intel.com>2010-08-15 01:06:31 -0400
committerLen Brown <len.brown@intel.com>2010-08-15 01:06:31 -0400
commit95ee46aa8698f2000647dfb362400fadbb5807cf (patch)
treee5a05c7297f997e191c73091934e42e3195c0e40 /arch/x86
parentcfa806f059801dbe7e435745eb2e187c8bfe1e7f (diff)
parent92fa5bd9a946b6e7aab6764e7312e4e3d9bed295 (diff)
Merge branch 'linus' into release
Conflicts: drivers/acpi/debug.c Signed-off-by: Len Brown <len.brown@intel.com>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig15
-rw-r--r--arch/x86/boot/Makefile8
-rw-r--r--arch/x86/boot/boot.h28
-rw-r--r--arch/x86/boot/cmdline.c6
-rw-r--r--arch/x86/boot/compressed/Makefile4
-rw-r--r--arch/x86/boot/compressed/cmdline.c21
-rw-r--r--arch/x86/boot/compressed/early_serial_console.c5
-rw-r--r--arch/x86/boot/compressed/head_32.S13
-rw-r--r--arch/x86/boot/compressed/head_64.S13
-rw-r--r--arch/x86/boot/compressed/misc.c56
-rw-r--r--arch/x86/boot/compressed/misc.h39
-rw-r--r--arch/x86/boot/compressed/string.c2
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S6
-rw-r--r--arch/x86/boot/ctype.h21
-rw-r--r--arch/x86/boot/early_serial_console.c139
-rw-r--r--arch/x86/boot/main.c9
-rw-r--r--arch/x86/boot/printf.c4
-rw-r--r--arch/x86/boot/string.c63
-rw-r--r--arch/x86/boot/tty.c37
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/ia32/ia32entry.S3
-rw-r--r--arch/x86/ia32/sys_ia32.c23
-rw-r--r--arch/x86/include/asm/acpi.h2
-rw-r--r--arch/x86/include/asm/alternative.h7
-rw-r--r--arch/x86/include/asm/apb_timer.h1
-rw-r--r--arch/x86/include/asm/bootparam.h11
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h198
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h83
-rw-r--r--arch/x86/include/asm/cpufeature.h43
-rw-r--r--arch/x86/include/asm/dma-mapping.h8
-rw-r--r--arch/x86/include/asm/highmem.h2
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h2
-rw-r--r--arch/x86/include/asm/hypervisor.h1
-rw-r--r--arch/x86/include/asm/i387.h26
-rw-r--r--arch/x86/include/asm/intel_scu_ipc.h20
-rw-r--r--arch/x86/include/asm/irq_vectors.h3
-rw-r--r--arch/x86/include/asm/kdebug.h6
-rw-r--r--arch/x86/include/asm/kgdb.h20
-rw-r--r--arch/x86/include/asm/kvm.h22
-rw-r--r--arch/x86/include/asm/kvm_emulate.h30
-rw-r--r--arch/x86/include/asm/kvm_host.h70
-rw-r--r--arch/x86/include/asm/local64.h1
-rw-r--r--arch/x86/include/asm/mce.h4
-rw-r--r--arch/x86/include/asm/mrst.h26
-rw-r--r--arch/x86/include/asm/msr-index.h26
-rw-r--r--arch/x86/include/asm/msr.h4
-rw-r--r--arch/x86/include/asm/nmi.h2
-rw-r--r--arch/x86/include/asm/olpc_ofw.h31
-rw-r--r--arch/x86/include/asm/page.h7
-rw-r--r--arch/x86/include/asm/pci_x86.h1
-rw-r--r--arch/x86/include/asm/perf_event.h18
-rw-r--r--arch/x86/include/asm/perf_event_p4.h99
-rw-r--r--arch/x86/include/asm/pgtable_64.h4
-rw-r--r--arch/x86/include/asm/processor.h21
-rw-r--r--arch/x86/include/asm/required-features.h2
-rw-r--r--arch/x86/include/asm/rwsem.h21
-rw-r--r--arch/x86/include/asm/scatterlist.h1
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/stacktrace.h49
-rw-r--r--arch/x86/include/asm/sys_ia32.h15
-rw-r--r--arch/x86/include/asm/syscalls.h2
-rw-r--r--arch/x86/include/asm/system.h7
-rw-r--r--arch/x86/include/asm/unistd_32.h5
-rw-r--r--arch/x86/include/asm/unistd_64.h6
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h151
-rw-r--r--arch/x86/include/asm/vmx.h5
-rw-r--r--arch/x86/include/asm/xen/hypercall.h6
-rw-r--r--arch/x86/include/asm/xen/page.h8
-rw-r--r--arch/x86/include/asm/xen/swiotlb-xen.h14
-rw-r--r--arch/x86/include/asm/xsave.h40
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S2
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/alternative.c1
-rw-r--r--arch/x86/kernel/amd_iommu.c8
-rw-r--r--arch/x86/kernel/apb_timer.c37
-rw-r--r--arch/x86/kernel/aperture_64.c4
-rw-r--r--arch/x86/kernel/apic/Makefile7
-rw-r--r--arch/x86/kernel/apic/apic.c4
-rw-r--r--arch/x86/kernel/apic/es7000_32.c1
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c107
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/apic/nmi.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c4
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/amd.c77
-rw-r--r--arch/x86/kernel/cpu/common.c28
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c11
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h26
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c7
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c8
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c3
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c108
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c35
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c206
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c3
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c56
-rw-r--r--arch/x86/kernel/cpu/perf_event.c62
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c165
-rw-r--r--arch/x86/kernel/cpu/scattered.c63
-rw-r--r--arch/x86/kernel/cpu/topology.c (renamed from arch/x86/kernel/cpu/addon_cpuid_features.c)58
-rw-r--r--arch/x86/kernel/cpu/vmware.c9
-rw-r--r--arch/x86/kernel/crash.c3
-rw-r--r--arch/x86/kernel/dumpstack.c1
-rw-r--r--arch/x86/kernel/dumpstack.h56
-rw-r--r--arch/x86/kernel/dumpstack_32.c2
-rw-r--r--arch/x86/kernel/dumpstack_64.c1
-rw-r--r--arch/x86/kernel/entry_32.S16
-rw-r--r--arch/x86/kernel/entry_64.S13
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/head_64.S5
-rw-r--r--arch/x86/kernel/hpet.c15
-rw-r--r--arch/x86/kernel/hw_breakpoint.c51
-rw-r--r--arch/x86/kernel/i387.c42
-rw-r--r--arch/x86/kernel/kgdb.c189
-rw-r--r--arch/x86/kernel/kprobes.c33
-rw-r--r--arch/x86/kernel/mpparse.c16
-rw-r--r--arch/x86/kernel/mrst.c105
-rw-r--r--arch/x86/kernel/olpc.c20
-rw-r--r--arch/x86/kernel/olpc_ofw.c106
-rw-r--r--arch/x86/kernel/pci-dma.c7
-rw-r--r--arch/x86/kernel/process.c56
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c5
-rw-r--r--arch/x86/kernel/setup.c6
-rw-r--r--arch/x86/kernel/smpboot.c15
-rw-r--r--arch/x86/kernel/stacktrace.c31
-rw-r--r--arch/x86/kernel/syscall_table_32.S3
-rw-r--r--arch/x86/kernel/tlb_uv.c760
-rw-r--r--arch/x86/kernel/traps.c7
-rw-r--r--arch/x86/kernel/tsc.c5
-rw-r--r--arch/x86/kernel/verify_cpu_64.S3
-rw-r--r--arch/x86/kernel/vsyscall_64.c17
-rw-r--r--arch/x86/kernel/xsave.c195
-rw-r--r--arch/x86/kvm/emulate.c749
-rw-r--r--arch/x86/kvm/i8254.c146
-rw-r--r--arch/x86/kvm/i8254.h4
-rw-r--r--arch/x86/kvm/i8259.c48
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/irq.h4
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h8
-rw-r--r--arch/x86/kvm/lapic.c17
-rw-r--r--arch/x86/kvm/mmu.c807
-rw-r--r--arch/x86/kvm/mmutrace.h2
-rw-r--r--arch/x86/kvm/paging_tmpl.h252
-rw-r--r--arch/x86/kvm/svm.c147
-rw-r--r--arch/x86/kvm/timer.c16
-rw-r--r--arch/x86/kvm/vmx.c261
-rw-r--r--arch/x86/kvm/x86.c1176
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/atomic64_386_32.S238
-rw-r--r--arch/x86/lib/clear_page_64.S2
-rw-r--r--arch/x86/lib/cmpxchg.c (renamed from arch/x86/kernel/cpu/cmpxchg.c)18
-rw-r--r--arch/x86/lib/copy_page_64.S2
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/memcpy_64.S2
-rw-r--r--arch/x86/lib/memset_64.S2
-rw-r--r--arch/x86/mm/dump_pagetables.c32
-rw-r--r--arch/x86/mm/fault.c4
-rw-r--r--arch/x86/mm/highmem_32.c4
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/ioremap.c14
-rw-r--r--arch/x86/mm/kmmio.c16
-rw-r--r--arch/x86/mm/pat.c2
-rw-r--r--arch/x86/mm/pf_in.c30
-rw-r--r--arch/x86/mm/testmmiotrace.c22
-rw-r--r--arch/x86/mm/tlb.c4
-rw-r--r--arch/x86/oprofile/nmi_int.c17
-rw-r--r--arch/x86/pci/acpi.c9
-rw-r--r--arch/x86/pci/common.c20
-rw-r--r--arch/x86/pci/irq.c6
-rw-r--r--arch/x86/pci/legacy.c2
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/power/hibernate_64.c2
-rw-r--r--arch/x86/vdso/Makefile3
-rwxr-xr-xarch/x86/vdso/checkundef.sh10
-rw-r--r--arch/x86/vdso/vdso32-setup.c2
-rw-r--r--arch/x86/vdso/vma.c3
-rw-r--r--arch/x86/xen/Kconfig5
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c201
-rw-r--r--arch/x86/xen/mmu.c328
-rw-r--r--arch/x86/xen/mmu.h1
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c58
-rw-r--r--arch/x86/xen/platform-pci-unplug.c137
-rw-r--r--arch/x86/xen/setup.c72
-rw-r--r--arch/x86/xen/smp.c2
-rw-r--r--arch/x86/xen/suspend.c12
-rw-r--r--arch/x86/xen/time.c96
-rw-r--r--arch/x86/xen/xen-ops.h13
202 files changed, 6510 insertions, 3148 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dcb0593b4a66..a84fc34c8f77 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -55,6 +55,7 @@ config X86
55 select HAVE_HW_BREAKPOINT 55 select HAVE_HW_BREAKPOINT
56 select HAVE_MIXED_BREAKPOINTS_REGS 56 select HAVE_MIXED_BREAKPOINTS_REGS
57 select PERF_EVENTS 57 select PERF_EVENTS
58 select HAVE_PERF_EVENTS_NMI
58 select ANON_INODES 59 select ANON_INODES
59 select HAVE_ARCH_KMEMCHECK 60 select HAVE_ARCH_KMEMCHECK
60 select HAVE_USER_RETURN_NOTIFIER 61 select HAVE_USER_RETURN_NOTIFIER
@@ -72,9 +73,6 @@ config ARCH_DEFCONFIG
72 default "arch/x86/configs/i386_defconfig" if X86_32 73 default "arch/x86/configs/i386_defconfig" if X86_32
73 default "arch/x86/configs/x86_64_defconfig" if X86_64 74 default "arch/x86/configs/x86_64_defconfig" if X86_64
74 75
75config GENERIC_TIME
76 def_bool y
77
78config GENERIC_CMOS_UPDATE 76config GENERIC_CMOS_UPDATE
79 def_bool y 77 def_bool y
80 78
@@ -2046,7 +2044,7 @@ config SCx200
2046 2044
2047config SCx200HR_TIMER 2045config SCx200HR_TIMER
2048 tristate "NatSemi SCx200 27MHz High-Resolution Timer Support" 2046 tristate "NatSemi SCx200 27MHz High-Resolution Timer Support"
2049 depends on SCx200 && GENERIC_TIME 2047 depends on SCx200
2050 default y 2048 default y
2051 ---help--- 2049 ---help---
2052 This driver provides a clocksource built upon the on-chip 2050 This driver provides a clocksource built upon the on-chip
@@ -2062,6 +2060,15 @@ config OLPC
2062 Add support for detecting the unique features of the OLPC 2060 Add support for detecting the unique features of the OLPC
2063 XO hardware. 2061 XO hardware.
2064 2062
2063config OLPC_OPENFIRMWARE
2064 bool "Support for OLPC's Open Firmware"
2065 depends on !X86_64 && !X86_PAE
2066 default y if OLPC
2067 help
2068 This option adds support for the implementation of Open Firmware
2069 that is used on the OLPC XO-1 Children's Machine.
2070 If unsure, say N here.
2071
2065endif # X86_32 2072endif # X86_32
2066 2073
2067config K8_NB 2074config K8_NB
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index ec749c2bfdd7..f7cb086b4add 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -26,10 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage
26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf 26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
27subdir- := compressed 27subdir- := compressed
28 28
29setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o 29setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o
30setup-y += header.o main.o mca.o memory.o pm.o pmjump.o 30setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o
31setup-y += printf.o regs.o string.o tty.o video.o video-mode.o 31setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o
32setup-y += version.o 32setup-y += video-mode.o version.o
33setup-$(CONFIG_X86_APM_BOOT) += apm.o 33setup-$(CONFIG_X86_APM_BOOT) += apm.o
34 34
35# The link order of the video-*.o modules can matter. In particular, 35# The link order of the video-*.o modules can matter. In particular,
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 98239d2658f2..c7093bd9f2d3 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -28,6 +28,7 @@
28#include "bitops.h" 28#include "bitops.h"
29#include <asm/cpufeature.h> 29#include <asm/cpufeature.h>
30#include <asm/processor-flags.h> 30#include <asm/processor-flags.h>
31#include "ctype.h"
31 32
32/* Useful macros */ 33/* Useful macros */
33#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) 34#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -37,6 +38,8 @@
37extern struct setup_header hdr; 38extern struct setup_header hdr;
38extern struct boot_params boot_params; 39extern struct boot_params boot_params;
39 40
41#define cpu_relax() asm volatile("rep; nop")
42
40/* Basic port I/O */ 43/* Basic port I/O */
41static inline void outb(u8 v, u16 port) 44static inline void outb(u8 v, u16 port)
42{ 45{
@@ -198,11 +201,6 @@ static inline int memcmp_gs(const void *s1, addr_t s2, size_t len)
198 return diff; 201 return diff;
199} 202}
200 203
201static inline int isdigit(int ch)
202{
203 return (ch >= '0') && (ch <= '9');
204}
205
206/* Heap -- available for dynamic lists. */ 204/* Heap -- available for dynamic lists. */
207extern char _end[]; 205extern char _end[];
208extern char *HEAP; 206extern char *HEAP;
@@ -287,8 +285,18 @@ struct biosregs {
287void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg); 285void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
288 286
289/* cmdline.c */ 287/* cmdline.c */
290int cmdline_find_option(const char *option, char *buffer, int bufsize); 288int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize);
291int cmdline_find_option_bool(const char *option); 289int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option);
290static inline int cmdline_find_option(const char *option, char *buffer, int bufsize)
291{
292 return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize);
293}
294
295static inline int cmdline_find_option_bool(const char *option)
296{
297 return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option);
298}
299
292 300
293/* cpu.c, cpucheck.c */ 301/* cpu.c, cpucheck.c */
294struct cpu_features { 302struct cpu_features {
@@ -300,6 +308,10 @@ extern struct cpu_features cpu;
300int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); 308int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
301int validate_cpu(void); 309int validate_cpu(void);
302 310
311/* early_serial_console.c */
312extern int early_serial_base;
313void console_init(void);
314
303/* edd.c */ 315/* edd.c */
304void query_edd(void); 316void query_edd(void);
305 317
@@ -329,8 +341,10 @@ void initregs(struct biosregs *regs);
329 341
330/* string.c */ 342/* string.c */
331int strcmp(const char *str1, const char *str2); 343int strcmp(const char *str1, const char *str2);
344int strncmp(const char *cs, const char *ct, size_t count);
332size_t strnlen(const char *s, size_t maxlen); 345size_t strnlen(const char *s, size_t maxlen);
333unsigned int atou(const char *s); 346unsigned int atou(const char *s);
347unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base);
334 348
335/* tty.c */ 349/* tty.c */
336void puts(const char *); 350void puts(const char *);
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index a1d35634bce0..6b3b6f708c04 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -27,9 +27,8 @@ static inline int myisspace(u8 c)
27 * Returns the length of the argument (regardless of if it was 27 * Returns the length of the argument (regardless of if it was
28 * truncated to fit in the buffer), or -1 on not found. 28 * truncated to fit in the buffer), or -1 on not found.
29 */ 29 */
30int cmdline_find_option(const char *option, char *buffer, int bufsize) 30int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize)
31{ 31{
32 u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
33 addr_t cptr; 32 addr_t cptr;
34 char c; 33 char c;
35 int len = -1; 34 int len = -1;
@@ -100,9 +99,8 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize)
100 * Returns the position of that option (starts counting with 1) 99 * Returns the position of that option (starts counting with 1)
101 * or 0 on not found 100 * or 0 on not found
102 */ 101 */
103int cmdline_find_option_bool(const char *option) 102int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option)
104{ 103{
105 u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
106 addr_t cptr; 104 addr_t cptr;
107 char c; 105 char c;
108 int pos = 0, wstart = 0; 106 int pos = 0, wstart = 0;
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index fbb47daf2459..0c229551eead 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,7 @@
4# create a compressed vmlinux image from the original vmlinux 4# create a compressed vmlinux image from the original vmlinux
5# 5#
6 6
7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o piggy.o 7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o
8 8
9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC 10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -23,7 +23,7 @@ LDFLAGS_vmlinux := -T
23 23
24hostprogs-y := mkpiggy 24hostprogs-y := mkpiggy
25 25
26$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE 26$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE
27 $(call if_changed,ld) 27 $(call if_changed,ld)
28 @: 28 @:
29 29
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
new file mode 100644
index 000000000000..cb62f786990d
--- /dev/null
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -0,0 +1,21 @@
1#include "misc.h"
2
3static unsigned long fs;
4static inline void set_fs(unsigned long seg)
5{
6 fs = seg << 4; /* shift it back */
7}
8typedef unsigned long addr_t;
9static inline char rdfs8(addr_t addr)
10{
11 return *((char *)(fs + addr));
12}
13#include "../cmdline.c"
14int cmdline_find_option(const char *option, char *buffer, int bufsize)
15{
16 return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize);
17}
18int cmdline_find_option_bool(const char *option)
19{
20 return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option);
21}
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c
new file mode 100644
index 000000000000..261e81fb9582
--- /dev/null
+++ b/arch/x86/boot/compressed/early_serial_console.c
@@ -0,0 +1,5 @@
1#include "misc.h"
2
3int early_serial_base;
4
5#include "../early_serial_console.c"
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index f543b70ffae2..67a655a39ce4 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -124,6 +124,19 @@ relocated:
124 rep stosl 124 rep stosl
125 125
126/* 126/*
127 * Adjust our own GOT
128 */
129 leal _got(%ebx), %edx
130 leal _egot(%ebx), %ecx
1311:
132 cmpl %ecx, %edx
133 jae 2f
134 addl %ebx, (%edx)
135 addl $4, %edx
136 jmp 1b
1372:
138
139/*
127 * Do the decompression, and jump to the new kernel.. 140 * Do the decompression, and jump to the new kernel..
128 */ 141 */
129 leal z_extract_offset_negative(%ebx), %ebp 142 leal z_extract_offset_negative(%ebx), %ebp
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index faff0dc9c06a..52f85a196fa0 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -280,6 +280,19 @@ relocated:
280 rep stosq 280 rep stosq
281 281
282/* 282/*
283 * Adjust our own GOT
284 */
285 leaq _got(%rip), %rdx
286 leaq _egot(%rip), %rcx
2871:
288 cmpq %rcx, %rdx
289 jae 2f
290 addq %rbx, (%rdx)
291 addq $8, %rdx
292 jmp 1b
2932:
294
295/*
283 * Do the decompression, and jump to the new kernel.. 296 * Do the decompression, and jump to the new kernel..
284 */ 297 */
285 pushq %rsi /* Save the real mode argument */ 298 pushq %rsi /* Save the real mode argument */
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 51e240779a44..8f7bef8e9fff 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -9,23 +9,7 @@
9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
10 */ 10 */
11 11
12/* 12#include "misc.h"
13 * we have to be careful, because no indirections are allowed here, and
14 * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
15 * we just keep it from happening
16 */
17#undef CONFIG_PARAVIRT
18#ifdef CONFIG_X86_32
19#define _ASM_X86_DESC_H 1
20#endif
21
22#include <linux/linkage.h>
23#include <linux/screen_info.h>
24#include <linux/elf.h>
25#include <linux/io.h>
26#include <asm/page.h>
27#include <asm/boot.h>
28#include <asm/bootparam.h>
29 13
30/* WARNING!! 14/* WARNING!!
31 * This code is compiled with -fPIC and it is relocated dynamically 15 * This code is compiled with -fPIC and it is relocated dynamically
@@ -123,15 +107,13 @@ static void error(char *m);
123/* 107/*
124 * This is set up by the setup-routine at boot-time 108 * This is set up by the setup-routine at boot-time
125 */ 109 */
126static struct boot_params *real_mode; /* Pointer to real-mode data */ 110struct boot_params *real_mode; /* Pointer to real-mode data */
127static int quiet; 111static int quiet;
112static int debug;
128 113
129void *memset(void *s, int c, size_t n); 114void *memset(void *s, int c, size_t n);
130void *memcpy(void *dest, const void *src, size_t n); 115void *memcpy(void *dest, const void *src, size_t n);
131 116
132static void __putstr(int, const char *);
133#define putstr(__x) __putstr(0, __x)
134
135#ifdef CONFIG_X86_64 117#ifdef CONFIG_X86_64
136#define memptr long 118#define memptr long
137#else 119#else
@@ -170,7 +152,21 @@ static void scroll(void)
170 vidmem[i] = ' '; 152 vidmem[i] = ' ';
171} 153}
172 154
173static void __putstr(int error, const char *s) 155#define XMTRDY 0x20
156
157#define TXR 0 /* Transmit register (WRITE) */
158#define LSR 5 /* Line Status */
159static void serial_putchar(int ch)
160{
161 unsigned timeout = 0xffff;
162
163 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
164 cpu_relax();
165
166 outb(ch, early_serial_base + TXR);
167}
168
169void __putstr(int error, const char *s)
174{ 170{
175 int x, y, pos; 171 int x, y, pos;
176 char c; 172 char c;
@@ -179,6 +175,14 @@ static void __putstr(int error, const char *s)
179 if (!error) 175 if (!error)
180 return; 176 return;
181#endif 177#endif
178 if (early_serial_base) {
179 const char *str = s;
180 while (*str) {
181 if (*str == '\n')
182 serial_putchar('\r');
183 serial_putchar(*str++);
184 }
185 }
182 186
183 if (real_mode->screen_info.orig_video_mode == 0 && 187 if (real_mode->screen_info.orig_video_mode == 0 &&
184 lines == 0 && cols == 0) 188 lines == 0 && cols == 0)
@@ -305,8 +309,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
305{ 309{
306 real_mode = rmode; 310 real_mode = rmode;
307 311
308 if (real_mode->hdr.loadflags & QUIET_FLAG) 312 if (cmdline_find_option_bool("quiet"))
309 quiet = 1; 313 quiet = 1;
314 if (cmdline_find_option_bool("debug"))
315 debug = 1;
310 316
311 if (real_mode->screen_info.orig_video_mode == 7) { 317 if (real_mode->screen_info.orig_video_mode == 7) {
312 vidmem = (char *) 0xb0000; 318 vidmem = (char *) 0xb0000;
@@ -319,6 +325,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
319 lines = real_mode->screen_info.orig_video_lines; 325 lines = real_mode->screen_info.orig_video_lines;
320 cols = real_mode->screen_info.orig_video_cols; 326 cols = real_mode->screen_info.orig_video_cols;
321 327
328 console_init();
329 if (debug)
330 putstr("early console in decompress_kernel\n");
331
322 free_mem_ptr = heap; /* Heap */ 332 free_mem_ptr = heap; /* Heap */
323 free_mem_end_ptr = heap + BOOT_HEAP_SIZE; 333 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
324 334
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
new file mode 100644
index 000000000000..3f19c81a6203
--- /dev/null
+++ b/arch/x86/boot/compressed/misc.h
@@ -0,0 +1,39 @@
1#ifndef BOOT_COMPRESSED_MISC_H
2#define BOOT_COMPRESSED_MISC_H
3
4/*
5 * we have to be careful, because no indirections are allowed here, and
6 * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
7 * we just keep it from happening
8 */
9#undef CONFIG_PARAVIRT
10#ifdef CONFIG_X86_32
11#define _ASM_X86_DESC_H 1
12#endif
13
14#include <linux/linkage.h>
15#include <linux/screen_info.h>
16#include <linux/elf.h>
17#include <linux/io.h>
18#include <asm/page.h>
19#include <asm/boot.h>
20#include <asm/bootparam.h>
21
22#define BOOT_BOOT_H
23#include "../ctype.h"
24
25/* misc.c */
26extern struct boot_params *real_mode; /* Pointer to real-mode data */
27void __putstr(int error, const char *s);
28#define putstr(__x) __putstr(0, __x)
29#define puts(__x) __putstr(0, __x)
30
31/* cmdline.c */
32int cmdline_find_option(const char *option, char *buffer, int bufsize);
33int cmdline_find_option_bool(const char *option);
34
35/* early_serial_console.c */
36extern int early_serial_base;
37void console_init(void);
38
39#endif
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
new file mode 100644
index 000000000000..19b3e693cd72
--- /dev/null
+++ b/arch/x86/boot/compressed/string.c
@@ -0,0 +1,2 @@
1#include "misc.h"
2#include "../string.c"
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 5ddabceee124..34d047c98284 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -41,6 +41,12 @@ SECTIONS
41 *(.rodata.*) 41 *(.rodata.*)
42 _erodata = . ; 42 _erodata = . ;
43 } 43 }
44 .got : {
45 _got = .;
46 KEEP(*(.got.plt))
47 KEEP(*(.got))
48 _egot = .;
49 }
44 .data : { 50 .data : {
45 _data = . ; 51 _data = . ;
46 *(.data) 52 *(.data)
diff --git a/arch/x86/boot/ctype.h b/arch/x86/boot/ctype.h
new file mode 100644
index 000000000000..25e13403193c
--- /dev/null
+++ b/arch/x86/boot/ctype.h
@@ -0,0 +1,21 @@
1#ifndef BOOT_ISDIGIT_H
2
3#define BOOT_ISDIGIT_H
4
5static inline int isdigit(int ch)
6{
7 return (ch >= '0') && (ch <= '9');
8}
9
10static inline int isxdigit(int ch)
11{
12 if (isdigit(ch))
13 return true;
14
15 if ((ch >= 'a') && (ch <= 'f'))
16 return true;
17
18 return (ch >= 'A') && (ch <= 'F');
19}
20
21#endif
diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c
new file mode 100644
index 000000000000..030f4b93e255
--- /dev/null
+++ b/arch/x86/boot/early_serial_console.c
@@ -0,0 +1,139 @@
1#include "boot.h"
2
3#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */
4
5#define XMTRDY 0x20
6
7#define DLAB 0x80
8
9#define TXR 0 /* Transmit register (WRITE) */
10#define RXR 0 /* Receive register (READ) */
11#define IER 1 /* Interrupt Enable */
12#define IIR 2 /* Interrupt ID */
13#define FCR 2 /* FIFO control */
14#define LCR 3 /* Line control */
15#define MCR 4 /* Modem control */
16#define LSR 5 /* Line Status */
17#define MSR 6 /* Modem Status */
18#define DLL 0 /* Divisor Latch Low */
19#define DLH 1 /* Divisor latch High */
20
21#define DEFAULT_BAUD 9600
22
23static void early_serial_init(int port, int baud)
24{
25 unsigned char c;
26 unsigned divisor;
27
28 outb(0x3, port + LCR); /* 8n1 */
29 outb(0, port + IER); /* no interrupt */
30 outb(0, port + FCR); /* no fifo */
31 outb(0x3, port + MCR); /* DTR + RTS */
32
33 divisor = 115200 / baud;
34 c = inb(port + LCR);
35 outb(c | DLAB, port + LCR);
36 outb(divisor & 0xff, port + DLL);
37 outb((divisor >> 8) & 0xff, port + DLH);
38 outb(c & ~DLAB, port + LCR);
39
40 early_serial_base = port;
41}
42
43static void parse_earlyprintk(void)
44{
45 int baud = DEFAULT_BAUD;
46 char arg[32];
47 int pos = 0;
48 int port = 0;
49
50 if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) {
51 char *e;
52
53 if (!strncmp(arg, "serial", 6)) {
54 port = DEFAULT_SERIAL_PORT;
55 pos += 6;
56 }
57
58 if (arg[pos] == ',')
59 pos++;
60
61 if (!strncmp(arg, "ttyS", 4)) {
62 static const int bases[] = { 0x3f8, 0x2f8 };
63 int idx = 0;
64
65 if (!strncmp(arg + pos, "ttyS", 4))
66 pos += 4;
67
68 if (arg[pos++] == '1')
69 idx = 1;
70
71 port = bases[idx];
72 }
73
74 if (arg[pos] == ',')
75 pos++;
76
77 baud = simple_strtoull(arg + pos, &e, 0);
78 if (baud == 0 || arg + pos == e)
79 baud = DEFAULT_BAUD;
80 }
81
82 if (port)
83 early_serial_init(port, baud);
84}
85
86#define BASE_BAUD (1843200/16)
87static unsigned int probe_baud(int port)
88{
89 unsigned char lcr, dll, dlh;
90 unsigned int quot;
91
92 lcr = inb(port + LCR);
93 outb(lcr | DLAB, port + LCR);
94 dll = inb(port + DLL);
95 dlh = inb(port + DLH);
96 outb(lcr, port + LCR);
97 quot = (dlh << 8) | dll;
98
99 return BASE_BAUD / quot;
100}
101
102static void parse_console_uart8250(void)
103{
104 char optstr[64], *options;
105 int baud = DEFAULT_BAUD;
106 int port = 0;
107
108 /*
109 * console=uart8250,io,0x3f8,115200n8
110 * need to make sure it is last one console !
111 */
112 if (cmdline_find_option("console", optstr, sizeof optstr) <= 0)
113 return;
114
115 options = optstr;
116
117 if (!strncmp(options, "uart8250,io,", 12))
118 port = simple_strtoull(options + 12, &options, 0);
119 else if (!strncmp(options, "uart,io,", 8))
120 port = simple_strtoull(options + 8, &options, 0);
121 else
122 return;
123
124 if (options && (options[0] == ','))
125 baud = simple_strtoull(options + 1, &options, 0);
126 else
127 baud = probe_baud(port);
128
129 if (port)
130 early_serial_init(port, baud);
131}
132
133void console_init(void)
134{
135 parse_earlyprintk();
136
137 if (!early_serial_base)
138 parse_console_uart8250();
139}
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 140172b895bd..40358c8905be 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -130,6 +130,11 @@ void main(void)
130 /* First, copy the boot header into the "zeropage" */ 130 /* First, copy the boot header into the "zeropage" */
131 copy_boot_params(); 131 copy_boot_params();
132 132
133 /* Initialize the early-boot console */
134 console_init();
135 if (cmdline_find_option_bool("debug"))
136 puts("early console in setup code\n");
137
133 /* End of heap check */ 138 /* End of heap check */
134 init_heap(); 139 init_heap();
135 140
@@ -168,10 +173,6 @@ void main(void)
168 /* Set the video mode */ 173 /* Set the video mode */
169 set_video(); 174 set_video();
170 175
171 /* Parse command line for 'quiet' and pass it to decompressor. */
172 if (cmdline_find_option_bool("quiet"))
173 boot_params.hdr.loadflags |= QUIET_FLAG;
174
175 /* Do the last things and invoke protected mode */ 176 /* Do the last things and invoke protected mode */
176 go_to_protected_mode(); 177 go_to_protected_mode();
177} 178}
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
index 50e47cdbdddd..cdac91ca55d3 100644
--- a/arch/x86/boot/printf.c
+++ b/arch/x86/boot/printf.c
@@ -34,7 +34,7 @@ static int skip_atoi(const char **s)
34#define SMALL 32 /* Must be 32 == 0x20 */ 34#define SMALL 32 /* Must be 32 == 0x20 */
35#define SPECIAL 64 /* 0x */ 35#define SPECIAL 64 /* 0x */
36 36
37#define do_div(n,base) ({ \ 37#define __do_div(n, base) ({ \
38int __res; \ 38int __res; \
39__res = ((unsigned long) n) % (unsigned) base; \ 39__res = ((unsigned long) n) % (unsigned) base; \
40n = ((unsigned long) n) / (unsigned) base; \ 40n = ((unsigned long) n) / (unsigned) base; \
@@ -83,7 +83,7 @@ static char *number(char *str, long num, int base, int size, int precision,
83 tmp[i++] = '0'; 83 tmp[i++] = '0';
84 else 84 else
85 while (num != 0) 85 while (num != 0)
86 tmp[i++] = (digits[do_div(num, base)] | locase); 86 tmp[i++] = (digits[__do_div(num, base)] | locase);
87 if (i > precision) 87 if (i > precision)
88 precision = i; 88 precision = i;
89 size -= precision; 89 size -= precision;
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index f94b7a0c2abf..3cbc4058dd26 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -30,6 +30,22 @@ int strcmp(const char *str1, const char *str2)
30 return 0; 30 return 0;
31} 31}
32 32
33int strncmp(const char *cs, const char *ct, size_t count)
34{
35 unsigned char c1, c2;
36
37 while (count) {
38 c1 = *cs++;
39 c2 = *ct++;
40 if (c1 != c2)
41 return c1 < c2 ? -1 : 1;
42 if (!c1)
43 break;
44 count--;
45 }
46 return 0;
47}
48
33size_t strnlen(const char *s, size_t maxlen) 49size_t strnlen(const char *s, size_t maxlen)
34{ 50{
35 const char *es = s; 51 const char *es = s;
@@ -48,3 +64,50 @@ unsigned int atou(const char *s)
48 i = i * 10 + (*s++ - '0'); 64 i = i * 10 + (*s++ - '0');
49 return i; 65 return i;
50} 66}
67
68/* Works only for digits and letters, but small and fast */
69#define TOLOWER(x) ((x) | 0x20)
70
71static unsigned int simple_guess_base(const char *cp)
72{
73 if (cp[0] == '0') {
74 if (TOLOWER(cp[1]) == 'x' && isxdigit(cp[2]))
75 return 16;
76 else
77 return 8;
78 } else {
79 return 10;
80 }
81}
82
83/**
84 * simple_strtoull - convert a string to an unsigned long long
85 * @cp: The start of the string
86 * @endp: A pointer to the end of the parsed string will be placed here
87 * @base: The number base to use
88 */
89
90unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
91{
92 unsigned long long result = 0;
93
94 if (!base)
95 base = simple_guess_base(cp);
96
97 if (base == 16 && cp[0] == '0' && TOLOWER(cp[1]) == 'x')
98 cp += 2;
99
100 while (isxdigit(*cp)) {
101 unsigned int value;
102
103 value = isdigit(*cp) ? *cp - '0' : TOLOWER(*cp) - 'a' + 10;
104 if (value >= base)
105 break;
106 result = result * base + value;
107 cp++;
108 }
109 if (endp)
110 *endp = (char *)cp;
111
112 return result;
113}
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 01ec69c901c7..def2451f46ae 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -10,23 +10,36 @@
10 * ----------------------------------------------------------------------- */ 10 * ----------------------------------------------------------------------- */
11 11
12/* 12/*
13 * Very simple screen I/O 13 * Very simple screen and serial I/O
14 * XXX: Probably should add very simple serial I/O?
15 */ 14 */
16 15
17#include "boot.h" 16#include "boot.h"
18 17
18int early_serial_base;
19
20#define XMTRDY 0x20
21
22#define TXR 0 /* Transmit register (WRITE) */
23#define LSR 5 /* Line Status */
24
19/* 25/*
20 * These functions are in .inittext so they can be used to signal 26 * These functions are in .inittext so they can be used to signal
21 * error during initialization. 27 * error during initialization.
22 */ 28 */
23 29
24void __attribute__((section(".inittext"))) putchar(int ch) 30static void __attribute__((section(".inittext"))) serial_putchar(int ch)
25{ 31{
26 struct biosregs ireg; 32 unsigned timeout = 0xffff;
27 33
28 if (ch == '\n') 34 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
29 putchar('\r'); /* \n -> \r\n */ 35 cpu_relax();
36
37 outb(ch, early_serial_base + TXR);
38}
39
40static void __attribute__((section(".inittext"))) bios_putchar(int ch)
41{
42 struct biosregs ireg;
30 43
31 initregs(&ireg); 44 initregs(&ireg);
32 ireg.bx = 0x0007; 45 ireg.bx = 0x0007;
@@ -36,6 +49,17 @@ void __attribute__((section(".inittext"))) putchar(int ch)
36 intcall(0x10, &ireg, NULL); 49 intcall(0x10, &ireg, NULL);
37} 50}
38 51
52void __attribute__((section(".inittext"))) putchar(int ch)
53{
54 if (ch == '\n')
55 putchar('\r'); /* \n -> \r\n */
56
57 bios_putchar(ch);
58
59 if (early_serial_base != 0)
60 serial_putchar(ch);
61}
62
39void __attribute__((section(".inittext"))) puts(const char *str) 63void __attribute__((section(".inittext"))) puts(const char *str)
40{ 64{
41 while (*str) 65 while (*str)
@@ -112,3 +136,4 @@ int getchar_timeout(void)
112 136
113 return 0; /* Timeout! */ 137 return 0; /* Timeout! */
114} 138}
139
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index d28fad19654a..e3a32431ca1e 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1471,6 +1471,7 @@ CONFIG_HWMON=y
1471# CONFIG_SENSORS_GL518SM is not set 1471# CONFIG_SENSORS_GL518SM is not set
1472# CONFIG_SENSORS_GL520SM is not set 1472# CONFIG_SENSORS_GL520SM is not set
1473# CONFIG_SENSORS_CORETEMP is not set 1473# CONFIG_SENSORS_CORETEMP is not set
1474# CONFIG_SENSORS_PKGTEMP is not set
1474# CONFIG_SENSORS_IT87 is not set 1475# CONFIG_SENSORS_IT87 is not set
1475# CONFIG_SENSORS_LM63 is not set 1476# CONFIG_SENSORS_LM63 is not set
1476# CONFIG_SENSORS_LM75 is not set 1477# CONFIG_SENSORS_LM75 is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 6c86acd847a4..4251f8372050 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1456,6 +1456,7 @@ CONFIG_HWMON=y
1456# CONFIG_SENSORS_GL518SM is not set 1456# CONFIG_SENSORS_GL518SM is not set
1457# CONFIG_SENSORS_GL520SM is not set 1457# CONFIG_SENSORS_GL520SM is not set
1458# CONFIG_SENSORS_CORETEMP is not set 1458# CONFIG_SENSORS_CORETEMP is not set
1459# CONFIG_SENSORS_PKGTEMP is not set
1459# CONFIG_SENSORS_IT87 is not set 1460# CONFIG_SENSORS_IT87 is not set
1460# CONFIG_SENSORS_LM63 is not set 1461# CONFIG_SENSORS_LM63 is not set
1461# CONFIG_SENSORS_LM75 is not set 1462# CONFIG_SENSORS_LM75 is not set
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e790bc1fbfa3..b86feabed69b 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -842,4 +842,7 @@ ia32_sys_call_table:
842 .quad compat_sys_rt_tgsigqueueinfo /* 335 */ 842 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
843 .quad sys_perf_event_open 843 .quad sys_perf_event_open
844 .quad compat_sys_recvmmsg 844 .quad compat_sys_recvmmsg
845 .quad sys_fanotify_init
846 .quad sys32_fanotify_mark
847 .quad sys_prlimit64 /* 340 */
845ia32_syscall_end: 848ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 626be156d88d..849813f398e7 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -51,7 +51,7 @@
51#define AA(__x) ((unsigned long)(__x)) 51#define AA(__x) ((unsigned long)(__x))
52 52
53 53
54asmlinkage long sys32_truncate64(char __user *filename, 54asmlinkage long sys32_truncate64(const char __user *filename,
55 unsigned long offset_low, 55 unsigned long offset_low,
56 unsigned long offset_high) 56 unsigned long offset_high)
57{ 57{
@@ -96,7 +96,7 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
96 return 0; 96 return 0;
97} 97}
98 98
99asmlinkage long sys32_stat64(char __user *filename, 99asmlinkage long sys32_stat64(const char __user *filename,
100 struct stat64 __user *statbuf) 100 struct stat64 __user *statbuf)
101{ 101{
102 struct kstat stat; 102 struct kstat stat;
@@ -107,7 +107,7 @@ asmlinkage long sys32_stat64(char __user *filename,
107 return ret; 107 return ret;
108} 108}
109 109
110asmlinkage long sys32_lstat64(char __user *filename, 110asmlinkage long sys32_lstat64(const char __user *filename,
111 struct stat64 __user *statbuf) 111 struct stat64 __user *statbuf)
112{ 112{
113 struct kstat stat; 113 struct kstat stat;
@@ -126,7 +126,7 @@ asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
126 return ret; 126 return ret;
127} 127}
128 128
129asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename, 129asmlinkage long sys32_fstatat(unsigned int dfd, const char __user *filename,
130 struct stat64 __user *statbuf, int flag) 130 struct stat64 __user *statbuf, int flag)
131{ 131{
132 struct kstat stat; 132 struct kstat stat;
@@ -408,8 +408,8 @@ asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
408 ((loff_t)AA(poshi) << 32) | AA(poslo)); 408 ((loff_t)AA(poshi) << 32) | AA(poslo));
409} 409}
410 410
411asmlinkage long sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, 411asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf,
412 u32 poslo, u32 poshi) 412 u32 count, u32 poslo, u32 poshi)
413{ 413{
414 return sys_pwrite64(fd, ubuf, count, 414 return sys_pwrite64(fd, ubuf, count,
415 ((loff_t)AA(poshi) << 32) | AA(poslo)); 415 ((loff_t)AA(poshi) << 32) | AA(poslo));
@@ -449,7 +449,7 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd,
449 return ret; 449 return ret;
450} 450}
451 451
452asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, 452asmlinkage long sys32_execve(const char __user *name, compat_uptr_t __user *argv,
453 compat_uptr_t __user *envp, struct pt_regs *regs) 453 compat_uptr_t __user *envp, struct pt_regs *regs)
454{ 454{
455 long error; 455 long error;
@@ -546,3 +546,12 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo,
546 return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, 546 return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
547 ((u64)len_hi << 32) | len_lo); 547 ((u64)len_hi << 32) | len_lo);
548} 548}
549
550asmlinkage long sys32_fanotify_mark(int fanotify_fd, unsigned int flags,
551 u32 mask_lo, u32 mask_hi,
552 int fd, const char __user *pathname)
553{
554 return sys_fanotify_mark(fanotify_fd, flags,
555 ((u64)mask_hi << 32) | mask_lo,
556 fd, pathname);
557}
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index aa2c39d968fc..92091de11113 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -134,7 +134,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
134 boot_cpu_data.x86_model <= 0x05 && 134 boot_cpu_data.x86_model <= 0x05 &&
135 boot_cpu_data.x86_mask < 0x0A) 135 boot_cpu_data.x86_mask < 0x0A)
136 return 1; 136 return 1;
137 else if (boot_cpu_has(X86_FEATURE_AMDC1E)) 137 else if (c1e_detected)
138 return 1; 138 return 1;
139 else 139 else
140 return max_cstate; 140 return max_cstate;
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 03b6bb5394a0..bc6abb7bc7ee 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -45,10 +45,9 @@
45struct alt_instr { 45struct alt_instr {
46 u8 *instr; /* original instruction */ 46 u8 *instr; /* original instruction */
47 u8 *replacement; 47 u8 *replacement;
48 u8 cpuid; /* cpuid bit set for replacement */ 48 u16 cpuid; /* cpuid bit set for replacement */
49 u8 instrlen; /* length of original instruction */ 49 u8 instrlen; /* length of original instruction */
50 u8 replacementlen; /* length of new instruction, <= instrlen */ 50 u8 replacementlen; /* length of new instruction, <= instrlen */
51 u8 pad1;
52#ifdef CONFIG_X86_64 51#ifdef CONFIG_X86_64
53 u32 pad2; 52 u32 pad2;
54#endif 53#endif
@@ -86,9 +85,11 @@ static inline int alternatives_text_reserved(void *start, void *end)
86 _ASM_ALIGN "\n" \ 85 _ASM_ALIGN "\n" \
87 _ASM_PTR "661b\n" /* label */ \ 86 _ASM_PTR "661b\n" /* label */ \
88 _ASM_PTR "663f\n" /* new instruction */ \ 87 _ASM_PTR "663f\n" /* new instruction */ \
89 " .byte " __stringify(feature) "\n" /* feature bit */ \ 88 " .word " __stringify(feature) "\n" /* feature bit */ \
90 " .byte 662b-661b\n" /* sourcelen */ \ 89 " .byte 662b-661b\n" /* sourcelen */ \
91 " .byte 664f-663f\n" /* replacementlen */ \ 90 " .byte 664f-663f\n" /* replacementlen */ \
91 ".previous\n" \
92 ".section .discard,\"aw\",@progbits\n" \
92 " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ 93 " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \
93 ".previous\n" \ 94 ".previous\n" \
94 ".section .altinstr_replacement, \"ax\"\n" \ 95 ".section .altinstr_replacement, \"ax\"\n" \
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index c74a2eebe570..a69b1ac9eaf8 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -55,7 +55,6 @@ extern unsigned long apbt_quick_calibrate(void);
55extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); 55extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
56extern void apbt_setup_secondary_clock(void); 56extern void apbt_setup_secondary_clock(void);
57extern unsigned int boot_cpu_id; 57extern unsigned int boot_cpu_id;
58extern int disable_apbt_percpu;
59 58
60extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); 59extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
61extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); 60extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 6be33d83c716..8e6218550e77 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -70,6 +70,14 @@ struct sys_desc_table {
70 __u8 table[14]; 70 __u8 table[14];
71}; 71};
72 72
73/* Gleaned from OFW's set-parameters in cpu/x86/pc/linux.fth */
74struct olpc_ofw_header {
75 __u32 ofw_magic; /* OFW signature */
76 __u32 ofw_version;
77 __u32 cif_handler; /* callback into OFW */
78 __u32 irq_desc_table;
79} __attribute__((packed));
80
73struct efi_info { 81struct efi_info {
74 __u32 efi_loader_signature; 82 __u32 efi_loader_signature;
75 __u32 efi_systab; 83 __u32 efi_systab;
@@ -92,7 +100,8 @@ struct boot_params {
92 __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ 100 __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
93 __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ 101 __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
94 struct sys_desc_table sys_desc_table; /* 0x0a0 */ 102 struct sys_desc_table sys_desc_table; /* 0x0a0 */
95 __u8 _pad4[144]; /* 0x0b0 */ 103 struct olpc_ofw_header olpc_ofw_header; /* 0x0b0 */
104 __u8 _pad4[128]; /* 0x0c0 */
96 struct edid_info edid_info; /* 0x140 */ 105 struct edid_info edid_info; /* 0x140 */
97 struct efi_info efi_info; /* 0x1c0 */ 106 struct efi_info efi_info; /* 0x1c0 */
98 __u32 alt_mem_k; /* 0x1e0 */ 107 __u32 alt_mem_k; /* 0x1e0 */
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 8859e12dd3cf..284a6e8f7ce1 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -11,38 +11,42 @@
11extern void __xchg_wrong_size(void); 11extern void __xchg_wrong_size(void);
12 12
13/* 13/*
14 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway 14 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
15 * Note 2: xchg has side effect, so that attribute volatile is necessary, 15 * Since this is generally used to protect other memory information, we
16 * but generally the primitive is invalid, *ptr is output argument. --ANK 16 * use "asm volatile" and "memory" clobbers to prevent gcc from moving
17 * information around.
17 */ 18 */
18
19struct __xchg_dummy {
20 unsigned long a[100];
21};
22#define __xg(x) ((struct __xchg_dummy *)(x))
23
24#define __xchg(x, ptr, size) \ 19#define __xchg(x, ptr, size) \
25({ \ 20({ \
26 __typeof(*(ptr)) __x = (x); \ 21 __typeof(*(ptr)) __x = (x); \
27 switch (size) { \ 22 switch (size) { \
28 case 1: \ 23 case 1: \
29 asm volatile("xchgb %b0,%1" \ 24 { \
30 : "=q" (__x) \ 25 volatile u8 *__ptr = (volatile u8 *)(ptr); \
31 : "m" (*__xg(ptr)), "0" (__x) \ 26 asm volatile("xchgb %0,%1" \
27 : "=q" (__x), "+m" (*__ptr) \
28 : "0" (__x) \
32 : "memory"); \ 29 : "memory"); \
33 break; \ 30 break; \
31 } \
34 case 2: \ 32 case 2: \
35 asm volatile("xchgw %w0,%1" \ 33 { \
36 : "=r" (__x) \ 34 volatile u16 *__ptr = (volatile u16 *)(ptr); \
37 : "m" (*__xg(ptr)), "0" (__x) \ 35 asm volatile("xchgw %0,%1" \
36 : "=r" (__x), "+m" (*__ptr) \
37 : "0" (__x) \
38 : "memory"); \ 38 : "memory"); \
39 break; \ 39 break; \
40 } \
40 case 4: \ 41 case 4: \
42 { \
43 volatile u32 *__ptr = (volatile u32 *)(ptr); \
41 asm volatile("xchgl %0,%1" \ 44 asm volatile("xchgl %0,%1" \
42 : "=r" (__x) \ 45 : "=r" (__x), "+m" (*__ptr) \
43 : "m" (*__xg(ptr)), "0" (__x) \ 46 : "0" (__x) \
44 : "memory"); \ 47 : "memory"); \
45 break; \ 48 break; \
49 } \
46 default: \ 50 default: \
47 __xchg_wrong_size(); \ 51 __xchg_wrong_size(); \
48 } \ 52 } \
@@ -53,60 +57,33 @@ struct __xchg_dummy {
53 __xchg((v), (ptr), sizeof(*ptr)) 57 __xchg((v), (ptr), sizeof(*ptr))
54 58
55/* 59/*
56 * The semantics of XCHGCMP8B are a bit strange, this is why 60 * CMPXCHG8B only writes to the target if we had the previous
57 * there is a loop and the loading of %%eax and %%edx has to 61 * value in registers, otherwise it acts as a read and gives us the
58 * be inside. This inlines well in most cases, the cached 62 * "new previous" value. That is why there is a loop. Preloading
59 * cost is around ~38 cycles. (in the future we might want 63 * EDX:EAX is a performance optimization: in the common case it means
60 * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that 64 * we need only one locked operation.
61 * might have an implicit FPU-save as a cost, so it's not
62 * clear which path to go.)
63 * 65 *
64 * cmpxchg8b must be used with the lock prefix here to allow 66 * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very
65 * the instruction to be executed atomically, see page 3-102 67 * least an FPU save and/or %cr0.ts manipulation.
66 * of the instruction set reference 24319102.pdf. We need 68 *
67 * the reader side to see the coherent 64bit value. 69 * cmpxchg8b must be used with the lock prefix here to allow the
70 * instruction to be executed atomically. We need to have the reader
71 * side to see the coherent 64bit value.
68 */ 72 */
69static inline void __set_64bit(unsigned long long *ptr, 73static inline void set_64bit(volatile u64 *ptr, u64 value)
70 unsigned int low, unsigned int high)
71{ 74{
75 u32 low = value;
76 u32 high = value >> 32;
77 u64 prev = *ptr;
78
72 asm volatile("\n1:\t" 79 asm volatile("\n1:\t"
73 "movl (%0), %%eax\n\t" 80 LOCK_PREFIX "cmpxchg8b %0\n\t"
74 "movl 4(%0), %%edx\n\t"
75 LOCK_PREFIX "cmpxchg8b (%0)\n\t"
76 "jnz 1b" 81 "jnz 1b"
77 : /* no outputs */ 82 : "=m" (*ptr), "+A" (prev)
78 : "D"(ptr), 83 : "b" (low), "c" (high)
79 "b"(low), 84 : "memory");
80 "c"(high)
81 : "ax", "dx", "memory");
82}
83
84static inline void __set_64bit_constant(unsigned long long *ptr,
85 unsigned long long value)
86{
87 __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32));
88}
89
90#define ll_low(x) *(((unsigned int *)&(x)) + 0)
91#define ll_high(x) *(((unsigned int *)&(x)) + 1)
92
93static inline void __set_64bit_var(unsigned long long *ptr,
94 unsigned long long value)
95{
96 __set_64bit(ptr, ll_low(value), ll_high(value));
97} 85}
98 86
99#define set_64bit(ptr, value) \
100 (__builtin_constant_p((value)) \
101 ? __set_64bit_constant((ptr), (value)) \
102 : __set_64bit_var((ptr), (value)))
103
104#define _set_64bit(ptr, value) \
105 (__builtin_constant_p(value) \
106 ? __set_64bit(ptr, (unsigned int)(value), \
107 (unsigned int)((value) >> 32)) \
108 : __set_64bit(ptr, ll_low((value)), ll_high((value))))
109
110extern void __cmpxchg_wrong_size(void); 87extern void __cmpxchg_wrong_size(void);
111 88
112/* 89/*
@@ -121,23 +98,32 @@ extern void __cmpxchg_wrong_size(void);
121 __typeof__(*(ptr)) __new = (new); \ 98 __typeof__(*(ptr)) __new = (new); \
122 switch (size) { \ 99 switch (size) { \
123 case 1: \ 100 case 1: \
124 asm volatile(lock "cmpxchgb %b1,%2" \ 101 { \
125 : "=a"(__ret) \ 102 volatile u8 *__ptr = (volatile u8 *)(ptr); \
126 : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ 103 asm volatile(lock "cmpxchgb %2,%1" \
104 : "=a" (__ret), "+m" (*__ptr) \
105 : "q" (__new), "0" (__old) \
127 : "memory"); \ 106 : "memory"); \
128 break; \ 107 break; \
108 } \
129 case 2: \ 109 case 2: \
130 asm volatile(lock "cmpxchgw %w1,%2" \ 110 { \
131 : "=a"(__ret) \ 111 volatile u16 *__ptr = (volatile u16 *)(ptr); \
132 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ 112 asm volatile(lock "cmpxchgw %2,%1" \
113 : "=a" (__ret), "+m" (*__ptr) \
114 : "r" (__new), "0" (__old) \
133 : "memory"); \ 115 : "memory"); \
134 break; \ 116 break; \
117 } \
135 case 4: \ 118 case 4: \
136 asm volatile(lock "cmpxchgl %1,%2" \ 119 { \
137 : "=a"(__ret) \ 120 volatile u32 *__ptr = (volatile u32 *)(ptr); \
138 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ 121 asm volatile(lock "cmpxchgl %2,%1" \
122 : "=a" (__ret), "+m" (*__ptr) \
123 : "r" (__new), "0" (__old) \
139 : "memory"); \ 124 : "memory"); \
140 break; \ 125 break; \
126 } \
141 default: \ 127 default: \
142 __cmpxchg_wrong_size(); \ 128 __cmpxchg_wrong_size(); \
143 } \ 129 } \
@@ -175,32 +161,28 @@ extern void __cmpxchg_wrong_size(void);
175 (unsigned long long)(n))) 161 (unsigned long long)(n)))
176#endif 162#endif
177 163
178static inline unsigned long long __cmpxchg64(volatile void *ptr, 164static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
179 unsigned long long old,
180 unsigned long long new)
181{ 165{
182 unsigned long long prev; 166 u64 prev;
183 asm volatile(LOCK_PREFIX "cmpxchg8b %3" 167 asm volatile(LOCK_PREFIX "cmpxchg8b %1"
184 : "=A"(prev) 168 : "=A" (prev),
185 : "b"((unsigned long)new), 169 "+m" (*ptr)
186 "c"((unsigned long)(new >> 32)), 170 : "b" ((u32)new),
187 "m"(*__xg(ptr)), 171 "c" ((u32)(new >> 32)),
188 "0"(old) 172 "0" (old)
189 : "memory"); 173 : "memory");
190 return prev; 174 return prev;
191} 175}
192 176
193static inline unsigned long long __cmpxchg64_local(volatile void *ptr, 177static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
194 unsigned long long old,
195 unsigned long long new)
196{ 178{
197 unsigned long long prev; 179 u64 prev;
198 asm volatile("cmpxchg8b %3" 180 asm volatile("cmpxchg8b %1"
199 : "=A"(prev) 181 : "=A" (prev),
200 : "b"((unsigned long)new), 182 "+m" (*ptr)
201 "c"((unsigned long)(new >> 32)), 183 : "b" ((u32)new),
202 "m"(*__xg(ptr)), 184 "c" ((u32)(new >> 32)),
203 "0"(old) 185 "0" (old)
204 : "memory"); 186 : "memory");
205 return prev; 187 return prev;
206} 188}
@@ -264,8 +246,6 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
264 * to simulate the cmpxchg8b on the 80386 and 80486 CPU. 246 * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
265 */ 247 */
266 248
267extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
268
269#define cmpxchg64(ptr, o, n) \ 249#define cmpxchg64(ptr, o, n) \
270({ \ 250({ \
271 __typeof__(*(ptr)) __ret; \ 251 __typeof__(*(ptr)) __ret; \
@@ -283,20 +263,20 @@ extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
283 __ret; }) 263 __ret; })
284 264
285 265
286 266#define cmpxchg64_local(ptr, o, n) \
287#define cmpxchg64_local(ptr, o, n) \ 267({ \
288({ \ 268 __typeof__(*(ptr)) __ret; \
289 __typeof__(*(ptr)) __ret; \ 269 __typeof__(*(ptr)) __old = (o); \
290 if (likely(boot_cpu_data.x86 > 4)) \ 270 __typeof__(*(ptr)) __new = (n); \
291 __ret = (__typeof__(*(ptr)))__cmpxchg64_local((ptr), \ 271 alternative_io("call cmpxchg8b_emu", \
292 (unsigned long long)(o), \ 272 "cmpxchg8b (%%esi)" , \
293 (unsigned long long)(n)); \ 273 X86_FEATURE_CX8, \
294 else \ 274 "=A" (__ret), \
295 __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \ 275 "S" ((ptr)), "0" (__old), \
296 (unsigned long long)(o), \ 276 "b" ((unsigned int)__new), \
297 (unsigned long long)(n)); \ 277 "c" ((unsigned int)(__new>>32)) \
298 __ret; \ 278 : "memory"); \
299}) 279 __ret; })
300 280
301#endif 281#endif
302 282
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 485ae415faec..423ae58aa020 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -3,51 +3,60 @@
3 3
4#include <asm/alternative.h> /* Provides LOCK_PREFIX */ 4#include <asm/alternative.h> /* Provides LOCK_PREFIX */
5 5
6#define __xg(x) ((volatile long *)(x)) 6static inline void set_64bit(volatile u64 *ptr, u64 val)
7
8static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
9{ 7{
10 *ptr = val; 8 *ptr = val;
11} 9}
12 10
13#define _set_64bit set_64bit
14
15extern void __xchg_wrong_size(void); 11extern void __xchg_wrong_size(void);
16extern void __cmpxchg_wrong_size(void); 12extern void __cmpxchg_wrong_size(void);
17 13
18/* 14/*
19 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway 15 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
20 * Note 2: xchg has side effect, so that attribute volatile is necessary, 16 * Since this is generally used to protect other memory information, we
21 * but generally the primitive is invalid, *ptr is output argument. --ANK 17 * use "asm volatile" and "memory" clobbers to prevent gcc from moving
18 * information around.
22 */ 19 */
23#define __xchg(x, ptr, size) \ 20#define __xchg(x, ptr, size) \
24({ \ 21({ \
25 __typeof(*(ptr)) __x = (x); \ 22 __typeof(*(ptr)) __x = (x); \
26 switch (size) { \ 23 switch (size) { \
27 case 1: \ 24 case 1: \
28 asm volatile("xchgb %b0,%1" \ 25 { \
29 : "=q" (__x) \ 26 volatile u8 *__ptr = (volatile u8 *)(ptr); \
30 : "m" (*__xg(ptr)), "0" (__x) \ 27 asm volatile("xchgb %0,%1" \
28 : "=q" (__x), "+m" (*__ptr) \
29 : "0" (__x) \
31 : "memory"); \ 30 : "memory"); \
32 break; \ 31 break; \
32 } \
33 case 2: \ 33 case 2: \
34 asm volatile("xchgw %w0,%1" \ 34 { \
35 : "=r" (__x) \ 35 volatile u16 *__ptr = (volatile u16 *)(ptr); \
36 : "m" (*__xg(ptr)), "0" (__x) \ 36 asm volatile("xchgw %0,%1" \
37 : "=r" (__x), "+m" (*__ptr) \
38 : "0" (__x) \
37 : "memory"); \ 39 : "memory"); \
38 break; \ 40 break; \
41 } \
39 case 4: \ 42 case 4: \
40 asm volatile("xchgl %k0,%1" \ 43 { \
41 : "=r" (__x) \ 44 volatile u32 *__ptr = (volatile u32 *)(ptr); \
42 : "m" (*__xg(ptr)), "0" (__x) \ 45 asm volatile("xchgl %0,%1" \
46 : "=r" (__x), "+m" (*__ptr) \
47 : "0" (__x) \
43 : "memory"); \ 48 : "memory"); \
44 break; \ 49 break; \
50 } \
45 case 8: \ 51 case 8: \
52 { \
53 volatile u64 *__ptr = (volatile u64 *)(ptr); \
46 asm volatile("xchgq %0,%1" \ 54 asm volatile("xchgq %0,%1" \
47 : "=r" (__x) \ 55 : "=r" (__x), "+m" (*__ptr) \
48 : "m" (*__xg(ptr)), "0" (__x) \ 56 : "0" (__x) \
49 : "memory"); \ 57 : "memory"); \
50 break; \ 58 break; \
59 } \
51 default: \ 60 default: \
52 __xchg_wrong_size(); \ 61 __xchg_wrong_size(); \
53 } \ 62 } \
@@ -71,29 +80,41 @@ extern void __cmpxchg_wrong_size(void);
71 __typeof__(*(ptr)) __new = (new); \ 80 __typeof__(*(ptr)) __new = (new); \
72 switch (size) { \ 81 switch (size) { \
73 case 1: \ 82 case 1: \
74 asm volatile(lock "cmpxchgb %b1,%2" \ 83 { \
75 : "=a"(__ret) \ 84 volatile u8 *__ptr = (volatile u8 *)(ptr); \
76 : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ 85 asm volatile(lock "cmpxchgb %2,%1" \
86 : "=a" (__ret), "+m" (*__ptr) \
87 : "q" (__new), "0" (__old) \
77 : "memory"); \ 88 : "memory"); \
78 break; \ 89 break; \
90 } \
79 case 2: \ 91 case 2: \
80 asm volatile(lock "cmpxchgw %w1,%2" \ 92 { \
81 : "=a"(__ret) \ 93 volatile u16 *__ptr = (volatile u16 *)(ptr); \
82 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ 94 asm volatile(lock "cmpxchgw %2,%1" \
95 : "=a" (__ret), "+m" (*__ptr) \
96 : "r" (__new), "0" (__old) \
83 : "memory"); \ 97 : "memory"); \
84 break; \ 98 break; \
99 } \
85 case 4: \ 100 case 4: \
86 asm volatile(lock "cmpxchgl %k1,%2" \ 101 { \
87 : "=a"(__ret) \ 102 volatile u32 *__ptr = (volatile u32 *)(ptr); \
88 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ 103 asm volatile(lock "cmpxchgl %2,%1" \
104 : "=a" (__ret), "+m" (*__ptr) \
105 : "r" (__new), "0" (__old) \
89 : "memory"); \ 106 : "memory"); \
90 break; \ 107 break; \
108 } \
91 case 8: \ 109 case 8: \
92 asm volatile(lock "cmpxchgq %1,%2" \ 110 { \
93 : "=a"(__ret) \ 111 volatile u64 *__ptr = (volatile u64 *)(ptr); \
94 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ 112 asm volatile(lock "cmpxchgq %2,%1" \
113 : "=a" (__ret), "+m" (*__ptr) \
114 : "r" (__new), "0" (__old) \
95 : "memory"); \ 115 : "memory"); \
96 break; \ 116 break; \
117 } \
97 default: \ 118 default: \
98 __cmpxchg_wrong_size(); \ 119 __cmpxchg_wrong_size(); \
99 } \ 120 } \
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 468145914389..781a50b29a49 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -6,7 +6,7 @@
6 6
7#include <asm/required-features.h> 7#include <asm/required-features.h>
8 8
9#define NCAPINTS 9 /* N 32-bit words worth of info */ 9#define NCAPINTS 10 /* N 32-bit words worth of info */
10 10
11/* 11/*
12 * Note: If the comment begins with a quoted string, that string is used 12 * Note: If the comment begins with a quoted string, that string is used
@@ -89,7 +89,7 @@
89#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */ 89#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
90#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */ 90#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */
91#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ 91#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
92#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ 92 /* 21 available, was AMD_C1E */
93#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ 93#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */
94#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ 94#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ 95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
@@ -124,6 +124,8 @@
124#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ 124#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
125#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ 125#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
126#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ 126#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
127#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */
128#define X86_FEATURE_RDRND (4*32+30) /* The RDRAND instruction */
127#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ 129#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
128 130
129/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ 131/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
@@ -157,22 +159,29 @@
157 159
158/* 160/*
159 * Auxiliary flags: Linux defined - For features scattered in various 161 * Auxiliary flags: Linux defined - For features scattered in various
160 * CPUID levels like 0x6, 0xA etc 162 * CPUID levels like 0x6, 0xA etc, word 7
161 */ 163 */
162#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ 164#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
163#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ 165#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */
164#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ 166#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */
167#define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
168#define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */
169#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */
170#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */
165 171
166/* Virtualization flags: Linux defined */ 172/* Virtualization flags: Linux defined, word 8 */
167#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ 173#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
168#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */ 174#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */
169#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ 175#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
170#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ 176#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
171#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ 177#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
172#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */ 178#define X86_FEATURE_NPT (8*32+ 5) /* AMD Nested Page Table support */
173#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */ 179#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */
174#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */ 180#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
175#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */ 181#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
182
183/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
184#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
176 185
177#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 186#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
178 187
@@ -194,7 +203,9 @@ extern const char * const x86_power_flags[32];
194 (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \ 203 (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \
195 (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \ 204 (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \
196 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ 205 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
197 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \ 206 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
207 (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
208 (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \
198 ? 1 : \ 209 ? 1 : \
199 test_cpu_cap(c, bit)) 210 test_cpu_cap(c, bit))
200 211
@@ -291,7 +302,7 @@ extern const char * const x86_power_flags[32];
291 * patch the target code for additional performance. 302 * patch the target code for additional performance.
292 * 303 *
293 */ 304 */
294static __always_inline __pure bool __static_cpu_has(u8 bit) 305static __always_inline __pure bool __static_cpu_has(u16 bit)
295{ 306{
296#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5) 307#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
297 asm goto("1: jmp %l[t_no]\n" 308 asm goto("1: jmp %l[t_no]\n"
@@ -300,11 +311,11 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
300 _ASM_ALIGN "\n" 311 _ASM_ALIGN "\n"
301 _ASM_PTR "1b\n" 312 _ASM_PTR "1b\n"
302 _ASM_PTR "0\n" /* no replacement */ 313 _ASM_PTR "0\n" /* no replacement */
303 " .byte %P0\n" /* feature bit */ 314 " .word %P0\n" /* feature bit */
304 " .byte 2b - 1b\n" /* source len */ 315 " .byte 2b - 1b\n" /* source len */
305 " .byte 0\n" /* replacement len */ 316 " .byte 0\n" /* replacement len */
306 " .byte 0xff + 0 - (2b-1b)\n" /* padding */
307 ".previous\n" 317 ".previous\n"
318 /* skipping size check since replacement size = 0 */
308 : : "i" (bit) : : t_no); 319 : : "i" (bit) : : t_no);
309 return true; 320 return true;
310 t_no: 321 t_no:
@@ -318,10 +329,12 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
318 _ASM_ALIGN "\n" 329 _ASM_ALIGN "\n"
319 _ASM_PTR "1b\n" 330 _ASM_PTR "1b\n"
320 _ASM_PTR "3f\n" 331 _ASM_PTR "3f\n"
321 " .byte %P1\n" /* feature bit */ 332 " .word %P1\n" /* feature bit */
322 " .byte 2b - 1b\n" /* source len */ 333 " .byte 2b - 1b\n" /* source len */
323 " .byte 4f - 3f\n" /* replacement len */ 334 " .byte 4f - 3f\n" /* replacement len */
324 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* padding */ 335 ".previous\n"
336 ".section .discard,\"aw\",@progbits\n"
337 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
325 ".previous\n" 338 ".previous\n"
326 ".section .altinstr_replacement,\"ax\"\n" 339 ".section .altinstr_replacement,\"ax\"\n"
327 "3: movb $1,%0\n" 340 "3: movb $1,%0\n"
@@ -337,7 +350,7 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
337( \ 350( \
338 __builtin_constant_p(boot_cpu_has(bit)) ? \ 351 __builtin_constant_p(boot_cpu_has(bit)) ? \
339 boot_cpu_has(bit) : \ 352 boot_cpu_has(bit) : \
340 (__builtin_constant_p(bit) && !((bit) & ~0xff)) ? \ 353 __builtin_constant_p(bit) ? \
341 __static_cpu_has(bit) : \ 354 __static_cpu_has(bit) : \
342 boot_cpu_has(bit) \ 355 boot_cpu_has(bit) \
343) 356)
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index ac91eed21061..d4c419f883a0 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -54,7 +54,6 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
54 54
55#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) 55#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
56#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) 56#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
57#define dma_is_consistent(d, h) (1)
58 57
59extern int dma_supported(struct device *hwdev, u64 mask); 58extern int dma_supported(struct device *hwdev, u64 mask);
60extern int dma_set_mask(struct device *dev, u64 mask); 59extern int dma_set_mask(struct device *dev, u64 mask);
@@ -87,13 +86,6 @@ dma_cache_sync(struct device *dev, void *vaddr, size_t size,
87 flush_write_buffers(); 86 flush_write_buffers();
88} 87}
89 88
90static inline int dma_get_cache_alignment(void)
91{
92 /* no easy way to get cache size on all x86, so return the
93 * maximum possible, to be safe */
94 return boot_cpu_data.x86_clflush_size;
95}
96
97static inline unsigned long dma_alloc_coherent_mask(struct device *dev, 89static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
98 gfp_t gfp) 90 gfp_t gfp)
99{ 91{
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index a726650fc80f..8caac76ac324 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -61,7 +61,7 @@ void *kmap(struct page *page);
61void kunmap(struct page *page); 61void kunmap(struct page *page);
62void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); 62void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
63void *kmap_atomic(struct page *page, enum km_type type); 63void *kmap_atomic(struct page *page, enum km_type type);
64void kunmap_atomic(void *kvaddr, enum km_type type); 64void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
65void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); 65void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); 66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
67struct page *kmap_atomic_to_page(void *ptr); 67struct page *kmap_atomic_to_page(void *ptr);
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 942255310e6a..528a11e8d3e3 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -20,10 +20,10 @@ struct arch_hw_breakpoint {
20#include <linux/list.h> 20#include <linux/list.h>
21 21
22/* Available HW breakpoint length encodings */ 22/* Available HW breakpoint length encodings */
23#define X86_BREAKPOINT_LEN_X 0x00
23#define X86_BREAKPOINT_LEN_1 0x40 24#define X86_BREAKPOINT_LEN_1 0x40
24#define X86_BREAKPOINT_LEN_2 0x44 25#define X86_BREAKPOINT_LEN_2 0x44
25#define X86_BREAKPOINT_LEN_4 0x4c 26#define X86_BREAKPOINT_LEN_4 0x4c
26#define X86_BREAKPOINT_LEN_EXECUTE 0x40
27 27
28#ifdef CONFIG_X86_64 28#ifdef CONFIG_X86_64
29#define X86_BREAKPOINT_LEN_8 0x48 29#define X86_BREAKPOINT_LEN_8 0x48
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 70abda7058c8..ff2546ce7178 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -45,5 +45,6 @@ extern const struct hypervisor_x86 *x86_hyper;
45/* Recognized hypervisors */ 45/* Recognized hypervisors */
46extern const struct hypervisor_x86 x86_hyper_vmware; 46extern const struct hypervisor_x86 x86_hyper_vmware;
47extern const struct hypervisor_x86 x86_hyper_ms_hyperv; 47extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
48extern const struct hypervisor_x86 x86_hyper_xen_hvm;
48 49
49#endif 50#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c991b3a7b904..a73a8d5a5e69 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -31,7 +31,6 @@ extern void mxcsr_feature_mask_init(void);
31extern int init_fpu(struct task_struct *child); 31extern int init_fpu(struct task_struct *child);
32extern asmlinkage void math_state_restore(void); 32extern asmlinkage void math_state_restore(void);
33extern void __math_state_restore(void); 33extern void __math_state_restore(void);
34extern void init_thread_xstate(void);
35extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); 34extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
36 35
37extern user_regset_active_fn fpregs_active, xfpregs_active; 36extern user_regset_active_fn fpregs_active, xfpregs_active;
@@ -58,11 +57,25 @@ extern int restore_i387_xstate_ia32(void __user *buf);
58 57
59#define X87_FSW_ES (1 << 7) /* Exception Summary */ 58#define X87_FSW_ES (1 << 7) /* Exception Summary */
60 59
60static __always_inline __pure bool use_xsaveopt(void)
61{
62 return static_cpu_has(X86_FEATURE_XSAVEOPT);
63}
64
61static __always_inline __pure bool use_xsave(void) 65static __always_inline __pure bool use_xsave(void)
62{ 66{
63 return static_cpu_has(X86_FEATURE_XSAVE); 67 return static_cpu_has(X86_FEATURE_XSAVE);
64} 68}
65 69
70extern void __sanitize_i387_state(struct task_struct *);
71
72static inline void sanitize_i387_state(struct task_struct *tsk)
73{
74 if (!use_xsaveopt())
75 return;
76 __sanitize_i387_state(tsk);
77}
78
66#ifdef CONFIG_X86_64 79#ifdef CONFIG_X86_64
67 80
68/* Ignore delayed exceptions from user space */ 81/* Ignore delayed exceptions from user space */
@@ -127,6 +140,15 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
127{ 140{
128 int err; 141 int err;
129 142
143 /*
144 * Clear the bytes not touched by the fxsave and reserved
145 * for the SW usage.
146 */
147 err = __clear_user(&fx->sw_reserved,
148 sizeof(struct _fpx_sw_bytes));
149 if (unlikely(err))
150 return -EFAULT;
151
130 asm volatile("1: rex64/fxsave (%[fx])\n\t" 152 asm volatile("1: rex64/fxsave (%[fx])\n\t"
131 "2:\n" 153 "2:\n"
132 ".section .fixup,\"ax\"\n" 154 ".section .fixup,\"ax\"\n"
@@ -482,6 +504,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src)
482 memcpy(dst->state, src->state, xstate_size); 504 memcpy(dst->state, src->state, xstate_size);
483} 505}
484 506
507extern void fpu_finit(struct fpu *fpu);
508
485#endif /* __ASSEMBLY__ */ 509#endif /* __ASSEMBLY__ */
486 510
487#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 511#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
index 4470c9ad4a3e..29f66793cc55 100644
--- a/arch/x86/include/asm/intel_scu_ipc.h
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -1,6 +1,12 @@
1#ifndef _ASM_X86_INTEL_SCU_IPC_H_ 1#ifndef _ASM_X86_INTEL_SCU_IPC_H_
2#define _ASM_X86_INTEL_SCU_IPC_H_ 2#define _ASM_X86_INTEL_SCU_IPC_H_
3 3
4#define IPCMSG_VRTC 0xFA /* Set vRTC device */
5
6/* Command id associated with message IPCMSG_VRTC */
7#define IPC_CMD_VRTC_SETTIME 1 /* Set time */
8#define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */
9
4/* Read single register */ 10/* Read single register */
5int intel_scu_ipc_ioread8(u16 addr, u8 *data); 11int intel_scu_ipc_ioread8(u16 addr, u8 *data);
6 12
@@ -28,20 +34,6 @@ int intel_scu_ipc_writev(u16 *addr, u8 *data, int len);
28/* Update single register based on the mask */ 34/* Update single register based on the mask */
29int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask); 35int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask);
30 36
31/*
32 * Indirect register read
33 * Can be used when SCCB(System Controller Configuration Block) register
34 * HRIM(Honor Restricted IPC Messages) is set (bit 23)
35 */
36int intel_scu_ipc_register_read(u32 addr, u32 *data);
37
38/*
39 * Indirect register write
40 * Can be used when SCCB(System Controller Configuration Block) register
41 * HRIM(Honor Restricted IPC Messages) is set (bit 23)
42 */
43int intel_scu_ipc_register_write(u32 addr, u32 data);
44
45/* Issue commands to the SCU with or without data */ 37/* Issue commands to the SCU with or without data */
46int intel_scu_ipc_simple_command(int cmd, int sub); 38int intel_scu_ipc_simple_command(int cmd, int sub);
47int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, 39int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 8767d99c4f64..e2ca30092557 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -125,6 +125,9 @@
125 */ 125 */
126#define MCE_SELF_VECTOR 0xeb 126#define MCE_SELF_VECTOR 0xeb
127 127
128/* Xen vector callback to receive events in a HVM domain */
129#define XEN_HVM_EVTCHN_CALLBACK 0xe9
130
128#define NR_VECTORS 256 131#define NR_VECTORS 256
129 132
130#define FPU_IRQ 13 133#define FPU_IRQ 13
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index fa7c0b974761..5bdfca86581b 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -33,5 +33,11 @@ extern void __show_regs(struct pt_regs *regs, int all);
33extern void show_regs(struct pt_regs *regs); 33extern void show_regs(struct pt_regs *regs);
34extern unsigned long oops_begin(void); 34extern unsigned long oops_begin(void);
35extern void oops_end(unsigned long, struct pt_regs *, int signr); 35extern void oops_end(unsigned long, struct pt_regs *, int signr);
36#ifdef CONFIG_KEXEC
37extern int in_crash_kexec;
38#else
39/* no crash dump is ever in progress if no crash kernel can be kexec'd */
40#define in_crash_kexec 0
41#endif
36 42
37#endif /* _ASM_X86_KDEBUG_H */ 43#endif /* _ASM_X86_KDEBUG_H */
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index 006da3687cdc..396f5b5fc4d7 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -39,9 +39,11 @@ enum regnames {
39 GDB_FS, /* 14 */ 39 GDB_FS, /* 14 */
40 GDB_GS, /* 15 */ 40 GDB_GS, /* 15 */
41}; 41};
42#define GDB_ORIG_AX 41
43#define DBG_MAX_REG_NUM 16
42#define NUMREGBYTES ((GDB_GS+1)*4) 44#define NUMREGBYTES ((GDB_GS+1)*4)
43#else /* ! CONFIG_X86_32 */ 45#else /* ! CONFIG_X86_32 */
44enum regnames64 { 46enum regnames {
45 GDB_AX, /* 0 */ 47 GDB_AX, /* 0 */
46 GDB_BX, /* 1 */ 48 GDB_BX, /* 1 */
47 GDB_CX, /* 2 */ 49 GDB_CX, /* 2 */
@@ -59,15 +61,15 @@ enum regnames64 {
59 GDB_R14, /* 14 */ 61 GDB_R14, /* 14 */
60 GDB_R15, /* 15 */ 62 GDB_R15, /* 15 */
61 GDB_PC, /* 16 */ 63 GDB_PC, /* 16 */
64 GDB_PS, /* 17 */
65 GDB_CS, /* 18 */
66 GDB_SS, /* 19 */
62}; 67};
63 68#define GDB_ORIG_AX 57
64enum regnames32 { 69#define DBG_MAX_REG_NUM 20
65 GDB_PS = 34, 70/* 17 64 bit regs and 3 32 bit regs */
66 GDB_CS, 71#define NUMREGBYTES ((17 * 8) + (3 * 4))
67 GDB_SS, 72#endif /* ! CONFIG_X86_32 */
68};
69#define NUMREGBYTES ((GDB_SS+1)*4)
70#endif /* CONFIG_X86_32 */
71 73
72static inline void arch_kgdb_breakpoint(void) 74static inline void arch_kgdb_breakpoint(void)
73{ 75{
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index ff90055c7f0b..4d8dcbdfc120 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -22,6 +22,8 @@
22#define __KVM_HAVE_XEN_HVM 22#define __KVM_HAVE_XEN_HVM
23#define __KVM_HAVE_VCPU_EVENTS 23#define __KVM_HAVE_VCPU_EVENTS
24#define __KVM_HAVE_DEBUGREGS 24#define __KVM_HAVE_DEBUGREGS
25#define __KVM_HAVE_XSAVE
26#define __KVM_HAVE_XCRS
25 27
26/* Architectural interrupt line count. */ 28/* Architectural interrupt line count. */
27#define KVM_NR_INTERRUPTS 256 29#define KVM_NR_INTERRUPTS 256
@@ -299,4 +301,24 @@ struct kvm_debugregs {
299 __u64 reserved[9]; 301 __u64 reserved[9];
300}; 302};
301 303
304/* for KVM_CAP_XSAVE */
305struct kvm_xsave {
306 __u32 region[1024];
307};
308
309#define KVM_MAX_XCRS 16
310
311struct kvm_xcr {
312 __u32 xcr;
313 __u32 reserved;
314 __u64 value;
315};
316
317struct kvm_xcrs {
318 __u32 nr_xcrs;
319 __u32 flags;
320 struct kvm_xcr xcrs[KVM_MAX_XCRS];
321 __u64 padding[16];
322};
323
302#endif /* _ASM_X86_KVM_H */ 324#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0b2729bf2070..51cfd730ac5d 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -51,8 +51,10 @@ struct x86_emulate_ctxt;
51#define X86EMUL_UNHANDLEABLE 1 51#define X86EMUL_UNHANDLEABLE 1
52/* Terminate emulation but return success to the caller. */ 52/* Terminate emulation but return success to the caller. */
53#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ 53#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
54#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ 54#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
55#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ 55#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
56#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
57
56struct x86_emulate_ops { 58struct x86_emulate_ops {
57 /* 59 /*
58 * read_std: Read bytes of standard (non-emulated/special) memory. 60 * read_std: Read bytes of standard (non-emulated/special) memory.
@@ -92,6 +94,7 @@ struct x86_emulate_ops {
92 int (*read_emulated)(unsigned long addr, 94 int (*read_emulated)(unsigned long addr,
93 void *val, 95 void *val,
94 unsigned int bytes, 96 unsigned int bytes,
97 unsigned int *error,
95 struct kvm_vcpu *vcpu); 98 struct kvm_vcpu *vcpu);
96 99
97 /* 100 /*
@@ -104,6 +107,7 @@ struct x86_emulate_ops {
104 int (*write_emulated)(unsigned long addr, 107 int (*write_emulated)(unsigned long addr,
105 const void *val, 108 const void *val,
106 unsigned int bytes, 109 unsigned int bytes,
110 unsigned int *error,
107 struct kvm_vcpu *vcpu); 111 struct kvm_vcpu *vcpu);
108 112
109 /* 113 /*
@@ -118,6 +122,7 @@ struct x86_emulate_ops {
118 const void *old, 122 const void *old,
119 const void *new, 123 const void *new,
120 unsigned int bytes, 124 unsigned int bytes,
125 unsigned int *error,
121 struct kvm_vcpu *vcpu); 126 struct kvm_vcpu *vcpu);
122 127
123 int (*pio_in_emulated)(int size, unsigned short port, void *val, 128 int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -132,18 +137,26 @@ struct x86_emulate_ops {
132 int seg, struct kvm_vcpu *vcpu); 137 int seg, struct kvm_vcpu *vcpu);
133 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); 138 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
134 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); 139 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
140 unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
135 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); 141 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
136 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); 142 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
137 void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); 143 int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
138 int (*cpl)(struct kvm_vcpu *vcpu); 144 int (*cpl)(struct kvm_vcpu *vcpu);
139 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 145 int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
146 int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
147 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
148 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
140}; 149};
141 150
142/* Type, address-of, and value of an instruction's operand. */ 151/* Type, address-of, and value of an instruction's operand. */
143struct operand { 152struct operand {
144 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; 153 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
145 unsigned int bytes; 154 unsigned int bytes;
146 unsigned long val, orig_val, *ptr; 155 unsigned long orig_val, *ptr;
156 union {
157 unsigned long val;
158 char valptr[sizeof(unsigned long) + 2];
159 };
147}; 160};
148 161
149struct fetch_cache { 162struct fetch_cache {
@@ -186,6 +199,7 @@ struct decode_cache {
186 unsigned long modrm_val; 199 unsigned long modrm_val;
187 struct fetch_cache fetch; 200 struct fetch_cache fetch;
188 struct read_cache io_read; 201 struct read_cache io_read;
202 struct read_cache mem_read;
189}; 203};
190 204
191struct x86_emulate_ctxt { 205struct x86_emulate_ctxt {
@@ -202,6 +216,12 @@ struct x86_emulate_ctxt {
202 int interruptibility; 216 int interruptibility;
203 217
204 bool restart; /* restart string instruction after writeback */ 218 bool restart; /* restart string instruction after writeback */
219
220 int exception; /* exception that happens during emulation or -1 */
221 u32 error_code; /* error code for exception */
222 bool error_code_valid;
223 unsigned long cr2; /* faulted address in case of #PF */
224
205 /* decode cache */ 225 /* decode cache */
206 struct decode_cache decode; 226 struct decode_cache decode;
207}; 227};
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 76f5483cffec..502e53f999cf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -15,6 +15,7 @@
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/mmu_notifier.h> 16#include <linux/mmu_notifier.h>
17#include <linux/tracepoint.h> 17#include <linux/tracepoint.h>
18#include <linux/cpumask.h>
18 19
19#include <linux/kvm.h> 20#include <linux/kvm.h>
20#include <linux/kvm_para.h> 21#include <linux/kvm_para.h>
@@ -39,11 +40,14 @@
39 0xFFFFFF0000000000ULL) 40 0xFFFFFF0000000000ULL)
40 41
41#define INVALID_PAGE (~(hpa_t)0) 42#define INVALID_PAGE (~(hpa_t)0)
43#define VALID_PAGE(x) ((x) != INVALID_PAGE)
44
42#define UNMAPPED_GVA (~(gpa_t)0) 45#define UNMAPPED_GVA (~(gpa_t)0)
43 46
44/* KVM Hugepage definitions for x86 */ 47/* KVM Hugepage definitions for x86 */
45#define KVM_NR_PAGE_SIZES 3 48#define KVM_NR_PAGE_SIZES 3
46#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) 49#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
50#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
47#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) 51#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
48#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) 52#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
49#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) 53#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
@@ -69,8 +73,6 @@
69 73
70#define IOPL_SHIFT 12 74#define IOPL_SHIFT 12
71 75
72#define KVM_ALIAS_SLOTS 4
73
74#define KVM_PERMILLE_MMU_PAGES 20 76#define KVM_PERMILLE_MMU_PAGES 20
75#define KVM_MIN_ALLOC_MMU_PAGES 64 77#define KVM_MIN_ALLOC_MMU_PAGES 64
76#define KVM_MMU_HASH_SHIFT 10 78#define KVM_MMU_HASH_SHIFT 10
@@ -241,7 +243,7 @@ struct kvm_mmu {
241 void (*prefetch_page)(struct kvm_vcpu *vcpu, 243 void (*prefetch_page)(struct kvm_vcpu *vcpu,
242 struct kvm_mmu_page *page); 244 struct kvm_mmu_page *page);
243 int (*sync_page)(struct kvm_vcpu *vcpu, 245 int (*sync_page)(struct kvm_vcpu *vcpu,
244 struct kvm_mmu_page *sp); 246 struct kvm_mmu_page *sp, bool clear_unsync);
245 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 247 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
246 hpa_t root_hpa; 248 hpa_t root_hpa;
247 int root_level; 249 int root_level;
@@ -301,8 +303,8 @@ struct kvm_vcpu_arch {
301 unsigned long mmu_seq; 303 unsigned long mmu_seq;
302 } update_pte; 304 } update_pte;
303 305
304 struct i387_fxsave_struct host_fx_image; 306 struct fpu guest_fpu;
305 struct i387_fxsave_struct guest_fx_image; 307 u64 xcr0;
306 308
307 gva_t mmio_fault_cr2; 309 gva_t mmio_fault_cr2;
308 struct kvm_pio_request pio; 310 struct kvm_pio_request pio;
@@ -360,26 +362,11 @@ struct kvm_vcpu_arch {
360 362
361 /* fields used by HYPER-V emulation */ 363 /* fields used by HYPER-V emulation */
362 u64 hv_vapic; 364 u64 hv_vapic;
363};
364
365struct kvm_mem_alias {
366 gfn_t base_gfn;
367 unsigned long npages;
368 gfn_t target_gfn;
369#define KVM_ALIAS_INVALID 1UL
370 unsigned long flags;
371};
372 365
373#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION 366 cpumask_var_t wbinvd_dirty_mask;
374
375struct kvm_mem_aliases {
376 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
377 int naliases;
378}; 367};
379 368
380struct kvm_arch { 369struct kvm_arch {
381 struct kvm_mem_aliases *aliases;
382
383 unsigned int n_free_mmu_pages; 370 unsigned int n_free_mmu_pages;
384 unsigned int n_requested_mmu_pages; 371 unsigned int n_requested_mmu_pages;
385 unsigned int n_alloc_mmu_pages; 372 unsigned int n_alloc_mmu_pages;
@@ -533,6 +520,8 @@ struct kvm_x86_ops {
533 520
534 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); 521 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
535 522
523 bool (*has_wbinvd_exit)(void);
524
536 const struct trace_print_flags *exit_reasons_str; 525 const struct trace_print_flags *exit_reasons_str;
537}; 526};
538 527
@@ -576,7 +565,6 @@ enum emulation_result {
576#define EMULTYPE_SKIP (1 << 2) 565#define EMULTYPE_SKIP (1 << 2)
577int emulate_instruction(struct kvm_vcpu *vcpu, 566int emulate_instruction(struct kvm_vcpu *vcpu,
578 unsigned long cr2, u16 error_code, int emulation_type); 567 unsigned long cr2, u16 error_code, int emulation_type);
579void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
580void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 568void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
581void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 569void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
582 570
@@ -591,10 +579,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
591int kvm_emulate_halt(struct kvm_vcpu *vcpu); 579int kvm_emulate_halt(struct kvm_vcpu *vcpu);
592int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); 580int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
593int emulate_clts(struct kvm_vcpu *vcpu); 581int emulate_clts(struct kvm_vcpu *vcpu);
594int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, 582int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
595 unsigned long *dest);
596int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
597 unsigned long value);
598 583
599void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 584void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
600int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); 585int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
@@ -602,15 +587,16 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
602int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 587int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
603 bool has_error_code, u32 error_code); 588 bool has_error_code, u32 error_code);
604 589
605void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 590int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
606void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 591int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
607void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 592int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
608void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); 593void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
609int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); 594int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
610int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); 595int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
611unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); 596unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
612void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); 597void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
613void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); 598void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
599int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
614 600
615int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 601int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
616int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 602int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
@@ -630,12 +616,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level);
630 616
631void kvm_inject_nmi(struct kvm_vcpu *vcpu); 617void kvm_inject_nmi(struct kvm_vcpu *vcpu);
632 618
633void fx_init(struct kvm_vcpu *vcpu); 619int fx_init(struct kvm_vcpu *vcpu);
634
635int emulator_write_emulated(unsigned long addr,
636 const void *val,
637 unsigned int bytes,
638 struct kvm_vcpu *vcpu);
639 620
640void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); 621void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
641void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 622void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -664,8 +645,6 @@ void kvm_disable_tdp(void);
664int complete_pio(struct kvm_vcpu *vcpu); 645int complete_pio(struct kvm_vcpu *vcpu);
665bool kvm_check_iopl(struct kvm_vcpu *vcpu); 646bool kvm_check_iopl(struct kvm_vcpu *vcpu);
666 647
667struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
668
669static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) 648static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
670{ 649{
671 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); 650 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -719,21 +698,6 @@ static inline unsigned long read_msr(unsigned long msr)
719} 698}
720#endif 699#endif
721 700
722static inline void kvm_fx_save(struct i387_fxsave_struct *image)
723{
724 asm("fxsave (%0)":: "r" (image));
725}
726
727static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
728{
729 asm("fxrstor (%0)":: "r" (image));
730}
731
732static inline void kvm_fx_finit(void)
733{
734 asm("finit");
735}
736
737static inline u32 get_rdx_init_val(void) 701static inline u32 get_rdx_init_val(void)
738{ 702{
739 return 0x600; /* P6 family */ 703 return 0x600; /* P6 family */
diff --git a/arch/x86/include/asm/local64.h b/arch/x86/include/asm/local64.h
new file mode 100644
index 000000000000..36c93b5cc239
--- /dev/null
+++ b/arch/x86/include/asm/local64.h
@@ -0,0 +1 @@
#include <asm-generic/local64.h>
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index f32a4301c4d4..c62c13cb9788 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -38,6 +38,10 @@
38#define MCM_ADDR_MEM 3 /* memory address */ 38#define MCM_ADDR_MEM 3 /* memory address */
39#define MCM_ADDR_GENERIC 7 /* generic */ 39#define MCM_ADDR_GENERIC 7 /* generic */
40 40
41/* CTL2 register defines */
42#define MCI_CTL2_CMCI_EN (1ULL << 30)
43#define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL
44
41#define MCJ_CTX_MASK 3 45#define MCJ_CTX_MASK 3
42#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) 46#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK)
43#define MCJ_CTX_RANDOM 0 /* inject context: random */ 47#define MCJ_CTX_RANDOM 0 /* inject context: random */
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 451d30e7f62d..16350740edf6 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -13,6 +13,32 @@
13extern int pci_mrst_init(void); 13extern int pci_mrst_init(void);
14int __init sfi_parse_mrtc(struct sfi_table_header *table); 14int __init sfi_parse_mrtc(struct sfi_table_header *table);
15 15
16/*
17 * Medfield is the follow-up of Moorestown, it combines two chip solution into
18 * one. Other than that it also added always-on and constant tsc and lapic
19 * timers. Medfield is the platform name, and the chip name is called Penwell
20 * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be
21 * identified via MSRs.
22 */
23enum mrst_cpu_type {
24 MRST_CPU_CHIP_LINCROFT = 1,
25 MRST_CPU_CHIP_PENWELL,
26};
27
28extern enum mrst_cpu_type __mrst_cpu_chip;
29static enum mrst_cpu_type mrst_identify_cpu(void)
30{
31 return __mrst_cpu_chip;
32}
33
34enum mrst_timer_options {
35 MRST_TIMER_DEFAULT,
36 MRST_TIMER_APBT_ONLY,
37 MRST_TIMER_LAPIC_APBT,
38};
39
40extern enum mrst_timer_options mrst_timer_options;
41
16#define SFI_MTMR_MAX_NUM 8 42#define SFI_MTMR_MAX_NUM 8
17#define SFI_MRTC_MAX 8 43#define SFI_MRTC_MAX 8
18 44
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8c7ae4318629..986f7790fdb2 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -20,6 +20,7 @@
20#define _EFER_LMA 10 /* Long mode active (read-only) */ 20#define _EFER_LMA 10 /* Long mode active (read-only) */
21#define _EFER_NX 11 /* No execute enable */ 21#define _EFER_NX 11 /* No execute enable */
22#define _EFER_SVME 12 /* Enable virtualization */ 22#define _EFER_SVME 12 /* Enable virtualization */
23#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
23#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ 24#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
24 25
25#define EFER_SCE (1<<_EFER_SCE) 26#define EFER_SCE (1<<_EFER_SCE)
@@ -27,6 +28,7 @@
27#define EFER_LMA (1<<_EFER_LMA) 28#define EFER_LMA (1<<_EFER_LMA)
28#define EFER_NX (1<<_EFER_NX) 29#define EFER_NX (1<<_EFER_NX)
29#define EFER_SVME (1<<_EFER_SVME) 30#define EFER_SVME (1<<_EFER_SVME)
31#define EFER_LMSLE (1<<_EFER_LMSLE)
30#define EFER_FFXSR (1<<_EFER_FFXSR) 32#define EFER_FFXSR (1<<_EFER_FFXSR)
31 33
32/* Intel MSRs. Some also available on other CPUs */ 34/* Intel MSRs. Some also available on other CPUs */
@@ -94,9 +96,6 @@
94#define MSR_IA32_MC0_CTL2 0x00000280 96#define MSR_IA32_MC0_CTL2 0x00000280
95#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) 97#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
96 98
97#define CMCI_EN (1ULL << 30)
98#define CMCI_THRESHOLD_MASK 0xffffULL
99
100#define MSR_P6_PERFCTR0 0x000000c1 99#define MSR_P6_PERFCTR0 0x000000c1
101#define MSR_P6_PERFCTR1 0x000000c2 100#define MSR_P6_PERFCTR1 0x000000c2
102#define MSR_P6_EVNTSEL0 0x00000186 101#define MSR_P6_EVNTSEL0 0x00000186
@@ -159,8 +158,6 @@
159#define MSR_K7_FID_VID_STATUS 0xc0010042 158#define MSR_K7_FID_VID_STATUS 0xc0010042
160 159
161/* K6 MSRs */ 160/* K6 MSRs */
162#define MSR_K6_EFER 0xc0000080
163#define MSR_K6_STAR 0xc0000081
164#define MSR_K6_WHCR 0xc0000082 161#define MSR_K6_WHCR 0xc0000082
165#define MSR_K6_UWCCR 0xc0000085 162#define MSR_K6_UWCCR 0xc0000085
166#define MSR_K6_EPMR 0xc0000086 163#define MSR_K6_EPMR 0xc0000086
@@ -224,12 +221,14 @@
224#define MSR_IA32_THERM_CONTROL 0x0000019a 221#define MSR_IA32_THERM_CONTROL 0x0000019a
225#define MSR_IA32_THERM_INTERRUPT 0x0000019b 222#define MSR_IA32_THERM_INTERRUPT 0x0000019b
226 223
227#define THERM_INT_LOW_ENABLE (1 << 0) 224#define THERM_INT_HIGH_ENABLE (1 << 0)
228#define THERM_INT_HIGH_ENABLE (1 << 1) 225#define THERM_INT_LOW_ENABLE (1 << 1)
226#define THERM_INT_PLN_ENABLE (1 << 24)
229 227
230#define MSR_IA32_THERM_STATUS 0x0000019c 228#define MSR_IA32_THERM_STATUS 0x0000019c
231 229
232#define THERM_STATUS_PROCHOT (1 << 0) 230#define THERM_STATUS_PROCHOT (1 << 0)
231#define THERM_STATUS_POWER_LIMIT (1 << 10)
233 232
234#define MSR_THERM2_CTL 0x0000019d 233#define MSR_THERM2_CTL 0x0000019d
235 234
@@ -239,6 +238,19 @@
239 238
240#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2 239#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2
241 240
241#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
242
243#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
244
245#define PACKAGE_THERM_STATUS_PROCHOT (1 << 0)
246#define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10)
247
248#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2
249
250#define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0)
251#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
252#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)
253
242/* MISC_ENABLE bits: architectural */ 254/* MISC_ENABLE bits: architectural */
243#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) 255#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
244#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) 256#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index c5bc4c2d33f5..084ef95274cd 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -148,8 +148,8 @@ static inline unsigned long long native_read_pmc(int counter)
148#define rdmsr(msr, val1, val2) \ 148#define rdmsr(msr, val1, val2) \
149do { \ 149do { \
150 u64 __val = native_read_msr((msr)); \ 150 u64 __val = native_read_msr((msr)); \
151 (val1) = (u32)__val; \ 151 (void)((val1) = (u32)__val); \
152 (val2) = (u32)(__val >> 32); \ 152 (void)((val2) = (u32)(__val >> 32)); \
153} while (0) 153} while (0)
154 154
155static inline void wrmsr(unsigned msr, unsigned low, unsigned high) 155static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 93da9c3f3341..932f0f86b4b7 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -17,7 +17,9 @@ int do_nmi_callback(struct pt_regs *regs, int cpu);
17 17
18extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); 18extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
19extern int check_nmi_watchdog(void); 19extern int check_nmi_watchdog(void);
20#if !defined(CONFIG_LOCKUP_DETECTOR)
20extern int nmi_watchdog_enabled; 21extern int nmi_watchdog_enabled;
22#endif
21extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); 23extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
22extern int reserve_perfctr_nmi(unsigned int); 24extern int reserve_perfctr_nmi(unsigned int);
23extern void release_perfctr_nmi(unsigned int); 25extern void release_perfctr_nmi(unsigned int);
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
new file mode 100644
index 000000000000..08fde475cb3b
--- /dev/null
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -0,0 +1,31 @@
1#ifndef _ASM_X86_OLPC_OFW_H
2#define _ASM_X86_OLPC_OFW_H
3
4/* index into the page table containing the entry OFW occupies */
5#define OLPC_OFW_PDE_NR 1022
6
7#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */
8
9#ifdef CONFIG_OLPC_OPENFIRMWARE
10
11/* run an OFW command by calling into the firmware */
12#define olpc_ofw(name, args, res) \
13 __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res)
14
15extern int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
16 void **res);
17
18/* determine whether OFW is available and lives in the proper memory */
19extern void olpc_ofw_detect(void);
20
21/* install OFW's pde permanently into the kernel's pgtable */
22extern void setup_olpc_ofw_pgd(void);
23
24#else /* !CONFIG_OLPC_OPENFIRMWARE */
25
26static inline void olpc_ofw_detect(void) { }
27static inline void setup_olpc_ofw_pgd(void) { }
28
29#endif /* !CONFIG_OLPC_OPENFIRMWARE */
30
31#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 625c3f0e741a..8ca82839288a 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -37,6 +37,13 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
37#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) 37#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x))
38/* __pa_symbol should be used for C visible symbols. 38/* __pa_symbol should be used for C visible symbols.
39 This seems to be the official gcc blessed way to do such arithmetic. */ 39 This seems to be the official gcc blessed way to do such arithmetic. */
40/*
41 * We need __phys_reloc_hide() here because gcc may assume that there is no
42 * overflow during __pa() calculation and can optimize it unexpectedly.
43 * Newer versions of gcc provide -fno-strict-overflow switch to handle this
44 * case properly. Once all supported versions of gcc understand it, we can
45 * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated)
46 */
40#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x))) 47#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
41 48
42#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) 49#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index cd2a31dc5fb8..49c7219826f9 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -30,6 +30,7 @@
30#define PCI_HAS_IO_ECS 0x40000 30#define PCI_HAS_IO_ECS 0x40000
31#define PCI_NOASSIGN_ROMS 0x80000 31#define PCI_NOASSIGN_ROMS 0x80000
32#define PCI_ROOT_NO_CRS 0x100000 32#define PCI_ROOT_NO_CRS 0x100000
33#define PCI_NOASSIGN_BARS 0x200000
33 34
34extern unsigned int pci_probe; 35extern unsigned int pci_probe;
35extern unsigned long pirq_table_addr; 36extern unsigned long pirq_table_addr;
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 254883d0c7e0..6e742cc4251b 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -68,8 +68,9 @@ union cpuid10_eax {
68 68
69union cpuid10_edx { 69union cpuid10_edx {
70 struct { 70 struct {
71 unsigned int num_counters_fixed:4; 71 unsigned int num_counters_fixed:5;
72 unsigned int reserved:28; 72 unsigned int bit_width_fixed:8;
73 unsigned int reserved:19;
73 } split; 74 } split;
74 unsigned int full; 75 unsigned int full;
75}; 76};
@@ -140,6 +141,19 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
140extern unsigned long perf_misc_flags(struct pt_regs *regs); 141extern unsigned long perf_misc_flags(struct pt_regs *regs);
141#define perf_misc_flags(regs) perf_misc_flags(regs) 142#define perf_misc_flags(regs) perf_misc_flags(regs)
142 143
144#include <asm/stacktrace.h>
145
146/*
147 * We abuse bit 3 from flags to pass exact information, see perf_misc_flags
148 * and the comment with PERF_EFLAGS_EXACT.
149 */
150#define perf_arch_fetch_caller_regs(regs, __ip) { \
151 (regs)->ip = (__ip); \
152 (regs)->bp = caller_frame_pointer(); \
153 (regs)->cs = __KERNEL_CS; \
154 regs->flags = 0; \
155}
156
143#else 157#else
144static inline void init_hw_perf_events(void) { } 158static inline void init_hw_perf_events(void) { }
145static inline void perf_events_lapic_init(void) { } 159static inline void perf_events_lapic_init(void) { }
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 64a8ebff06fc..def500776b16 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -19,7 +19,6 @@
19#define ARCH_P4_RESERVED_ESCR (2) /* IQ_ESCR(0,1) not always present */ 19#define ARCH_P4_RESERVED_ESCR (2) /* IQ_ESCR(0,1) not always present */
20#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR) 20#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
21#define ARCH_P4_MAX_CCCR (18) 21#define ARCH_P4_MAX_CCCR (18)
22#define ARCH_P4_MAX_COUNTER (ARCH_P4_MAX_CCCR / 2)
23 22
24#define P4_ESCR_EVENT_MASK 0x7e000000U 23#define P4_ESCR_EVENT_MASK 0x7e000000U
25#define P4_ESCR_EVENT_SHIFT 25 24#define P4_ESCR_EVENT_SHIFT 25
@@ -71,10 +70,6 @@
71#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT) 70#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT)
72#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT) 71#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT)
73 72
74/* Custom bits in reerved CCCR area */
75#define P4_CCCR_CACHE_OPS_MASK 0x0000003fU
76
77
78/* Non HT mask */ 73/* Non HT mask */
79#define P4_CCCR_MASK \ 74#define P4_CCCR_MASK \
80 (P4_CCCR_OVF | \ 75 (P4_CCCR_OVF | \
@@ -106,8 +101,7 @@
106 * ESCR and CCCR but rather an only packed value should 101 * ESCR and CCCR but rather an only packed value should
107 * be unpacked and written to a proper addresses 102 * be unpacked and written to a proper addresses
108 * 103 *
109 * the base idea is to pack as much info as 104 * the base idea is to pack as much info as possible
110 * possible
111 */ 105 */
112#define p4_config_pack_escr(v) (((u64)(v)) << 32) 106#define p4_config_pack_escr(v) (((u64)(v)) << 32)
113#define p4_config_pack_cccr(v) (((u64)(v)) & 0xffffffffULL) 107#define p4_config_pack_cccr(v) (((u64)(v)) & 0xffffffffULL)
@@ -130,8 +124,6 @@
130 t; \ 124 t; \
131 }) 125 })
132 126
133#define p4_config_unpack_cache_event(v) (((u64)(v)) & P4_CCCR_CACHE_OPS_MASK)
134
135#define P4_CONFIG_HT_SHIFT 63 127#define P4_CONFIG_HT_SHIFT 63
136#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) 128#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
137 129
@@ -214,6 +206,12 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
214 return escr; 206 return escr;
215} 207}
216 208
209/*
210 * This are the events which should be used in "Event Select"
211 * field of ESCR register, they are like unique keys which allow
212 * the kernel to determinate which CCCR and COUNTER should be
213 * used to track an event
214 */
217enum P4_EVENTS { 215enum P4_EVENTS {
218 P4_EVENT_TC_DELIVER_MODE, 216 P4_EVENT_TC_DELIVER_MODE,
219 P4_EVENT_BPU_FETCH_REQUEST, 217 P4_EVENT_BPU_FETCH_REQUEST,
@@ -561,7 +559,7 @@ enum P4_EVENT_OPCODES {
561 * a caller should use P4_ESCR_EMASK_NAME helper to 559 * a caller should use P4_ESCR_EMASK_NAME helper to
562 * pick the EventMask needed, for example 560 * pick the EventMask needed, for example
563 * 561 *
564 * P4_ESCR_EMASK_NAME(P4_EVENT_TC_DELIVER_MODE, DD) 562 * P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)
565 */ 563 */
566enum P4_ESCR_EMASKS { 564enum P4_ESCR_EMASKS {
567 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0), 565 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0),
@@ -753,43 +751,50 @@ enum P4_ESCR_EMASKS {
753 P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1), 751 P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1),
754}; 752};
755 753
756/* P4 PEBS: stale for a while */ 754/*
757#define P4_PEBS_METRIC_MASK 0x00001fffU 755 * P4 PEBS specifics (Replay Event only)
758#define P4_PEBS_UOB_TAG 0x01000000U 756 *
759#define P4_PEBS_ENABLE 0x02000000U 757 * Format (bits):
760 758 * 0-6: metric from P4_PEBS_METRIC enum
761/* Replay metrics for MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT */ 759 * 7 : reserved
762#define P4_PEBS__1stl_cache_load_miss_retired 0x3000001 760 * 8 : reserved
763#define P4_PEBS__2ndl_cache_load_miss_retired 0x3000002 761 * 9-11 : reserved
764#define P4_PEBS__dtlb_load_miss_retired 0x3000004 762 *
765#define P4_PEBS__dtlb_store_miss_retired 0x3000004 763 * Note we have UOP and PEBS bits reserved for now
766#define P4_PEBS__dtlb_all_miss_retired 0x3000004 764 * just in case if we will need them once
767#define P4_PEBS__tagged_mispred_branch 0x3018000 765 */
768#define P4_PEBS__mob_load_replay_retired 0x3000200 766#define P4_PEBS_CONFIG_ENABLE (1 << 7)
769#define P4_PEBS__split_load_retired 0x3000400 767#define P4_PEBS_CONFIG_UOP_TAG (1 << 8)
770#define P4_PEBS__split_store_retired 0x3000400 768#define P4_PEBS_CONFIG_METRIC_MASK 0x3f
771 769#define P4_PEBS_CONFIG_MASK 0xff
772#define P4_VERT__1stl_cache_load_miss_retired 0x0000001 770
773#define P4_VERT__2ndl_cache_load_miss_retired 0x0000001 771/*
774#define P4_VERT__dtlb_load_miss_retired 0x0000001 772 * mem: Only counters MSR_IQ_COUNTER4 (16) and
775#define P4_VERT__dtlb_store_miss_retired 0x0000002 773 * MSR_IQ_COUNTER5 (17) are allowed for PEBS sampling
776#define P4_VERT__dtlb_all_miss_retired 0x0000003 774 */
777#define P4_VERT__tagged_mispred_branch 0x0000010 775#define P4_PEBS_ENABLE 0x02000000U
778#define P4_VERT__mob_load_replay_retired 0x0000001 776#define P4_PEBS_ENABLE_UOP_TAG 0x01000000U
779#define P4_VERT__split_load_retired 0x0000001 777
780#define P4_VERT__split_store_retired 0x0000002 778#define p4_config_unpack_metric(v) (((u64)(v)) & P4_PEBS_CONFIG_METRIC_MASK)
781 779#define p4_config_unpack_pebs(v) (((u64)(v)) & P4_PEBS_CONFIG_MASK)
782enum P4_CACHE_EVENTS { 780
783 P4_CACHE__NONE, 781#define p4_config_pebs_has(v, mask) (p4_config_unpack_pebs(v) & (mask))
784 782
785 P4_CACHE__1stl_cache_load_miss_retired, 783enum P4_PEBS_METRIC {
786 P4_CACHE__2ndl_cache_load_miss_retired, 784 P4_PEBS_METRIC__none,
787 P4_CACHE__dtlb_load_miss_retired, 785
788 P4_CACHE__dtlb_store_miss_retired, 786 P4_PEBS_METRIC__1stl_cache_load_miss_retired,
789 P4_CACHE__itlb_reference_hit, 787 P4_PEBS_METRIC__2ndl_cache_load_miss_retired,
790 P4_CACHE__itlb_reference_miss, 788 P4_PEBS_METRIC__dtlb_load_miss_retired,
791 789 P4_PEBS_METRIC__dtlb_store_miss_retired,
792 P4_CACHE__MAX 790 P4_PEBS_METRIC__dtlb_all_miss_retired,
791 P4_PEBS_METRIC__tagged_mispred_branch,
792 P4_PEBS_METRIC__mob_load_replay_retired,
793 P4_PEBS_METRIC__split_load_retired,
794 P4_PEBS_METRIC__split_store_retired,
795
796 P4_PEBS_METRIC__max
793}; 797};
794 798
795#endif /* PERF_EVENT_P4_H */ 799#endif /* PERF_EVENT_P4_H */
800
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 181be528c612..076052cd62be 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -126,8 +126,8 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
126/* x86-64 always has all page tables mapped. */ 126/* x86-64 always has all page tables mapped. */
127#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) 127#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
128#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) 128#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
129#define pte_unmap(pte) /* NOP */ 129#define pte_unmap(pte) ((void)(pte))/* NOP */
130#define pte_unmap_nested(pte) /* NOP */ 130#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */
131 131
132#define update_mmu_cache(vma, address, ptep) do { } while (0) 132#define update_mmu_cache(vma, address, ptep) do { } while (0)
133 133
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7e5c6a60b8ee..325b7bdbebaa 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -762,6 +762,7 @@ extern void init_c1e_mask(void);
762extern unsigned long boot_option_idle_override; 762extern unsigned long boot_option_idle_override;
763extern unsigned long idle_halt; 763extern unsigned long idle_halt;
764extern unsigned long idle_nomwait; 764extern unsigned long idle_nomwait;
765extern bool c1e_detected;
765 766
766/* 767/*
767 * on systems with caches, caches must be flashed as the absolute 768 * on systems with caches, caches must be flashed as the absolute
@@ -1025,4 +1026,24 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
1025 return ratio; 1026 return ratio;
1026} 1027}
1027 1028
1029/*
1030 * AMD errata checking
1031 */
1032#ifdef CONFIG_CPU_SUP_AMD
1033extern const int amd_erratum_383[];
1034extern const int amd_erratum_400[];
1035extern bool cpu_has_amd_erratum(const int *);
1036
1037#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
1038#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
1039#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
1040 ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
1041#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
1042#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
1043#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
1044
1045#else
1046#define cpu_has_amd_erratum(x) (false)
1047#endif /* CONFIG_CPU_SUP_AMD */
1048
1028#endif /* _ASM_X86_PROCESSOR_H */ 1049#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 64cf2d24fad1..6c7fc25f2c34 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -84,5 +84,7 @@
84#define REQUIRED_MASK5 0 84#define REQUIRED_MASK5 0
85#define REQUIRED_MASK6 0 85#define REQUIRED_MASK6 0
86#define REQUIRED_MASK7 0 86#define REQUIRED_MASK7 0
87#define REQUIRED_MASK8 0
88#define REQUIRED_MASK9 0
87 89
88#endif /* _ASM_X86_REQUIRED_FEATURES_H */ 90#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 606ede126972..d1e41b0f9b60 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -118,7 +118,7 @@ static inline void __down_read(struct rw_semaphore *sem)
118{ 118{
119 asm volatile("# beginning down_read\n\t" 119 asm volatile("# beginning down_read\n\t"
120 LOCK_PREFIX _ASM_INC "(%1)\n\t" 120 LOCK_PREFIX _ASM_INC "(%1)\n\t"
121 /* adds 0x00000001, returns the old value */ 121 /* adds 0x00000001 */
122 " jns 1f\n" 122 " jns 1f\n"
123 " call call_rwsem_down_read_failed\n" 123 " call call_rwsem_down_read_failed\n"
124 "1:\n\t" 124 "1:\n\t"
@@ -156,11 +156,9 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
156static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) 156static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
157{ 157{
158 rwsem_count_t tmp; 158 rwsem_count_t tmp;
159
160 tmp = RWSEM_ACTIVE_WRITE_BIAS;
161 asm volatile("# beginning down_write\n\t" 159 asm volatile("# beginning down_write\n\t"
162 LOCK_PREFIX " xadd %1,(%2)\n\t" 160 LOCK_PREFIX " xadd %1,(%2)\n\t"
163 /* subtract 0x0000ffff, returns the old value */ 161 /* adds 0xffff0001, returns the old value */
164 " test %1,%1\n\t" 162 " test %1,%1\n\t"
165 /* was the count 0 before? */ 163 /* was the count 0 before? */
166 " jz 1f\n" 164 " jz 1f\n"
@@ -168,7 +166,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
168 "1:\n" 166 "1:\n"
169 "# ending down_write" 167 "# ending down_write"
170 : "+m" (sem->count), "=d" (tmp) 168 : "+m" (sem->count), "=d" (tmp)
171 : "a" (sem), "1" (tmp) 169 : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS)
172 : "memory", "cc"); 170 : "memory", "cc");
173} 171}
174 172
@@ -195,16 +193,16 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
195 */ 193 */
196static inline void __up_read(struct rw_semaphore *sem) 194static inline void __up_read(struct rw_semaphore *sem)
197{ 195{
198 rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS; 196 rwsem_count_t tmp;
199 asm volatile("# beginning __up_read\n\t" 197 asm volatile("# beginning __up_read\n\t"
200 LOCK_PREFIX " xadd %1,(%2)\n\t" 198 LOCK_PREFIX " xadd %1,(%2)\n\t"
201 /* subtracts 1, returns the old value */ 199 /* subtracts 1, returns the old value */
202 " jns 1f\n\t" 200 " jns 1f\n\t"
203 " call call_rwsem_wake\n" 201 " call call_rwsem_wake\n" /* expects old value in %edx */
204 "1:\n" 202 "1:\n"
205 "# ending __up_read\n" 203 "# ending __up_read\n"
206 : "+m" (sem->count), "=d" (tmp) 204 : "+m" (sem->count), "=d" (tmp)
207 : "a" (sem), "1" (tmp) 205 : "a" (sem), "1" (-RWSEM_ACTIVE_READ_BIAS)
208 : "memory", "cc"); 206 : "memory", "cc");
209} 207}
210 208
@@ -216,10 +214,9 @@ static inline void __up_write(struct rw_semaphore *sem)
216 rwsem_count_t tmp; 214 rwsem_count_t tmp;
217 asm volatile("# beginning __up_write\n\t" 215 asm volatile("# beginning __up_write\n\t"
218 LOCK_PREFIX " xadd %1,(%2)\n\t" 216 LOCK_PREFIX " xadd %1,(%2)\n\t"
219 /* tries to transition 217 /* subtracts 0xffff0001, returns the old value */
220 0xffff0001 -> 0x00000000 */ 218 " jns 1f\n\t"
221 " jz 1f\n" 219 " call call_rwsem_wake\n" /* expects old value in %edx */
222 " call call_rwsem_wake\n"
223 "1:\n\t" 220 "1:\n\t"
224 "# ending __up_write\n" 221 "# ending __up_write\n"
225 : "+m" (sem->count), "=d" (tmp) 222 : "+m" (sem->count), "=d" (tmp)
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
index fb0b1874396f..4240878b9d76 100644
--- a/arch/x86/include/asm/scatterlist.h
+++ b/arch/x86/include/asm/scatterlist.h
@@ -3,7 +3,6 @@
3 3
4#include <asm-generic/scatterlist.h> 4#include <asm-generic/scatterlist.h>
5 5
6#define ISA_DMA_THRESHOLD (0x00ffffff)
7#define ARCH_HAS_SG_CHAIN 6#define ARCH_HAS_SG_CHAIN
8 7
9#endif /* _ASM_X86_SCATTERLIST_H */ 8#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 86b1506f4179..ef292c792d74 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -82,7 +82,7 @@ void *extend_brk(size_t size, size_t align);
82 * executable.) 82 * executable.)
83 */ 83 */
84#define RESERVE_BRK(name,sz) \ 84#define RESERVE_BRK(name,sz) \
85 static void __section(.discard) __used \ 85 static void __section(.discard.text) __used \
86 __brk_reservation_fn_##name##__(void) { \ 86 __brk_reservation_fn_##name##__(void) { \
87 asm volatile ( \ 87 asm volatile ( \
88 ".pushsection .brk_reservation,\"aw\",@nobits;" \ 88 ".pushsection .brk_reservation,\"aw\",@nobits;" \
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 4dab78edbad9..2b16a2ad23dc 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -1,6 +1,13 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5
1#ifndef _ASM_X86_STACKTRACE_H 6#ifndef _ASM_X86_STACKTRACE_H
2#define _ASM_X86_STACKTRACE_H 7#define _ASM_X86_STACKTRACE_H
3 8
9#include <linux/uaccess.h>
10
4extern int kstack_depth_to_print; 11extern int kstack_depth_to_print;
5 12
6struct thread_info; 13struct thread_info;
@@ -42,4 +49,46 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
42 unsigned long *stack, unsigned long bp, 49 unsigned long *stack, unsigned long bp,
43 const struct stacktrace_ops *ops, void *data); 50 const struct stacktrace_ops *ops, void *data);
44 51
52#ifdef CONFIG_X86_32
53#define STACKSLOTS_PER_LINE 8
54#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
55#else
56#define STACKSLOTS_PER_LINE 4
57#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
58#endif
59
60extern void
61show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
62 unsigned long *stack, unsigned long bp, char *log_lvl);
63
64extern void
65show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
66 unsigned long *sp, unsigned long bp, char *log_lvl);
67
68extern unsigned int code_bytes;
69
70/* The form of the top of the frame on the stack */
71struct stack_frame {
72 struct stack_frame *next_frame;
73 unsigned long return_address;
74};
75
76struct stack_frame_ia32 {
77 u32 next_frame;
78 u32 return_address;
79};
80
81static inline unsigned long caller_frame_pointer(void)
82{
83 struct stack_frame *frame;
84
85 get_bp(frame);
86
87#ifdef CONFIG_FRAME_POINTER
88 frame = frame->next_frame;
89#endif
90
91 return (unsigned long)frame;
92}
93
45#endif /* _ASM_X86_STACKTRACE_H */ 94#endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 3ad421784ae7..cb238526a9f1 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -18,13 +18,13 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19 19
20/* ia32/sys_ia32.c */ 20/* ia32/sys_ia32.c */
21asmlinkage long sys32_truncate64(char __user *, unsigned long, unsigned long); 21asmlinkage long sys32_truncate64(const char __user *, unsigned long, unsigned long);
22asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long); 22asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long);
23 23
24asmlinkage long sys32_stat64(char __user *, struct stat64 __user *); 24asmlinkage long sys32_stat64(const char __user *, struct stat64 __user *);
25asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *); 25asmlinkage long sys32_lstat64(const char __user *, struct stat64 __user *);
26asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *); 26asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
27asmlinkage long sys32_fstatat(unsigned int, char __user *, 27asmlinkage long sys32_fstatat(unsigned int, const char __user *,
28 struct stat64 __user *, int); 28 struct stat64 __user *, int);
29struct mmap_arg_struct32; 29struct mmap_arg_struct32;
30asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *); 30asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *);
@@ -49,12 +49,12 @@ asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
49asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *); 49asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
50 50
51asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); 51asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
52asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); 52asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32);
53 53
54asmlinkage long sys32_personality(unsigned long); 54asmlinkage long sys32_personality(unsigned long);
55asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); 55asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
56 56
57asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *, 57asmlinkage long sys32_execve(const char __user *, compat_uptr_t __user *,
58 compat_uptr_t __user *, struct pt_regs *); 58 compat_uptr_t __user *, struct pt_regs *);
59asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *); 59asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
60 60
@@ -80,4 +80,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *);
80 80
81/* ia32/ipc32.c */ 81/* ia32/ipc32.c */
82asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32); 82asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
83
84asmlinkage long sys32_fanotify_mark(int, unsigned int, u32, u32, int,
85 const char __user *);
83#endif /* _ASM_X86_SYS_IA32_H */ 86#endif /* _ASM_X86_SYS_IA32_H */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 5c044b43e9a7..feb2ff9bfc2d 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -23,7 +23,7 @@ long sys_iopl(unsigned int, struct pt_regs *);
23/* kernel/process.c */ 23/* kernel/process.c */
24int sys_fork(struct pt_regs *); 24int sys_fork(struct pt_regs *);
25int sys_vfork(struct pt_regs *); 25int sys_vfork(struct pt_regs *);
26long sys_execve(char __user *, char __user * __user *, 26long sys_execve(const char __user *, char __user * __user *,
27 char __user * __user *, struct pt_regs *); 27 char __user * __user *, struct pt_regs *);
28long sys_clone(unsigned long, unsigned long, void __user *, 28long sys_clone(unsigned long, unsigned long, void __user *,
29 void __user *, struct pt_regs *); 29 void __user *, struct pt_regs *);
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index e7f4d33c55ed..33ecc3ea8782 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -457,4 +457,11 @@ static __always_inline void rdtsc_barrier(void)
457 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); 457 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
458} 458}
459 459
460/*
461 * We handle most unaligned accesses in hardware. On the other hand
462 * unaligned DMA can be quite expensive on some Nehalem processors.
463 *
464 * Based on this we disable the IP header alignment in network drivers.
465 */
466#define NET_IP_ALIGN 0
460#endif /* _ASM_X86_SYSTEM_H */ 467#endif /* _ASM_X86_SYSTEM_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index beb9b5f8f8a4..b766a5e8ba0e 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -343,10 +343,13 @@
343#define __NR_rt_tgsigqueueinfo 335 343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_event_open 336 344#define __NR_perf_event_open 336
345#define __NR_recvmmsg 337 345#define __NR_recvmmsg 337
346#define __NR_fanotify_init 338
347#define __NR_fanotify_mark 339
348#define __NR_prlimit64 340
346 349
347#ifdef __KERNEL__ 350#ifdef __KERNEL__
348 351
349#define NR_syscalls 338 352#define NR_syscalls 341
350 353
351#define __ARCH_WANT_IPC_PARSE_VERSION 354#define __ARCH_WANT_IPC_PARSE_VERSION
352#define __ARCH_WANT_OLD_READDIR 355#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index ff4307b0e81e..363e9b8a715b 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,12 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
663__SYSCALL(__NR_perf_event_open, sys_perf_event_open) 663__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
664#define __NR_recvmmsg 299 664#define __NR_recvmmsg 299
665__SYSCALL(__NR_recvmmsg, sys_recvmmsg) 665__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
666#define __NR_fanotify_init 300
667__SYSCALL(__NR_fanotify_init, sys_fanotify_init)
668#define __NR_fanotify_mark 301
669__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
670#define __NR_prlimit64 302
671__SYSCALL(__NR_prlimit64, sys_prlimit64)
666 672
667#ifndef __NO_STUBS 673#ifndef __NO_STUBS
668#define __ARCH_WANT_OLD_READDIR 674#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index aa558ac0306e..42d412fd8b02 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -34,6 +34,7 @@
34 */ 34 */
35 35
36#define UV_ITEMS_PER_DESCRIPTOR 8 36#define UV_ITEMS_PER_DESCRIPTOR 8
37/* the 'throttle' to prevent the hardware stay-busy bug */
37#define MAX_BAU_CONCURRENT 3 38#define MAX_BAU_CONCURRENT 3
38#define UV_CPUS_PER_ACT_STATUS 32 39#define UV_CPUS_PER_ACT_STATUS 32
39#define UV_ACT_STATUS_MASK 0x3 40#define UV_ACT_STATUS_MASK 0x3
@@ -45,10 +46,26 @@
45#define UV_DESC_BASE_PNODE_SHIFT 49 46#define UV_DESC_BASE_PNODE_SHIFT 49
46#define UV_PAYLOADQ_PNODE_SHIFT 49 47#define UV_PAYLOADQ_PNODE_SHIFT 49
47#define UV_PTC_BASENAME "sgi_uv/ptc_statistics" 48#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
49#define UV_BAU_BASENAME "sgi_uv/bau_tunables"
50#define UV_BAU_TUNABLES_DIR "sgi_uv"
51#define UV_BAU_TUNABLES_FILE "bau_tunables"
52#define WHITESPACE " \t\n"
48#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) 53#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
49#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15 54#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
50#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16 55#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
51#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL 56#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x0000000009UL
57/* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */
58#define BAU_MISC_CONTROL_MULT_MASK 3
59
60#define UVH_AGING_PRESCALE_SEL 0x000000b000UL
61/* [30:28] URGENCY_7 an index into a table of times */
62#define BAU_URGENCY_7_SHIFT 28
63#define BAU_URGENCY_7_MASK 7
64
65#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL
66/* [45:40] BAU - BAU transaction timeout select - a multiplier */
67#define BAU_TRANS_SHIFT 40
68#define BAU_TRANS_MASK 0x3f
52 69
53/* 70/*
54 * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 71 * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
@@ -59,24 +76,21 @@
59#define DESC_STATUS_SOURCE_TIMEOUT 3 76#define DESC_STATUS_SOURCE_TIMEOUT 3
60 77
61/* 78/*
62 * source side threshholds at which message retries print a warning 79 * delay for 'plugged' timeout retries, in microseconds
63 */
64#define SOURCE_TIMEOUT_LIMIT 20
65#define DESTINATION_TIMEOUT_LIMIT 20
66
67/*
68 * misc. delays, in microseconds
69 */ 80 */
70#define THROTTLE_DELAY 10 81#define PLUGGED_DELAY 10
71#define TIMEOUT_DELAY 10
72#define BIOS_TO 1000
73/* BIOS is assumed to set the destination timeout to 1003520 nanoseconds */
74 82
75/* 83/*
76 * threshholds at which to use IPI to free resources 84 * threshholds at which to use IPI to free resources
77 */ 85 */
86/* after this # consecutive 'plugged' timeouts, use IPI to release resources */
78#define PLUGSB4RESET 100 87#define PLUGSB4RESET 100
79#define TIMEOUTSB4RESET 100 88/* after this many consecutive timeouts, use IPI to release resources */
89#define TIMEOUTSB4RESET 1
90/* at this number uses of IPI to release resources, giveup the request */
91#define IPI_RESET_LIMIT 1
92/* after this # consecutive successes, bump up the throttle if it was lowered */
93#define COMPLETE_THRESHOLD 5
80 94
81/* 95/*
82 * number of entries in the destination side payload queue 96 * number of entries in the destination side payload queue
@@ -96,6 +110,13 @@
96#define FLUSH_COMPLETE 4 110#define FLUSH_COMPLETE 4
97 111
98/* 112/*
113 * tuning the action when the numalink network is extremely delayed
114 */
115#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in microseconds */
116#define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */
117#define CONGESTED_PERIOD 30 /* time for the bau to be disabled, in seconds */
118
119/*
99 * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) 120 * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
100 * If the 'multilevel' flag in the header portion of the descriptor 121 * If the 'multilevel' flag in the header portion of the descriptor
101 * has been set to 0, then endpoint multi-unicast mode is selected. 122 * has been set to 0, then endpoint multi-unicast mode is selected.
@@ -300,37 +321,16 @@ struct bau_payload_queue_entry {
300 /* bytes 24-31 */ 321 /* bytes 24-31 */
301}; 322};
302 323
303/* 324struct msg_desc {
304 * one per-cpu; to locate the software tables 325 struct bau_payload_queue_entry *msg;
305 */ 326 int msg_slot;
306struct bau_control { 327 int sw_ack_slot;
307 struct bau_desc *descriptor_base;
308 struct bau_payload_queue_entry *va_queue_first; 328 struct bau_payload_queue_entry *va_queue_first;
309 struct bau_payload_queue_entry *va_queue_last; 329 struct bau_payload_queue_entry *va_queue_last;
310 struct bau_payload_queue_entry *bau_msg_head; 330};
311 struct bau_control *uvhub_master; 331
312 struct bau_control *socket_master; 332struct reset_args {
313 unsigned long timeout_interval; 333 int sender;
314 atomic_t active_descriptor_count;
315 int max_concurrent;
316 int max_concurrent_constant;
317 int retry_message_scans;
318 int plugged_tries;
319 int timeout_tries;
320 int ipi_attempts;
321 int conseccompletes;
322 short cpu;
323 short uvhub_cpu;
324 short uvhub;
325 short cpus_in_socket;
326 short cpus_in_uvhub;
327 unsigned short message_number;
328 unsigned short uvhub_quiesce;
329 short socket_acknowledge_count[DEST_Q_SIZE];
330 cycles_t send_message;
331 spinlock_t masks_lock;
332 spinlock_t uvhub_lock;
333 spinlock_t queue_lock;
334}; 334};
335 335
336/* 336/*
@@ -344,18 +344,25 @@ struct ptc_stats {
344 unsigned long s_dtimeout; /* destination side timeouts */ 344 unsigned long s_dtimeout; /* destination side timeouts */
345 unsigned long s_time; /* time spent in sending side */ 345 unsigned long s_time; /* time spent in sending side */
346 unsigned long s_retriesok; /* successful retries */ 346 unsigned long s_retriesok; /* successful retries */
347 unsigned long s_ntargcpu; /* number of cpus targeted */ 347 unsigned long s_ntargcpu; /* total number of cpu's targeted */
348 unsigned long s_ntarguvhub; /* number of uvhubs targeted */ 348 unsigned long s_ntargself; /* times the sending cpu was targeted */
349 unsigned long s_ntarguvhub16; /* number of times >= 16 target hubs */ 349 unsigned long s_ntarglocals; /* targets of cpus on the local blade */
350 unsigned long s_ntarguvhub8; /* number of times >= 8 target hubs */ 350 unsigned long s_ntargremotes; /* targets of cpus on remote blades */
351 unsigned long s_ntarguvhub4; /* number of times >= 4 target hubs */ 351 unsigned long s_ntarglocaluvhub; /* targets of the local hub */
352 unsigned long s_ntarguvhub2; /* number of times >= 2 target hubs */ 352 unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */
353 unsigned long s_ntarguvhub1; /* number of times == 1 target hub */ 353 unsigned long s_ntarguvhub; /* total number of uvhubs targeted */
354 unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/
355 unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */
356 unsigned long s_ntarguvhub4; /* number of times target hubs >= 4 */
357 unsigned long s_ntarguvhub2; /* number of times target hubs >= 2 */
358 unsigned long s_ntarguvhub1; /* number of times target hubs == 1 */
354 unsigned long s_resets_plug; /* ipi-style resets from plug state */ 359 unsigned long s_resets_plug; /* ipi-style resets from plug state */
355 unsigned long s_resets_timeout; /* ipi-style resets from timeouts */ 360 unsigned long s_resets_timeout; /* ipi-style resets from timeouts */
356 unsigned long s_busy; /* status stayed busy past s/w timer */ 361 unsigned long s_busy; /* status stayed busy past s/w timer */
357 unsigned long s_throttles; /* waits in throttle */ 362 unsigned long s_throttles; /* waits in throttle */
358 unsigned long s_retry_messages; /* retry broadcasts */ 363 unsigned long s_retry_messages; /* retry broadcasts */
364 unsigned long s_bau_reenabled; /* for bau enable/disable */
365 unsigned long s_bau_disabled; /* for bau enable/disable */
359 /* destination statistics */ 366 /* destination statistics */
360 unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ 367 unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */
361 unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */ 368 unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
@@ -370,6 +377,52 @@ struct ptc_stats {
370 unsigned long d_rcanceled; /* number of messages canceled by resets */ 377 unsigned long d_rcanceled; /* number of messages canceled by resets */
371}; 378};
372 379
380/*
381 * one per-cpu; to locate the software tables
382 */
383struct bau_control {
384 struct bau_desc *descriptor_base;
385 struct bau_payload_queue_entry *va_queue_first;
386 struct bau_payload_queue_entry *va_queue_last;
387 struct bau_payload_queue_entry *bau_msg_head;
388 struct bau_control *uvhub_master;
389 struct bau_control *socket_master;
390 struct ptc_stats *statp;
391 unsigned long timeout_interval;
392 unsigned long set_bau_on_time;
393 atomic_t active_descriptor_count;
394 int plugged_tries;
395 int timeout_tries;
396 int ipi_attempts;
397 int conseccompletes;
398 int baudisabled;
399 int set_bau_off;
400 short cpu;
401 short uvhub_cpu;
402 short uvhub;
403 short cpus_in_socket;
404 short cpus_in_uvhub;
405 unsigned short message_number;
406 unsigned short uvhub_quiesce;
407 short socket_acknowledge_count[DEST_Q_SIZE];
408 cycles_t send_message;
409 spinlock_t uvhub_lock;
410 spinlock_t queue_lock;
411 /* tunables */
412 int max_bau_concurrent;
413 int max_bau_concurrent_constant;
414 int plugged_delay;
415 int plugsb4reset;
416 int timeoutsb4reset;
417 int ipi_reset_limit;
418 int complete_threshold;
419 int congested_response_us;
420 int congested_reps;
421 int congested_period;
422 cycles_t period_time;
423 long period_requests;
424};
425
373static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp) 426static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
374{ 427{
375 return constant_test_bit(uvhub, &dstp->bits[0]); 428 return constant_test_bit(uvhub, &dstp->bits[0]);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9e6779f7cf2d..9f0cbd987d50 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -257,6 +257,7 @@ enum vmcs_field {
257#define EXIT_REASON_IO_INSTRUCTION 30 257#define EXIT_REASON_IO_INSTRUCTION 30
258#define EXIT_REASON_MSR_READ 31 258#define EXIT_REASON_MSR_READ 31
259#define EXIT_REASON_MSR_WRITE 32 259#define EXIT_REASON_MSR_WRITE 32
260#define EXIT_REASON_INVALID_STATE 33
260#define EXIT_REASON_MWAIT_INSTRUCTION 36 261#define EXIT_REASON_MWAIT_INSTRUCTION 36
261#define EXIT_REASON_MONITOR_INSTRUCTION 39 262#define EXIT_REASON_MONITOR_INSTRUCTION 39
262#define EXIT_REASON_PAUSE_INSTRUCTION 40 263#define EXIT_REASON_PAUSE_INSTRUCTION 40
@@ -266,6 +267,7 @@ enum vmcs_field {
266#define EXIT_REASON_EPT_VIOLATION 48 267#define EXIT_REASON_EPT_VIOLATION 48
267#define EXIT_REASON_EPT_MISCONFIG 49 268#define EXIT_REASON_EPT_MISCONFIG 49
268#define EXIT_REASON_WBINVD 54 269#define EXIT_REASON_WBINVD 54
270#define EXIT_REASON_XSETBV 55
269 271
270/* 272/*
271 * Interruption-information format 273 * Interruption-information format
@@ -375,6 +377,9 @@ enum vmcs_field {
375#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 377#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
376#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 378#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
377 379
380#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
381#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
382
378#define VMX_EPT_DEFAULT_GAW 3 383#define VMX_EPT_DEFAULT_GAW 3
379#define VMX_EPT_MAX_GAW 0x4 384#define VMX_EPT_MAX_GAW 0x4
380#define VMX_EPT_MT_EPTE_SHIFT 3 385#define VMX_EPT_MT_EPTE_SHIFT 3
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 9c371e4a9fa6..7fda040a76cd 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -417,6 +417,12 @@ HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
417 return _hypercall2(int, nmi_op, op, arg); 417 return _hypercall2(int, nmi_op, op, arg);
418} 418}
419 419
420static inline unsigned long __must_check
421HYPERVISOR_hvm_op(int op, void *arg)
422{
423 return _hypercall2(unsigned long, hvm_op, op, arg);
424}
425
420static inline void 426static inline void
421MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) 427MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
422{ 428{
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 018a0a400799..bf5f7d32bd08 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -112,13 +112,9 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
112 */ 112 */
113static inline unsigned long mfn_to_local_pfn(unsigned long mfn) 113static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
114{ 114{
115 extern unsigned long max_mapnr;
116 unsigned long pfn = mfn_to_pfn(mfn); 115 unsigned long pfn = mfn_to_pfn(mfn);
117 if ((pfn < max_mapnr) 116 if (get_phys_to_machine(pfn) != mfn)
118 && !xen_feature(XENFEAT_auto_translated_physmap) 117 return -1; /* force !pfn_valid() */
119 && (get_phys_to_machine(pfn) != mfn))
120 return max_mapnr; /* force !pfn_valid() */
121 /* XXX fixme; not true with sparsemem */
122 return pfn; 118 return pfn;
123} 119}
124 120
diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h
new file mode 100644
index 000000000000..1be1ab7d6a41
--- /dev/null
+++ b/arch/x86/include/asm/xen/swiotlb-xen.h
@@ -0,0 +1,14 @@
1#ifndef _ASM_X86_SWIOTLB_XEN_H
2#define _ASM_X86_SWIOTLB_XEN_H
3
4#ifdef CONFIG_SWIOTLB_XEN
5extern int xen_swiotlb;
6extern int __init pci_xen_swiotlb_detect(void);
7extern void __init pci_xen_swiotlb_init(void);
8#else
9#define xen_swiotlb (0)
10static inline int __init pci_xen_swiotlb_detect(void) { return 0; }
11static inline void __init pci_xen_swiotlb_init(void) { }
12#endif
13
14#endif /* _ASM_X86_SWIOTLB_XEN_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 2c4390cae228..c6ce2452f10c 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -3,7 +3,8 @@
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <asm/processor.h> 5#include <asm/processor.h>
6#include <asm/i387.h> 6
7#define XSTATE_CPUID 0x0000000d
7 8
8#define XSTATE_FP 0x1 9#define XSTATE_FP 0x1
9#define XSTATE_SSE 0x2 10#define XSTATE_SSE 0x2
@@ -13,6 +14,12 @@
13 14
14#define FXSAVE_SIZE 512 15#define FXSAVE_SIZE 512
15 16
17#define XSAVE_HDR_SIZE 64
18#define XSAVE_HDR_OFFSET FXSAVE_SIZE
19
20#define XSAVE_YMM_SIZE 256
21#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
22
16/* 23/*
17 * These are the features that the OS can handle currently. 24 * These are the features that the OS can handle currently.
18 */ 25 */
@@ -26,10 +33,8 @@
26 33
27extern unsigned int xstate_size; 34extern unsigned int xstate_size;
28extern u64 pcntxt_mask; 35extern u64 pcntxt_mask;
29extern struct xsave_struct *init_xstate_buf;
30extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; 36extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
31 37
32extern void xsave_cntxt_init(void);
33extern void xsave_init(void); 38extern void xsave_init(void);
34extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); 39extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
35extern int init_fpu(struct task_struct *child); 40extern int init_fpu(struct task_struct *child);
@@ -59,6 +64,16 @@ static inline int fpu_xrstor_checking(struct fpu *fpu)
59static inline int xsave_user(struct xsave_struct __user *buf) 64static inline int xsave_user(struct xsave_struct __user *buf)
60{ 65{
61 int err; 66 int err;
67
68 /*
69 * Clear the xsave header first, so that reserved fields are
70 * initialized to zero.
71 */
72 err = __clear_user(&buf->xsave_hdr,
73 sizeof(struct xsave_hdr_struct));
74 if (unlikely(err))
75 return -EFAULT;
76
62 __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n" 77 __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
63 "2:\n" 78 "2:\n"
64 ".section .fixup,\"ax\"\n" 79 ".section .fixup,\"ax\"\n"
@@ -111,12 +126,25 @@ static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
111 : "memory"); 126 : "memory");
112} 127}
113 128
129static inline void xsave_state(struct xsave_struct *fx, u64 mask)
130{
131 u32 lmask = mask;
132 u32 hmask = mask >> 32;
133
134 asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t"
135 : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
136 : "memory");
137}
138
114static inline void fpu_xsave(struct fpu *fpu) 139static inline void fpu_xsave(struct fpu *fpu)
115{ 140{
116 /* This, however, we can work around by forcing the compiler to select 141 /* This, however, we can work around by forcing the compiler to select
117 an addressing mode that doesn't require extended registers. */ 142 an addressing mode that doesn't require extended registers. */
118 __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27" 143 alternative_input(
119 : : "D" (&(fpu->state->xsave)), 144 ".byte " REX_PREFIX "0x0f,0xae,0x27",
120 "a" (-1), "d"(-1) : "memory"); 145 ".byte " REX_PREFIX "0x0f,0xae,0x37",
146 X86_FEATURE_XSAVEOPT,
147 [fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) :
148 "memory");
121} 149}
122#endif 150#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e77b22083721..0925676266bd 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -104,6 +104,7 @@ obj-$(CONFIG_SCx200) += scx200.o
104scx200-y += scx200_32.o 104scx200-y += scx200_32.o
105 105
106obj-$(CONFIG_OLPC) += olpc.o 106obj-$(CONFIG_OLPC) += olpc.o
107obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
107obj-$(CONFIG_X86_MRST) += mrst.o 108obj-$(CONFIG_X86_MRST) += mrst.o
108 109
109microcode-y := microcode_core.o 110microcode-y := microcode_core.o
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 580b4e296010..28595d6df47c 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -104,7 +104,7 @@ _start:
104 movl %eax, %ecx 104 movl %eax, %ecx
105 orl %edx, %ecx 105 orl %edx, %ecx
106 jz 1f 106 jz 1f
107 movl $0xc0000080, %ecx 107 movl $MSR_EFER, %ecx
108 wrmsr 108 wrmsr
1091: 1091:
110 110
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index fcc3c61fdecc..33cec152070d 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -2,7 +2,7 @@
2 * sleep.c - x86-specific ACPI sleep support. 2 * sleep.c - x86-specific ACPI sleep support.
3 * 3 *
4 * Copyright (C) 2001-2003 Patrick Mochel 4 * Copyright (C) 2001-2003 Patrick Mochel
5 * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> 5 * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz>
6 */ 6 */
7 7
8#include <linux/acpi.h> 8#include <linux/acpi.h>
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 70237732a6c7..f65ab8b014c4 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
214 u8 *instr = a->instr; 214 u8 *instr = a->instr;
215 BUG_ON(a->replacementlen > a->instrlen); 215 BUG_ON(a->replacementlen > a->instrlen);
216 BUG_ON(a->instrlen > sizeof(insnbuf)); 216 BUG_ON(a->instrlen > sizeof(insnbuf));
217 BUG_ON(a->cpuid >= NCAPINTS*32);
217 if (!boot_cpu_has(a->cpuid)) 218 if (!boot_cpu_has(a->cpuid))
218 continue; 219 continue;
219#ifdef CONFIG_X86_64 220#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0d20286d78c6..fa044e1e30a2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2572,6 +2572,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2572static int amd_iommu_domain_has_cap(struct iommu_domain *domain, 2572static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
2573 unsigned long cap) 2573 unsigned long cap)
2574{ 2574{
2575 switch (cap) {
2576 case IOMMU_CAP_CACHE_COHERENCY:
2577 return 1;
2578 }
2579
2575 return 0; 2580 return 0;
2576} 2581}
2577 2582
@@ -2609,8 +2614,7 @@ int __init amd_iommu_init_passthrough(void)
2609 2614
2610 pt_domain->mode |= PAGE_MODE_NONE; 2615 pt_domain->mode |= PAGE_MODE_NONE;
2611 2616
2612 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2617 for_each_pci_dev(dev) {
2613
2614 if (!check_device(&dev->dev)) 2618 if (!check_device(&dev->dev))
2615 continue; 2619 continue;
2616 2620
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index a35347501d36..8dd77800ff5d 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -43,10 +43,11 @@
43 43
44#include <asm/fixmap.h> 44#include <asm/fixmap.h>
45#include <asm/apb_timer.h> 45#include <asm/apb_timer.h>
46#include <asm/mrst.h>
46 47
47#define APBT_MASK CLOCKSOURCE_MASK(32) 48#define APBT_MASK CLOCKSOURCE_MASK(32)
48#define APBT_SHIFT 22 49#define APBT_SHIFT 22
49#define APBT_CLOCKEVENT_RATING 150 50#define APBT_CLOCKEVENT_RATING 110
50#define APBT_CLOCKSOURCE_RATING 250 51#define APBT_CLOCKSOURCE_RATING 250
51#define APBT_MIN_DELTA_USEC 200 52#define APBT_MIN_DELTA_USEC 200
52 53
@@ -83,8 +84,6 @@ struct apbt_dev {
83 char name[10]; 84 char name[10];
84}; 85};
85 86
86int disable_apbt_percpu __cpuinitdata;
87
88static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); 87static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
89 88
90#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
@@ -195,29 +194,6 @@ static struct clock_event_device apbt_clockevent = {
195}; 194};
196 195
197/* 196/*
198 * if user does not want to use per CPU apb timer, just give it a lower rating
199 * than local apic timer and skip the late per cpu timer init.
200 */
201static inline int __init setup_x86_mrst_timer(char *arg)
202{
203 if (!arg)
204 return -EINVAL;
205
206 if (strcmp("apbt_only", arg) == 0)
207 disable_apbt_percpu = 0;
208 else if (strcmp("lapic_and_apbt", arg) == 0)
209 disable_apbt_percpu = 1;
210 else {
211 pr_warning("X86 MRST timer option %s not recognised"
212 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
213 arg);
214 return -EINVAL;
215 }
216 return 0;
217}
218__setup("x86_mrst_timer=", setup_x86_mrst_timer);
219
220/*
221 * start count down from 0xffff_ffff. this is done by toggling the enable bit 197 * start count down from 0xffff_ffff. this is done by toggling the enable bit
222 * then load initial load count to ~0. 198 * then load initial load count to ~0.
223 */ 199 */
@@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void)
335 adev->num = smp_processor_id(); 311 adev->num = smp_processor_id();
336 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); 312 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
337 313
338 if (disable_apbt_percpu) { 314 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
339 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; 315 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
340 global_clock_event = &adev->evt; 316 global_clock_event = &adev->evt;
341 printk(KERN_DEBUG "%s clockevent registered as global\n", 317 printk(KERN_DEBUG "%s clockevent registered as global\n",
@@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
429 405
430static __init int apbt_late_init(void) 406static __init int apbt_late_init(void)
431{ 407{
432 if (disable_apbt_percpu || !apb_timer_block_enabled) 408 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT ||
409 !apb_timer_block_enabled)
433 return 0; 410 return 0;
434 /* This notifier should be called after workqueue is ready */ 411 /* This notifier should be called after workqueue is ready */
435 hotcpu_notifier(apbt_cpuhp_notify, -20); 412 hotcpu_notifier(apbt_cpuhp_notify, -20);
@@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode,
450 int timer_num; 427 int timer_num;
451 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); 428 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
452 429
430 BUG_ON(!apbt_virt_address);
431
453 timer_num = adev->num; 432 timer_num = adev->num;
454 pr_debug("%s CPU %d timer %d mode=%d\n", 433 pr_debug("%s CPU %d timer %d mode=%d\n",
455 __func__, first_cpu(*evt->cpumask), timer_num, mode); 434 __func__, first_cpu(*evt->cpumask), timer_num, mode);
@@ -676,7 +655,7 @@ void __init apbt_time_init(void)
676 } 655 }
677#ifdef CONFIG_SMP 656#ifdef CONFIG_SMP
678 /* kernel cmdline disable apb timer, so we will use lapic timers */ 657 /* kernel cmdline disable apb timer, so we will use lapic timers */
679 if (disable_apbt_percpu) { 658 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
680 printk(KERN_INFO "apbt: disabled per cpu timer\n"); 659 printk(KERN_INFO "apbt: disabled per cpu timer\n");
681 return; 660 return;
682 } 661 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b5d8b0bcf235..a2e0caf26e17 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void)
280 * or BIOS forget to put that in reserved. 280 * or BIOS forget to put that in reserved.
281 * try to update e820 to make that region as reserved. 281 * try to update e820 to make that region as reserved.
282 */ 282 */
283 u32 agp_aper_base = 0, agp_aper_order = 0; 283 u32 agp_aper_order = 0;
284 int i, fix, slot, valid_agp = 0; 284 int i, fix, slot, valid_agp = 0;
285 u32 ctl; 285 u32 ctl;
286 u32 aper_size = 0, aper_order = 0, last_aper_order = 0; 286 u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
@@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void)
291 return; 291 return;
292 292
293 /* This is mostly duplicate of iommu_hole_init */ 293 /* This is mostly duplicate of iommu_hole_init */
294 agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); 294 search_agp_bridge(&agp_aper_order, &valid_agp);
295 295
296 fix = 0; 296 fix = 0;
297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 565c1bfc507d..910f20b457c4 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,12 @@
2# Makefile for local APIC drivers and for the IO-APIC code 2# Makefile for local APIC drivers and for the IO-APIC code
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
6ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
7obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
8endif
9obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o
10
6obj-$(CONFIG_X86_IO_APIC) += io_apic.o 11obj-$(CONFIG_X86_IO_APIC) += io_apic.o
7obj-$(CONFIG_SMP) += ipi.o 12obj-$(CONFIG_SMP) += ipi.o
8 13
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a96489ee6cab..e3b534cda49a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -460,7 +460,7 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
460} 460}
461 461
462/* 462/*
463 * Setup the local APIC timer for this CPU. Copy the initilized values 463 * Setup the local APIC timer for this CPU. Copy the initialized values
464 * of the boot CPU and register the clock event in the framework. 464 * of the boot CPU and register the clock event in the framework.
465 */ 465 */
466static void __cpuinit setup_APIC_timer(void) 466static void __cpuinit setup_APIC_timer(void)
@@ -1606,7 +1606,7 @@ void __init init_apic_mappings(void)
1606 * acpi lapic path already maps that address in 1606 * acpi lapic path already maps that address in
1607 * acpi_register_lapic_address() 1607 * acpi_register_lapic_address()
1608 */ 1608 */
1609 if (!acpi_lapic) 1609 if (!acpi_lapic && !smp_found_config)
1610 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 1610 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
1611 1611
1612 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", 1612 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 425e53a87feb..8593582d8022 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -129,7 +129,6 @@ int es7000_plat;
129 * GSI override for ES7000 platforms. 129 * GSI override for ES7000 platforms.
130 */ 130 */
131 131
132static unsigned int base;
133 132
134static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) 133static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
135{ 134{
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
new file mode 100644
index 000000000000..cefd6942f0e9
--- /dev/null
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -0,0 +1,107 @@
1/*
2 * HW NMI watchdog support
3 *
4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
5 *
6 * Arch specific calls to support NMI watchdog
7 *
8 * Bits copied from original nmi.c file
9 *
10 */
11#include <asm/apic.h>
12
13#include <linux/cpumask.h>
14#include <linux/kdebug.h>
15#include <linux/notifier.h>
16#include <linux/kprobes.h>
17#include <linux/nmi.h>
18#include <linux/module.h>
19
20/* For reliability, we're prepared to waste bits here. */
21static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
22
23u64 hw_nmi_get_sample_period(void)
24{
25 return (u64)(cpu_khz) * 1000 * 60;
26}
27
28#ifdef ARCH_HAS_NMI_WATCHDOG
29void arch_trigger_all_cpu_backtrace(void)
30{
31 int i;
32
33 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
34
35 printk(KERN_INFO "sending NMI to all CPUs:\n");
36 apic->send_IPI_all(NMI_VECTOR);
37
38 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
39 for (i = 0; i < 10 * 1000; i++) {
40 if (cpumask_empty(to_cpumask(backtrace_mask)))
41 break;
42 mdelay(1);
43 }
44}
45
46static int __kprobes
47arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
48 unsigned long cmd, void *__args)
49{
50 struct die_args *args = __args;
51 struct pt_regs *regs;
52 int cpu = smp_processor_id();
53
54 switch (cmd) {
55 case DIE_NMI:
56 case DIE_NMI_IPI:
57 break;
58
59 default:
60 return NOTIFY_DONE;
61 }
62
63 regs = args->regs;
64
65 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
66 static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
67
68 arch_spin_lock(&lock);
69 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
70 show_regs(regs);
71 dump_stack();
72 arch_spin_unlock(&lock);
73 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
74 return NOTIFY_STOP;
75 }
76
77 return NOTIFY_DONE;
78}
79
80static __read_mostly struct notifier_block backtrace_notifier = {
81 .notifier_call = arch_trigger_all_cpu_backtrace_handler,
82 .next = NULL,
83 .priority = 1
84};
85
86static int __init register_trigger_all_cpu_backtrace(void)
87{
88 register_die_notifier(&backtrace_notifier);
89 return 0;
90}
91early_initcall(register_trigger_all_cpu_backtrace);
92#endif
93
94/* STUB calls to mimic old nmi_watchdog behaviour */
95#if defined(CONFIG_X86_LOCAL_APIC)
96unsigned int nmi_watchdog = NMI_NONE;
97EXPORT_SYMBOL(nmi_watchdog);
98void acpi_nmi_enable(void) { return; }
99void acpi_nmi_disable(void) { return; }
100#endif
101atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
102EXPORT_SYMBOL(nmi_active);
103int unknown_nmi_panic;
104void cpu_nmi_set_wd_enabled(void) { return; }
105void stop_apic_nmi_watchdog(void *unused) { return; }
106void setup_apic_nmi_watchdog(void *unused) { return; }
107int __init check_nmi_watchdog(void) { return 0; }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e41ed24ab26d..4dc0084ec1b1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3397,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3397 3397
3398 cfg = desc->chip_data; 3398 cfg = desc->chip_data;
3399 3399
3400 read_msi_msg_desc(desc, &msg); 3400 get_cached_msi_msg_desc(desc, &msg);
3401 3401
3402 msg.data &= ~MSI_DATA_VECTOR_MASK; 3402 msg.data &= ~MSI_DATA_VECTOR_MASK;
3403 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3403 msg.data |= MSI_DATA_VECTOR(cfg->vector);
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 1edaf15c0b8e..a43f71cb30f8 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -401,13 +401,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
401 int cpu = smp_processor_id(); 401 int cpu = smp_processor_id();
402 int rc = 0; 402 int rc = 0;
403 403
404 /* check for other users first */
405 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
406 == NOTIFY_STOP) {
407 rc = 1;
408 touched = 1;
409 }
410
411 sum = get_timer_irqs(cpu); 404 sum = get_timer_irqs(cpu);
412 405
413 if (__get_cpu_var(nmi_touch)) { 406 if (__get_cpu_var(nmi_touch)) {
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index e46f98f36e31..7b598b84c902 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -604,6 +604,10 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
604{ 604{
605 if (reason != DIE_NMI_IPI) 605 if (reason != DIE_NMI_IPI)
606 return NOTIFY_OK; 606 return NOTIFY_OK;
607
608 if (in_crash_kexec)
609 /* do nothing if entering the crash kernel */
610 return NOTIFY_OK;
607 /* 611 /*
608 * Use a lock so only one cpu prints at a time 612 * Use a lock so only one cpu prints at a time
609 * to prevent intermixed output. 613 * to prevent intermixed output.
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index c4f9182ca3ac..4c9c67bf09b7 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -140,7 +140,7 @@
140 * is now the way life works). 140 * is now the way life works).
141 * Fix thinko in suspend() (wrong return). 141 * Fix thinko in suspend() (wrong return).
142 * Notify drivers on critical suspend. 142 * Notify drivers on critical suspend.
143 * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz> 143 * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz>
144 * modified by sfr). 144 * modified by sfr).
145 * Disable interrupts while we are suspended (Andy Henroid 145 * Disable interrupts while we are suspended (Andy Henroid
146 * <andy_henroid@yahoo.com> fixed by sfr). 146 * <andy_henroid@yahoo.com> fixed by sfr).
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3a785da34b6f..3f0ebe429a01 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -12,11 +12,11 @@ endif
12nostackp := $(call cc-option, -fno-stack-protector) 12nostackp := $(call cc-option, -fno-stack-protector)
13CFLAGS_common.o := $(nostackp) 13CFLAGS_common.o := $(nostackp)
14 14
15obj-y := intel_cacheinfo.o addon_cpuid_features.o 15obj-y := intel_cacheinfo.o scattered.o topology.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o sched.o mshyperv.o 17obj-y += vmware.o hypervisor.o sched.o mshyperv.o
18 18
19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 19obj-$(CONFIG_X86_32) += bugs.o
20obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
21 21
22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e485825130d2..60a57b13082d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
466 } 466 }
467 467
468 } 468 }
469 if (c->x86 == 0x10 || c->x86 == 0x11) 469 if (c->x86 >= 0x10)
470 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 470 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
471 471
472 /* get apicid instead of initial apic id from cpuid */ 472 /* get apicid instead of initial apic id from cpuid */
@@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
529 num_cache_leaves = 3; 529 num_cache_leaves = 3;
530 } 530 }
531 531
532 if (c->x86 >= 0xf && c->x86 <= 0x11) 532 if (c->x86 >= 0xf)
533 set_cpu_cap(c, X86_FEATURE_K8); 533 set_cpu_cap(c, X86_FEATURE_K8);
534 534
535 if (cpu_has_xmm2) { 535 if (cpu_has_xmm2) {
@@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
546 fam10h_check_enable_mmcfg(); 546 fam10h_check_enable_mmcfg();
547 } 547 }
548 548
549 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { 549 if (c == &boot_cpu_data && c->x86 >= 0xf) {
550 unsigned long long tseg; 550 unsigned long long tseg;
551 551
552 /* 552 /*
@@ -609,3 +609,74 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
609}; 609};
610 610
611cpu_dev_register(amd_cpu_dev); 611cpu_dev_register(amd_cpu_dev);
612
613/*
614 * AMD errata checking
615 *
616 * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
617 * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
618 * have an OSVW id assigned, which it takes as first argument. Both take a
619 * variable number of family-specific model-stepping ranges created by
620 * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const
621 * int[] in arch/x86/include/asm/processor.h.
622 *
623 * Example:
624 *
625 * const int amd_erratum_319[] =
626 * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
627 * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
628 * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
629 */
630
631const int amd_erratum_400[] =
632 AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
633 AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
634EXPORT_SYMBOL_GPL(amd_erratum_400);
635
636const int amd_erratum_383[] =
637 AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
638EXPORT_SYMBOL_GPL(amd_erratum_383);
639
640bool cpu_has_amd_erratum(const int *erratum)
641{
642 struct cpuinfo_x86 *cpu = &current_cpu_data;
643 int osvw_id = *erratum++;
644 u32 range;
645 u32 ms;
646
647 /*
648 * If called early enough that current_cpu_data hasn't been initialized
649 * yet, fall back to boot_cpu_data.
650 */
651 if (cpu->x86 == 0)
652 cpu = &boot_cpu_data;
653
654 if (cpu->x86_vendor != X86_VENDOR_AMD)
655 return false;
656
657 if (osvw_id >= 0 && osvw_id < 65536 &&
658 cpu_has(cpu, X86_FEATURE_OSVW)) {
659 u64 osvw_len;
660
661 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
662 if (osvw_id < osvw_len) {
663 u64 osvw_bits;
664
665 rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
666 osvw_bits);
667 return osvw_bits & (1ULL << (osvw_id & 0x3f));
668 }
669 }
670
671 /* OSVW unavailable or ID unknown, match family-model-stepping range */
672 ms = (cpu->x86_model << 8) | cpu->x86_mask;
673 while ((range = *erratum++))
674 if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
675 (ms >= AMD_MODEL_RANGE_START(range)) &&
676 (ms <= AMD_MODEL_RANGE_END(range)))
677 return true;
678
679 return false;
680}
681
682EXPORT_SYMBOL_GPL(cpu_has_amd_erratum);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 68e4a6f2211e..490dac63c2d2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
140static int __init x86_xsave_setup(char *s) 140static int __init x86_xsave_setup(char *s)
141{ 141{
142 setup_clear_cpu_cap(X86_FEATURE_XSAVE); 142 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
143 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
143 return 1; 144 return 1;
144} 145}
145__setup("noxsave", x86_xsave_setup); 146__setup("noxsave", x86_xsave_setup);
146 147
148static int __init x86_xsaveopt_setup(char *s)
149{
150 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
151 return 1;
152}
153__setup("noxsaveopt", x86_xsaveopt_setup);
154
147#ifdef CONFIG_X86_32 155#ifdef CONFIG_X86_32
148static int cachesize_override __cpuinitdata = -1; 156static int cachesize_override __cpuinitdata = -1;
149static int disable_x86_serial_nr __cpuinitdata = 1; 157static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -551,6 +559,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
551 c->x86_capability[4] = excap; 559 c->x86_capability[4] = excap;
552 } 560 }
553 561
562 /* Additional Intel-defined flags: level 0x00000007 */
563 if (c->cpuid_level >= 0x00000007) {
564 u32 eax, ebx, ecx, edx;
565
566 cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
567
568 if (eax > 0)
569 c->x86_capability[9] = ebx;
570 }
571
554 /* AMD-defined flags: level 0x80000001 */ 572 /* AMD-defined flags: level 0x80000001 */
555 xlvl = cpuid_eax(0x80000000); 573 xlvl = cpuid_eax(0x80000000);
556 c->extended_cpuid_level = xlvl; 574 c->extended_cpuid_level = xlvl;
@@ -576,6 +594,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
576 if (c->extended_cpuid_level >= 0x80000007) 594 if (c->extended_cpuid_level >= 0x80000007)
577 c->x86_power = cpuid_edx(0x80000007); 595 c->x86_power = cpuid_edx(0x80000007);
578 596
597 init_scattered_cpuid_features(c);
579} 598}
580 599
581static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) 600static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -731,7 +750,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
731 750
732 get_model_name(c); /* Default name */ 751 get_model_name(c); /* Default name */
733 752
734 init_scattered_cpuid_features(c);
735 detect_nopl(c); 753 detect_nopl(c);
736} 754}
737 755
@@ -1192,6 +1210,7 @@ void __cpuinit cpu_init(void)
1192 dbg_restore_debug_regs(); 1210 dbg_restore_debug_regs();
1193 1211
1194 fpu_init(); 1212 fpu_init();
1213 xsave_init();
1195 1214
1196 raw_local_save_flags(kernel_eflags); 1215 raw_local_save_flags(kernel_eflags);
1197 1216
@@ -1252,12 +1271,7 @@ void __cpuinit cpu_init(void)
1252 clear_used_math(); 1271 clear_used_math();
1253 mxcsr_feature_mask_init(); 1272 mxcsr_feature_mask_init();
1254 1273
1255 /* 1274 fpu_init();
1256 * Boot processor to setup the FP and extended state context info.
1257 */
1258 if (smp_processor_id() == boot_cpu_id)
1259 init_thread_xstate();
1260
1261 xsave_init(); 1275 xsave_init();
1262} 1276}
1263#endif 1277#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1d3cddaa40ee..cd8da247dda1 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -34,7 +34,6 @@
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <trace/events/power.h>
38 37
39#include <linux/acpi.h> 38#include <linux/acpi.h>
40#include <linux/io.h> 39#include <linux/io.h>
@@ -73,7 +72,7 @@ struct acpi_cpufreq_data {
73static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); 72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
74 73
75/* acpi_perf_data is a pointer to percpu data. */ 74/* acpi_perf_data is a pointer to percpu data. */
76static struct acpi_processor_performance *acpi_perf_data; 75static struct acpi_processor_performance __percpu *acpi_perf_data;
77 76
78static struct cpufreq_driver acpi_cpufreq_driver; 77static struct cpufreq_driver acpi_cpufreq_driver;
79 78
@@ -324,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
324 } 323 }
325 } 324 }
326 325
327 trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
328
329 switch (data->cpu_feature) { 326 switch (data->cpu_feature) {
330 case SYSTEM_INTEL_MSR_CAPABLE: 327 case SYSTEM_INTEL_MSR_CAPABLE:
331 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 328 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
@@ -351,7 +348,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
351 348
352 freqs.old = perf->states[perf->state].core_frequency * 1000; 349 freqs.old = perf->states[perf->state].core_frequency * 1000;
353 freqs.new = data->freq_table[next_state].frequency; 350 freqs.new = data->freq_table[next_state].frequency;
354 for_each_cpu(i, cmd.mask) { 351 for_each_cpu(i, policy->cpus) {
355 freqs.cpu = i; 352 freqs.cpu = i;
356 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 353 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
357 } 354 }
@@ -367,7 +364,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
367 } 364 }
368 } 365 }
369 366
370 for_each_cpu(i, cmd.mask) { 367 for_each_cpu(i, policy->cpus) {
371 freqs.cpu = i; 368 freqs.cpu = i;
372 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 369 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
373 } 370 }
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index 16e3483be9e3..32974cf84232 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -169,12 +169,9 @@ static int gx_freq_mult[16] = {
169 * Low Level chipset interface * 169 * Low Level chipset interface *
170 ****************************************************************/ 170 ****************************************************************/
171static struct pci_device_id gx_chipset_tbl[] __initdata = { 171static struct pci_device_id gx_chipset_tbl[] __initdata = {
172 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, 172 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
173 PCI_ANY_ID, PCI_ANY_ID }, 173 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
174 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, 174 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
175 PCI_ANY_ID, PCI_ANY_ID },
176 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510,
177 PCI_ANY_ID, PCI_ANY_ID },
178 { 0, }, 175 { 0, },
179}; 176};
180 177
@@ -199,7 +196,7 @@ static __init struct pci_dev *gx_detect_chipset(void)
199 } 196 }
200 197
201 /* detect which companion chip is used */ 198 /* detect which companion chip is used */
202 while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) { 199 for_each_pci_dev(gx_pci) {
203 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) 200 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
204 return gx_pci; 201 return gx_pci;
205 } 202 }
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 7e7eea4f8261..03162dac6271 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -426,7 +426,7 @@ static int guess_fsb(int mult)
426} 426}
427 427
428 428
429static int __init longhaul_get_ranges(void) 429static int __cpuinit longhaul_get_ranges(void)
430{ 430{
431 unsigned int i, j, k = 0; 431 unsigned int i, j, k = 0;
432 unsigned int ratio; 432 unsigned int ratio;
@@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void)
530} 530}
531 531
532 532
533static void __init longhaul_setup_voltagescaling(void) 533static void __cpuinit longhaul_setup_voltagescaling(void)
534{ 534{
535 union msr_longhaul longhaul; 535 union msr_longhaul longhaul;
536 struct mV_pos minvid, maxvid, vid; 536 struct mV_pos minvid, maxvid, vid;
@@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void)
784 return 0; 784 return 0;
785} 785}
786 786
787static int __init longhaul_cpu_init(struct cpufreq_policy *policy) 787static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
788{ 788{
789 struct cpuinfo_x86 *c = &cpu_data(0); 789 struct cpuinfo_x86 *c = &cpu_data(0);
790 char *cpuname = NULL; 790 char *cpuname = NULL;
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
index e2360a469f79..cbf48fbca881 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h
@@ -56,7 +56,7 @@ union msr_longhaul {
56/* 56/*
57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0) 57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
58 */ 58 */
59static const int __initdata samuel1_mults[16] = { 59static const int __cpuinitdata samuel1_mults[16] = {
60 -1, /* 0000 -> RESERVED */ 60 -1, /* 0000 -> RESERVED */
61 30, /* 0001 -> 3.0x */ 61 30, /* 0001 -> 3.0x */
62 40, /* 0010 -> 4.0x */ 62 40, /* 0010 -> 4.0x */
@@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = {
75 -1, /* 1111 -> RESERVED */ 75 -1, /* 1111 -> RESERVED */
76}; 76};
77 77
78static const int __initdata samuel1_eblcr[16] = { 78static const int __cpuinitdata samuel1_eblcr[16] = {
79 50, /* 0000 -> RESERVED */ 79 50, /* 0000 -> RESERVED */
80 30, /* 0001 -> 3.0x */ 80 30, /* 0001 -> 3.0x */
81 40, /* 0010 -> 4.0x */ 81 40, /* 0010 -> 4.0x */
@@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = {
97/* 97/*
98 * VIA C3 Samuel2 Stepping 1->15 98 * VIA C3 Samuel2 Stepping 1->15
99 */ 99 */
100static const int __initdata samuel2_eblcr[16] = { 100static const int __cpuinitdata samuel2_eblcr[16] = {
101 50, /* 0000 -> 5.0x */ 101 50, /* 0000 -> 5.0x */
102 30, /* 0001 -> 3.0x */ 102 30, /* 0001 -> 3.0x */
103 40, /* 0010 -> 4.0x */ 103 40, /* 0010 -> 4.0x */
@@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = {
119/* 119/*
120 * VIA C3 Ezra 120 * VIA C3 Ezra
121 */ 121 */
122static const int __initdata ezra_mults[16] = { 122static const int __cpuinitdata ezra_mults[16] = {
123 100, /* 0000 -> 10.0x */ 123 100, /* 0000 -> 10.0x */
124 30, /* 0001 -> 3.0x */ 124 30, /* 0001 -> 3.0x */
125 40, /* 0010 -> 4.0x */ 125 40, /* 0010 -> 4.0x */
@@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = {
138 120, /* 1111 -> 12.0x */ 138 120, /* 1111 -> 12.0x */
139}; 139};
140 140
141static const int __initdata ezra_eblcr[16] = { 141static const int __cpuinitdata ezra_eblcr[16] = {
142 50, /* 0000 -> 5.0x */ 142 50, /* 0000 -> 5.0x */
143 30, /* 0001 -> 3.0x */ 143 30, /* 0001 -> 3.0x */
144 40, /* 0010 -> 4.0x */ 144 40, /* 0010 -> 4.0x */
@@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = {
160/* 160/*
161 * VIA C3 (Ezra-T) [C5M]. 161 * VIA C3 (Ezra-T) [C5M].
162 */ 162 */
163static const int __initdata ezrat_mults[32] = { 163static const int __cpuinitdata ezrat_mults[32] = {
164 100, /* 0000 -> 10.0x */ 164 100, /* 0000 -> 10.0x */
165 30, /* 0001 -> 3.0x */ 165 30, /* 0001 -> 3.0x */
166 40, /* 0010 -> 4.0x */ 166 40, /* 0010 -> 4.0x */
@@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = {
196 -1, /* 1111 -> RESERVED (12.0x) */ 196 -1, /* 1111 -> RESERVED (12.0x) */
197}; 197};
198 198
199static const int __initdata ezrat_eblcr[32] = { 199static const int __cpuinitdata ezrat_eblcr[32] = {
200 50, /* 0000 -> 5.0x */ 200 50, /* 0000 -> 5.0x */
201 30, /* 0001 -> 3.0x */ 201 30, /* 0001 -> 3.0x */
202 40, /* 0010 -> 4.0x */ 202 40, /* 0010 -> 4.0x */
@@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = {
235/* 235/*
236 * VIA C3 Nehemiah */ 236 * VIA C3 Nehemiah */
237 237
238static const int __initdata nehemiah_mults[32] = { 238static const int __cpuinitdata nehemiah_mults[32] = {
239 100, /* 0000 -> 10.0x */ 239 100, /* 0000 -> 10.0x */
240 -1, /* 0001 -> 16.0x */ 240 -1, /* 0001 -> 16.0x */
241 40, /* 0010 -> 4.0x */ 241 40, /* 0010 -> 4.0x */
@@ -270,7 +270,7 @@ static const int __initdata nehemiah_mults[32] = {
270 -1, /* 1111 -> 12.0x */ 270 -1, /* 1111 -> 12.0x */
271}; 271};
272 272
273static const int __initdata nehemiah_eblcr[32] = { 273static const int __cpuinitdata nehemiah_eblcr[32] = {
274 50, /* 0000 -> 5.0x */ 274 50, /* 0000 -> 5.0x */
275 160, /* 0001 -> 16.0x */ 275 160, /* 0001 -> 16.0x */
276 40, /* 0010 -> 4.0x */ 276 40, /* 0010 -> 4.0x */
@@ -315,7 +315,7 @@ struct mV_pos {
315 unsigned short pos; 315 unsigned short pos;
316}; 316};
317 317
318static const struct mV_pos __initdata vrm85_mV[32] = { 318static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, 319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, 320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, 321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
@@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = {
326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} 326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
327}; 327};
328 328
329static const unsigned char __initdata mV_vrm85[32] = { 329static const unsigned char __cpuinitdata mV_vrm85[32] = {
330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, 330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, 331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, 332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
334}; 334};
335 335
336static const struct mV_pos __initdata mobilevrm_mV[32] = { 336static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, 337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, 338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, 339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
@@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = {
344 {675, 3}, {650, 2}, {625, 1}, {600, 0} 344 {675, 3}, {650, 2}, {625, 1}, {600, 0}
345}; 345};
346 346
347static const unsigned char __initdata mV_mobilevrm[32] = { 347static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index e7b559d74c52..fc09f142d94d 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu)
165 * TMTA rules: 165 * TMTA rules:
166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) 166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
167 */ 167 */
168static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, 168static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
169 unsigned int *high_freq) 169 unsigned int *high_freq)
170{ 170{
171 u32 msr_lo, msr_hi; 171 u32 msr_lo, msr_hi;
172 u32 save_lo, save_hi; 172 u32 save_lo, save_hi;
@@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
258} 258}
259 259
260 260
261static int __init longrun_cpu_init(struct cpufreq_policy *policy) 261static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
262{ 262{
263 int result = 0; 263 int result = 0;
264 264
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 7b8a8ba67b07..bd1cac747f67 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
178 } 178 }
179 } 179 }
180 180
181 if (c->x86 != 0xF) { 181 if (c->x86 != 0xF)
182 if (!cpu_has(c, X86_FEATURE_EST))
183 printk(KERN_WARNING PFX "Unknown CPU. "
184 "Please send an e-mail to "
185 "<cpufreq@vger.kernel.org>\n");
186 return 0; 182 return 0;
187 }
188 183
189 /* on P-4s, the TSC runs with constant frequency independent whether 184 /* on P-4s, the TSC runs with constant frequency independent whether
190 * throttling is active or not. */ 185 * throttling is active or not. */
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index a36de5bbb622..994230d4dc4e 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -110,7 +110,7 @@ struct pcc_cpu {
110 u32 output_offset; 110 u32 output_offset;
111}; 111};
112 112
113static struct pcc_cpu *pcc_cpu_info; 113static struct pcc_cpu __percpu *pcc_cpu_info;
114 114
115static int pcc_cpufreq_verify(struct cpufreq_policy *policy) 115static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
116{ 116{
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 9a97116f89e5..4a45fd6e41ba 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy)
569 * We will then get the same kind of behaviour already tested under 569 * We will then get the same kind of behaviour already tested under
570 * the "well-known" other OS. 570 * the "well-known" other OS.
571 */ 571 */
572static int __init fixup_sgtc(void) 572static int __cpuinit fixup_sgtc(void)
573{ 573{
574 unsigned int sgtc; 574 unsigned int sgtc;
575 unsigned int m; 575 unsigned int m;
@@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu)
603} 603}
604 604
605 605
606static int __init acer_cpufreq_pst(const struct dmi_system_id *d) 606static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
607{ 607{
608 printk(KERN_WARNING PFX 608 printk(KERN_WARNING PFX
609 "%s laptop with broken PST tables in BIOS detected.\n", 609 "%s laptop with broken PST tables in BIOS detected.\n",
@@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
621 * A BIOS update is all that can save them. 621 * A BIOS update is all that can save them.
622 * Mention this, and disable cpufreq. 622 * Mention this, and disable cpufreq.
623 */ 623 */
624static struct dmi_system_id __initdata powernow_dmi_table[] = { 624static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
625 { 625 {
626 .callback = acer_cpufreq_pst, 626 .callback = acer_cpufreq_pst,
627 .ident = "Acer Aspire", 627 .ident = "Acer Aspire",
@@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = {
633 { } 633 { }
634}; 634};
635 635
636static int __init powernow_cpu_init(struct cpufreq_policy *policy) 636static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
637{ 637{
638 union msr_fidvidstatus fidvidstatus; 638 union msr_fidvidstatus fidvidstatus;
639 int result; 639 int result;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 3e90cce3dc8b..491977baf6c0 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -9,7 +9,7 @@
9 * Based on the powernow-k7.c module written by Dave Jones. 9 * Based on the powernow-k7.c module written by Dave Jones.
10 * (C) 2003 Dave Jones on behalf of SuSE Labs 10 * (C) 2003 Dave Jones on behalf of SuSE Labs
11 * (C) 2004 Dominik Brodowski <linux@brodo.de> 11 * (C) 2004 Dominik Brodowski <linux@brodo.de>
12 * (C) 2004 Pavel Machek <pavel@suse.cz> 12 * (C) 2004 Pavel Machek <pavel@ucw.cz>
13 * Licensed under the terms of the GNU GPL License version 2. 13 * Licensed under the terms of the GNU GPL License version 2.
14 * Based upon datasheets & sample CPUs kindly provided by AMD. 14 * Based upon datasheets & sample CPUs kindly provided by AMD.
15 * 15 *
@@ -806,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data)
806 * www.amd.com 806 * www.amd.com
807 */ 807 */
808 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); 808 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
809 printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
810 " and Cool'N'Quiet support is enabled in BIOS setup\n");
809 return -ENODEV; 811 return -ENODEV;
810} 812}
811 813
@@ -910,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
910{ 912{
911 int i; 913 int i;
912 u32 hi = 0, lo = 0; 914 u32 hi = 0, lo = 0;
913 rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); 915 rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
914 data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; 916 data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
915 917
916 for (i = 0; i < data->acpi_data.state_count; i++) { 918 for (i = 0; i < data->acpi_data.state_count; i++) {
917 u32 index; 919 u32 index;
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index dd531cc56a8f..8095f8611f8a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,6 +34,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
34{ 34{
35 &x86_hyper_vmware, 35 &x86_hyper_vmware,
36 &x86_hyper_ms_hyperv, 36 &x86_hyper_ms_hyperv,
37#ifdef CONFIG_XEN_PVHVM
38 &x86_hyper_xen_hvm,
39#endif
37}; 40};
38 41
39const struct hypervisor_x86 *x86_hyper; 42const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 33eae2062cf5..898c2f4eab88 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -347,8 +347,8 @@ static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
347 return l3; 347 return l3;
348} 348}
349 349
350static void __cpuinit 350static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
351amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 351 int index)
352{ 352{
353 int node; 353 int node;
354 354
@@ -396,20 +396,39 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
396 this_leaf->l3 = l3_caches[node]; 396 this_leaf->l3 = l3_caches[node];
397} 397}
398 398
399/*
400 * check whether a slot used for disabling an L3 index is occupied.
401 * @l3: L3 cache descriptor
402 * @slot: slot number (0..1)
403 *
404 * @returns: the disabled index if used or negative value if slot free.
405 */
406int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
407{
408 unsigned int reg = 0;
409
410 pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
411
412 /* check whether this slot is activated already */
413 if (reg & (3UL << 30))
414 return reg & 0xfff;
415
416 return -1;
417}
418
399static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, 419static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
400 unsigned int slot) 420 unsigned int slot)
401{ 421{
402 struct pci_dev *dev = this_leaf->l3->dev; 422 int index;
403 unsigned int reg = 0;
404 423
405 if (!this_leaf->l3 || !this_leaf->l3->can_disable) 424 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
406 return -EINVAL; 425 return -EINVAL;
407 426
408 if (!dev) 427 index = amd_get_l3_disable_slot(this_leaf->l3, slot);
409 return -EINVAL; 428 if (index >= 0)
429 return sprintf(buf, "%d\n", index);
410 430
411 pci_read_config_dword(dev, 0x1BC + slot * 4, &reg); 431 return sprintf(buf, "FREE\n");
412 return sprintf(buf, "0x%08x\n", reg);
413} 432}
414 433
415#define SHOW_CACHE_DISABLE(slot) \ 434#define SHOW_CACHE_DISABLE(slot) \
@@ -451,37 +470,74 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
451 } 470 }
452} 471}
453 472
454 473/*
455static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, 474 * disable a L3 cache index by using a disable-slot
456 const char *buf, size_t count, 475 *
457 unsigned int slot) 476 * @l3: L3 cache descriptor
477 * @cpu: A CPU on the node containing the L3 cache
478 * @slot: slot number (0..1)
479 * @index: index to disable
480 *
481 * @return: 0 on success, error status on failure
482 */
483int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
484 unsigned long index)
458{ 485{
459 struct pci_dev *dev = this_leaf->l3->dev; 486 int ret = 0;
460 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
461 unsigned long val = 0;
462 487
463#define SUBCACHE_MASK (3UL << 20) 488#define SUBCACHE_MASK (3UL << 20)
464#define SUBCACHE_INDEX 0xfff 489#define SUBCACHE_INDEX 0xfff
465 490
466 if (!this_leaf->l3 || !this_leaf->l3->can_disable) 491 /*
492 * check whether this slot is already used or
493 * the index is already disabled
494 */
495 ret = amd_get_l3_disable_slot(l3, slot);
496 if (ret >= 0)
467 return -EINVAL; 497 return -EINVAL;
468 498
499 /*
500 * check whether the other slot has disabled the
501 * same index already
502 */
503 if (index == amd_get_l3_disable_slot(l3, !slot))
504 return -EINVAL;
505
506 /* do not allow writes outside of allowed bits */
507 if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
508 ((index & SUBCACHE_INDEX) > l3->indices))
509 return -EINVAL;
510
511 amd_l3_disable_index(l3, cpu, slot, index);
512
513 return 0;
514}
515
516static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
517 const char *buf, size_t count,
518 unsigned int slot)
519{
520 unsigned long val = 0;
521 int cpu, err = 0;
522
469 if (!capable(CAP_SYS_ADMIN)) 523 if (!capable(CAP_SYS_ADMIN))
470 return -EPERM; 524 return -EPERM;
471 525
472 if (!dev) 526 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
473 return -EINVAL; 527 return -EINVAL;
474 528
475 if (strict_strtoul(buf, 10, &val) < 0) 529 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
476 return -EINVAL;
477 530
478 /* do not allow writes outside of allowed bits */ 531 if (strict_strtoul(buf, 10, &val) < 0)
479 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
480 ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
481 return -EINVAL; 532 return -EINVAL;
482 533
483 amd_l3_disable_index(this_leaf->l3, cpu, slot, val); 534 err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val);
484 535 if (err) {
536 if (err == -EEXIST)
537 printk(KERN_WARNING "L3 disable slot %d in use!\n",
538 slot);
539 return err;
540 }
485 return count; 541 return count;
486} 542}
487 543
@@ -502,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
502 558
503#else /* CONFIG_CPU_SUP_AMD */ 559#else /* CONFIG_CPU_SUP_AMD */
504static void __cpuinit 560static void __cpuinit
505amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 561amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
506{ 562{
507}; 563};
508#endif /* CONFIG_CPU_SUP_AMD */ 564#endif /* CONFIG_CPU_SUP_AMD */
@@ -518,7 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
518 574
519 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 575 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
520 amd_cpuid4(index, &eax, &ebx, &ecx); 576 amd_cpuid4(index, &eax, &ebx, &ecx);
521 amd_check_l3_disable(index, this_leaf); 577 amd_check_l3_disable(this_leaf, index);
522 } else { 578 } else {
523 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 579 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
524 } 580 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 18cc42562250..ed41562909fe 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,7 +51,7 @@
51static DEFINE_MUTEX(mce_read_mutex); 51static DEFINE_MUTEX(mce_read_mutex);
52 52
53#define rcu_dereference_check_mce(p) \ 53#define rcu_dereference_check_mce(p) \
54 rcu_dereference_check((p), \ 54 rcu_dereference_index_check((p), \
55 rcu_read_lock_sched_held() || \ 55 rcu_read_lock_sched_held() || \
56 lockdep_is_held(&mce_read_mutex)) 56 lockdep_is_held(&mce_read_mutex))
57 57
@@ -107,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
107static int default_decode_mce(struct notifier_block *nb, unsigned long val, 107static int default_decode_mce(struct notifier_block *nb, unsigned long val,
108 void *data) 108 void *data)
109{ 109{
110 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 110 pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
111 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 111 pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
112 112
113 return NOTIFY_STOP; 113 return NOTIFY_STOP;
114} 114}
@@ -211,11 +211,11 @@ void mce_log(struct mce *mce)
211 211
212static void print_mce(struct mce *m) 212static void print_mce(struct mce *m)
213{ 213{
214 pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 214 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
215 m->extcpu, m->mcgstatus, m->bank, m->status); 215 m->extcpu, m->mcgstatus, m->bank, m->status);
216 216
217 if (m->ip) { 217 if (m->ip) {
218 pr_emerg("RIP%s %02x:<%016Lx> ", 218 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
219 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 219 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
220 m->cs, m->ip); 220 m->cs, m->ip);
221 221
@@ -224,14 +224,14 @@ static void print_mce(struct mce *m)
224 pr_cont("\n"); 224 pr_cont("\n");
225 } 225 }
226 226
227 pr_emerg("TSC %llx ", m->tsc); 227 pr_emerg(HW_ERR "TSC %llx ", m->tsc);
228 if (m->addr) 228 if (m->addr)
229 pr_cont("ADDR %llx ", m->addr); 229 pr_cont("ADDR %llx ", m->addr);
230 if (m->misc) 230 if (m->misc)
231 pr_cont("MISC %llx ", m->misc); 231 pr_cont("MISC %llx ", m->misc);
232 232
233 pr_cont("\n"); 233 pr_cont("\n");
234 pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 234 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
235 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); 235 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
236 236
237 /* 237 /*
@@ -241,16 +241,6 @@ static void print_mce(struct mce *m)
241 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 241 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
242} 242}
243 243
244static void print_mce_head(void)
245{
246 pr_emerg("\nHARDWARE ERROR\n");
247}
248
249static void print_mce_tail(void)
250{
251 pr_emerg("This is not a software problem!\n");
252}
253
254#define PANIC_TIMEOUT 5 /* 5 seconds */ 244#define PANIC_TIMEOUT 5 /* 5 seconds */
255 245
256static atomic_t mce_paniced; 246static atomic_t mce_paniced;
@@ -291,7 +281,6 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
291 if (atomic_inc_return(&mce_fake_paniced) > 1) 281 if (atomic_inc_return(&mce_fake_paniced) > 1)
292 return; 282 return;
293 } 283 }
294 print_mce_head();
295 /* First print corrected ones that are still unlogged */ 284 /* First print corrected ones that are still unlogged */
296 for (i = 0; i < MCE_LOG_LEN; i++) { 285 for (i = 0; i < MCE_LOG_LEN; i++) {
297 struct mce *m = &mcelog.entry[i]; 286 struct mce *m = &mcelog.entry[i];
@@ -322,16 +311,15 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
322 apei_err = apei_write_mce(final); 311 apei_err = apei_write_mce(final);
323 } 312 }
324 if (cpu_missing) 313 if (cpu_missing)
325 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 314 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
326 print_mce_tail();
327 if (exp) 315 if (exp)
328 printk(KERN_EMERG "Machine check: %s\n", exp); 316 pr_emerg(HW_ERR "Machine check: %s\n", exp);
329 if (!fake_panic) { 317 if (!fake_panic) {
330 if (panic_timeout == 0) 318 if (panic_timeout == 0)
331 panic_timeout = mce_panic_timeout; 319 panic_timeout = mce_panic_timeout;
332 panic(msg); 320 panic(msg);
333 } else 321 } else
334 printk(KERN_EMERG "Fake kernel panic: %s\n", msg); 322 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
335} 323}
336 324
337/* Support code for software error injection */ 325/* Support code for software error injection */
@@ -600,6 +588,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
600 */ 588 */
601 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 589 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
602 mce_log(&m); 590 mce_log(&m);
591 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
603 add_taint(TAINT_MACHINE_CHECK); 592 add_taint(TAINT_MACHINE_CHECK);
604 } 593 }
605 594
@@ -1220,7 +1209,7 @@ int mce_notify_irq(void)
1220 schedule_work(&mce_trigger_work); 1209 schedule_work(&mce_trigger_work);
1221 1210
1222 if (__ratelimit(&ratelimit)) 1211 if (__ratelimit(&ratelimit))
1223 printk(KERN_INFO "Machine check events logged\n"); 1212 pr_info(HW_ERR "Machine check events logged\n");
1224 1213
1225 return 1; 1214 return 1;
1226 } 1215 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 62b48e40920a..6fcd0936194f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -95,19 +95,20 @@ static void cmci_discover(int banks, int boot)
95 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 95 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
96 96
97 /* Already owned by someone else? */ 97 /* Already owned by someone else? */
98 if (val & CMCI_EN) { 98 if (val & MCI_CTL2_CMCI_EN) {
99 if (test_and_clear_bit(i, owned) && !boot) 99 if (test_and_clear_bit(i, owned) && !boot)
100 print_update("SHD", &hdr, i); 100 print_update("SHD", &hdr, i);
101 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 101 __clear_bit(i, __get_cpu_var(mce_poll_banks));
102 continue; 102 continue;
103 } 103 }
104 104
105 val |= CMCI_EN | CMCI_THRESHOLD; 105 val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
106 val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
106 wrmsrl(MSR_IA32_MCx_CTL2(i), val); 107 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
107 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 108 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
108 109
109 /* Did the enable bit stick? -- the bank supports CMCI */ 110 /* Did the enable bit stick? -- the bank supports CMCI */
110 if (val & CMCI_EN) { 111 if (val & MCI_CTL2_CMCI_EN) {
111 if (!test_and_set_bit(i, owned) && !boot) 112 if (!test_and_set_bit(i, owned) && !boot)
112 print_update("CMCI", &hdr, i); 113 print_update("CMCI", &hdr, i);
113 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 114 __clear_bit(i, __get_cpu_var(mce_poll_banks));
@@ -155,7 +156,7 @@ void cmci_clear(void)
155 continue; 156 continue;
156 /* Disable CMCI */ 157 /* Disable CMCI */
157 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 158 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
158 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); 159 val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
159 wrmsrl(MSR_IA32_MCx_CTL2(i), val); 160 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
160 __clear_bit(i, __get_cpu_var(mce_banks_owned)); 161 __clear_bit(i, __get_cpu_var(mce_banks_owned));
161 } 162 }
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index e1a0a3bf9716..c2a8b26d4fea 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,15 +34,25 @@
34/* How long to wait between reporting thermal events */ 34/* How long to wait between reporting thermal events */
35#define CHECK_INTERVAL (300 * HZ) 35#define CHECK_INTERVAL (300 * HZ)
36 36
37#define THERMAL_THROTTLING_EVENT 0
38#define POWER_LIMIT_EVENT 1
39
37/* 40/*
38 * Current thermal throttling state: 41 * Current thermal event state:
39 */ 42 */
40struct thermal_state { 43struct _thermal_state {
41 bool is_throttled; 44 bool new_event;
42 45 int event;
43 u64 next_check; 46 u64 next_check;
44 unsigned long throttle_count; 47 unsigned long count;
45 unsigned long last_throttle_count; 48 unsigned long last_count;
49};
50
51struct thermal_state {
52 struct _thermal_state core_throttle;
53 struct _thermal_state core_power_limit;
54 struct _thermal_state package_throttle;
55 struct _thermal_state package_power_limit;
46}; 56};
47 57
48static DEFINE_PER_CPU(struct thermal_state, thermal_state); 58static DEFINE_PER_CPU(struct thermal_state, thermal_state);
@@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly;
53 63
54#ifdef CONFIG_SYSFS 64#ifdef CONFIG_SYSFS
55#define define_therm_throt_sysdev_one_ro(_name) \ 65#define define_therm_throt_sysdev_one_ro(_name) \
56 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 66 static SYSDEV_ATTR(_name, 0444, \
67 therm_throt_sysdev_show_##_name, \
68 NULL) \
57 69
58#define define_therm_throt_sysdev_show_func(name) \ 70#define define_therm_throt_sysdev_show_func(event, name) \
59 \ 71 \
60static ssize_t therm_throt_sysdev_show_##name( \ 72static ssize_t therm_throt_sysdev_show_##event##_##name( \
61 struct sys_device *dev, \ 73 struct sys_device *dev, \
62 struct sysdev_attribute *attr, \ 74 struct sysdev_attribute *attr, \
63 char *buf) \ 75 char *buf) \
@@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name( \
66 ssize_t ret; \ 78 ssize_t ret; \
67 \ 79 \
68 preempt_disable(); /* CPU hotplug */ \ 80 preempt_disable(); /* CPU hotplug */ \
69 if (cpu_online(cpu)) \ 81 if (cpu_online(cpu)) { \
70 ret = sprintf(buf, "%lu\n", \ 82 ret = sprintf(buf, "%lu\n", \
71 per_cpu(thermal_state, cpu).name); \ 83 per_cpu(thermal_state, cpu).event.name); \
72 else \ 84 } else \
73 ret = 0; \ 85 ret = 0; \
74 preempt_enable(); \ 86 preempt_enable(); \
75 \ 87 \
76 return ret; \ 88 return ret; \
77} 89}
78 90
79define_therm_throt_sysdev_show_func(throttle_count); 91define_therm_throt_sysdev_show_func(core_throttle, count);
80define_therm_throt_sysdev_one_ro(throttle_count); 92define_therm_throt_sysdev_one_ro(core_throttle_count);
93
94define_therm_throt_sysdev_show_func(core_power_limit, count);
95define_therm_throt_sysdev_one_ro(core_power_limit_count);
96
97define_therm_throt_sysdev_show_func(package_throttle, count);
98define_therm_throt_sysdev_one_ro(package_throttle_count);
99
100define_therm_throt_sysdev_show_func(package_power_limit, count);
101define_therm_throt_sysdev_one_ro(package_power_limit_count);
81 102
82static struct attribute *thermal_throttle_attrs[] = { 103static struct attribute *thermal_throttle_attrs[] = {
83 &attr_throttle_count.attr, 104 &attr_core_throttle_count.attr,
84 NULL 105 NULL
85}; 106};
86 107
87static struct attribute_group thermal_throttle_attr_group = { 108static struct attribute_group thermal_attr_group = {
88 .attrs = thermal_throttle_attrs, 109 .attrs = thermal_throttle_attrs,
89 .name = "thermal_throttle" 110 .name = "thermal_throttle"
90}; 111};
91#endif /* CONFIG_SYSFS */ 112#endif /* CONFIG_SYSFS */
92 113
114#define CORE_LEVEL 0
115#define PACKAGE_LEVEL 1
116
93/*** 117/***
94 * therm_throt_process - Process thermal throttling event from interrupt 118 * therm_throt_process - Process thermal throttling event from interrupt
95 * @curr: Whether the condition is current or not (boolean), since the 119 * @curr: Whether the condition is current or not (boolean), since the
@@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = {
106 * 1 : Event should be logged further, and a message has been 130 * 1 : Event should be logged further, and a message has been
107 * printed to the syslog. 131 * printed to the syslog.
108 */ 132 */
109static int therm_throt_process(bool is_throttled) 133static int therm_throt_process(bool new_event, int event, int level)
110{ 134{
111 struct thermal_state *state; 135 struct _thermal_state *state;
112 unsigned int this_cpu; 136 unsigned int this_cpu = smp_processor_id();
113 bool was_throttled; 137 bool old_event;
114 u64 now; 138 u64 now;
139 struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
115 140
116 this_cpu = smp_processor_id();
117 now = get_jiffies_64(); 141 now = get_jiffies_64();
118 state = &per_cpu(thermal_state, this_cpu); 142 if (level == CORE_LEVEL) {
143 if (event == THERMAL_THROTTLING_EVENT)
144 state = &pstate->core_throttle;
145 else if (event == POWER_LIMIT_EVENT)
146 state = &pstate->core_power_limit;
147 else
148 return 0;
149 } else if (level == PACKAGE_LEVEL) {
150 if (event == THERMAL_THROTTLING_EVENT)
151 state = &pstate->package_throttle;
152 else if (event == POWER_LIMIT_EVENT)
153 state = &pstate->package_power_limit;
154 else
155 return 0;
156 } else
157 return 0;
119 158
120 was_throttled = state->is_throttled; 159 old_event = state->new_event;
121 state->is_throttled = is_throttled; 160 state->new_event = new_event;
122 161
123 if (is_throttled) 162 if (new_event)
124 state->throttle_count++; 163 state->count++;
125 164
126 if (time_before64(now, state->next_check) && 165 if (time_before64(now, state->next_check) &&
127 state->throttle_count != state->last_throttle_count) 166 state->count != state->last_count)
128 return 0; 167 return 0;
129 168
130 state->next_check = now + CHECK_INTERVAL; 169 state->next_check = now + CHECK_INTERVAL;
131 state->last_throttle_count = state->throttle_count; 170 state->last_count = state->count;
132 171
133 /* if we just entered the thermal event */ 172 /* if we just entered the thermal event */
134 if (is_throttled) { 173 if (new_event) {
135 printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); 174 if (event == THERMAL_THROTTLING_EVENT)
175 printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
176 this_cpu,
177 level == CORE_LEVEL ? "Core" : "Package",
178 state->count);
179 else
180 printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
181 this_cpu,
182 level == CORE_LEVEL ? "Core" : "Package",
183 state->count);
136 184
137 add_taint(TAINT_MACHINE_CHECK); 185 add_taint(TAINT_MACHINE_CHECK);
138 return 1; 186 return 1;
139 } 187 }
140 if (was_throttled) { 188 if (old_event) {
141 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); 189 if (event == THERMAL_THROTTLING_EVENT)
190 printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
191 this_cpu,
192 level == CORE_LEVEL ? "Core" : "Package");
193 else
194 printk(KERN_INFO "CPU%d: %s power limit normal\n",
195 this_cpu,
196 level == CORE_LEVEL ? "Core" : "Package");
142 return 1; 197 return 1;
143 } 198 }
144 199
@@ -149,13 +204,32 @@ static int therm_throt_process(bool is_throttled)
149/* Add/Remove thermal_throttle interface for CPU device: */ 204/* Add/Remove thermal_throttle interface for CPU device: */
150static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) 205static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
151{ 206{
152 return sysfs_create_group(&sys_dev->kobj, 207 int err;
153 &thermal_throttle_attr_group); 208 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
209
210 err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
211 if (err)
212 return err;
213
214 if (cpu_has(c, X86_FEATURE_PLN))
215 err = sysfs_add_file_to_group(&sys_dev->kobj,
216 &attr_core_power_limit_count.attr,
217 thermal_attr_group.name);
218 if (cpu_has(c, X86_FEATURE_PTS))
219 err = sysfs_add_file_to_group(&sys_dev->kobj,
220 &attr_package_throttle_count.attr,
221 thermal_attr_group.name);
222 if (cpu_has(c, X86_FEATURE_PLN))
223 err = sysfs_add_file_to_group(&sys_dev->kobj,
224 &attr_package_power_limit_count.attr,
225 thermal_attr_group.name);
226
227 return err;
154} 228}
155 229
156static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 230static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
157{ 231{
158 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 232 sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
159} 233}
160 234
161/* Mutex protecting device creation against CPU hotplug: */ 235/* Mutex protecting device creation against CPU hotplug: */
@@ -226,14 +300,50 @@ device_initcall(thermal_throttle_init_device);
226 300
227#endif /* CONFIG_SYSFS */ 301#endif /* CONFIG_SYSFS */
228 302
303/*
304 * Set up the most two significant bit to notify mce log that this thermal
305 * event type.
306 * This is a temp solution. May be changed in the future with mce log
307 * infrasture.
308 */
309#define CORE_THROTTLED (0)
310#define CORE_POWER_LIMIT ((__u64)1 << 62)
311#define PACKAGE_THROTTLED ((__u64)2 << 62)
312#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
313
229/* Thermal transition interrupt handler */ 314/* Thermal transition interrupt handler */
230static void intel_thermal_interrupt(void) 315static void intel_thermal_interrupt(void)
231{ 316{
232 __u64 msr_val; 317 __u64 msr_val;
318 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
233 319
234 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 320 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
235 if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) 321
236 mce_log_therm_throt_event(msr_val); 322 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
323 THERMAL_THROTTLING_EVENT,
324 CORE_LEVEL) != 0)
325 mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
326
327 if (cpu_has(c, X86_FEATURE_PLN))
328 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
329 POWER_LIMIT_EVENT,
330 CORE_LEVEL) != 0)
331 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
332
333 if (cpu_has(c, X86_FEATURE_PTS)) {
334 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
335 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
336 THERMAL_THROTTLING_EVENT,
337 PACKAGE_LEVEL) != 0)
338 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
339 if (cpu_has(c, X86_FEATURE_PLN))
340 if (therm_throt_process(msr_val &
341 PACKAGE_THERM_STATUS_POWER_LIMIT,
342 POWER_LIMIT_EVENT,
343 PACKAGE_LEVEL) != 0)
344 mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
345 | msr_val);
346 }
237} 347}
238 348
239static void unexpected_thermal_interrupt(void) 349static void unexpected_thermal_interrupt(void)
@@ -335,8 +445,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
335 apic_write(APIC_LVTTHMR, h); 445 apic_write(APIC_LVTTHMR, h);
336 446
337 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 447 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
338 wrmsr(MSR_IA32_THERM_INTERRUPT, 448 if (cpu_has(c, X86_FEATURE_PLN))
339 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); 449 wrmsr(MSR_IA32_THERM_INTERRUPT,
450 l | (THERM_INT_LOW_ENABLE
451 | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
452 else
453 wrmsr(MSR_IA32_THERM_INTERRUPT,
454 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
455
456 if (cpu_has(c, X86_FEATURE_PTS)) {
457 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
458 if (cpu_has(c, X86_FEATURE_PLN))
459 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
460 l | (PACKAGE_THERM_INT_LOW_ENABLE
461 | PACKAGE_THERM_INT_HIGH_ENABLE
462 | PACKAGE_THERM_INT_PLN_ENABLE), h);
463 else
464 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
465 l | (PACKAGE_THERM_INT_LOW_ENABLE
466 | PACKAGE_THERM_INT_HIGH_ENABLE), h);
467 }
340 468
341 smp_thermal_vector = intel_thermal_interrupt; 469 smp_thermal_vector = intel_thermal_interrupt;
342 470
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 16f41bbe46b6..d944bf6c50e9 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
18#include <asm/mshyperv.h> 18#include <asm/mshyperv.h>
19 19
20struct ms_hyperv_info ms_hyperv; 20struct ms_hyperv_info ms_hyperv;
21EXPORT_SYMBOL_GPL(ms_hyperv);
21 22
22static bool __init ms_hyperv_platform(void) 23static bool __init ms_hyperv_platform(void)
23{ 24{
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 06130b52f012..c5f59d071425 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i)
632 unsigned long gran_base, chunk_base, lose_base; 632 unsigned long gran_base, chunk_base, lose_base;
633 char gran_factor, chunk_factor, lose_factor; 633 char gran_factor, chunk_factor, lose_factor;
634 634
635 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), 635 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
636 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), 636 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
637 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), 637 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
638 638
639 pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", 639 pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
640 result[i].bad ? "*BAD*" : " ", 640 result[i].bad ? "*BAD*" : " ",
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fd31a441c61c..7d28d7d03885 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
433{ 433{
434 unsigned int mask_lo, mask_hi, base_lo, base_hi; 434 unsigned int mask_lo, mask_hi, base_lo, base_hi;
435 unsigned int tmp, hi; 435 unsigned int tmp, hi;
436 int cpu;
437 436
438 /* 437 /*
439 * get_mtrr doesn't need to update mtrr_state, also it could be called 438 * get_mtrr doesn't need to update mtrr_state, also it could be called
440 * from any cpu, so try to print it out directly. 439 * from any cpu, so try to print it out directly.
441 */ 440 */
442 cpu = get_cpu(); 441 get_cpu();
443 442
444 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 443 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
445 444
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 79556bd9b602..01c0f3ee6cc3 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -35,6 +35,7 @@
35 35
36#include <linux/types.h> /* FIXME: kvm_para.h needs this */ 36#include <linux/types.h> /* FIXME: kvm_para.h needs this */
37 37
38#include <linux/stop_machine.h>
38#include <linux/kvm_para.h> 39#include <linux/kvm_para.h>
39#include <linux/uaccess.h> 40#include <linux/uaccess.h>
40#include <linux/module.h> 41#include <linux/module.h>
@@ -143,22 +144,28 @@ struct set_mtrr_data {
143 mtrr_type smp_type; 144 mtrr_type smp_type;
144}; 145};
145 146
147static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
148
146/** 149/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs. 150 * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
148 * @info: pointer to mtrr configuration data 151 * @info: pointer to mtrr configuration data
149 * 152 *
150 * Returns nothing. 153 * Returns nothing.
151 */ 154 */
152static void ipi_handler(void *info) 155static int mtrr_work_handler(void *info)
153{ 156{
154#ifdef CONFIG_SMP 157#ifdef CONFIG_SMP
155 struct set_mtrr_data *data = info; 158 struct set_mtrr_data *data = info;
156 unsigned long flags; 159 unsigned long flags;
157 160
161 atomic_dec(&data->count);
162 while (!atomic_read(&data->gate))
163 cpu_relax();
164
158 local_irq_save(flags); 165 local_irq_save(flags);
159 166
160 atomic_dec(&data->count); 167 atomic_dec(&data->count);
161 while (!atomic_read(&data->gate)) 168 while (atomic_read(&data->gate))
162 cpu_relax(); 169 cpu_relax();
163 170
164 /* The master has cleared me to execute */ 171 /* The master has cleared me to execute */
@@ -173,12 +180,13 @@ static void ipi_handler(void *info)
173 } 180 }
174 181
175 atomic_dec(&data->count); 182 atomic_dec(&data->count);
176 while (atomic_read(&data->gate)) 183 while (!atomic_read(&data->gate))
177 cpu_relax(); 184 cpu_relax();
178 185
179 atomic_dec(&data->count); 186 atomic_dec(&data->count);
180 local_irq_restore(flags); 187 local_irq_restore(flags);
181#endif 188#endif
189 return 0;
182} 190}
183 191
184static inline int types_compatible(mtrr_type type1, mtrr_type type2) 192static inline int types_compatible(mtrr_type type1, mtrr_type type2)
@@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
198 * 206 *
199 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: 207 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
200 * 208 *
201 * 1. Send IPI to do the following: 209 * 1. Queue work to do the following on all processors:
202 * 2. Disable Interrupts 210 * 2. Disable Interrupts
203 * 3. Wait for all procs to do so 211 * 3. Wait for all procs to do so
204 * 4. Enter no-fill cache mode 212 * 4. Enter no-fill cache mode
@@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
215 * 15. Enable interrupts. 223 * 15. Enable interrupts.
216 * 224 *
217 * What does that mean for us? Well, first we set data.count to the number 225 * What does that mean for us? Well, first we set data.count to the number
218 * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait 226 * of CPUs. As each CPU announces that it started the rendezvous handler by
219 * until it hits 0 and proceed. We set the data.gate flag and reset data.count. 227 * decrementing the count, We reset data.count and set the data.gate flag
220 * Meanwhile, they are waiting for that flag to be set. Once it's set, each 228 * allowing all the cpu's to proceed with the work. As each cpu disables
229 * interrupts, it'll decrement data.count once. We wait until it hits 0 and
230 * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
231 * are waiting for that flag to be cleared. Once it's cleared, each
221 * CPU goes through the transition of updating MTRRs. 232 * CPU goes through the transition of updating MTRRs.
222 * The CPU vendors may each do it differently, 233 * The CPU vendors may each do it differently,
223 * so we call mtrr_if->set() callback and let them take care of it. 234 * so we call mtrr_if->set() callback and let them take care of it.
224 * When they're done, they again decrement data->count and wait for data.gate 235 * When they're done, they again decrement data->count and wait for data.gate
225 * to be reset. 236 * to be set.
226 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag 237 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
227 * Everyone then enables interrupts and we all continue on. 238 * Everyone then enables interrupts and we all continue on.
228 * 239 *
@@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
234{ 245{
235 struct set_mtrr_data data; 246 struct set_mtrr_data data;
236 unsigned long flags; 247 unsigned long flags;
248 int cpu;
249
250 preempt_disable();
237 251
238 data.smp_reg = reg; 252 data.smp_reg = reg;
239 data.smp_base = base; 253 data.smp_base = base;
@@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
246 atomic_set(&data.gate, 0); 260 atomic_set(&data.gate, 0);
247 261
248 /* Start the ball rolling on other CPUs */ 262 /* Start the ball rolling on other CPUs */
249 if (smp_call_function(ipi_handler, &data, 0) != 0) 263 for_each_online_cpu(cpu) {
250 panic("mtrr: timed out waiting for other CPUs\n"); 264 struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
265
266 if (cpu == smp_processor_id())
267 continue;
268
269 stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
270 }
251 271
252 local_irq_save(flags);
253 272
254 while (atomic_read(&data.count)) 273 while (atomic_read(&data.count))
255 cpu_relax(); 274 cpu_relax();
@@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
259 smp_wmb(); 278 smp_wmb();
260 atomic_set(&data.gate, 1); 279 atomic_set(&data.gate, 1);
261 280
281 local_irq_save(flags);
282
283 while (atomic_read(&data.count))
284 cpu_relax();
285
286 /* Ok, reset count and toggle gate */
287 atomic_set(&data.count, num_booting_cpus() - 1);
288 smp_wmb();
289 atomic_set(&data.gate, 0);
290
262 /* Do our MTRR business */ 291 /* Do our MTRR business */
263 292
264 /* 293 /*
@@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
279 308
280 atomic_set(&data.count, num_booting_cpus() - 1); 309 atomic_set(&data.count, num_booting_cpus() - 1);
281 smp_wmb(); 310 smp_wmb();
282 atomic_set(&data.gate, 0); 311 atomic_set(&data.gate, 1);
283 312
284 /* 313 /*
285 * Wait here for everyone to have seen the gate change 314 * Wait here for everyone to have seen the gate change
@@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
289 cpu_relax(); 318 cpu_relax();
290 319
291 local_irq_restore(flags); 320 local_irq_restore(flags);
321 preempt_enable();
292} 322}
293 323
294/** 324/**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5db5b7d65a18..f2da20fda02d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -220,6 +220,7 @@ struct x86_pmu {
220 struct perf_event *event); 220 struct perf_event *event);
221 struct event_constraint *event_constraints; 221 struct event_constraint *event_constraints;
222 void (*quirks)(void); 222 void (*quirks)(void);
223 int perfctr_second_write;
223 224
224 int (*cpu_prepare)(int cpu); 225 int (*cpu_prepare)(int cpu);
225 void (*cpu_starting)(int cpu); 226 void (*cpu_starting)(int cpu);
@@ -295,10 +296,10 @@ x86_perf_event_update(struct perf_event *event)
295 * count to the generic event atomically: 296 * count to the generic event atomically:
296 */ 297 */
297again: 298again:
298 prev_raw_count = atomic64_read(&hwc->prev_count); 299 prev_raw_count = local64_read(&hwc->prev_count);
299 rdmsrl(hwc->event_base + idx, new_raw_count); 300 rdmsrl(hwc->event_base + idx, new_raw_count);
300 301
301 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, 302 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
302 new_raw_count) != prev_raw_count) 303 new_raw_count) != prev_raw_count)
303 goto again; 304 goto again;
304 305
@@ -313,8 +314,8 @@ again:
313 delta = (new_raw_count << shift) - (prev_raw_count << shift); 314 delta = (new_raw_count << shift) - (prev_raw_count << shift);
314 delta >>= shift; 315 delta >>= shift;
315 316
316 atomic64_add(delta, &event->count); 317 local64_add(delta, &event->count);
317 atomic64_sub(delta, &hwc->period_left); 318 local64_sub(delta, &hwc->period_left);
318 319
319 return new_raw_count; 320 return new_raw_count;
320} 321}
@@ -438,7 +439,7 @@ static int x86_setup_perfctr(struct perf_event *event)
438 if (!hwc->sample_period) { 439 if (!hwc->sample_period) {
439 hwc->sample_period = x86_pmu.max_period; 440 hwc->sample_period = x86_pmu.max_period;
440 hwc->last_period = hwc->sample_period; 441 hwc->last_period = hwc->sample_period;
441 atomic64_set(&hwc->period_left, hwc->sample_period); 442 local64_set(&hwc->period_left, hwc->sample_period);
442 } else { 443 } else {
443 /* 444 /*
444 * If we have a PMU initialized but no APIC 445 * If we have a PMU initialized but no APIC
@@ -885,7 +886,7 @@ static int
885x86_perf_event_set_period(struct perf_event *event) 886x86_perf_event_set_period(struct perf_event *event)
886{ 887{
887 struct hw_perf_event *hwc = &event->hw; 888 struct hw_perf_event *hwc = &event->hw;
888 s64 left = atomic64_read(&hwc->period_left); 889 s64 left = local64_read(&hwc->period_left);
889 s64 period = hwc->sample_period; 890 s64 period = hwc->sample_period;
890 int ret = 0, idx = hwc->idx; 891 int ret = 0, idx = hwc->idx;
891 892
@@ -897,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event)
897 */ 898 */
898 if (unlikely(left <= -period)) { 899 if (unlikely(left <= -period)) {
899 left = period; 900 left = period;
900 atomic64_set(&hwc->period_left, left); 901 local64_set(&hwc->period_left, left);
901 hwc->last_period = period; 902 hwc->last_period = period;
902 ret = 1; 903 ret = 1;
903 } 904 }
904 905
905 if (unlikely(left <= 0)) { 906 if (unlikely(left <= 0)) {
906 left += period; 907 left += period;
907 atomic64_set(&hwc->period_left, left); 908 local64_set(&hwc->period_left, left);
908 hwc->last_period = period; 909 hwc->last_period = period;
909 ret = 1; 910 ret = 1;
910 } 911 }
@@ -923,10 +924,19 @@ x86_perf_event_set_period(struct perf_event *event)
923 * The hw event starts counting from this event offset, 924 * The hw event starts counting from this event offset,
924 * mark it to be able to extra future deltas: 925 * mark it to be able to extra future deltas:
925 */ 926 */
926 atomic64_set(&hwc->prev_count, (u64)-left); 927 local64_set(&hwc->prev_count, (u64)-left);
927 928
928 wrmsrl(hwc->event_base + idx, 929 wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
930
931 /*
932 * Due to erratum on certan cpu we need
933 * a second write to be sure the register
934 * is updated properly
935 */
936 if (x86_pmu.perfctr_second_write) {
937 wrmsrl(hwc->event_base + idx,
929 (u64)(-left) & x86_pmu.cntval_mask); 938 (u64)(-left) & x86_pmu.cntval_mask);
939 }
930 940
931 perf_event_update_userpage(event); 941 perf_event_update_userpage(event);
932 942
@@ -969,7 +979,7 @@ static int x86_pmu_enable(struct perf_event *event)
969 * skip the schedulability test here, it will be peformed 979 * skip the schedulability test here, it will be peformed
970 * at commit time(->commit_txn) as a whole 980 * at commit time(->commit_txn) as a whole
971 */ 981 */
972 if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) 982 if (cpuc->group_flag & PERF_EVENT_TXN)
973 goto out; 983 goto out;
974 984
975 ret = x86_pmu.schedule_events(cpuc, n, assign); 985 ret = x86_pmu.schedule_events(cpuc, n, assign);
@@ -1096,7 +1106,7 @@ static void x86_pmu_disable(struct perf_event *event)
1096 * The events never got scheduled and ->cancel_txn will truncate 1106 * The events never got scheduled and ->cancel_txn will truncate
1097 * the event_list. 1107 * the event_list.
1098 */ 1108 */
1099 if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) 1109 if (cpuc->group_flag & PERF_EVENT_TXN)
1100 return; 1110 return;
1101 1111
1102 x86_pmu_stop(event); 1112 x86_pmu_stop(event);
@@ -1388,7 +1398,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
1388{ 1398{
1389 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1399 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1390 1400
1391 cpuc->group_flag |= PERF_EVENT_TXN_STARTED; 1401 cpuc->group_flag |= PERF_EVENT_TXN;
1392 cpuc->n_txn = 0; 1402 cpuc->n_txn = 0;
1393} 1403}
1394 1404
@@ -1401,7 +1411,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
1401{ 1411{
1402 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1412 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1403 1413
1404 cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED; 1414 cpuc->group_flag &= ~PERF_EVENT_TXN;
1405 /* 1415 /*
1406 * Truncate the collected events. 1416 * Truncate the collected events.
1407 */ 1417 */
@@ -1435,11 +1445,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
1435 */ 1445 */
1436 memcpy(cpuc->assign, assign, n*sizeof(int)); 1446 memcpy(cpuc->assign, assign, n*sizeof(int));
1437 1447
1438 /* 1448 cpuc->group_flag &= ~PERF_EVENT_TXN;
1439 * Clear out the txn count so that ->cancel_txn() which gets
1440 * run after ->commit_txn() doesn't undo things.
1441 */
1442 cpuc->n_txn = 0;
1443 1449
1444 return 0; 1450 return 0;
1445} 1451}
@@ -1607,8 +1613,6 @@ static const struct stacktrace_ops backtrace_ops = {
1607 .walk_stack = print_context_stack_bp, 1613 .walk_stack = print_context_stack_bp,
1608}; 1614};
1609 1615
1610#include "../dumpstack.h"
1611
1612static void 1616static void
1613perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) 1617perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1614{ 1618{
@@ -1730,22 +1734,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1730 return entry; 1734 return entry;
1731} 1735}
1732 1736
1733void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
1734{
1735 regs->ip = ip;
1736 /*
1737 * perf_arch_fetch_caller_regs adds another call, we need to increment
1738 * the skip level
1739 */
1740 regs->bp = rewind_frame_pointer(skip + 1);
1741 regs->cs = __KERNEL_CS;
1742 /*
1743 * We abuse bit 3 to pass exact information, see perf_misc_flags
1744 * and the comment with PERF_EFLAGS_EXACT.
1745 */
1746 regs->flags = 0;
1747}
1748
1749unsigned long perf_instruction_pointer(struct pt_regs *regs) 1737unsigned long perf_instruction_pointer(struct pt_regs *regs)
1750{ 1738{
1751 unsigned long ip; 1739 unsigned long ip;
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ae85d69644d1..febb12cea795 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -21,22 +21,36 @@ struct p4_event_bind {
21 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ 21 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
22}; 22};
23 23
24struct p4_cache_event_bind { 24struct p4_pebs_bind {
25 unsigned int metric_pebs; 25 unsigned int metric_pebs;
26 unsigned int metric_vert; 26 unsigned int metric_vert;
27}; 27};
28 28
29#define P4_GEN_CACHE_EVENT_BIND(name) \ 29/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
30 [P4_CACHE__##name] = { \ 30#define P4_GEN_PEBS_BIND(name, pebs, vert) \
31 .metric_pebs = P4_PEBS__##name, \ 31 [P4_PEBS_METRIC__##name] = { \
32 .metric_vert = P4_VERT__##name, \ 32 .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \
33 .metric_vert = vert, \
33 } 34 }
34 35
35static struct p4_cache_event_bind p4_cache_event_bind_map[] = { 36/*
36 P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired), 37 * note we have P4_PEBS_ENABLE_UOP_TAG always set here
37 P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired), 38 *
38 P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired), 39 * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
39 P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired), 40 * event configuration to find out which values are to be
41 * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
42 * resgisters
43 */
44static struct p4_pebs_bind p4_pebs_bind_map[] = {
45 P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001),
46 P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001),
47 P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001),
48 P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002),
49 P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003),
50 P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010),
51 P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001),
52 P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001),
53 P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002),
40}; 54};
41 55
42/* 56/*
@@ -281,10 +295,10 @@ static struct p4_event_bind p4_event_bind_map[] = {
281 }, 295 },
282}; 296};
283 297
284#define P4_GEN_CACHE_EVENT(event, bit, cache_event) \ 298#define P4_GEN_CACHE_EVENT(event, bit, metric) \
285 p4_config_pack_escr(P4_ESCR_EVENT(event) | \ 299 p4_config_pack_escr(P4_ESCR_EVENT(event) | \
286 P4_ESCR_EMASK_BIT(event, bit)) | \ 300 P4_ESCR_EMASK_BIT(event, bit)) | \
287 p4_config_pack_cccr(cache_event | \ 301 p4_config_pack_cccr(metric | \
288 P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event)))) 302 P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
289 303
290static __initconst const u64 p4_hw_cache_event_ids 304static __initconst const u64 p4_hw_cache_event_ids
@@ -296,34 +310,34 @@ static __initconst const u64 p4_hw_cache_event_ids
296 [ C(OP_READ) ] = { 310 [ C(OP_READ) ] = {
297 [ C(RESULT_ACCESS) ] = 0x0, 311 [ C(RESULT_ACCESS) ] = 0x0,
298 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, 312 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
299 P4_CACHE__1stl_cache_load_miss_retired), 313 P4_PEBS_METRIC__1stl_cache_load_miss_retired),
300 }, 314 },
301 }, 315 },
302 [ C(LL ) ] = { 316 [ C(LL ) ] = {
303 [ C(OP_READ) ] = { 317 [ C(OP_READ) ] = {
304 [ C(RESULT_ACCESS) ] = 0x0, 318 [ C(RESULT_ACCESS) ] = 0x0,
305 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, 319 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
306 P4_CACHE__2ndl_cache_load_miss_retired), 320 P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
307 }, 321 },
308}, 322},
309 [ C(DTLB) ] = { 323 [ C(DTLB) ] = {
310 [ C(OP_READ) ] = { 324 [ C(OP_READ) ] = {
311 [ C(RESULT_ACCESS) ] = 0x0, 325 [ C(RESULT_ACCESS) ] = 0x0,
312 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, 326 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
313 P4_CACHE__dtlb_load_miss_retired), 327 P4_PEBS_METRIC__dtlb_load_miss_retired),
314 }, 328 },
315 [ C(OP_WRITE) ] = { 329 [ C(OP_WRITE) ] = {
316 [ C(RESULT_ACCESS) ] = 0x0, 330 [ C(RESULT_ACCESS) ] = 0x0,
317 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, 331 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
318 P4_CACHE__dtlb_store_miss_retired), 332 P4_PEBS_METRIC__dtlb_store_miss_retired),
319 }, 333 },
320 }, 334 },
321 [ C(ITLB) ] = { 335 [ C(ITLB) ] = {
322 [ C(OP_READ) ] = { 336 [ C(OP_READ) ] = {
323 [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT, 337 [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
324 P4_CACHE__itlb_reference_hit), 338 P4_PEBS_METRIC__none),
325 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS, 339 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
326 P4_CACHE__itlb_reference_miss), 340 P4_PEBS_METRIC__none),
327 }, 341 },
328 [ C(OP_WRITE) ] = { 342 [ C(OP_WRITE) ] = {
329 [ C(RESULT_ACCESS) ] = -1, 343 [ C(RESULT_ACCESS) ] = -1,
@@ -414,11 +428,37 @@ static u64 p4_pmu_event_map(int hw_event)
414 return config; 428 return config;
415} 429}
416 430
431static int p4_validate_raw_event(struct perf_event *event)
432{
433 unsigned int v;
434
435 /* user data may have out-of-bound event index */
436 v = p4_config_unpack_event(event->attr.config);
437 if (v >= ARRAY_SIZE(p4_event_bind_map)) {
438 pr_warning("P4 PMU: Unknown event code: %d\n", v);
439 return -EINVAL;
440 }
441
442 /*
443 * it may have some screwed PEBS bits
444 */
445 if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
446 pr_warning("P4 PMU: PEBS are not supported yet\n");
447 return -EINVAL;
448 }
449 v = p4_config_unpack_metric(event->attr.config);
450 if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
451 pr_warning("P4 PMU: Unknown metric code: %d\n", v);
452 return -EINVAL;
453 }
454
455 return 0;
456}
457
417static int p4_hw_config(struct perf_event *event) 458static int p4_hw_config(struct perf_event *event)
418{ 459{
419 int cpu = get_cpu(); 460 int cpu = get_cpu();
420 int rc = 0; 461 int rc = 0;
421 unsigned int evnt;
422 u32 escr, cccr; 462 u32 escr, cccr;
423 463
424 /* 464 /*
@@ -438,12 +478,9 @@ static int p4_hw_config(struct perf_event *event)
438 478
439 if (event->attr.type == PERF_TYPE_RAW) { 479 if (event->attr.type == PERF_TYPE_RAW) {
440 480
441 /* user data may have out-of-bound event index */ 481 rc = p4_validate_raw_event(event);
442 evnt = p4_config_unpack_event(event->attr.config); 482 if (rc)
443 if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
444 rc = -EINVAL;
445 goto out; 483 goto out;
446 }
447 484
448 /* 485 /*
449 * We don't control raw events so it's up to the caller 486 * We don't control raw events so it's up to the caller
@@ -451,12 +488,15 @@ static int p4_hw_config(struct perf_event *event)
451 * on HT machine but allow HT-compatible specifics to be 488 * on HT machine but allow HT-compatible specifics to be
452 * passed on) 489 * passed on)
453 * 490 *
491 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
492 * bits since we keep additional info here (for cache events and etc)
493 *
454 * XXX: HT wide things should check perf_paranoid_cpu() && 494 * XXX: HT wide things should check perf_paranoid_cpu() &&
455 * CAP_SYS_ADMIN 495 * CAP_SYS_ADMIN
456 */ 496 */
457 event->hw.config |= event->attr.config & 497 event->hw.config |= event->attr.config &
458 (p4_config_pack_escr(P4_ESCR_MASK_HT) | 498 (p4_config_pack_escr(P4_ESCR_MASK_HT) |
459 p4_config_pack_cccr(P4_CCCR_MASK_HT)); 499 p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
460 } 500 }
461 501
462 rc = x86_setup_perfctr(event); 502 rc = x86_setup_perfctr(event);
@@ -482,6 +522,29 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
482 return overflow; 522 return overflow;
483} 523}
484 524
525static void p4_pmu_disable_pebs(void)
526{
527 /*
528 * FIXME
529 *
530 * It's still allowed that two threads setup same cache
531 * events so we can't simply clear metrics until we knew
532 * noone is depending on us, so we need kind of counter
533 * for "ReplayEvent" users.
534 *
535 * What is more complex -- RAW events, if user (for some
536 * reason) will pass some cache event metric with improper
537 * event opcode -- it's fine from hardware point of view
538 * but completely nonsence from "meaning" of such action.
539 *
540 * So at moment let leave metrics turned on forever -- it's
541 * ok for now but need to be revisited!
542 *
543 * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0);
544 * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
545 */
546}
547
485static inline void p4_pmu_disable_event(struct perf_event *event) 548static inline void p4_pmu_disable_event(struct perf_event *event)
486{ 549{
487 struct hw_perf_event *hwc = &event->hw; 550 struct hw_perf_event *hwc = &event->hw;
@@ -507,6 +570,26 @@ static void p4_pmu_disable_all(void)
507 continue; 570 continue;
508 p4_pmu_disable_event(event); 571 p4_pmu_disable_event(event);
509 } 572 }
573
574 p4_pmu_disable_pebs();
575}
576
577/* configuration must be valid */
578static void p4_pmu_enable_pebs(u64 config)
579{
580 struct p4_pebs_bind *bind;
581 unsigned int idx;
582
583 BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
584
585 idx = p4_config_unpack_metric(config);
586 if (idx == P4_PEBS_METRIC__none)
587 return;
588
589 bind = &p4_pebs_bind_map[idx];
590
591 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
592 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
510} 593}
511 594
512static void p4_pmu_enable_event(struct perf_event *event) 595static void p4_pmu_enable_event(struct perf_event *event)
@@ -515,9 +598,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
515 int thread = p4_ht_config_thread(hwc->config); 598 int thread = p4_ht_config_thread(hwc->config);
516 u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config)); 599 u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
517 unsigned int idx = p4_config_unpack_event(hwc->config); 600 unsigned int idx = p4_config_unpack_event(hwc->config);
518 unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
519 struct p4_event_bind *bind; 601 struct p4_event_bind *bind;
520 struct p4_cache_event_bind *bind_cache;
521 u64 escr_addr, cccr; 602 u64 escr_addr, cccr;
522 603
523 bind = &p4_event_bind_map[idx]; 604 bind = &p4_event_bind_map[idx];
@@ -537,16 +618,10 @@ static void p4_pmu_enable_event(struct perf_event *event)
537 cccr = p4_config_unpack_cccr(hwc->config); 618 cccr = p4_config_unpack_cccr(hwc->config);
538 619
539 /* 620 /*
540 * it could be Cache event so that we need to 621 * it could be Cache event so we need to write metrics
541 * set metrics into additional MSRs 622 * into additional MSRs
542 */ 623 */
543 BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK); 624 p4_pmu_enable_pebs(hwc->config);
544 if (idx_cache > P4_CACHE__NONE &&
545 idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
546 bind_cache = &p4_cache_event_bind_map[idx_cache];
547 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
548 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
549 }
550 625
551 (void)checking_wrmsrl(escr_addr, escr_conf); 626 (void)checking_wrmsrl(escr_addr, escr_conf);
552 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 627 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
@@ -581,6 +656,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
581 cpuc = &__get_cpu_var(cpu_hw_events); 656 cpuc = &__get_cpu_var(cpu_hw_events);
582 657
583 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 658 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
659 int overflow;
584 660
585 if (!test_bit(idx, cpuc->active_mask)) 661 if (!test_bit(idx, cpuc->active_mask))
586 continue; 662 continue;
@@ -591,12 +667,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
591 WARN_ON_ONCE(hwc->idx != idx); 667 WARN_ON_ONCE(hwc->idx != idx);
592 668
593 /* it might be unflagged overflow */ 669 /* it might be unflagged overflow */
594 handled = p4_pmu_clear_cccr_ovf(hwc); 670 overflow = p4_pmu_clear_cccr_ovf(hwc);
595 671
596 val = x86_perf_event_update(event); 672 val = x86_perf_event_update(event);
597 if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) 673 if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
598 continue; 674 continue;
599 675
676 handled += overflow;
677
600 /* event overflow for sure */ 678 /* event overflow for sure */
601 data.period = event->hw.last_period; 679 data.period = event->hw.last_period;
602 680
@@ -612,7 +690,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
612 inc_irq_stat(apic_perf_irqs); 690 inc_irq_stat(apic_perf_irqs);
613 } 691 }
614 692
615 return handled; 693 return handled > 0;
616} 694}
617 695
618/* 696/*
@@ -829,6 +907,15 @@ static __initconst const struct x86_pmu p4_pmu = {
829 .max_period = (1ULL << 39) - 1, 907 .max_period = (1ULL << 39) - 1,
830 .hw_config = p4_hw_config, 908 .hw_config = p4_hw_config,
831 .schedule_events = p4_pmu_schedule_events, 909 .schedule_events = p4_pmu_schedule_events,
910 /*
911 * This handles erratum N15 in intel doc 249199-029,
912 * the counter may not be updated correctly on write
913 * so we need a second write operation to do the trick
914 * (the official workaround didn't work)
915 *
916 * the former idea is taken from OProfile code
917 */
918 .perfctr_second_write = 1,
832}; 919};
833 920
834static __init int p4_pmu_init(void) 921static __init int p4_pmu_init(void)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 000000000000..34b4dad6f0b8
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,63 @@
1/*
2 * Routines to indentify additional cpu features that are scattered in
3 * cpuid space.
4 */
5#include <linux/cpu.h>
6
7#include <asm/pat.h>
8#include <asm/processor.h>
9
10#include <asm/apic.h>
11
12struct cpuid_bit {
13 u16 feature;
14 u8 reg;
15 u8 bit;
16 u32 level;
17 u32 sub_leaf;
18};
19
20enum cpuid_regs {
21 CR_EAX = 0,
22 CR_ECX,
23 CR_EDX,
24 CR_EBX
25};
26
27void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
28{
29 u32 max_level;
30 u32 regs[4];
31 const struct cpuid_bit *cb;
32
33 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
34 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
35 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
36 { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
37 { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
38 { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
39 { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
40 { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
41 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
42 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
43 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
44 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
45 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
46 { 0, 0, 0, 0, 0 }
47 };
48
49 for (cb = cpuid_bits; cb->feature; cb++) {
50
51 /* Verify that the level is valid */
52 max_level = cpuid_eax(cb->level & 0xffff0000);
53 if (max_level < cb->level ||
54 max_level > (cb->level | 0xffff))
55 continue;
56
57 cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
58 &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
59
60 if (regs[cb->reg] & (1 << cb->bit))
61 set_cpu_cap(c, cb->feature);
62 }
63}
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/topology.c
index 10fa5684a662..4397e987a1cf 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -1,62 +1,14 @@
1/* 1/*
2 * Routines to indentify additional cpu features that are scattered in 2 * Check for extended topology enumeration cpuid leaf 0xb and if it
3 * cpuid space. 3 * exists, use it for populating initial_apicid and cpu topology
4 * detection.
4 */ 5 */
5#include <linux/cpu.h>
6 6
7#include <linux/cpu.h>
8#include <asm/apic.h>
7#include <asm/pat.h> 9#include <asm/pat.h>
8#include <asm/processor.h> 10#include <asm/processor.h>
9 11
10#include <asm/apic.h>
11
12struct cpuid_bit {
13 u16 feature;
14 u8 reg;
15 u8 bit;
16 u32 level;
17};
18
19enum cpuid_regs {
20 CR_EAX = 0,
21 CR_ECX,
22 CR_EDX,
23 CR_EBX
24};
25
26void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
27{
28 u32 max_level;
29 u32 regs[4];
30 const struct cpuid_bit *cb;
31
32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
35 { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 },
36 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 },
37 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
38 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
39 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
40 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
41 { 0, 0, 0, 0 }
42 };
43
44 for (cb = cpuid_bits; cb->feature; cb++) {
45
46 /* Verify that the level is valid */
47 max_level = cpuid_eax(cb->level & 0xffff0000);
48 if (max_level < cb->level ||
49 max_level > (cb->level | 0xffff))
50 continue;
51
52 cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
53 &regs[CR_ECX], &regs[CR_EDX]);
54
55 if (regs[cb->reg] & (1 << cb->bit))
56 set_cpu_cap(c, cb->feature);
57 }
58}
59
60/* leaf 0xb SMT level */ 12/* leaf 0xb SMT level */
61#define SMT_LEVEL 0 13#define SMT_LEVEL 0
62 14
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index b9d1ff588445..227b0448960d 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -51,7 +51,7 @@ static inline int __vmware_platform(void)
51 51
52static unsigned long vmware_get_tsc_khz(void) 52static unsigned long vmware_get_tsc_khz(void)
53{ 53{
54 uint64_t tsc_hz; 54 uint64_t tsc_hz, lpj;
55 uint32_t eax, ebx, ecx, edx; 55 uint32_t eax, ebx, ecx, edx;
56 56
57 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); 57 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
@@ -62,6 +62,13 @@ static unsigned long vmware_get_tsc_khz(void)
62 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", 62 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
63 (unsigned long) tsc_hz / 1000, 63 (unsigned long) tsc_hz / 1000,
64 (unsigned long) tsc_hz % 1000); 64 (unsigned long) tsc_hz % 1000);
65
66 if (!preset_lpj) {
67 lpj = ((u64)tsc_hz * 1000);
68 do_div(lpj, HZ);
69 preset_lpj = lpj;
70 }
71
65 return tsc_hz; 72 return tsc_hz;
66} 73}
67 74
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ebd4c51d096a..764c7c2b1811 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -28,6 +28,8 @@
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30 30
31int in_crash_kexec;
32
31#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
32 34
33static void kdump_nmi_callback(int cpu, struct die_args *args) 35static void kdump_nmi_callback(int cpu, struct die_args *args)
@@ -61,6 +63,7 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
61 63
62static void kdump_nmi_shootdown_cpus(void) 64static void kdump_nmi_shootdown_cpus(void)
63{ 65{
66 in_crash_kexec = 1;
64 nmi_shootdown_cpus(kdump_nmi_callback); 67 nmi_shootdown_cpus(kdump_nmi_callback);
65 68
66 disable_local_APIC(); 69 disable_local_APIC();
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index c89a386930b7..6e8752c1bd52 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -18,7 +18,6 @@
18 18
19#include <asm/stacktrace.h> 19#include <asm/stacktrace.h>
20 20
21#include "dumpstack.h"
22 21
23int panic_on_unrecovered_nmi; 22int panic_on_unrecovered_nmi;
24int panic_on_io_nmi; 23int panic_on_io_nmi;
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
deleted file mode 100644
index e1a93be4fd44..000000000000
--- a/arch/x86/kernel/dumpstack.h
+++ /dev/null
@@ -1,56 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5
6#ifndef DUMPSTACK_H
7#define DUMPSTACK_H
8
9#ifdef CONFIG_X86_32
10#define STACKSLOTS_PER_LINE 8
11#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
12#else
13#define STACKSLOTS_PER_LINE 4
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif
16
17#include <linux/uaccess.h>
18
19extern void
20show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
21 unsigned long *stack, unsigned long bp, char *log_lvl);
22
23extern void
24show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
25 unsigned long *sp, unsigned long bp, char *log_lvl);
26
27extern unsigned int code_bytes;
28
29/* The form of the top of the frame on the stack */
30struct stack_frame {
31 struct stack_frame *next_frame;
32 unsigned long return_address;
33};
34
35struct stack_frame_ia32 {
36 u32 next_frame;
37 u32 return_address;
38};
39
40static inline unsigned long rewind_frame_pointer(int n)
41{
42 struct stack_frame *frame;
43
44 get_bp(frame);
45
46#ifdef CONFIG_FRAME_POINTER
47 while (n--) {
48 if (probe_kernel_address(&frame->next_frame, frame))
49 break;
50 }
51#endif
52
53 return (unsigned long)frame;
54}
55
56#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 11540a189d93..0f6376ffa2d9 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,8 +16,6 @@
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h"
20
21 19
22void dump_trace(struct task_struct *task, struct pt_regs *regs, 20void dump_trace(struct task_struct *task, struct pt_regs *regs,
23 unsigned long *stack, unsigned long bp, 21 unsigned long *stack, unsigned long bp,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 272c9f1f05f3..57a21f11c791 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -16,7 +16,6 @@
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h"
20 19
21#define N_EXCEPTION_STACKS_END \ 20#define N_EXCEPTION_STACKS_END \
22 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) 21 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index cd49141cf153..227d00920d2f 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -611,14 +611,14 @@ ldt_ss:
611 * compensating for the offset by changing to the ESPFIX segment with 611 * compensating for the offset by changing to the ESPFIX segment with
612 * a base address that matches for the difference. 612 * a base address that matches for the difference.
613 */ 613 */
614#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
614 mov %esp, %edx /* load kernel esp */ 615 mov %esp, %edx /* load kernel esp */
615 mov PT_OLDESP(%esp), %eax /* load userspace esp */ 616 mov PT_OLDESP(%esp), %eax /* load userspace esp */
616 mov %dx, %ax /* eax: new kernel esp */ 617 mov %dx, %ax /* eax: new kernel esp */
617 sub %eax, %edx /* offset (low word is 0) */ 618 sub %eax, %edx /* offset (low word is 0) */
618 PER_CPU(gdt_page, %ebx)
619 shr $16, %edx 619 shr $16, %edx
620 mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ 620 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
621 mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ 621 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
622 pushl $__ESPFIX_SS 622 pushl $__ESPFIX_SS
623 CFI_ADJUST_CFA_OFFSET 4 623 CFI_ADJUST_CFA_OFFSET 4
624 push %eax /* new kernel esp */ 624 push %eax /* new kernel esp */
@@ -791,9 +791,8 @@ ptregs_clone:
791 * normal stack and adjusts ESP with the matching offset. 791 * normal stack and adjusts ESP with the matching offset.
792 */ 792 */
793 /* fixup the stack */ 793 /* fixup the stack */
794 PER_CPU(gdt_page, %ebx) 794 mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
795 mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ 795 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
796 mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
797 shl $16, %eax 796 shl $16, %eax
798 addl %esp, %eax /* the adjusted stack pointer */ 797 addl %esp, %eax /* the adjusted stack pointer */
799 pushl $__KERNEL_DS 798 pushl $__KERNEL_DS
@@ -914,7 +913,7 @@ ENTRY(simd_coprocessor_error)
914 .balign 4 913 .balign 4
915 .long 661b 914 .long 661b
916 .long 663f 915 .long 663f
917 .byte X86_FEATURE_XMM 916 .word X86_FEATURE_XMM
918 .byte 662b-661b 917 .byte 662b-661b
919 .byte 664f-663f 918 .byte 664f-663f
920.previous 919.previous
@@ -1166,6 +1165,9 @@ ENTRY(xen_failsafe_callback)
1166.previous 1165.previous
1167ENDPROC(xen_failsafe_callback) 1166ENDPROC(xen_failsafe_callback)
1168 1167
1168BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
1169 xen_evtchn_do_upcall)
1170
1169#endif /* CONFIG_XEN */ 1171#endif /* CONFIG_XEN */
1170 1172
1171#ifdef CONFIG_FUNCTION_TRACER 1173#ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 4db7c4d12ffa..17be5ec7cbba 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1065,6 +1065,7 @@ ENTRY(\sym)
1065END(\sym) 1065END(\sym)
1066.endm 1066.endm
1067 1067
1068#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
1068.macro paranoidzeroentry_ist sym do_sym ist 1069.macro paranoidzeroentry_ist sym do_sym ist
1069ENTRY(\sym) 1070ENTRY(\sym)
1070 INTR_FRAME 1071 INTR_FRAME
@@ -1076,10 +1077,9 @@ ENTRY(\sym)
1076 TRACE_IRQS_OFF 1077 TRACE_IRQS_OFF
1077 movq %rsp,%rdi /* pt_regs pointer */ 1078 movq %rsp,%rdi /* pt_regs pointer */
1078 xorl %esi,%esi /* no error code */ 1079 xorl %esi,%esi /* no error code */
1079 PER_CPU(init_tss, %r12) 1080 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1080 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1081 call \do_sym 1081 call \do_sym
1082 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) 1082 addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1083 jmp paranoid_exit /* %ebx: no swapgs flag */ 1083 jmp paranoid_exit /* %ebx: no swapgs flag */
1084 CFI_ENDPROC 1084 CFI_ENDPROC
1085END(\sym) 1085END(\sym)
@@ -1185,13 +1185,13 @@ END(kernel_thread_helper)
1185 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. 1185 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
1186 * 1186 *
1187 * C extern interface: 1187 * C extern interface:
1188 * extern long execve(char *name, char **argv, char **envp) 1188 * extern long execve(const char *name, char **argv, char **envp)
1189 * 1189 *
1190 * asm input arguments: 1190 * asm input arguments:
1191 * rdi: name, rsi: argv, rdx: envp 1191 * rdi: name, rsi: argv, rdx: envp
1192 * 1192 *
1193 * We want to fallback into: 1193 * We want to fallback into:
1194 * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs) 1194 * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
1195 * 1195 *
1196 * do_sys_execve asm fallback arguments: 1196 * do_sys_execve asm fallback arguments:
1197 * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack 1197 * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
@@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback)
1329 CFI_ENDPROC 1329 CFI_ENDPROC
1330END(xen_failsafe_callback) 1330END(xen_failsafe_callback)
1331 1331
1332apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
1333 xen_hvm_callback_vector xen_evtchn_do_upcall
1334
1332#endif /* CONFIG_XEN */ 1335#endif /* CONFIG_XEN */
1333 1336
1334/* 1337/*
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index b2e246037392..784360c0625c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -20,7 +20,7 @@
20 20
21static void __init i386_default_early_setup(void) 21static void __init i386_default_early_setup(void)
22{ 22{
23 /* Initilize 32bit specific setup functions */ 23 /* Initialize 32bit specific setup functions */
24 x86_init.resources.probe_roms = probe_roms; 24 x86_init.resources.probe_roms = probe_roms;
25 x86_init.resources.reserve_resources = i386_reserve_resources; 25 x86_init.resources.reserve_resources = i386_reserve_resources;
26 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; 26 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 37c3d4b17d85..ff4c453e13f3 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -131,6 +131,12 @@ ENTRY(startup_32)
131 movsl 131 movsl
1321: 1321:
133 133
134#ifdef CONFIG_OLPC_OPENFIRMWARE
135 /* save OFW's pgdir table for later use when calling into OFW */
136 movl %cr3, %eax
137 movl %eax, pa(olpc_ofw_pgd)
138#endif
139
134#ifdef CONFIG_PARAVIRT 140#ifdef CONFIG_PARAVIRT
135 /* This is can only trip for a broken bootloader... */ 141 /* This is can only trip for a broken bootloader... */
136 cmpw $0x207, pa(boot_params + BP_version) 142 cmpw $0x207, pa(boot_params + BP_version)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3d1e6f16b7a6..239046bd447f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -234,9 +234,8 @@ ENTRY(secondary_startup_64)
234 * init data section till per cpu areas are set up. 234 * init data section till per cpu areas are set up.
235 */ 235 */
236 movl $MSR_GS_BASE,%ecx 236 movl $MSR_GS_BASE,%ecx
237 movq initial_gs(%rip),%rax 237 movl initial_gs(%rip),%eax
238 movq %rax,%rdx 238 movl initial_gs+4(%rip),%edx
239 shrq $32,%rdx
240 wrmsr 239 wrmsr
241 240
242 /* esi is pointer to real mode structure with interesting info. 241 /* esi is pointer to real mode structure with interesting info.
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ba390d731175..351f9c0fea1f 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -16,7 +16,6 @@
16#include <asm/hpet.h> 16#include <asm/hpet.h>
17 17
18#define HPET_MASK CLOCKSOURCE_MASK(32) 18#define HPET_MASK CLOCKSOURCE_MASK(32)
19#define HPET_SHIFT 22
20 19
21/* FSEC = 10^-15 20/* FSEC = 10^-15
22 NSEC = 10^-9 */ 21 NSEC = 10^-9 */
@@ -583,7 +582,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
583 * scaled math multiplication factor for nanosecond to hpet tick 582 * scaled math multiplication factor for nanosecond to hpet tick
584 * conversion. 583 * conversion.
585 */ 584 */
586 hpet_freq = 1000000000000000ULL; 585 hpet_freq = FSEC_PER_SEC;
587 do_div(hpet_freq, hpet_period); 586 do_div(hpet_freq, hpet_period);
588 evt->mult = div_sc((unsigned long) hpet_freq, 587 evt->mult = div_sc((unsigned long) hpet_freq,
589 NSEC_PER_SEC, evt->shift); 588 NSEC_PER_SEC, evt->shift);
@@ -787,7 +786,6 @@ static struct clocksource clocksource_hpet = {
787 .rating = 250, 786 .rating = 250,
788 .read = read_hpet, 787 .read = read_hpet,
789 .mask = HPET_MASK, 788 .mask = HPET_MASK,
790 .shift = HPET_SHIFT,
791 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 789 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
792 .resume = hpet_resume_counter, 790 .resume = hpet_resume_counter,
793#ifdef CONFIG_X86_64 791#ifdef CONFIG_X86_64
@@ -798,6 +796,7 @@ static struct clocksource clocksource_hpet = {
798static int hpet_clocksource_register(void) 796static int hpet_clocksource_register(void)
799{ 797{
800 u64 start, now; 798 u64 start, now;
799 u64 hpet_freq;
801 cycle_t t1; 800 cycle_t t1;
802 801
803 /* Start the counter */ 802 /* Start the counter */
@@ -832,9 +831,15 @@ static int hpet_clocksource_register(void)
832 * mult = (hpet_period * 2^shift)/10^6 831 * mult = (hpet_period * 2^shift)/10^6
833 * mult = (hpet_period << shift)/FSEC_PER_NSEC 832 * mult = (hpet_period << shift)/FSEC_PER_NSEC
834 */ 833 */
835 clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
836 834
837 clocksource_register(&clocksource_hpet); 835 /* Need to convert hpet_period (fsec/cyc) to cyc/sec:
836 *
837 * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
838 * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
839 */
840 hpet_freq = FSEC_PER_SEC;
841 do_div(hpet_freq, hpet_period);
842 clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
838 843
839 return 0; 844 return 0;
840} 845}
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index a8f1b803d2fd..a474ec37c32f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -208,6 +208,9 @@ int arch_bp_generic_fields(int x86_len, int x86_type,
208{ 208{
209 /* Len */ 209 /* Len */
210 switch (x86_len) { 210 switch (x86_len) {
211 case X86_BREAKPOINT_LEN_X:
212 *gen_len = sizeof(long);
213 break;
211 case X86_BREAKPOINT_LEN_1: 214 case X86_BREAKPOINT_LEN_1:
212 *gen_len = HW_BREAKPOINT_LEN_1; 215 *gen_len = HW_BREAKPOINT_LEN_1;
213 break; 216 break;
@@ -251,6 +254,29 @@ static int arch_build_bp_info(struct perf_event *bp)
251 254
252 info->address = bp->attr.bp_addr; 255 info->address = bp->attr.bp_addr;
253 256
257 /* Type */
258 switch (bp->attr.bp_type) {
259 case HW_BREAKPOINT_W:
260 info->type = X86_BREAKPOINT_WRITE;
261 break;
262 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
263 info->type = X86_BREAKPOINT_RW;
264 break;
265 case HW_BREAKPOINT_X:
266 info->type = X86_BREAKPOINT_EXECUTE;
267 /*
268 * x86 inst breakpoints need to have a specific undefined len.
269 * But we still need to check userspace is not trying to setup
270 * an unsupported length, to get a range breakpoint for example.
271 */
272 if (bp->attr.bp_len == sizeof(long)) {
273 info->len = X86_BREAKPOINT_LEN_X;
274 return 0;
275 }
276 default:
277 return -EINVAL;
278 }
279
254 /* Len */ 280 /* Len */
255 switch (bp->attr.bp_len) { 281 switch (bp->attr.bp_len) {
256 case HW_BREAKPOINT_LEN_1: 282 case HW_BREAKPOINT_LEN_1:
@@ -271,21 +297,6 @@ static int arch_build_bp_info(struct perf_event *bp)
271 return -EINVAL; 297 return -EINVAL;
272 } 298 }
273 299
274 /* Type */
275 switch (bp->attr.bp_type) {
276 case HW_BREAKPOINT_W:
277 info->type = X86_BREAKPOINT_WRITE;
278 break;
279 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
280 info->type = X86_BREAKPOINT_RW;
281 break;
282 case HW_BREAKPOINT_X:
283 info->type = X86_BREAKPOINT_EXECUTE;
284 break;
285 default:
286 return -EINVAL;
287 }
288
289 return 0; 300 return 0;
290} 301}
291/* 302/*
@@ -305,6 +316,9 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
305 ret = -EINVAL; 316 ret = -EINVAL;
306 317
307 switch (info->len) { 318 switch (info->len) {
319 case X86_BREAKPOINT_LEN_X:
320 align = sizeof(long) -1;
321 break;
308 case X86_BREAKPOINT_LEN_1: 322 case X86_BREAKPOINT_LEN_1:
309 align = 0; 323 align = 0;
310 break; 324 break;
@@ -466,6 +480,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
466 480
467 perf_bp_event(bp, args->regs); 481 perf_bp_event(bp, args->regs);
468 482
483 /*
484 * Set up resume flag to avoid breakpoint recursion when
485 * returning back to origin.
486 */
487 if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE)
488 args->regs->flags |= X86_EFLAGS_RF;
489
469 rcu_read_unlock(); 490 rcu_read_unlock();
470 } 491 }
471 /* 492 /*
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 86cef6b32253..1f11f5ce668f 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -59,18 +59,18 @@ void __cpuinit mxcsr_feature_mask_init(void)
59 stts(); 59 stts();
60} 60}
61 61
62void __cpuinit init_thread_xstate(void) 62static void __cpuinit init_thread_xstate(void)
63{ 63{
64 /*
65 * Note that xstate_size might be overwriten later during
66 * xsave_init().
67 */
68
64 if (!HAVE_HWFP) { 69 if (!HAVE_HWFP) {
65 xstate_size = sizeof(struct i387_soft_struct); 70 xstate_size = sizeof(struct i387_soft_struct);
66 return; 71 return;
67 } 72 }
68 73
69 if (cpu_has_xsave) {
70 xsave_cntxt_init();
71 return;
72 }
73
74 if (cpu_has_fxsr) 74 if (cpu_has_fxsr)
75 xstate_size = sizeof(struct i387_fxsave_struct); 75 xstate_size = sizeof(struct i387_fxsave_struct);
76#ifdef CONFIG_X86_32 76#ifdef CONFIG_X86_32
@@ -84,6 +84,7 @@ void __cpuinit init_thread_xstate(void)
84 * Called at bootup to set up the initial FPU state that is later cloned 84 * Called at bootup to set up the initial FPU state that is later cloned
85 * into all processes. 85 * into all processes.
86 */ 86 */
87
87void __cpuinit fpu_init(void) 88void __cpuinit fpu_init(void)
88{ 89{
89 unsigned long oldcr0 = read_cr0(); 90 unsigned long oldcr0 = read_cr0();
@@ -93,21 +94,26 @@ void __cpuinit fpu_init(void)
93 94
94 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ 95 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
95 96
96 /*
97 * Boot processor to setup the FP and extended state context info.
98 */
99 if (!smp_processor_id()) 97 if (!smp_processor_id())
100 init_thread_xstate(); 98 init_thread_xstate();
101 xsave_init();
102 99
103 mxcsr_feature_mask_init(); 100 mxcsr_feature_mask_init();
104 /* clean state in init */ 101 /* clean state in init */
105 current_thread_info()->status = 0; 102 current_thread_info()->status = 0;
106 clear_used_math(); 103 clear_used_math();
107} 104}
108#endif /* CONFIG_X86_64 */
109 105
110static void fpu_finit(struct fpu *fpu) 106#else /* CONFIG_X86_64 */
107
108void __cpuinit fpu_init(void)
109{
110 if (!smp_processor_id())
111 init_thread_xstate();
112}
113
114#endif /* CONFIG_X86_32 */
115
116void fpu_finit(struct fpu *fpu)
111{ 117{
112#ifdef CONFIG_X86_32 118#ifdef CONFIG_X86_32
113 if (!HAVE_HWFP) { 119 if (!HAVE_HWFP) {
@@ -132,6 +138,7 @@ static void fpu_finit(struct fpu *fpu)
132 fp->fos = 0xffff0000u; 138 fp->fos = 0xffff0000u;
133 } 139 }
134} 140}
141EXPORT_SYMBOL_GPL(fpu_finit);
135 142
136/* 143/*
137 * The _current_ task is using the FPU for the first time 144 * The _current_ task is using the FPU for the first time
@@ -190,6 +197,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
190 if (ret) 197 if (ret)
191 return ret; 198 return ret;
192 199
200 sanitize_i387_state(target);
201
193 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, 202 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
194 &target->thread.fpu.state->fxsave, 0, -1); 203 &target->thread.fpu.state->fxsave, 0, -1);
195} 204}
@@ -207,6 +216,8 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
207 if (ret) 216 if (ret)
208 return ret; 217 return ret;
209 218
219 sanitize_i387_state(target);
220
210 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 221 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
211 &target->thread.fpu.state->fxsave, 0, -1); 222 &target->thread.fpu.state->fxsave, 0, -1);
212 223
@@ -446,6 +457,8 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
446 -1); 457 -1);
447 } 458 }
448 459
460 sanitize_i387_state(target);
461
449 if (kbuf && pos == 0 && count == sizeof(env)) { 462 if (kbuf && pos == 0 && count == sizeof(env)) {
450 convert_from_fxsr(kbuf, target); 463 convert_from_fxsr(kbuf, target);
451 return 0; 464 return 0;
@@ -467,6 +480,8 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
467 if (ret) 480 if (ret)
468 return ret; 481 return ret;
469 482
483 sanitize_i387_state(target);
484
470 if (!HAVE_HWFP) 485 if (!HAVE_HWFP)
471 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); 486 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
472 487
@@ -533,6 +548,9 @@ static int save_i387_xsave(void __user *buf)
533 struct _fpstate_ia32 __user *fx = buf; 548 struct _fpstate_ia32 __user *fx = buf;
534 int err = 0; 549 int err = 0;
535 550
551
552 sanitize_i387_state(tsk);
553
536 /* 554 /*
537 * For legacy compatible, we always set FP/SSE bits in the bit 555 * For legacy compatible, we always set FP/SSE bits in the bit
538 * vector while saving the state to the user context. 556 * vector while saving the state to the user context.
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 01ab17ae2ae7..ef10940e1af0 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -49,55 +49,94 @@
49#include <asm/system.h> 49#include <asm/system.h>
50#include <asm/apic.h> 50#include <asm/apic.h>
51 51
52/** 52struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
53 * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
54 * @gdb_regs: A pointer to hold the registers in the order GDB wants.
55 * @regs: The &struct pt_regs of the current process.
56 *
57 * Convert the pt_regs in @regs into the format for registers that
58 * GDB expects, stored in @gdb_regs.
59 */
60void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
61{ 53{
62#ifndef CONFIG_X86_32 54#ifdef CONFIG_X86_32
63 u32 *gdb_regs32 = (u32 *)gdb_regs; 55 { "ax", 4, offsetof(struct pt_regs, ax) },
56 { "cx", 4, offsetof(struct pt_regs, cx) },
57 { "dx", 4, offsetof(struct pt_regs, dx) },
58 { "bx", 4, offsetof(struct pt_regs, bx) },
59 { "sp", 4, offsetof(struct pt_regs, sp) },
60 { "bp", 4, offsetof(struct pt_regs, bp) },
61 { "si", 4, offsetof(struct pt_regs, si) },
62 { "di", 4, offsetof(struct pt_regs, di) },
63 { "ip", 4, offsetof(struct pt_regs, ip) },
64 { "flags", 4, offsetof(struct pt_regs, flags) },
65 { "cs", 4, offsetof(struct pt_regs, cs) },
66 { "ss", 4, offsetof(struct pt_regs, ss) },
67 { "ds", 4, offsetof(struct pt_regs, ds) },
68 { "es", 4, offsetof(struct pt_regs, es) },
69 { "fs", 4, -1 },
70 { "gs", 4, -1 },
71#else
72 { "ax", 8, offsetof(struct pt_regs, ax) },
73 { "bx", 8, offsetof(struct pt_regs, bx) },
74 { "cx", 8, offsetof(struct pt_regs, cx) },
75 { "dx", 8, offsetof(struct pt_regs, dx) },
76 { "si", 8, offsetof(struct pt_regs, dx) },
77 { "di", 8, offsetof(struct pt_regs, di) },
78 { "bp", 8, offsetof(struct pt_regs, bp) },
79 { "sp", 8, offsetof(struct pt_regs, sp) },
80 { "r8", 8, offsetof(struct pt_regs, r8) },
81 { "r9", 8, offsetof(struct pt_regs, r9) },
82 { "r10", 8, offsetof(struct pt_regs, r10) },
83 { "r11", 8, offsetof(struct pt_regs, r11) },
84 { "r12", 8, offsetof(struct pt_regs, r12) },
85 { "r13", 8, offsetof(struct pt_regs, r13) },
86 { "r14", 8, offsetof(struct pt_regs, r14) },
87 { "r15", 8, offsetof(struct pt_regs, r15) },
88 { "ip", 8, offsetof(struct pt_regs, ip) },
89 { "flags", 4, offsetof(struct pt_regs, flags) },
90 { "cs", 4, offsetof(struct pt_regs, cs) },
91 { "ss", 4, offsetof(struct pt_regs, ss) },
64#endif 92#endif
65 gdb_regs[GDB_AX] = regs->ax; 93};
66 gdb_regs[GDB_BX] = regs->bx; 94
67 gdb_regs[GDB_CX] = regs->cx; 95int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
68 gdb_regs[GDB_DX] = regs->dx; 96{
69 gdb_regs[GDB_SI] = regs->si; 97 if (
70 gdb_regs[GDB_DI] = regs->di;
71 gdb_regs[GDB_BP] = regs->bp;
72 gdb_regs[GDB_PC] = regs->ip;
73#ifdef CONFIG_X86_32 98#ifdef CONFIG_X86_32
74 gdb_regs[GDB_PS] = regs->flags; 99 regno == GDB_SS || regno == GDB_FS || regno == GDB_GS ||
75 gdb_regs[GDB_DS] = regs->ds; 100#endif
76 gdb_regs[GDB_ES] = regs->es; 101 regno == GDB_SP || regno == GDB_ORIG_AX)
77 gdb_regs[GDB_CS] = regs->cs; 102 return 0;
78 gdb_regs[GDB_FS] = 0xFFFF; 103
79 gdb_regs[GDB_GS] = 0xFFFF; 104 if (dbg_reg_def[regno].offset != -1)
80 if (user_mode_vm(regs)) { 105 memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
81 gdb_regs[GDB_SS] = regs->ss; 106 dbg_reg_def[regno].size);
82 gdb_regs[GDB_SP] = regs->sp; 107 return 0;
83 } else { 108}
84 gdb_regs[GDB_SS] = __KERNEL_DS; 109
85 gdb_regs[GDB_SP] = kernel_stack_pointer(regs); 110char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
111{
112 if (regno == GDB_ORIG_AX) {
113 memcpy(mem, &regs->orig_ax, sizeof(regs->orig_ax));
114 return "orig_ax";
86 } 115 }
87#else 116 if (regno >= DBG_MAX_REG_NUM || regno < 0)
88 gdb_regs[GDB_R8] = regs->r8; 117 return NULL;
89 gdb_regs[GDB_R9] = regs->r9; 118
90 gdb_regs[GDB_R10] = regs->r10; 119 if (dbg_reg_def[regno].offset != -1)
91 gdb_regs[GDB_R11] = regs->r11; 120 memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
92 gdb_regs[GDB_R12] = regs->r12; 121 dbg_reg_def[regno].size);
93 gdb_regs[GDB_R13] = regs->r13; 122
94 gdb_regs[GDB_R14] = regs->r14; 123 switch (regno) {
95 gdb_regs[GDB_R15] = regs->r15; 124#ifdef CONFIG_X86_32
96 gdb_regs32[GDB_PS] = regs->flags; 125 case GDB_SS:
97 gdb_regs32[GDB_CS] = regs->cs; 126 if (!user_mode_vm(regs))
98 gdb_regs32[GDB_SS] = regs->ss; 127 *(unsigned long *)mem = __KERNEL_DS;
99 gdb_regs[GDB_SP] = kernel_stack_pointer(regs); 128 break;
129 case GDB_SP:
130 if (!user_mode_vm(regs))
131 *(unsigned long *)mem = kernel_stack_pointer(regs);
132 break;
133 case GDB_GS:
134 case GDB_FS:
135 *(unsigned long *)mem = 0xFFFF;
136 break;
100#endif 137#endif
138 }
139 return dbg_reg_def[regno].name;
101} 140}
102 141
103/** 142/**
@@ -150,54 +189,13 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
150 gdb_regs[GDB_SP] = p->thread.sp; 189 gdb_regs[GDB_SP] = p->thread.sp;
151} 190}
152 191
153/**
154 * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs.
155 * @gdb_regs: A pointer to hold the registers we've received from GDB.
156 * @regs: A pointer to a &struct pt_regs to hold these values in.
157 *
158 * Convert the GDB regs in @gdb_regs into the pt_regs, and store them
159 * in @regs.
160 */
161void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
162{
163#ifndef CONFIG_X86_32
164 u32 *gdb_regs32 = (u32 *)gdb_regs;
165#endif
166 regs->ax = gdb_regs[GDB_AX];
167 regs->bx = gdb_regs[GDB_BX];
168 regs->cx = gdb_regs[GDB_CX];
169 regs->dx = gdb_regs[GDB_DX];
170 regs->si = gdb_regs[GDB_SI];
171 regs->di = gdb_regs[GDB_DI];
172 regs->bp = gdb_regs[GDB_BP];
173 regs->ip = gdb_regs[GDB_PC];
174#ifdef CONFIG_X86_32
175 regs->flags = gdb_regs[GDB_PS];
176 regs->ds = gdb_regs[GDB_DS];
177 regs->es = gdb_regs[GDB_ES];
178 regs->cs = gdb_regs[GDB_CS];
179#else
180 regs->r8 = gdb_regs[GDB_R8];
181 regs->r9 = gdb_regs[GDB_R9];
182 regs->r10 = gdb_regs[GDB_R10];
183 regs->r11 = gdb_regs[GDB_R11];
184 regs->r12 = gdb_regs[GDB_R12];
185 regs->r13 = gdb_regs[GDB_R13];
186 regs->r14 = gdb_regs[GDB_R14];
187 regs->r15 = gdb_regs[GDB_R15];
188 regs->flags = gdb_regs32[GDB_PS];
189 regs->cs = gdb_regs32[GDB_CS];
190 regs->ss = gdb_regs32[GDB_SS];
191#endif
192}
193
194static struct hw_breakpoint { 192static struct hw_breakpoint {
195 unsigned enabled; 193 unsigned enabled;
196 unsigned long addr; 194 unsigned long addr;
197 int len; 195 int len;
198 int type; 196 int type;
199 struct perf_event **pev; 197 struct perf_event **pev;
200} breakinfo[4]; 198} breakinfo[HBP_NUM];
201 199
202static unsigned long early_dr7; 200static unsigned long early_dr7;
203 201
@@ -205,7 +203,7 @@ static void kgdb_correct_hw_break(void)
205{ 203{
206 int breakno; 204 int breakno;
207 205
208 for (breakno = 0; breakno < 4; breakno++) { 206 for (breakno = 0; breakno < HBP_NUM; breakno++) {
209 struct perf_event *bp; 207 struct perf_event *bp;
210 struct arch_hw_breakpoint *info; 208 struct arch_hw_breakpoint *info;
211 int val; 209 int val;
@@ -292,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
292{ 290{
293 int i; 291 int i;
294 292
295 for (i = 0; i < 4; i++) 293 for (i = 0; i < HBP_NUM; i++)
296 if (breakinfo[i].addr == addr && breakinfo[i].enabled) 294 if (breakinfo[i].addr == addr && breakinfo[i].enabled)
297 break; 295 break;
298 if (i == 4) 296 if (i == HBP_NUM)
299 return -1; 297 return -1;
300 298
301 if (hw_break_release_slot(i)) { 299 if (hw_break_release_slot(i)) {
@@ -313,7 +311,7 @@ static void kgdb_remove_all_hw_break(void)
313 int cpu = raw_smp_processor_id(); 311 int cpu = raw_smp_processor_id();
314 struct perf_event *bp; 312 struct perf_event *bp;
315 313
316 for (i = 0; i < 4; i++) { 314 for (i = 0; i < HBP_NUM; i++) {
317 if (!breakinfo[i].enabled) 315 if (!breakinfo[i].enabled)
318 continue; 316 continue;
319 bp = *per_cpu_ptr(breakinfo[i].pev, cpu); 317 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
@@ -333,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
333{ 331{
334 int i; 332 int i;
335 333
336 for (i = 0; i < 4; i++) 334 for (i = 0; i < HBP_NUM; i++)
337 if (!breakinfo[i].enabled) 335 if (!breakinfo[i].enabled)
338 break; 336 break;
339 if (i == 4) 337 if (i == HBP_NUM)
340 return -1; 338 return -1;
341 339
342 switch (bptype) { 340 switch (bptype) {
@@ -397,7 +395,7 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
397 395
398 /* Disable hardware debugging while we are in kgdb: */ 396 /* Disable hardware debugging while we are in kgdb: */
399 set_debugreg(0UL, 7); 397 set_debugreg(0UL, 7);
400 for (i = 0; i < 4; i++) { 398 for (i = 0; i < HBP_NUM; i++) {
401 if (!breakinfo[i].enabled) 399 if (!breakinfo[i].enabled)
402 continue; 400 continue;
403 if (dbg_is_early) { 401 if (dbg_is_early) {
@@ -458,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
458{ 456{
459 unsigned long addr; 457 unsigned long addr;
460 char *ptr; 458 char *ptr;
461 int newPC;
462 459
463 switch (remcomInBuffer[0]) { 460 switch (remcomInBuffer[0]) {
464 case 'c': 461 case 'c':
@@ -469,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
469 linux_regs->ip = addr; 466 linux_regs->ip = addr;
470 case 'D': 467 case 'D':
471 case 'k': 468 case 'k':
472 newPC = linux_regs->ip;
473
474 /* clear the trace bit */ 469 /* clear the trace bit */
475 linux_regs->flags &= ~X86_EFLAGS_TF; 470 linux_regs->flags &= ~X86_EFLAGS_TF;
476 atomic_set(&kgdb_cpu_doing_single_step, -1); 471 atomic_set(&kgdb_cpu_doing_single_step, -1);
@@ -645,7 +640,7 @@ void kgdb_arch_late(void)
645 attr.bp_len = HW_BREAKPOINT_LEN_1; 640 attr.bp_len = HW_BREAKPOINT_LEN_1;
646 attr.bp_type = HW_BREAKPOINT_W; 641 attr.bp_type = HW_BREAKPOINT_W;
647 attr.disabled = 1; 642 attr.disabled = 1;
648 for (i = 0; i < 4; i++) { 643 for (i = 0; i < HBP_NUM; i++) {
649 if (breakinfo[i].pev) 644 if (breakinfo[i].pev)
650 continue; 645 continue;
651 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); 646 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 675879b65ce6..1bfb6cf4dd55 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to)
126} 126}
127 127
128/* 128/*
129 * Check for the REX prefix which can only exist on X86_64 129 * Skip the prefixes of the instruction.
130 * X86_32 always returns 0
131 */ 130 */
132static int __kprobes is_REX_prefix(kprobe_opcode_t *insn) 131static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
133{ 132{
133 insn_attr_t attr;
134
135 attr = inat_get_opcode_attribute((insn_byte_t)*insn);
136 while (inat_is_legacy_prefix(attr)) {
137 insn++;
138 attr = inat_get_opcode_attribute((insn_byte_t)*insn);
139 }
134#ifdef CONFIG_X86_64 140#ifdef CONFIG_X86_64
135 if ((*insn & 0xf0) == 0x40) 141 if (inat_is_rex_prefix(attr))
136 return 1; 142 insn++;
137#endif 143#endif
138 return 0; 144 return insn;
139} 145}
140 146
141/* 147/*
@@ -272,6 +278,9 @@ static int __kprobes can_probe(unsigned long paddr)
272 */ 278 */
273static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) 279static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
274{ 280{
281 /* Skip prefixes */
282 insn = skip_prefixes(insn);
283
275 switch (*insn) { 284 switch (*insn) {
276 case 0xfa: /* cli */ 285 case 0xfa: /* cli */
277 case 0xfb: /* sti */ 286 case 0xfb: /* sti */
@@ -280,13 +289,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
280 return 1; 289 return 1;
281 } 290 }
282 291
283 /*
284 * on X86_64, 0x40-0x4f are REX prefixes so we need to look
285 * at the next byte instead.. but of course not recurse infinitely
286 */
287 if (is_REX_prefix(insn))
288 return is_IF_modifier(++insn);
289
290 return 0; 292 return 0;
291} 293}
292 294
@@ -803,9 +805,8 @@ static void __kprobes resume_execution(struct kprobe *p,
803 unsigned long orig_ip = (unsigned long)p->addr; 805 unsigned long orig_ip = (unsigned long)p->addr;
804 kprobe_opcode_t *insn = p->ainsn.insn; 806 kprobe_opcode_t *insn = p->ainsn.insn;
805 807
806 /*skip the REX prefix*/ 808 /* Skip prefixes */
807 if (is_REX_prefix(insn)) 809 insn = skip_prefixes(insn);
808 insn++;
809 810
810 regs->flags &= ~X86_EFLAGS_TF; 811 regs->flags &= ~X86_EFLAGS_TF;
811 switch (*insn) { 812 switch (*insn) {
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d86dbf7e54be..d7b6f7fb4fec 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -274,6 +274,18 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
274 274
275void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } 275void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
276 276
277static void __init smp_register_lapic_address(unsigned long address)
278{
279 mp_lapic_addr = address;
280
281 set_fixmap_nocache(FIX_APIC_BASE, address);
282 if (boot_cpu_physical_apicid == -1U) {
283 boot_cpu_physical_apicid = read_apic_id();
284 apic_version[boot_cpu_physical_apicid] =
285 GET_APIC_VERSION(apic_read(APIC_LVR));
286 }
287}
288
277static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) 289static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
278{ 290{
279 char str[16]; 291 char str[16];
@@ -295,6 +307,10 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
295 if (early) 307 if (early)
296 return 1; 308 return 1;
297 309
310 /* Initialize the lapic mapping */
311 if (!acpi_lapic)
312 smp_register_lapic_address(mpc->lapic);
313
298 if (mpc->oemptr) 314 if (mpc->oemptr)
299 x86_init.mpparse.smp_read_mpc_oem(mpc); 315 x86_init.mpparse.smp_read_mpc_oem(mpc);
300 316
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 5915e0b33303..79ae68154e87 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -25,8 +25,34 @@
25#include <asm/i8259.h> 25#include <asm/i8259.h>
26#include <asm/apb_timer.h> 26#include <asm/apb_timer.h>
27 27
28/*
29 * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
30 * cmdline option x86_mrst_timer can be used to override the configuration
31 * to prefer one or the other.
32 * at runtime, there are basically three timer configurations:
33 * 1. per cpu apbt clock only
34 * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
35 * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
36 *
37 * by default (without cmdline option), platform code first detects cpu type
38 * to see if we are on lincroft or penwell, then set up both lapic or apbt
39 * clocks accordingly.
40 * i.e. by default, medfield uses configuration #2, moorestown uses #1.
41 * config #3 is supported but not recommended on medfield.
42 *
43 * rating and feature summary:
44 * lapic (with C3STOP) --------- 100
45 * apbt (always-on) ------------ 110
46 * lapic (always-on,ARAT) ------ 150
47 */
48
49__cpuinitdata enum mrst_timer_options mrst_timer_options;
50
28static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; 51static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
29static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; 52static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
53enum mrst_cpu_type __mrst_cpu_chip;
54EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
55
30int sfi_mtimer_num; 56int sfi_mtimer_num;
31 57
32struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; 58struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
@@ -167,18 +193,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
167 return 0; 193 return 0;
168} 194}
169 195
170/*
171 * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
172 * APBT but cmdline option can also override it.
173 */
174static void __cpuinit mrst_setup_secondary_clock(void)
175{
176 /* restore default lapic clock if disabled by cmdline */
177 if (disable_apbt_percpu)
178 return setup_secondary_APIC_clock();
179 apbt_setup_secondary_clock();
180}
181
182static unsigned long __init mrst_calibrate_tsc(void) 196static unsigned long __init mrst_calibrate_tsc(void)
183{ 197{
184 unsigned long flags, fast_calibrate; 198 unsigned long flags, fast_calibrate;
@@ -195,6 +209,21 @@ static unsigned long __init mrst_calibrate_tsc(void)
195 209
196void __init mrst_time_init(void) 210void __init mrst_time_init(void)
197{ 211{
212 switch (mrst_timer_options) {
213 case MRST_TIMER_APBT_ONLY:
214 break;
215 case MRST_TIMER_LAPIC_APBT:
216 x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
217 x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
218 break;
219 default:
220 if (!boot_cpu_has(X86_FEATURE_ARAT))
221 break;
222 x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
223 x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
224 return;
225 }
226 /* we need at least one APB timer */
198 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); 227 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
199 pre_init_apic_IRQ0(); 228 pre_init_apic_IRQ0();
200 apbt_time_init(); 229 apbt_time_init();
@@ -205,16 +234,21 @@ void __init mrst_rtc_init(void)
205 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); 234 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
206} 235}
207 236
208/* 237void __cpuinit mrst_arch_setup(void)
209 * if we use per cpu apb timer, the bootclock already setup. if we use lapic
210 * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
211 */
212static void __init mrst_setup_boot_clock(void)
213{ 238{
214 pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); 239 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
215 if (disable_apbt_percpu) 240 __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
216 setup_boot_APIC_clock(); 241 else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
217}; 242 __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
243 else {
244 pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
245 boot_cpu_data.x86, boot_cpu_data.x86_model);
246 __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
247 }
248 pr_debug("Moorestown CPU %s identified\n",
249 (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
250 "Lincroft" : "Penwell");
251}
218 252
219/* MID systems don't have i8042 controller */ 253/* MID systems don't have i8042 controller */
220static int mrst_i8042_detect(void) 254static int mrst_i8042_detect(void)
@@ -232,11 +266,13 @@ void __init x86_mrst_early_setup(void)
232 x86_init.resources.reserve_resources = x86_init_noop; 266 x86_init.resources.reserve_resources = x86_init_noop;
233 267
234 x86_init.timers.timer_init = mrst_time_init; 268 x86_init.timers.timer_init = mrst_time_init;
235 x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; 269 x86_init.timers.setup_percpu_clockev = x86_init_noop;
236 270
237 x86_init.irqs.pre_vector_init = x86_init_noop; 271 x86_init.irqs.pre_vector_init = x86_init_noop;
238 272
239 x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; 273 x86_init.oem.arch_setup = mrst_arch_setup;
274
275 x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
240 276
241 x86_platform.calibrate_tsc = mrst_calibrate_tsc; 277 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
242 x86_platform.i8042_detect = mrst_i8042_detect; 278 x86_platform.i8042_detect = mrst_i8042_detect;
@@ -250,3 +286,26 @@ void __init x86_mrst_early_setup(void)
250 x86_init.mpparse.get_smp_config = x86_init_uint_noop; 286 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
251 287
252} 288}
289
290/*
291 * if user does not want to use per CPU apb timer, just give it a lower rating
292 * than local apic timer and skip the late per cpu timer init.
293 */
294static inline int __init setup_x86_mrst_timer(char *arg)
295{
296 if (!arg)
297 return -EINVAL;
298
299 if (strcmp("apbt_only", arg) == 0)
300 mrst_timer_options = MRST_TIMER_APBT_ONLY;
301 else if (strcmp("lapic_and_apbt", arg) == 0)
302 mrst_timer_options = MRST_TIMER_LAPIC_APBT;
303 else {
304 pr_warning("X86 MRST timer option %s not recognised"
305 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
306 arg);
307 return -EINVAL;
308 }
309 return 0;
310}
311__setup("x86_mrst_timer=", setup_x86_mrst_timer);
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 8297160c41b3..0e0cdde519be 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -21,10 +21,7 @@
21#include <asm/geode.h> 21#include <asm/geode.h>
22#include <asm/setup.h> 22#include <asm/setup.h>
23#include <asm/olpc.h> 23#include <asm/olpc.h>
24 24#include <asm/olpc_ofw.h>
25#ifdef CONFIG_OPEN_FIRMWARE
26#include <asm/ofw.h>
27#endif
28 25
29struct olpc_platform_t olpc_platform_info; 26struct olpc_platform_t olpc_platform_info;
30EXPORT_SYMBOL_GPL(olpc_platform_info); 27EXPORT_SYMBOL_GPL(olpc_platform_info);
@@ -145,7 +142,7 @@ restart:
145 * The OBF flag will sometimes misbehave due to what we believe 142 * The OBF flag will sometimes misbehave due to what we believe
146 * is a hardware quirk.. 143 * is a hardware quirk..
147 */ 144 */
148 printk(KERN_DEBUG "olpc-ec: running cmd 0x%x\n", cmd); 145 pr_devel("olpc-ec: running cmd 0x%x\n", cmd);
149 outb(cmd, 0x6c); 146 outb(cmd, 0x6c);
150 147
151 if (wait_on_ibf(0x6c, 0)) { 148 if (wait_on_ibf(0x6c, 0)) {
@@ -162,8 +159,7 @@ restart:
162 " EC accept data!\n"); 159 " EC accept data!\n");
163 goto err; 160 goto err;
164 } 161 }
165 printk(KERN_DEBUG "olpc-ec: sending cmd arg 0x%x\n", 162 pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]);
166 inbuf[i]);
167 outb(inbuf[i], 0x68); 163 outb(inbuf[i], 0x68);
168 } 164 }
169 } 165 }
@@ -176,8 +172,7 @@ restart:
176 goto restart; 172 goto restart;
177 } 173 }
178 outbuf[i] = inb(0x68); 174 outbuf[i] = inb(0x68);
179 printk(KERN_DEBUG "olpc-ec: received 0x%x\n", 175 pr_devel("olpc-ec: received 0x%x\n", outbuf[i]);
180 outbuf[i]);
181 } 176 }
182 } 177 }
183 178
@@ -188,14 +183,15 @@ err:
188} 183}
189EXPORT_SYMBOL_GPL(olpc_ec_cmd); 184EXPORT_SYMBOL_GPL(olpc_ec_cmd);
190 185
191#ifdef CONFIG_OPEN_FIRMWARE 186#ifdef CONFIG_OLPC_OPENFIRMWARE
192static void __init platform_detect(void) 187static void __init platform_detect(void)
193{ 188{
194 size_t propsize; 189 size_t propsize;
195 __be32 rev; 190 __be32 rev;
191 const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
192 void *res[] = { &propsize };
196 193
197 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, 194 if (olpc_ofw("getprop", args, res) || propsize != 4) {
198 &propsize) || propsize != 4) {
199 printk(KERN_ERR "ofw: getprop call failed!\n"); 195 printk(KERN_ERR "ofw: getprop call failed!\n");
200 rev = cpu_to_be32(0); 196 rev = cpu_to_be32(0);
201 } 197 }
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
new file mode 100644
index 000000000000..3218aa71ab5e
--- /dev/null
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -0,0 +1,106 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/init.h>
4#include <asm/page.h>
5#include <asm/setup.h>
6#include <asm/io.h>
7#include <asm/pgtable.h>
8#include <asm/olpc_ofw.h>
9
10/* address of OFW callback interface; will be NULL if OFW isn't found */
11static int (*olpc_ofw_cif)(int *);
12
13/* page dir entry containing OFW's pgdir table; filled in by head_32.S */
14u32 olpc_ofw_pgd __initdata;
15
16static DEFINE_SPINLOCK(ofw_lock);
17
18#define MAXARGS 10
19
20void __init setup_olpc_ofw_pgd(void)
21{
22 pgd_t *base, *ofw_pde;
23
24 if (!olpc_ofw_cif)
25 return;
26
27 /* fetch OFW's PDE */
28 base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
29 if (!base) {
30 printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n");
31 olpc_ofw_cif = NULL;
32 return;
33 }
34 ofw_pde = &base[OLPC_OFW_PDE_NR];
35
36 /* install OFW's PDE permanently into the kernel's pgtable */
37 set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
38 /* implicit optimization barrier here due to uninline function return */
39
40 early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
41}
42
43int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
44 void **res)
45{
46 int ofw_args[MAXARGS + 3];
47 unsigned long flags;
48 int ret, i, *p;
49
50 BUG_ON(nr_args + nr_res > MAXARGS);
51
52 if (!olpc_ofw_cif)
53 return -EIO;
54
55 ofw_args[0] = (int)name;
56 ofw_args[1] = nr_args;
57 ofw_args[2] = nr_res;
58
59 p = &ofw_args[3];
60 for (i = 0; i < nr_args; i++, p++)
61 *p = (int)args[i];
62
63 /* call into ofw */
64 spin_lock_irqsave(&ofw_lock, flags);
65 ret = olpc_ofw_cif(ofw_args);
66 spin_unlock_irqrestore(&ofw_lock, flags);
67
68 if (!ret) {
69 for (i = 0; i < nr_res; i++, p++)
70 *((int *)res[i]) = *p;
71 }
72
73 return ret;
74}
75EXPORT_SYMBOL_GPL(__olpc_ofw);
76
77/* OFW cif _should_ be above this address */
78#define OFW_MIN 0xff000000
79
80/* OFW starts on a 1MB boundary */
81#define OFW_BOUND (1<<20)
82
83void __init olpc_ofw_detect(void)
84{
85 struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header;
86 unsigned long start;
87
88 /* ensure OFW booted us by checking for "OFW " string */
89 if (hdr->ofw_magic != OLPC_OFW_SIG)
90 return;
91
92 olpc_ofw_cif = (int (*)(int *))hdr->cif_handler;
93
94 if ((unsigned long)olpc_ofw_cif < OFW_MIN) {
95 printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n",
96 (unsigned long)olpc_ofw_cif);
97 olpc_ofw_cif = NULL;
98 return;
99 }
100
101 /* determine where OFW starts in memory */
102 start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND);
103 printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n",
104 (unsigned long)olpc_ofw_cif, (-start) >> 20);
105 reserve_top_address(-start);
106}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 4b7e3d8b01dd..9f07cfcbd3a5 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -13,6 +13,7 @@
13#include <asm/calgary.h> 13#include <asm/calgary.h>
14#include <asm/amd_iommu.h> 14#include <asm/amd_iommu.h>
15#include <asm/x86_init.h> 15#include <asm/x86_init.h>
16#include <asm/xen/swiotlb-xen.h>
16 17
17static int forbid_dac __read_mostly; 18static int forbid_dac __read_mostly;
18 19
@@ -132,7 +133,7 @@ void __init pci_iommu_alloc(void)
132 /* free the range so iommu could get some range less than 4G */ 133 /* free the range so iommu could get some range less than 4G */
133 dma32_free_bootmem(); 134 dma32_free_bootmem();
134 135
135 if (pci_swiotlb_detect()) 136 if (pci_xen_swiotlb_detect() || pci_swiotlb_detect())
136 goto out; 137 goto out;
137 138
138 gart_iommu_hole_init(); 139 gart_iommu_hole_init();
@@ -144,6 +145,8 @@ void __init pci_iommu_alloc(void)
144 /* needs to be called after gart_iommu_hole_init */ 145 /* needs to be called after gart_iommu_hole_init */
145 amd_iommu_detect(); 146 amd_iommu_detect();
146out: 147out:
148 pci_xen_swiotlb_init();
149
147 pci_swiotlb_init(); 150 pci_swiotlb_init();
148} 151}
149 152
@@ -296,7 +299,7 @@ static int __init pci_iommu_init(void)
296#endif 299#endif
297 x86_init.iommu.iommu_init(); 300 x86_init.iommu.iommu_init();
298 301
299 if (swiotlb) { 302 if (swiotlb || xen_swiotlb) {
300 printk(KERN_INFO "PCI-DMA: " 303 printk(KERN_INFO "PCI-DMA: "
301 "Using software bounce buffering for IO (SWIOTLB)\n"); 304 "Using software bounce buffering for IO (SWIOTLB)\n");
302 swiotlb_print_info(); 305 swiotlb_print_info();
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e35219b32f..64ecaf0af9af 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@ unsigned long idle_nomwait;
28EXPORT_SYMBOL(idle_nomwait); 28EXPORT_SYMBOL(idle_nomwait);
29 29
30struct kmem_cache *task_xstate_cachep; 30struct kmem_cache *task_xstate_cachep;
31EXPORT_SYMBOL_GPL(task_xstate_cachep);
31 32
32int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 33int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
33{ 34{
@@ -300,7 +301,7 @@ EXPORT_SYMBOL(kernel_thread);
300/* 301/*
301 * sys_execve() executes a new program. 302 * sys_execve() executes a new program.
302 */ 303 */
303long sys_execve(char __user *name, char __user * __user *argv, 304long sys_execve(const char __user *name, char __user * __user *argv,
304 char __user * __user *envp, struct pt_regs *regs) 305 char __user * __user *envp, struct pt_regs *regs)
305{ 306{
306 long error; 307 long error;
@@ -371,7 +372,7 @@ static inline int hlt_use_halt(void)
371void default_idle(void) 372void default_idle(void)
372{ 373{
373 if (hlt_use_halt()) { 374 if (hlt_use_halt()) {
374 trace_power_start(POWER_CSTATE, 1); 375 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
375 current_thread_info()->status &= ~TS_POLLING; 376 current_thread_info()->status &= ~TS_POLLING;
376 /* 377 /*
377 * TS_POLLING-cleared state must be visible before we 378 * TS_POLLING-cleared state must be visible before we
@@ -441,7 +442,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
441 */ 442 */
442void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 443void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
443{ 444{
444 trace_power_start(POWER_CSTATE, (ax>>4)+1); 445 trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
445 if (!need_resched()) { 446 if (!need_resched()) {
446 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 447 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
447 clflush((void *)&current_thread_info()->flags); 448 clflush((void *)&current_thread_info()->flags);
@@ -457,7 +458,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
457static void mwait_idle(void) 458static void mwait_idle(void)
458{ 459{
459 if (!need_resched()) { 460 if (!need_resched()) {
460 trace_power_start(POWER_CSTATE, 1); 461 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
461 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 462 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
462 clflush((void *)&current_thread_info()->flags); 463 clflush((void *)&current_thread_info()->flags);
463 464
@@ -478,7 +479,7 @@ static void mwait_idle(void)
478 */ 479 */
479static void poll_idle(void) 480static void poll_idle(void)
480{ 481{
481 trace_power_start(POWER_CSTATE, 0); 482 trace_power_start(POWER_CSTATE, 0, smp_processor_id());
482 local_irq_enable(); 483 local_irq_enable();
483 while (!need_resched()) 484 while (!need_resched())
484 cpu_relax(); 485 cpu_relax();
@@ -525,44 +526,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
525 return (edx & MWAIT_EDX_C1); 526 return (edx & MWAIT_EDX_C1);
526} 527}
527 528
528/* 529bool c1e_detected;
529 * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. 530EXPORT_SYMBOL(c1e_detected);
530 * For more information see
531 * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
532 * - Erratum #365 for family 0x11 (not affected because C1e not in use)
533 */
534static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
535{
536 u64 val;
537 if (c->x86_vendor != X86_VENDOR_AMD)
538 goto no_c1e_idle;
539
540 /* Family 0x0f models < rev F do not have C1E */
541 if (c->x86 == 0x0F && c->x86_model >= 0x40)
542 return 1;
543
544 if (c->x86 == 0x10) {
545 /*
546 * check OSVW bit for CPUs that are not affected
547 * by erratum #400
548 */
549 if (cpu_has(c, X86_FEATURE_OSVW)) {
550 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
551 if (val >= 2) {
552 rdmsrl(MSR_AMD64_OSVW_STATUS, val);
553 if (!(val & BIT(1)))
554 goto no_c1e_idle;
555 }
556 }
557 return 1;
558 }
559
560no_c1e_idle:
561 return 0;
562}
563 531
564static cpumask_var_t c1e_mask; 532static cpumask_var_t c1e_mask;
565static int c1e_detected;
566 533
567void c1e_remove_cpu(int cpu) 534void c1e_remove_cpu(int cpu)
568{ 535{
@@ -584,12 +551,12 @@ static void c1e_idle(void)
584 u32 lo, hi; 551 u32 lo, hi;
585 552
586 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 553 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
554
587 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 555 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
588 c1e_detected = 1; 556 c1e_detected = true;
589 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 557 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
590 mark_tsc_unstable("TSC halt in AMD C1E"); 558 mark_tsc_unstable("TSC halt in AMD C1E");
591 printk(KERN_INFO "System has AMD C1E enabled\n"); 559 printk(KERN_INFO "System has AMD C1E enabled\n");
592 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
593 } 560 }
594 } 561 }
595 562
@@ -638,7 +605,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
638 */ 605 */
639 printk(KERN_INFO "using mwait in idle threads.\n"); 606 printk(KERN_INFO "using mwait in idle threads.\n");
640 pm_idle = mwait_idle; 607 pm_idle = mwait_idle;
641 } else if (check_c1e_idle(c)) { 608 } else if (cpu_has_amd_erratum(amd_erratum_400)) {
609 /* E400: APIC timer interrupt does not wake up CPU from C1e */
642 printk(KERN_INFO "using C1E aware idle routine\n"); 610 printk(KERN_INFO "using C1E aware idle routine\n");
643 pm_idle = c1e_idle; 611 pm_idle = c1e_idle;
644 } else 612 } else
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d128783af47..96586c3cbbbf 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,6 +57,8 @@
57#include <asm/syscalls.h> 57#include <asm/syscalls.h>
58#include <asm/debugreg.h> 58#include <asm/debugreg.h>
59 59
60#include <trace/events/power.h>
61
60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
61 63
62/* 64/*
@@ -111,6 +113,8 @@ void cpu_idle(void)
111 stop_critical_timings(); 113 stop_critical_timings();
112 pm_idle(); 114 pm_idle();
113 start_critical_timings(); 115 start_critical_timings();
116
117 trace_power_end(smp_processor_id());
114 } 118 }
115 tick_nohz_restart_sched_tick(); 119 tick_nohz_restart_sched_tick();
116 preempt_enable_no_resched(); 120 preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3c2422a99f1f..3d9ea531ddd1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,6 +51,8 @@
51#include <asm/syscalls.h> 51#include <asm/syscalls.h>
52#include <asm/debugreg.h> 52#include <asm/debugreg.h>
53 53
54#include <trace/events/power.h>
55
54asmlinkage extern void ret_from_fork(void); 56asmlinkage extern void ret_from_fork(void);
55 57
56DEFINE_PER_CPU(unsigned long, old_rsp); 58DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -138,6 +140,9 @@ void cpu_idle(void)
138 stop_critical_timings(); 140 stop_critical_timings();
139 pm_idle(); 141 pm_idle();
140 start_critical_timings(); 142 start_critical_timings();
143
144 trace_power_end(smp_processor_id());
145
141 /* In many cases the interrupt that ended idle 146 /* In many cases the interrupt that ended idle
142 has already called exit_idle. But some idle 147 has already called exit_idle. But some idle
143 loops can be woken up without interrupt. */ 148 loops can be woken up without interrupt. */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4ae4acbd031..b008e7883207 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -102,6 +102,7 @@
102 102
103#include <asm/paravirt.h> 103#include <asm/paravirt.h>
104#include <asm/hypervisor.h> 104#include <asm/hypervisor.h>
105#include <asm/olpc_ofw.h>
105 106
106#include <asm/percpu.h> 107#include <asm/percpu.h>
107#include <asm/topology.h> 108#include <asm/topology.h>
@@ -736,10 +737,15 @@ void __init setup_arch(char **cmdline_p)
736 /* VMI may relocate the fixmap; do this before touching ioremap area */ 737 /* VMI may relocate the fixmap; do this before touching ioremap area */
737 vmi_init(); 738 vmi_init();
738 739
740 /* OFW also may relocate the fixmap */
741 olpc_ofw_detect();
742
739 early_trap_init(); 743 early_trap_init();
740 early_cpu_init(); 744 early_cpu_init();
741 early_ioremap_init(); 745 early_ioremap_init();
742 746
747 setup_olpc_ofw_pgd();
748
743 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); 749 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
744 screen_info = boot_params.screen_info; 750 screen_info = boot_params.screen_info;
745 edid_info = boot_params.edid_info; 751 edid_info = boot_params.edid_info;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c4f33b2e77d6..a5e928b0cb5f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -735,12 +735,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
735 goto do_rest; 735 goto do_rest;
736 } 736 }
737 737
738 if (!keventd_up() || current_is_keventd()) 738 schedule_work(&c_idle.work);
739 c_idle.work.func(&c_idle.work); 739 wait_for_completion(&c_idle.done);
740 else {
741 schedule_work(&c_idle.work);
742 wait_for_completion(&c_idle.done);
743 }
744 740
745 if (IS_ERR(c_idle.idle)) { 741 if (IS_ERR(c_idle.idle)) {
746 printk("failed fork for CPU %d\n", cpu); 742 printk("failed fork for CPU %d\n", cpu);
@@ -816,6 +812,13 @@ do_rest:
816 if (cpumask_test_cpu(cpu, cpu_callin_mask)) 812 if (cpumask_test_cpu(cpu, cpu_callin_mask))
817 break; /* It has booted */ 813 break; /* It has booted */
818 udelay(100); 814 udelay(100);
815 /*
816 * Allow other tasks to run while we wait for the
817 * AP to come online. This also gives a chance
818 * for the MTRR work(triggered by the AP coming online)
819 * to be completed in the stop machine context.
820 */
821 schedule();
819 } 822 }
820 823
821 if (cpumask_test_cpu(cpu, cpu_callin_mask)) 824 if (cpumask_test_cpu(cpu, cpu_callin_mask))
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 922eefbb3f6c..b53c525368a7 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -23,11 +23,16 @@ static int save_stack_stack(void *data, char *name)
23 return 0; 23 return 0;
24} 24}
25 25
26static void save_stack_address(void *data, unsigned long addr, int reliable) 26static void
27__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
27{ 28{
28 struct stack_trace *trace = data; 29 struct stack_trace *trace = data;
30#ifdef CONFIG_FRAME_POINTER
29 if (!reliable) 31 if (!reliable)
30 return; 32 return;
33#endif
34 if (nosched && in_sched_functions(addr))
35 return;
31 if (trace->skip > 0) { 36 if (trace->skip > 0) {
32 trace->skip--; 37 trace->skip--;
33 return; 38 return;
@@ -36,20 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable)
36 trace->entries[trace->nr_entries++] = addr; 41 trace->entries[trace->nr_entries++] = addr;
37} 42}
38 43
44static void save_stack_address(void *data, unsigned long addr, int reliable)
45{
46 return __save_stack_address(data, addr, reliable, false);
47}
48
39static void 49static void
40save_stack_address_nosched(void *data, unsigned long addr, int reliable) 50save_stack_address_nosched(void *data, unsigned long addr, int reliable)
41{ 51{
42 struct stack_trace *trace = (struct stack_trace *)data; 52 return __save_stack_address(data, addr, reliable, true);
43 if (!reliable)
44 return;
45 if (in_sched_functions(addr))
46 return;
47 if (trace->skip > 0) {
48 trace->skip--;
49 return;
50 }
51 if (trace->nr_entries < trace->max_entries)
52 trace->entries[trace->nr_entries++] = addr;
53} 53}
54 54
55static const struct stacktrace_ops save_stack_ops = { 55static const struct stacktrace_ops save_stack_ops = {
@@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
96 96
97/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ 97/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
98 98
99struct stack_frame { 99struct stack_frame_user {
100 const void __user *next_fp; 100 const void __user *next_fp;
101 unsigned long ret_addr; 101 unsigned long ret_addr;
102}; 102};
103 103
104static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) 104static int
105copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
105{ 106{
106 int ret; 107 int ret;
107 108
@@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace)
126 trace->entries[trace->nr_entries++] = regs->ip; 127 trace->entries[trace->nr_entries++] = regs->ip;
127 128
128 while (trace->nr_entries < trace->max_entries) { 129 while (trace->nr_entries < trace->max_entries) {
129 struct stack_frame frame; 130 struct stack_frame_user frame;
130 131
131 frame.next_fp = NULL; 132 frame.next_fp = NULL;
132 frame.ret_addr = 0; 133 frame.ret_addr = 0;
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8b3729341216..b35786dc9b8f 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,6 @@ ENTRY(sys_call_table)
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open 338 .long sys_perf_event_open
339 .long sys_recvmmsg 339 .long sys_recvmmsg
340 .long sys_fanotify_init
341 .long sys_fanotify_mark
342 .long sys_prlimit64 /* 340 */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 7fea555929e2..312ef0292815 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -8,6 +8,7 @@
8 */ 8 */
9#include <linux/seq_file.h> 9#include <linux/seq_file.h>
10#include <linux/proc_fs.h> 10#include <linux/proc_fs.h>
11#include <linux/debugfs.h>
11#include <linux/kernel.h> 12#include <linux/kernel.h>
12#include <linux/slab.h> 13#include <linux/slab.h>
13 14
@@ -22,19 +23,37 @@
22#include <asm/irq_vectors.h> 23#include <asm/irq_vectors.h>
23#include <asm/timer.h> 24#include <asm/timer.h>
24 25
25struct msg_desc { 26/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
26 struct bau_payload_queue_entry *msg; 27static int timeout_base_ns[] = {
27 int msg_slot; 28 20,
28 int sw_ack_slot; 29 160,
29 struct bau_payload_queue_entry *va_queue_first; 30 1280,
30 struct bau_payload_queue_entry *va_queue_last; 31 10240,
32 81920,
33 655360,
34 5242880,
35 167772160
31}; 36};
32 37static int timeout_us;
33#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
34
35static int uv_bau_max_concurrent __read_mostly;
36
37static int nobau; 38static int nobau;
39static int baudisabled;
40static spinlock_t disable_lock;
41static cycles_t congested_cycles;
42
43/* tunables: */
44static int max_bau_concurrent = MAX_BAU_CONCURRENT;
45static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
46static int plugged_delay = PLUGGED_DELAY;
47static int plugsb4reset = PLUGSB4RESET;
48static int timeoutsb4reset = TIMEOUTSB4RESET;
49static int ipi_reset_limit = IPI_RESET_LIMIT;
50static int complete_threshold = COMPLETE_THRESHOLD;
51static int congested_response_us = CONGESTED_RESPONSE_US;
52static int congested_reps = CONGESTED_REPS;
53static int congested_period = CONGESTED_PERIOD;
54static struct dentry *tunables_dir;
55static struct dentry *tunables_file;
56
38static int __init setup_nobau(char *arg) 57static int __init setup_nobau(char *arg)
39{ 58{
40 nobau = 1; 59 nobau = 1;
@@ -52,10 +71,6 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
52static DEFINE_PER_CPU(struct bau_control, bau_control); 71static DEFINE_PER_CPU(struct bau_control, bau_control);
53static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); 72static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
54 73
55struct reset_args {
56 int sender;
57};
58
59/* 74/*
60 * Determine the first node on a uvhub. 'Nodes' are used for kernel 75 * Determine the first node on a uvhub. 'Nodes' are used for kernel
61 * memory allocation. 76 * memory allocation.
@@ -126,7 +141,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
126 struct ptc_stats *stat; 141 struct ptc_stats *stat;
127 142
128 msg = mdp->msg; 143 msg = mdp->msg;
129 stat = &per_cpu(ptcstats, bcp->cpu); 144 stat = bcp->statp;
130 stat->d_retries++; 145 stat->d_retries++;
131 /* 146 /*
132 * cancel any message from msg+1 to the retry itself 147 * cancel any message from msg+1 to the retry itself
@@ -146,15 +161,14 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
146 slot2 = msg2 - mdp->va_queue_first; 161 slot2 = msg2 - mdp->va_queue_first;
147 mmr = uv_read_local_mmr 162 mmr = uv_read_local_mmr
148 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); 163 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
149 msg_res = ((msg2->sw_ack_vector << 8) | 164 msg_res = msg2->sw_ack_vector;
150 msg2->sw_ack_vector);
151 /* 165 /*
152 * This is a message retry; clear the resources held 166 * This is a message retry; clear the resources held
153 * by the previous message only if they timed out. 167 * by the previous message only if they timed out.
154 * If it has not timed out we have an unexpected 168 * If it has not timed out we have an unexpected
155 * situation to report. 169 * situation to report.
156 */ 170 */
157 if (mmr & (msg_res << 8)) { 171 if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
158 /* 172 /*
159 * is the resource timed out? 173 * is the resource timed out?
160 * make everyone ignore the cancelled message. 174 * make everyone ignore the cancelled message.
@@ -164,9 +178,9 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
164 cancel_count++; 178 cancel_count++;
165 uv_write_local_mmr( 179 uv_write_local_mmr(
166 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, 180 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
167 (msg_res << 8) | msg_res); 181 (msg_res << UV_SW_ACK_NPENDING) |
168 } else 182 msg_res);
169 printk(KERN_INFO "note bau retry: no effect\n"); 183 }
170 } 184 }
171 } 185 }
172 if (!cancel_count) 186 if (!cancel_count)
@@ -190,7 +204,7 @@ static void uv_bau_process_message(struct msg_desc *mdp,
190 * This must be a normal message, or retry of a normal message 204 * This must be a normal message, or retry of a normal message
191 */ 205 */
192 msg = mdp->msg; 206 msg = mdp->msg;
193 stat = &per_cpu(ptcstats, bcp->cpu); 207 stat = bcp->statp;
194 if (msg->address == TLB_FLUSH_ALL) { 208 if (msg->address == TLB_FLUSH_ALL) {
195 local_flush_tlb(); 209 local_flush_tlb();
196 stat->d_alltlb++; 210 stat->d_alltlb++;
@@ -274,7 +288,7 @@ uv_do_reset(void *ptr)
274 288
275 bcp = &per_cpu(bau_control, smp_processor_id()); 289 bcp = &per_cpu(bau_control, smp_processor_id());
276 rap = (struct reset_args *)ptr; 290 rap = (struct reset_args *)ptr;
277 stat = &per_cpu(ptcstats, bcp->cpu); 291 stat = bcp->statp;
278 stat->d_resets++; 292 stat->d_resets++;
279 293
280 /* 294 /*
@@ -302,13 +316,13 @@ uv_do_reset(void *ptr)
302 */ 316 */
303 mmr = uv_read_local_mmr 317 mmr = uv_read_local_mmr
304 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); 318 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
305 msg_res = ((msg->sw_ack_vector << 8) | 319 msg_res = msg->sw_ack_vector;
306 msg->sw_ack_vector);
307 if (mmr & msg_res) { 320 if (mmr & msg_res) {
308 stat->d_rcanceled++; 321 stat->d_rcanceled++;
309 uv_write_local_mmr( 322 uv_write_local_mmr(
310 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, 323 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
311 msg_res); 324 (msg_res << UV_SW_ACK_NPENDING) |
325 msg_res);
312 } 326 }
313 } 327 }
314 } 328 }
@@ -386,17 +400,12 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
386 unsigned long mmr_offset, int right_shift, int this_cpu, 400 unsigned long mmr_offset, int right_shift, int this_cpu,
387 struct bau_control *bcp, struct bau_control *smaster, long try) 401 struct bau_control *bcp, struct bau_control *smaster, long try)
388{ 402{
389 int relaxes = 0;
390 unsigned long descriptor_status; 403 unsigned long descriptor_status;
391 unsigned long mmr;
392 unsigned long mask;
393 cycles_t ttime; 404 cycles_t ttime;
394 cycles_t timeout_time; 405 struct ptc_stats *stat = bcp->statp;
395 struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
396 struct bau_control *hmaster; 406 struct bau_control *hmaster;
397 407
398 hmaster = bcp->uvhub_master; 408 hmaster = bcp->uvhub_master;
399 timeout_time = get_cycles() + bcp->timeout_interval;
400 409
401 /* spin on the status MMR, waiting for it to go idle */ 410 /* spin on the status MMR, waiting for it to go idle */
402 while ((descriptor_status = (((unsigned long) 411 while ((descriptor_status = (((unsigned long)
@@ -423,7 +432,8 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
423 * pending. In that case hardware returns the 432 * pending. In that case hardware returns the
424 * ERROR that looks like a destination timeout. 433 * ERROR that looks like a destination timeout.
425 */ 434 */
426 if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) { 435 if (cycles_2_us(ttime - bcp->send_message) <
436 timeout_us) {
427 bcp->conseccompletes = 0; 437 bcp->conseccompletes = 0;
428 return FLUSH_RETRY_PLUGGED; 438 return FLUSH_RETRY_PLUGGED;
429 } 439 }
@@ -435,26 +445,6 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
435 * descriptor_status is still BUSY 445 * descriptor_status is still BUSY
436 */ 446 */
437 cpu_relax(); 447 cpu_relax();
438 relaxes++;
439 if (relaxes >= 10000) {
440 relaxes = 0;
441 if (get_cycles() > timeout_time) {
442 quiesce_local_uvhub(hmaster);
443
444 /* single-thread the register change */
445 spin_lock(&hmaster->masks_lock);
446 mmr = uv_read_local_mmr(mmr_offset);
447 mask = 0UL;
448 mask |= (3UL < right_shift);
449 mask = ~mask;
450 mmr &= mask;
451 uv_write_local_mmr(mmr_offset, mmr);
452 spin_unlock(&hmaster->masks_lock);
453 end_uvhub_quiesce(hmaster);
454 stat->s_busy++;
455 return FLUSH_GIVEUP;
456 }
457 }
458 } 448 }
459 } 449 }
460 bcp->conseccompletes++; 450 bcp->conseccompletes++;
@@ -494,56 +484,116 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
494 return 1; 484 return 1;
495} 485}
496 486
487/*
488 * Our retries are blocked by all destination swack resources being
489 * in use, and a timeout is pending. In that case hardware immediately
490 * returns the ERROR that looks like a destination timeout.
491 */
492static void
493destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
494 struct bau_control *hmaster, struct ptc_stats *stat)
495{
496 udelay(bcp->plugged_delay);
497 bcp->plugged_tries++;
498 if (bcp->plugged_tries >= bcp->plugsb4reset) {
499 bcp->plugged_tries = 0;
500 quiesce_local_uvhub(hmaster);
501 spin_lock(&hmaster->queue_lock);
502 uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
503 spin_unlock(&hmaster->queue_lock);
504 end_uvhub_quiesce(hmaster);
505 bcp->ipi_attempts++;
506 stat->s_resets_plug++;
507 }
508}
509
510static void
511destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
512 struct bau_control *hmaster, struct ptc_stats *stat)
513{
514 hmaster->max_bau_concurrent = 1;
515 bcp->timeout_tries++;
516 if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
517 bcp->timeout_tries = 0;
518 quiesce_local_uvhub(hmaster);
519 spin_lock(&hmaster->queue_lock);
520 uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
521 spin_unlock(&hmaster->queue_lock);
522 end_uvhub_quiesce(hmaster);
523 bcp->ipi_attempts++;
524 stat->s_resets_timeout++;
525 }
526}
527
528/*
529 * Completions are taking a very long time due to a congested numalink
530 * network.
531 */
532static void
533disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
534{
535 int tcpu;
536 struct bau_control *tbcp;
537
538 /* let only one cpu do this disabling */
539 spin_lock(&disable_lock);
540 if (!baudisabled && bcp->period_requests &&
541 ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
542 /* it becomes this cpu's job to turn on the use of the
543 BAU again */
544 baudisabled = 1;
545 bcp->set_bau_off = 1;
546 bcp->set_bau_on_time = get_cycles() +
547 sec_2_cycles(bcp->congested_period);
548 stat->s_bau_disabled++;
549 for_each_present_cpu(tcpu) {
550 tbcp = &per_cpu(bau_control, tcpu);
551 tbcp->baudisabled = 1;
552 }
553 }
554 spin_unlock(&disable_lock);
555}
556
497/** 557/**
498 * uv_flush_send_and_wait 558 * uv_flush_send_and_wait
499 * 559 *
500 * Send a broadcast and wait for it to complete. 560 * Send a broadcast and wait for it to complete.
501 * 561 *
502 * The flush_mask contains the cpus the broadcast is to be sent to, plus 562 * The flush_mask contains the cpus the broadcast is to be sent to including
503 * cpus that are on the local uvhub. 563 * cpus that are on the local uvhub.
504 * 564 *
505 * Returns NULL if all flushing represented in the mask was done. The mask 565 * Returns 0 if all flushing represented in the mask was done.
506 * is zeroed. 566 * Returns 1 if it gives up entirely and the original cpu mask is to be
507 * Returns @flush_mask if some remote flushing remains to be done. The 567 * returned to the kernel.
508 * mask will have some bits still set, representing any cpus on the local
509 * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
510 */ 568 */
511const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, 569int uv_flush_send_and_wait(struct bau_desc *bau_desc,
512 struct cpumask *flush_mask, 570 struct cpumask *flush_mask, struct bau_control *bcp)
513 struct bau_control *bcp)
514{ 571{
515 int right_shift; 572 int right_shift;
516 int uvhub;
517 int bit;
518 int completion_status = 0; 573 int completion_status = 0;
519 int seq_number = 0; 574 int seq_number = 0;
520 long try = 0; 575 long try = 0;
521 int cpu = bcp->uvhub_cpu; 576 int cpu = bcp->uvhub_cpu;
522 int this_cpu = bcp->cpu; 577 int this_cpu = bcp->cpu;
523 int this_uvhub = bcp->uvhub;
524 unsigned long mmr_offset; 578 unsigned long mmr_offset;
525 unsigned long index; 579 unsigned long index;
526 cycles_t time1; 580 cycles_t time1;
527 cycles_t time2; 581 cycles_t time2;
528 struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu); 582 cycles_t elapsed;
583 struct ptc_stats *stat = bcp->statp;
529 struct bau_control *smaster = bcp->socket_master; 584 struct bau_control *smaster = bcp->socket_master;
530 struct bau_control *hmaster = bcp->uvhub_master; 585 struct bau_control *hmaster = bcp->uvhub_master;
531 586
532 /*
533 * Spin here while there are hmaster->max_concurrent or more active
534 * descriptors. This is the per-uvhub 'throttle'.
535 */
536 if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, 587 if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
537 &hmaster->active_descriptor_count, 588 &hmaster->active_descriptor_count,
538 hmaster->max_concurrent)) { 589 hmaster->max_bau_concurrent)) {
539 stat->s_throttles++; 590 stat->s_throttles++;
540 do { 591 do {
541 cpu_relax(); 592 cpu_relax();
542 } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, 593 } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
543 &hmaster->active_descriptor_count, 594 &hmaster->active_descriptor_count,
544 hmaster->max_concurrent)); 595 hmaster->max_bau_concurrent));
545 } 596 }
546
547 while (hmaster->uvhub_quiesce) 597 while (hmaster->uvhub_quiesce)
548 cpu_relax(); 598 cpu_relax();
549 599
@@ -557,23 +607,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
557 } 607 }
558 time1 = get_cycles(); 608 time1 = get_cycles();
559 do { 609 do {
560 /*
561 * Every message from any given cpu gets a unique message
562 * sequence number. But retries use that same number.
563 * Our message may have timed out at the destination because
564 * all sw-ack resources are in use and there is a timeout
565 * pending there. In that case, our last send never got
566 * placed into the queue and we need to persist until it
567 * does.
568 *
569 * Make any retry a type MSG_RETRY so that the destination will
570 * free any resource held by a previous message from this cpu.
571 */
572 if (try == 0) { 610 if (try == 0) {
573 /* use message type set by the caller the first time */ 611 bau_desc->header.msg_type = MSG_REGULAR;
574 seq_number = bcp->message_number++; 612 seq_number = bcp->message_number++;
575 } else { 613 } else {
576 /* use RETRY type on all the rest; same sequence */
577 bau_desc->header.msg_type = MSG_RETRY; 614 bau_desc->header.msg_type = MSG_RETRY;
578 stat->s_retry_messages++; 615 stat->s_retry_messages++;
579 } 616 }
@@ -581,50 +618,17 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
581 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | 618 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
582 bcp->uvhub_cpu; 619 bcp->uvhub_cpu;
583 bcp->send_message = get_cycles(); 620 bcp->send_message = get_cycles();
584
585 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); 621 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
586
587 try++; 622 try++;
588 completion_status = uv_wait_completion(bau_desc, mmr_offset, 623 completion_status = uv_wait_completion(bau_desc, mmr_offset,
589 right_shift, this_cpu, bcp, smaster, try); 624 right_shift, this_cpu, bcp, smaster, try);
590 625
591 if (completion_status == FLUSH_RETRY_PLUGGED) { 626 if (completion_status == FLUSH_RETRY_PLUGGED) {
592 /* 627 destination_plugged(bau_desc, bcp, hmaster, stat);
593 * Our retries may be blocked by all destination swack
594 * resources being consumed, and a timeout pending. In
595 * that case hardware immediately returns the ERROR
596 * that looks like a destination timeout.
597 */
598 udelay(TIMEOUT_DELAY);
599 bcp->plugged_tries++;
600 if (bcp->plugged_tries >= PLUGSB4RESET) {
601 bcp->plugged_tries = 0;
602 quiesce_local_uvhub(hmaster);
603 spin_lock(&hmaster->queue_lock);
604 uv_reset_with_ipi(&bau_desc->distribution,
605 this_cpu);
606 spin_unlock(&hmaster->queue_lock);
607 end_uvhub_quiesce(hmaster);
608 bcp->ipi_attempts++;
609 stat->s_resets_plug++;
610 }
611 } else if (completion_status == FLUSH_RETRY_TIMEOUT) { 628 } else if (completion_status == FLUSH_RETRY_TIMEOUT) {
612 hmaster->max_concurrent = 1; 629 destination_timeout(bau_desc, bcp, hmaster, stat);
613 bcp->timeout_tries++;
614 udelay(TIMEOUT_DELAY);
615 if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
616 bcp->timeout_tries = 0;
617 quiesce_local_uvhub(hmaster);
618 spin_lock(&hmaster->queue_lock);
619 uv_reset_with_ipi(&bau_desc->distribution,
620 this_cpu);
621 spin_unlock(&hmaster->queue_lock);
622 end_uvhub_quiesce(hmaster);
623 bcp->ipi_attempts++;
624 stat->s_resets_timeout++;
625 }
626 } 630 }
627 if (bcp->ipi_attempts >= 3) { 631 if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
628 bcp->ipi_attempts = 0; 632 bcp->ipi_attempts = 0;
629 completion_status = FLUSH_GIVEUP; 633 completion_status = FLUSH_GIVEUP;
630 break; 634 break;
@@ -633,49 +637,36 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
633 } while ((completion_status == FLUSH_RETRY_PLUGGED) || 637 } while ((completion_status == FLUSH_RETRY_PLUGGED) ||
634 (completion_status == FLUSH_RETRY_TIMEOUT)); 638 (completion_status == FLUSH_RETRY_TIMEOUT));
635 time2 = get_cycles(); 639 time2 = get_cycles();
636 640 bcp->plugged_tries = 0;
637 if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5) 641 bcp->timeout_tries = 0;
638 && (hmaster->max_concurrent < hmaster->max_concurrent_constant)) 642 if ((completion_status == FLUSH_COMPLETE) &&
639 hmaster->max_concurrent++; 643 (bcp->conseccompletes > bcp->complete_threshold) &&
640 644 (hmaster->max_bau_concurrent <
641 /* 645 hmaster->max_bau_concurrent_constant))
642 * hold any cpu not timing out here; no other cpu currently held by 646 hmaster->max_bau_concurrent++;
643 * the 'throttle' should enter the activation code
644 */
645 while (hmaster->uvhub_quiesce) 647 while (hmaster->uvhub_quiesce)
646 cpu_relax(); 648 cpu_relax();
647 atomic_dec(&hmaster->active_descriptor_count); 649 atomic_dec(&hmaster->active_descriptor_count);
648 650 if (time2 > time1) {
649 /* guard against cycles wrap */ 651 elapsed = time2 - time1;
650 if (time2 > time1) 652 stat->s_time += elapsed;
651 stat->s_time += (time2 - time1); 653 if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
652 else 654 bcp->period_requests++;
653 stat->s_requestor--; /* don't count this one */ 655 bcp->period_time += elapsed;
656 if ((elapsed > congested_cycles) &&
657 (bcp->period_requests > bcp->congested_reps)) {
658 disable_for_congestion(bcp, stat);
659 }
660 }
661 } else
662 stat->s_requestor--;
654 if (completion_status == FLUSH_COMPLETE && try > 1) 663 if (completion_status == FLUSH_COMPLETE && try > 1)
655 stat->s_retriesok++; 664 stat->s_retriesok++;
656 else if (completion_status == FLUSH_GIVEUP) { 665 else if (completion_status == FLUSH_GIVEUP) {
657 /*
658 * Cause the caller to do an IPI-style TLB shootdown on
659 * the target cpu's, all of which are still in the mask.
660 */
661 stat->s_giveup++; 666 stat->s_giveup++;
662 return flush_mask; 667 return 1;
663 }
664
665 /*
666 * Success, so clear the remote cpu's from the mask so we don't
667 * use the IPI method of shootdown on them.
668 */
669 for_each_cpu(bit, flush_mask) {
670 uvhub = uv_cpu_to_blade_id(bit);
671 if (uvhub == this_uvhub)
672 continue;
673 cpumask_clear_cpu(bit, flush_mask);
674 } 668 }
675 if (!cpumask_empty(flush_mask)) 669 return 0;
676 return flush_mask;
677
678 return NULL;
679} 670}
680 671
681/** 672/**
@@ -707,70 +698,89 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
707 struct mm_struct *mm, 698 struct mm_struct *mm,
708 unsigned long va, unsigned int cpu) 699 unsigned long va, unsigned int cpu)
709{ 700{
710 int remotes;
711 int tcpu; 701 int tcpu;
712 int uvhub; 702 int uvhub;
713 int locals = 0; 703 int locals = 0;
704 int remotes = 0;
705 int hubs = 0;
714 struct bau_desc *bau_desc; 706 struct bau_desc *bau_desc;
715 struct cpumask *flush_mask; 707 struct cpumask *flush_mask;
716 struct ptc_stats *stat; 708 struct ptc_stats *stat;
717 struct bau_control *bcp; 709 struct bau_control *bcp;
710 struct bau_control *tbcp;
718 711
712 /* kernel was booted 'nobau' */
719 if (nobau) 713 if (nobau)
720 return cpumask; 714 return cpumask;
721 715
722 bcp = &per_cpu(bau_control, cpu); 716 bcp = &per_cpu(bau_control, cpu);
717 stat = bcp->statp;
718
719 /* bau was disabled due to slow response */
720 if (bcp->baudisabled) {
721 /* the cpu that disabled it must re-enable it */
722 if (bcp->set_bau_off) {
723 if (get_cycles() >= bcp->set_bau_on_time) {
724 stat->s_bau_reenabled++;
725 baudisabled = 0;
726 for_each_present_cpu(tcpu) {
727 tbcp = &per_cpu(bau_control, tcpu);
728 tbcp->baudisabled = 0;
729 tbcp->period_requests = 0;
730 tbcp->period_time = 0;
731 }
732 }
733 }
734 return cpumask;
735 }
736
723 /* 737 /*
724 * Each sending cpu has a per-cpu mask which it fills from the caller's 738 * Each sending cpu has a per-cpu mask which it fills from the caller's
725 * cpu mask. Only remote cpus are converted to uvhubs and copied. 739 * cpu mask. All cpus are converted to uvhubs and copied to the
740 * activation descriptor.
726 */ 741 */
727 flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); 742 flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
728 /* 743 /* don't actually do a shootdown of the local cpu */
729 * copy cpumask to flush_mask, removing current cpu
730 * (current cpu should already have been flushed by the caller and
731 * should never be returned if we return flush_mask)
732 */
733 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); 744 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
734 if (cpu_isset(cpu, *cpumask)) 745 if (cpu_isset(cpu, *cpumask))
735 locals++; /* current cpu was targeted */ 746 stat->s_ntargself++;
736 747
737 bau_desc = bcp->descriptor_base; 748 bau_desc = bcp->descriptor_base;
738 bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; 749 bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
739
740 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 750 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
741 remotes = 0; 751
752 /* cpu statistics */
742 for_each_cpu(tcpu, flush_mask) { 753 for_each_cpu(tcpu, flush_mask) {
743 uvhub = uv_cpu_to_blade_id(tcpu); 754 uvhub = uv_cpu_to_blade_id(tcpu);
744 if (uvhub == bcp->uvhub) {
745 locals++;
746 continue;
747 }
748 bau_uvhub_set(uvhub, &bau_desc->distribution); 755 bau_uvhub_set(uvhub, &bau_desc->distribution);
749 remotes++; 756 if (uvhub == bcp->uvhub)
750 } 757 locals++;
751 if (remotes == 0) {
752 /*
753 * No off_hub flushing; return status for local hub.
754 * Return the caller's mask if all were local (the current
755 * cpu may be in that mask).
756 */
757 if (locals)
758 return cpumask;
759 else 758 else
760 return NULL; 759 remotes++;
761 } 760 }
762 stat = &per_cpu(ptcstats, cpu); 761 if ((locals + remotes) == 0)
762 return NULL;
763 stat->s_requestor++; 763 stat->s_requestor++;
764 stat->s_ntargcpu += remotes; 764 stat->s_ntargcpu += remotes + locals;
765 stat->s_ntargremotes += remotes;
766 stat->s_ntarglocals += locals;
765 remotes = bau_uvhub_weight(&bau_desc->distribution); 767 remotes = bau_uvhub_weight(&bau_desc->distribution);
766 stat->s_ntarguvhub += remotes; 768
767 if (remotes >= 16) 769 /* uvhub statistics */
770 hubs = bau_uvhub_weight(&bau_desc->distribution);
771 if (locals) {
772 stat->s_ntarglocaluvhub++;
773 stat->s_ntargremoteuvhub += (hubs - 1);
774 } else
775 stat->s_ntargremoteuvhub += hubs;
776 stat->s_ntarguvhub += hubs;
777 if (hubs >= 16)
768 stat->s_ntarguvhub16++; 778 stat->s_ntarguvhub16++;
769 else if (remotes >= 8) 779 else if (hubs >= 8)
770 stat->s_ntarguvhub8++; 780 stat->s_ntarguvhub8++;
771 else if (remotes >= 4) 781 else if (hubs >= 4)
772 stat->s_ntarguvhub4++; 782 stat->s_ntarguvhub4++;
773 else if (remotes >= 2) 783 else if (hubs >= 2)
774 stat->s_ntarguvhub2++; 784 stat->s_ntarguvhub2++;
775 else 785 else
776 stat->s_ntarguvhub1++; 786 stat->s_ntarguvhub1++;
@@ -779,10 +789,13 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
779 bau_desc->payload.sending_cpu = cpu; 789 bau_desc->payload.sending_cpu = cpu;
780 790
781 /* 791 /*
782 * uv_flush_send_and_wait returns null if all cpu's were messaged, or 792 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
783 * the adjusted flush_mask if any cpu's were not messaged. 793 * or 1 if it gave up and the original cpumask should be returned.
784 */ 794 */
785 return uv_flush_send_and_wait(bau_desc, flush_mask, bcp); 795 if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
796 return NULL;
797 else
798 return cpumask;
786} 799}
787 800
788/* 801/*
@@ -810,7 +823,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
810 823
811 time_start = get_cycles(); 824 time_start = get_cycles();
812 bcp = &per_cpu(bau_control, smp_processor_id()); 825 bcp = &per_cpu(bau_control, smp_processor_id());
813 stat = &per_cpu(ptcstats, smp_processor_id()); 826 stat = bcp->statp;
814 msgdesc.va_queue_first = bcp->va_queue_first; 827 msgdesc.va_queue_first = bcp->va_queue_first;
815 msgdesc.va_queue_last = bcp->va_queue_last; 828 msgdesc.va_queue_last = bcp->va_queue_last;
816 msg = bcp->bau_msg_head; 829 msg = bcp->bau_msg_head;
@@ -908,12 +921,12 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
908} 921}
909 922
910static inline unsigned long long 923static inline unsigned long long
911millisec_2_cycles(unsigned long millisec) 924microsec_2_cycles(unsigned long microsec)
912{ 925{
913 unsigned long ns; 926 unsigned long ns;
914 unsigned long long cyc; 927 unsigned long long cyc;
915 928
916 ns = millisec * 1000; 929 ns = microsec * 1000;
917 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); 930 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
918 return cyc; 931 return cyc;
919} 932}
@@ -931,15 +944,19 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
931 944
932 if (!cpu) { 945 if (!cpu) {
933 seq_printf(file, 946 seq_printf(file,
934 "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 "); 947 "# cpu sent stime self locals remotes ncpus localhub ");
948 seq_printf(file,
949 "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
935 seq_printf(file, 950 seq_printf(file,
936 "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto "); 951 "numuvhubs4 numuvhubs2 numuvhubs1 dto ");
937 seq_printf(file, 952 seq_printf(file,
938 "retries rok resetp resett giveup sto bz throt "); 953 "retries rok resetp resett giveup sto bz throt ");
939 seq_printf(file, 954 seq_printf(file,
940 "sw_ack recv rtime all "); 955 "sw_ack recv rtime all ");
941 seq_printf(file, 956 seq_printf(file,
942 "one mult none retry canc nocan reset rcan\n"); 957 "one mult none retry canc nocan reset rcan ");
958 seq_printf(file,
959 "disable enable\n");
943 } 960 }
944 if (cpu < num_possible_cpus() && cpu_online(cpu)) { 961 if (cpu < num_possible_cpus() && cpu_online(cpu)) {
945 stat = &per_cpu(ptcstats, cpu); 962 stat = &per_cpu(ptcstats, cpu);
@@ -947,18 +964,23 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
947 seq_printf(file, 964 seq_printf(file,
948 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 965 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
949 cpu, stat->s_requestor, cycles_2_us(stat->s_time), 966 cpu, stat->s_requestor, cycles_2_us(stat->s_time),
950 stat->s_ntarguvhub, stat->s_ntarguvhub16, 967 stat->s_ntargself, stat->s_ntarglocals,
968 stat->s_ntargremotes, stat->s_ntargcpu,
969 stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
970 stat->s_ntarguvhub, stat->s_ntarguvhub16);
971 seq_printf(file, "%ld %ld %ld %ld %ld ",
951 stat->s_ntarguvhub8, stat->s_ntarguvhub4, 972 stat->s_ntarguvhub8, stat->s_ntarguvhub4,
952 stat->s_ntarguvhub2, stat->s_ntarguvhub1, 973 stat->s_ntarguvhub2, stat->s_ntarguvhub1,
953 stat->s_ntargcpu, stat->s_dtimeout); 974 stat->s_dtimeout);
954 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", 975 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
955 stat->s_retry_messages, stat->s_retriesok, 976 stat->s_retry_messages, stat->s_retriesok,
956 stat->s_resets_plug, stat->s_resets_timeout, 977 stat->s_resets_plug, stat->s_resets_timeout,
957 stat->s_giveup, stat->s_stimeout, 978 stat->s_giveup, stat->s_stimeout,
958 stat->s_busy, stat->s_throttles); 979 stat->s_busy, stat->s_throttles);
980
959 /* destination side statistics */ 981 /* destination side statistics */
960 seq_printf(file, 982 seq_printf(file,
961 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", 983 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
962 uv_read_global_mmr64(uv_cpu_to_pnode(cpu), 984 uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
963 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), 985 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
964 stat->d_requestee, cycles_2_us(stat->d_time), 986 stat->d_requestee, cycles_2_us(stat->d_time),
@@ -966,15 +988,36 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
966 stat->d_nomsg, stat->d_retries, stat->d_canceled, 988 stat->d_nomsg, stat->d_retries, stat->d_canceled,
967 stat->d_nocanceled, stat->d_resets, 989 stat->d_nocanceled, stat->d_resets,
968 stat->d_rcanceled); 990 stat->d_rcanceled);
991 seq_printf(file, "%ld %ld\n",
992 stat->s_bau_disabled, stat->s_bau_reenabled);
969 } 993 }
970 994
971 return 0; 995 return 0;
972} 996}
973 997
974/* 998/*
999 * Display the tunables thru debugfs
1000 */
1001static ssize_t tunables_read(struct file *file, char __user *userbuf,
1002 size_t count, loff_t *ppos)
1003{
1004 char buf[300];
1005 int ret;
1006
1007 ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
1008 "max_bau_concurrent plugged_delay plugsb4reset",
1009 "timeoutsb4reset ipi_reset_limit complete_threshold",
1010 "congested_response_us congested_reps congested_period",
1011 max_bau_concurrent, plugged_delay, plugsb4reset,
1012 timeoutsb4reset, ipi_reset_limit, complete_threshold,
1013 congested_response_us, congested_reps, congested_period);
1014
1015 return simple_read_from_buffer(userbuf, count, ppos, buf, ret);
1016}
1017
1018/*
975 * -1: resetf the statistics 1019 * -1: resetf the statistics
976 * 0: display meaning of the statistics 1020 * 0: display meaning of the statistics
977 * >0: maximum concurrent active descriptors per uvhub (throttle)
978 */ 1021 */
979static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, 1022static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
980 size_t count, loff_t *data) 1023 size_t count, loff_t *data)
@@ -983,7 +1026,6 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
983 long input_arg; 1026 long input_arg;
984 char optstr[64]; 1027 char optstr[64];
985 struct ptc_stats *stat; 1028 struct ptc_stats *stat;
986 struct bau_control *bcp;
987 1029
988 if (count == 0 || count > sizeof(optstr)) 1030 if (count == 0 || count > sizeof(optstr))
989 return -EINVAL; 1031 return -EINVAL;
@@ -1059,29 +1101,158 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
1059 "reset: number of ipi-style reset requests processed\n"); 1101 "reset: number of ipi-style reset requests processed\n");
1060 printk(KERN_DEBUG 1102 printk(KERN_DEBUG
1061 "rcan: number messages canceled by reset requests\n"); 1103 "rcan: number messages canceled by reset requests\n");
1104 printk(KERN_DEBUG
1105 "disable: number times use of the BAU was disabled\n");
1106 printk(KERN_DEBUG
1107 "enable: number times use of the BAU was re-enabled\n");
1062 } else if (input_arg == -1) { 1108 } else if (input_arg == -1) {
1063 for_each_present_cpu(cpu) { 1109 for_each_present_cpu(cpu) {
1064 stat = &per_cpu(ptcstats, cpu); 1110 stat = &per_cpu(ptcstats, cpu);
1065 memset(stat, 0, sizeof(struct ptc_stats)); 1111 memset(stat, 0, sizeof(struct ptc_stats));
1066 } 1112 }
1067 } else { 1113 }
1068 uv_bau_max_concurrent = input_arg; 1114
1069 bcp = &per_cpu(bau_control, smp_processor_id()); 1115 return count;
1070 if (uv_bau_max_concurrent < 1 || 1116}
1071 uv_bau_max_concurrent > bcp->cpus_in_uvhub) { 1117
1072 printk(KERN_DEBUG 1118static int local_atoi(const char *name)
1073 "Error: BAU max concurrent %d; %d is invalid\n", 1119{
1074 bcp->max_concurrent, uv_bau_max_concurrent); 1120 int val = 0;
1075 return -EINVAL; 1121
1076 } 1122 for (;; name++) {
1077 printk(KERN_DEBUG "Set BAU max concurrent:%d\n", 1123 switch (*name) {
1078 uv_bau_max_concurrent); 1124 case '0' ... '9':
1079 for_each_present_cpu(cpu) { 1125 val = 10*val+(*name-'0');
1080 bcp = &per_cpu(bau_control, cpu); 1126 break;
1081 bcp->max_concurrent = uv_bau_max_concurrent; 1127 default:
1128 return val;
1082 } 1129 }
1083 } 1130 }
1131}
1132
1133/*
1134 * set the tunables
1135 * 0 values reset them to defaults
1136 */
1137static ssize_t tunables_write(struct file *file, const char __user *user,
1138 size_t count, loff_t *data)
1139{
1140 int cpu;
1141 int cnt = 0;
1142 int val;
1143 char *p;
1144 char *q;
1145 char instr[64];
1146 struct bau_control *bcp;
1147
1148 if (count == 0 || count > sizeof(instr)-1)
1149 return -EINVAL;
1150 if (copy_from_user(instr, user, count))
1151 return -EFAULT;
1084 1152
1153 instr[count] = '\0';
1154 /* count the fields */
1155 p = instr + strspn(instr, WHITESPACE);
1156 q = p;
1157 for (; *p; p = q + strspn(q, WHITESPACE)) {
1158 q = p + strcspn(p, WHITESPACE);
1159 cnt++;
1160 if (q == p)
1161 break;
1162 }
1163 if (cnt != 9) {
1164 printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
1165 return -EINVAL;
1166 }
1167
1168 p = instr + strspn(instr, WHITESPACE);
1169 q = p;
1170 for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
1171 q = p + strcspn(p, WHITESPACE);
1172 val = local_atoi(p);
1173 switch (cnt) {
1174 case 0:
1175 if (val == 0) {
1176 max_bau_concurrent = MAX_BAU_CONCURRENT;
1177 max_bau_concurrent_constant =
1178 MAX_BAU_CONCURRENT;
1179 continue;
1180 }
1181 bcp = &per_cpu(bau_control, smp_processor_id());
1182 if (val < 1 || val > bcp->cpus_in_uvhub) {
1183 printk(KERN_DEBUG
1184 "Error: BAU max concurrent %d is invalid\n",
1185 val);
1186 return -EINVAL;
1187 }
1188 max_bau_concurrent = val;
1189 max_bau_concurrent_constant = val;
1190 continue;
1191 case 1:
1192 if (val == 0)
1193 plugged_delay = PLUGGED_DELAY;
1194 else
1195 plugged_delay = val;
1196 continue;
1197 case 2:
1198 if (val == 0)
1199 plugsb4reset = PLUGSB4RESET;
1200 else
1201 plugsb4reset = val;
1202 continue;
1203 case 3:
1204 if (val == 0)
1205 timeoutsb4reset = TIMEOUTSB4RESET;
1206 else
1207 timeoutsb4reset = val;
1208 continue;
1209 case 4:
1210 if (val == 0)
1211 ipi_reset_limit = IPI_RESET_LIMIT;
1212 else
1213 ipi_reset_limit = val;
1214 continue;
1215 case 5:
1216 if (val == 0)
1217 complete_threshold = COMPLETE_THRESHOLD;
1218 else
1219 complete_threshold = val;
1220 continue;
1221 case 6:
1222 if (val == 0)
1223 congested_response_us = CONGESTED_RESPONSE_US;
1224 else
1225 congested_response_us = val;
1226 continue;
1227 case 7:
1228 if (val == 0)
1229 congested_reps = CONGESTED_REPS;
1230 else
1231 congested_reps = val;
1232 continue;
1233 case 8:
1234 if (val == 0)
1235 congested_period = CONGESTED_PERIOD;
1236 else
1237 congested_period = val;
1238 continue;
1239 }
1240 if (q == p)
1241 break;
1242 }
1243 for_each_present_cpu(cpu) {
1244 bcp = &per_cpu(bau_control, cpu);
1245 bcp->max_bau_concurrent = max_bau_concurrent;
1246 bcp->max_bau_concurrent_constant = max_bau_concurrent;
1247 bcp->plugged_delay = plugged_delay;
1248 bcp->plugsb4reset = plugsb4reset;
1249 bcp->timeoutsb4reset = timeoutsb4reset;
1250 bcp->ipi_reset_limit = ipi_reset_limit;
1251 bcp->complete_threshold = complete_threshold;
1252 bcp->congested_response_us = congested_response_us;
1253 bcp->congested_reps = congested_reps;
1254 bcp->congested_period = congested_period;
1255 }
1085 return count; 1256 return count;
1086} 1257}
1087 1258
@@ -1097,6 +1268,11 @@ static int uv_ptc_proc_open(struct inode *inode, struct file *file)
1097 return seq_open(file, &uv_ptc_seq_ops); 1268 return seq_open(file, &uv_ptc_seq_ops);
1098} 1269}
1099 1270
1271static int tunables_open(struct inode *inode, struct file *file)
1272{
1273 return 0;
1274}
1275
1100static const struct file_operations proc_uv_ptc_operations = { 1276static const struct file_operations proc_uv_ptc_operations = {
1101 .open = uv_ptc_proc_open, 1277 .open = uv_ptc_proc_open,
1102 .read = seq_read, 1278 .read = seq_read,
@@ -1105,6 +1281,12 @@ static const struct file_operations proc_uv_ptc_operations = {
1105 .release = seq_release, 1281 .release = seq_release,
1106}; 1282};
1107 1283
1284static const struct file_operations tunables_fops = {
1285 .open = tunables_open,
1286 .read = tunables_read,
1287 .write = tunables_write,
1288};
1289
1108static int __init uv_ptc_init(void) 1290static int __init uv_ptc_init(void)
1109{ 1291{
1110 struct proc_dir_entry *proc_uv_ptc; 1292 struct proc_dir_entry *proc_uv_ptc;
@@ -1119,6 +1301,20 @@ static int __init uv_ptc_init(void)
1119 UV_PTC_BASENAME); 1301 UV_PTC_BASENAME);
1120 return -EINVAL; 1302 return -EINVAL;
1121 } 1303 }
1304
1305 tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
1306 if (!tunables_dir) {
1307 printk(KERN_ERR "unable to create debugfs directory %s\n",
1308 UV_BAU_TUNABLES_DIR);
1309 return -EINVAL;
1310 }
1311 tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
1312 tunables_dir, NULL, &tunables_fops);
1313 if (!tunables_file) {
1314 printk(KERN_ERR "unable to create debugfs file %s\n",
1315 UV_BAU_TUNABLES_FILE);
1316 return -EINVAL;
1317 }
1122 return 0; 1318 return 0;
1123} 1319}
1124 1320
@@ -1259,15 +1455,45 @@ static void __init uv_init_uvhub(int uvhub, int vector)
1259} 1455}
1260 1456
1261/* 1457/*
1458 * We will set BAU_MISC_CONTROL with a timeout period.
1459 * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
1460 * So the destination timeout period has be be calculated from them.
1461 */
1462static int
1463calculate_destination_timeout(void)
1464{
1465 unsigned long mmr_image;
1466 int mult1;
1467 int mult2;
1468 int index;
1469 int base;
1470 int ret;
1471 unsigned long ts_ns;
1472
1473 mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
1474 mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
1475 index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
1476 mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
1477 mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
1478 base = timeout_base_ns[index];
1479 ts_ns = base * mult1 * mult2;
1480 ret = ts_ns / 1000;
1481 return ret;
1482}
1483
1484/*
1262 * initialize the bau_control structure for each cpu 1485 * initialize the bau_control structure for each cpu
1263 */ 1486 */
1264static void uv_init_per_cpu(int nuvhubs) 1487static void __init uv_init_per_cpu(int nuvhubs)
1265{ 1488{
1266 int i, j, k; 1489 int i;
1267 int cpu; 1490 int cpu;
1268 int pnode; 1491 int pnode;
1269 int uvhub; 1492 int uvhub;
1493 int have_hmaster;
1270 short socket = 0; 1494 short socket = 0;
1495 unsigned short socket_mask;
1496 unsigned char *uvhub_mask;
1271 struct bau_control *bcp; 1497 struct bau_control *bcp;
1272 struct uvhub_desc *bdp; 1498 struct uvhub_desc *bdp;
1273 struct socket_desc *sdp; 1499 struct socket_desc *sdp;
@@ -1278,7 +1504,7 @@ static void uv_init_per_cpu(int nuvhubs)
1278 short cpu_number[16]; 1504 short cpu_number[16];
1279 }; 1505 };
1280 struct uvhub_desc { 1506 struct uvhub_desc {
1281 short num_sockets; 1507 unsigned short socket_mask;
1282 short num_cpus; 1508 short num_cpus;
1283 short uvhub; 1509 short uvhub;
1284 short pnode; 1510 short pnode;
@@ -1286,57 +1512,84 @@ static void uv_init_per_cpu(int nuvhubs)
1286 }; 1512 };
1287 struct uvhub_desc *uvhub_descs; 1513 struct uvhub_desc *uvhub_descs;
1288 1514
1515 timeout_us = calculate_destination_timeout();
1516
1289 uvhub_descs = (struct uvhub_desc *) 1517 uvhub_descs = (struct uvhub_desc *)
1290 kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); 1518 kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
1291 memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); 1519 memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
1520 uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
1292 for_each_present_cpu(cpu) { 1521 for_each_present_cpu(cpu) {
1293 bcp = &per_cpu(bau_control, cpu); 1522 bcp = &per_cpu(bau_control, cpu);
1294 memset(bcp, 0, sizeof(struct bau_control)); 1523 memset(bcp, 0, sizeof(struct bau_control));
1295 spin_lock_init(&bcp->masks_lock);
1296 bcp->max_concurrent = uv_bau_max_concurrent;
1297 pnode = uv_cpu_hub_info(cpu)->pnode; 1524 pnode = uv_cpu_hub_info(cpu)->pnode;
1298 uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; 1525 uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
1526 *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
1299 bdp = &uvhub_descs[uvhub]; 1527 bdp = &uvhub_descs[uvhub];
1300 bdp->num_cpus++; 1528 bdp->num_cpus++;
1301 bdp->uvhub = uvhub; 1529 bdp->uvhub = uvhub;
1302 bdp->pnode = pnode; 1530 bdp->pnode = pnode;
1303 /* time interval to catch a hardware stay-busy bug */ 1531 /* kludge: 'assuming' one node per socket, and assuming that
1304 bcp->timeout_interval = millisec_2_cycles(3); 1532 disabling a socket just leaves a gap in node numbers */
1305 /* kludge: assume uv_hub.h is constant */ 1533 socket = (cpu_to_node(cpu) & 1);
1306 socket = (cpu_physical_id(cpu)>>5)&1; 1534 bdp->socket_mask |= (1 << socket);
1307 if (socket >= bdp->num_sockets)
1308 bdp->num_sockets = socket+1;
1309 sdp = &bdp->socket[socket]; 1535 sdp = &bdp->socket[socket];
1310 sdp->cpu_number[sdp->num_cpus] = cpu; 1536 sdp->cpu_number[sdp->num_cpus] = cpu;
1311 sdp->num_cpus++; 1537 sdp->num_cpus++;
1312 } 1538 }
1313 socket = 0; 1539 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1314 for_each_possible_blade(uvhub) { 1540 if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
1541 continue;
1542 have_hmaster = 0;
1315 bdp = &uvhub_descs[uvhub]; 1543 bdp = &uvhub_descs[uvhub];
1316 for (i = 0; i < bdp->num_sockets; i++) { 1544 socket_mask = bdp->socket_mask;
1317 sdp = &bdp->socket[i]; 1545 socket = 0;
1318 for (j = 0; j < sdp->num_cpus; j++) { 1546 while (socket_mask) {
1319 cpu = sdp->cpu_number[j]; 1547 if (!(socket_mask & 1))
1548 goto nextsocket;
1549 sdp = &bdp->socket[socket];
1550 for (i = 0; i < sdp->num_cpus; i++) {
1551 cpu = sdp->cpu_number[i];
1320 bcp = &per_cpu(bau_control, cpu); 1552 bcp = &per_cpu(bau_control, cpu);
1321 bcp->cpu = cpu; 1553 bcp->cpu = cpu;
1322 if (j == 0) { 1554 if (i == 0) {
1323 smaster = bcp; 1555 smaster = bcp;
1324 if (i == 0) 1556 if (!have_hmaster) {
1557 have_hmaster++;
1325 hmaster = bcp; 1558 hmaster = bcp;
1559 }
1326 } 1560 }
1327 bcp->cpus_in_uvhub = bdp->num_cpus; 1561 bcp->cpus_in_uvhub = bdp->num_cpus;
1328 bcp->cpus_in_socket = sdp->num_cpus; 1562 bcp->cpus_in_socket = sdp->num_cpus;
1329 bcp->socket_master = smaster; 1563 bcp->socket_master = smaster;
1564 bcp->uvhub = bdp->uvhub;
1330 bcp->uvhub_master = hmaster; 1565 bcp->uvhub_master = hmaster;
1331 for (k = 0; k < DEST_Q_SIZE; k++) 1566 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
1332 bcp->socket_acknowledge_count[k] = 0; 1567 blade_processor_id;
1333 bcp->uvhub_cpu =
1334 uv_cpu_hub_info(cpu)->blade_processor_id;
1335 } 1568 }
1569nextsocket:
1336 socket++; 1570 socket++;
1571 socket_mask = (socket_mask >> 1);
1337 } 1572 }
1338 } 1573 }
1339 kfree(uvhub_descs); 1574 kfree(uvhub_descs);
1575 kfree(uvhub_mask);
1576 for_each_present_cpu(cpu) {
1577 bcp = &per_cpu(bau_control, cpu);
1578 bcp->baudisabled = 0;
1579 bcp->statp = &per_cpu(ptcstats, cpu);
1580 /* time interval to catch a hardware stay-busy bug */
1581 bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
1582 bcp->max_bau_concurrent = max_bau_concurrent;
1583 bcp->max_bau_concurrent_constant = max_bau_concurrent;
1584 bcp->plugged_delay = plugged_delay;
1585 bcp->plugsb4reset = plugsb4reset;
1586 bcp->timeoutsb4reset = timeoutsb4reset;
1587 bcp->ipi_reset_limit = ipi_reset_limit;
1588 bcp->complete_threshold = complete_threshold;
1589 bcp->congested_response_us = congested_response_us;
1590 bcp->congested_reps = congested_reps;
1591 bcp->congested_period = congested_period;
1592 }
1340} 1593}
1341 1594
1342/* 1595/*
@@ -1361,10 +1614,11 @@ static int __init uv_bau_init(void)
1361 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), 1614 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
1362 GFP_KERNEL, cpu_to_node(cur_cpu)); 1615 GFP_KERNEL, cpu_to_node(cur_cpu));
1363 1616
1364 uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
1365 uv_nshift = uv_hub_info->m_val; 1617 uv_nshift = uv_hub_info->m_val;
1366 uv_mmask = (1UL << uv_hub_info->m_val) - 1; 1618 uv_mmask = (1UL << uv_hub_info->m_val) - 1;
1367 nuvhubs = uv_num_possible_blades(); 1619 nuvhubs = uv_num_possible_blades();
1620 spin_lock_init(&disable_lock);
1621 congested_cycles = microsec_2_cycles(congested_response_us);
1368 1622
1369 uv_init_per_cpu(nuvhubs); 1623 uv_init_per_cpu(nuvhubs);
1370 1624
@@ -1383,15 +1637,19 @@ static int __init uv_bau_init(void)
1383 alloc_intr_gate(vector, uv_bau_message_intr1); 1637 alloc_intr_gate(vector, uv_bau_message_intr1);
1384 1638
1385 for_each_possible_blade(uvhub) { 1639 for_each_possible_blade(uvhub) {
1386 pnode = uv_blade_to_pnode(uvhub); 1640 if (uv_blade_nr_possible_cpus(uvhub)) {
1387 /* INIT the bau */ 1641 pnode = uv_blade_to_pnode(uvhub);
1388 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, 1642 /* INIT the bau */
1389 ((unsigned long)1 << 63)); 1643 uv_write_global_mmr64(pnode,
1390 mmr = 1; /* should be 1 to broadcast to both sockets */ 1644 UVH_LB_BAU_SB_ACTIVATION_CONTROL,
1391 uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr); 1645 ((unsigned long)1 << 63));
1646 mmr = 1; /* should be 1 to broadcast to both sockets */
1647 uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
1648 mmr);
1649 }
1392 } 1650 }
1393 1651
1394 return 0; 1652 return 0;
1395} 1653}
1396core_initcall(uv_bau_init); 1654core_initcall(uv_bau_init);
1397core_initcall(uv_ptc_init); 1655fs_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 725ef4d17cd5..60788dee0f8a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -392,7 +392,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
392 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) 392 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
393 == NOTIFY_STOP) 393 == NOTIFY_STOP)
394 return; 394 return;
395
395#ifdef CONFIG_X86_LOCAL_APIC 396#ifdef CONFIG_X86_LOCAL_APIC
397 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
398 == NOTIFY_STOP)
399 return;
400
401#ifndef CONFIG_LOCKUP_DETECTOR
396 /* 402 /*
397 * Ok, so this is none of the documented NMI sources, 403 * Ok, so this is none of the documented NMI sources,
398 * so it must be the NMI watchdog. 404 * so it must be the NMI watchdog.
@@ -400,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
400 if (nmi_watchdog_tick(regs, reason)) 406 if (nmi_watchdog_tick(regs, reason))
401 return; 407 return;
402 if (!do_nmi_callback(regs, cpu)) 408 if (!do_nmi_callback(regs, cpu))
409#endif /* !CONFIG_LOCKUP_DETECTOR */
403 unknown_nmi_error(reason, regs); 410 unknown_nmi_error(reason, regs);
404#else 411#else
405 unknown_nmi_error(reason, regs); 412 unknown_nmi_error(reason, regs);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9faf91ae1841..ce8e50239332 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -751,7 +751,6 @@ static struct clocksource clocksource_tsc = {
751 .read = read_tsc, 751 .read = read_tsc,
752 .resume = resume_tsc, 752 .resume = resume_tsc,
753 .mask = CLOCKSOURCE_MASK(64), 753 .mask = CLOCKSOURCE_MASK(64),
754 .shift = 22,
755 .flags = CLOCK_SOURCE_IS_CONTINUOUS | 754 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
756 CLOCK_SOURCE_MUST_VERIFY, 755 CLOCK_SOURCE_MUST_VERIFY,
757#ifdef CONFIG_X86_64 756#ifdef CONFIG_X86_64
@@ -845,8 +844,6 @@ __cpuinit int unsynchronized_tsc(void)
845 844
846static void __init init_tsc_clocksource(void) 845static void __init init_tsc_clocksource(void)
847{ 846{
848 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
849 clocksource_tsc.shift);
850 if (tsc_clocksource_reliable) 847 if (tsc_clocksource_reliable)
851 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 848 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
852 /* lower the rating if we already know its unstable: */ 849 /* lower the rating if we already know its unstable: */
@@ -854,7 +851,7 @@ static void __init init_tsc_clocksource(void)
854 clocksource_tsc.rating = 0; 851 clocksource_tsc.rating = 0;
855 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; 852 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
856 } 853 }
857 clocksource_register(&clocksource_tsc); 854 clocksource_register_khz(&clocksource_tsc, tsc_khz);
858} 855}
859 856
860#ifdef CONFIG_X86_64 857#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
index 45b6f8a975a1..56a8c2a867d9 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu_64.S
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <asm/cpufeature.h> 33#include <asm/cpufeature.h>
34#include <asm/msr-index.h>
34 35
35verify_cpu: 36verify_cpu:
36 pushfl # Save caller passed flags 37 pushfl # Save caller passed flags
@@ -88,7 +89,7 @@ verify_cpu_sse_test:
88 je verify_cpu_sse_ok 89 je verify_cpu_sse_ok
89 test %di,%di 90 test %di,%di
90 jz verify_cpu_no_longmode # only try to force SSE on AMD 91 jz verify_cpu_no_longmode # only try to force SSE on AMD
91 movl $0xc0010015,%ecx # HWCR 92 movl $MSR_K7_HWCR,%ecx
92 rdmsr 93 rdmsr
93 btr $15,%eax # enable SSE 94 btr $15,%eax # enable SSE
94 wrmsr 95 wrmsr
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 1c0c6ab9c60f..dcbb28c4b694 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,8 +73,8 @@ void update_vsyscall_tz(void)
73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
74} 74}
75 75
76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, 76void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
77 u32 mult) 77 struct clocksource *clock, u32 mult)
78{ 78{
79 unsigned long flags; 79 unsigned long flags;
80 80
@@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
87 vsyscall_gtod_data.clock.shift = clock->shift; 87 vsyscall_gtod_data.clock.shift = clock->shift;
88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
90 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; 90 vsyscall_gtod_data.wall_to_monotonic = *wtm;
91 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); 91 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
92 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 92 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
93} 93}
@@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
169 * unlikely */ 169 * unlikely */
170time_t __vsyscall(1) vtime(time_t *t) 170time_t __vsyscall(1) vtime(time_t *t)
171{ 171{
172 struct timeval tv; 172 unsigned seq;
173 time_t result; 173 time_t result;
174 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) 174 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
175 return time_syscall(t); 175 return time_syscall(t);
176 176
177 vgettimeofday(&tv, NULL); 177 do {
178 result = tv.tv_sec; 178 seq = read_seqbegin(&__vsyscall_gtod_data.lock);
179
180 result = __vsyscall_gtod_data.wall_time_sec;
181
182 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
183
179 if (t) 184 if (t)
180 *t = result; 185 *t = result;
181 return result; 186 return result;
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 37e68fc5e24a..9c253bd65e24 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -16,11 +16,88 @@
16 */ 16 */
17u64 pcntxt_mask; 17u64 pcntxt_mask;
18 18
19/*
20 * Represents init state for the supported extended state.
21 */
22static struct xsave_struct *init_xstate_buf;
23
19struct _fpx_sw_bytes fx_sw_reserved; 24struct _fpx_sw_bytes fx_sw_reserved;
20#ifdef CONFIG_IA32_EMULATION 25#ifdef CONFIG_IA32_EMULATION
21struct _fpx_sw_bytes fx_sw_reserved_ia32; 26struct _fpx_sw_bytes fx_sw_reserved_ia32;
22#endif 27#endif
23 28
29static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
30
31/*
32 * If a processor implementation discern that a processor state component is
33 * in its initialized state it may modify the corresponding bit in the
34 * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory
35 * layout in the case of xsaveopt. While presenting the xstate information to
36 * the user, we always ensure that the memory layout of a feature will be in
37 * the init state if the corresponding header bit is zero. This is to ensure
38 * that the user doesn't see some stale state in the memory layout during
39 * signal handling, debugging etc.
40 */
41void __sanitize_i387_state(struct task_struct *tsk)
42{
43 u64 xstate_bv;
44 int feature_bit = 0x2;
45 struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
46
47 if (!fx)
48 return;
49
50 BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU);
51
52 xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
53
54 /*
55 * None of the feature bits are in init state. So nothing else
56 * to do for us, as the memory layout is upto date.
57 */
58 if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
59 return;
60
61 /*
62 * FP is in init state
63 */
64 if (!(xstate_bv & XSTATE_FP)) {
65 fx->cwd = 0x37f;
66 fx->swd = 0;
67 fx->twd = 0;
68 fx->fop = 0;
69 fx->rip = 0;
70 fx->rdp = 0;
71 memset(&fx->st_space[0], 0, 128);
72 }
73
74 /*
75 * SSE is in init state
76 */
77 if (!(xstate_bv & XSTATE_SSE))
78 memset(&fx->xmm_space[0], 0, 256);
79
80 xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2;
81
82 /*
83 * Update all the other memory layouts for which the corresponding
84 * header bit is in the init state.
85 */
86 while (xstate_bv) {
87 if (xstate_bv & 0x1) {
88 int offset = xstate_offsets[feature_bit];
89 int size = xstate_sizes[feature_bit];
90
91 memcpy(((void *) fx) + offset,
92 ((void *) init_xstate_buf) + offset,
93 size);
94 }
95
96 xstate_bv >>= 1;
97 feature_bit++;
98 }
99}
100
24/* 101/*
25 * Check for the presence of extended state information in the 102 * Check for the presence of extended state information in the
26 * user fpstate pointer in the sigcontext. 103 * user fpstate pointer in the sigcontext.
@@ -36,15 +113,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
36 113
37 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], 114 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
38 sizeof(struct _fpx_sw_bytes)); 115 sizeof(struct _fpx_sw_bytes));
39
40 if (err) 116 if (err)
41 return err; 117 return -EFAULT;
42 118
43 /* 119 /*
44 * First Magic check failed. 120 * First Magic check failed.
45 */ 121 */
46 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) 122 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
47 return -1; 123 return -EINVAL;
48 124
49 /* 125 /*
50 * Check for error scenarios. 126 * Check for error scenarios.
@@ -52,19 +128,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
52 if (fx_sw_user->xstate_size < min_xstate_size || 128 if (fx_sw_user->xstate_size < min_xstate_size ||
53 fx_sw_user->xstate_size > xstate_size || 129 fx_sw_user->xstate_size > xstate_size ||
54 fx_sw_user->xstate_size > fx_sw_user->extended_size) 130 fx_sw_user->xstate_size > fx_sw_user->extended_size)
55 return -1; 131 return -EINVAL;
56 132
57 err = __get_user(magic2, (__u32 *) (((void *)fpstate) + 133 err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
58 fx_sw_user->extended_size - 134 fx_sw_user->extended_size -
59 FP_XSTATE_MAGIC2_SIZE)); 135 FP_XSTATE_MAGIC2_SIZE));
136 if (err)
137 return err;
60 /* 138 /*
61 * Check for the presence of second magic word at the end of memory 139 * Check for the presence of second magic word at the end of memory
62 * layout. This detects the case where the user just copied the legacy 140 * layout. This detects the case where the user just copied the legacy
63 * fpstate layout with out copying the extended state information 141 * fpstate layout with out copying the extended state information
64 * in the memory layout. 142 * in the memory layout.
65 */ 143 */
66 if (err || magic2 != FP_XSTATE_MAGIC2) 144 if (magic2 != FP_XSTATE_MAGIC2)
67 return -1; 145 return -EFAULT;
68 146
69 return 0; 147 return 0;
70} 148}
@@ -91,14 +169,6 @@ int save_i387_xstate(void __user *buf)
91 return 0; 169 return 0;
92 170
93 if (task_thread_info(tsk)->status & TS_USEDFPU) { 171 if (task_thread_info(tsk)->status & TS_USEDFPU) {
94 /*
95 * Start with clearing the user buffer. This will present a
96 * clean context for the bytes not touched by the fxsave/xsave.
97 */
98 err = __clear_user(buf, sig_xstate_size);
99 if (err)
100 return err;
101
102 if (use_xsave()) 172 if (use_xsave())
103 err = xsave_user(buf); 173 err = xsave_user(buf);
104 else 174 else
@@ -109,6 +179,7 @@ int save_i387_xstate(void __user *buf)
109 task_thread_info(tsk)->status &= ~TS_USEDFPU; 179 task_thread_info(tsk)->status &= ~TS_USEDFPU;
110 stts(); 180 stts();
111 } else { 181 } else {
182 sanitize_i387_state(tsk);
112 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, 183 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
113 xstate_size)) 184 xstate_size))
114 return -1; 185 return -1;
@@ -184,8 +255,8 @@ static int restore_user_xstate(void __user *buf)
184 * init the state skipped by the user. 255 * init the state skipped by the user.
185 */ 256 */
186 mask = pcntxt_mask & ~mask; 257 mask = pcntxt_mask & ~mask;
187 258 if (unlikely(mask))
188 xrstor_state(init_xstate_buf, mask); 259 xrstor_state(init_xstate_buf, mask);
189 260
190 return 0; 261 return 0;
191 262
@@ -274,11 +345,6 @@ static void prepare_fx_sw_frame(void)
274#endif 345#endif
275} 346}
276 347
277/*
278 * Represents init state for the supported extended state.
279 */
280struct xsave_struct *init_xstate_buf;
281
282#ifdef CONFIG_X86_64 348#ifdef CONFIG_X86_64
283unsigned int sig_xstate_size = sizeof(struct _fpstate); 349unsigned int sig_xstate_size = sizeof(struct _fpstate);
284#endif 350#endif
@@ -286,37 +352,77 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate);
286/* 352/*
287 * Enable the extended processor state save/restore feature 353 * Enable the extended processor state save/restore feature
288 */ 354 */
289void __cpuinit xsave_init(void) 355static inline void xstate_enable(void)
290{ 356{
291 if (!cpu_has_xsave)
292 return;
293
294 set_in_cr4(X86_CR4_OSXSAVE); 357 set_in_cr4(X86_CR4_OSXSAVE);
295
296 /*
297 * Enable all the features that the HW is capable of
298 * and the Linux kernel is aware of.
299 */
300 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); 358 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
301} 359}
302 360
303/* 361/*
362 * Record the offsets and sizes of different state managed by the xsave
363 * memory layout.
364 */
365static void __init setup_xstate_features(void)
366{
367 int eax, ebx, ecx, edx, leaf = 0x2;
368
369 xstate_features = fls64(pcntxt_mask);
370 xstate_offsets = alloc_bootmem(xstate_features * sizeof(int));
371 xstate_sizes = alloc_bootmem(xstate_features * sizeof(int));
372
373 do {
374 cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
375
376 if (eax == 0)
377 break;
378
379 xstate_offsets[leaf] = ebx;
380 xstate_sizes[leaf] = eax;
381
382 leaf++;
383 } while (1);
384}
385
386/*
304 * setup the xstate image representing the init state 387 * setup the xstate image representing the init state
305 */ 388 */
306static void __init setup_xstate_init(void) 389static void __init setup_xstate_init(void)
307{ 390{
391 setup_xstate_features();
392
393 /*
394 * Setup init_xstate_buf to represent the init state of
395 * all the features managed by the xsave
396 */
308 init_xstate_buf = alloc_bootmem(xstate_size); 397 init_xstate_buf = alloc_bootmem(xstate_size);
309 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; 398 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
399
400 clts();
401 /*
402 * Init all the features state with header_bv being 0x0
403 */
404 xrstor_state(init_xstate_buf, -1);
405 /*
406 * Dump the init state again. This is to identify the init state
407 * of any feature which is not represented by all zero's.
408 */
409 xsave_state(init_xstate_buf, -1);
410 stts();
310} 411}
311 412
312/* 413/*
313 * Enable and initialize the xsave feature. 414 * Enable and initialize the xsave feature.
314 */ 415 */
315void __ref xsave_cntxt_init(void) 416static void __init xstate_enable_boot_cpu(void)
316{ 417{
317 unsigned int eax, ebx, ecx, edx; 418 unsigned int eax, ebx, ecx, edx;
318 419
319 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); 420 if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
421 WARN(1, KERN_ERR "XSTATE_CPUID missing\n");
422 return;
423 }
424
425 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
320 pcntxt_mask = eax + ((u64)edx << 32); 426 pcntxt_mask = eax + ((u64)edx << 32);
321 427
322 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { 428 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
@@ -329,12 +435,13 @@ void __ref xsave_cntxt_init(void)
329 * Support only the state known to OS. 435 * Support only the state known to OS.
330 */ 436 */
331 pcntxt_mask = pcntxt_mask & XCNTXT_MASK; 437 pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
332 xsave_init(); 438
439 xstate_enable();
333 440
334 /* 441 /*
335 * Recompute the context size for enabled features 442 * Recompute the context size for enabled features
336 */ 443 */
337 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); 444 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
338 xstate_size = ebx; 445 xstate_size = ebx;
339 446
340 update_regset_xstate_info(xstate_size, pcntxt_mask); 447 update_regset_xstate_info(xstate_size, pcntxt_mask);
@@ -346,3 +453,23 @@ void __ref xsave_cntxt_init(void)
346 "cntxt size 0x%x\n", 453 "cntxt size 0x%x\n",
347 pcntxt_mask, xstate_size); 454 pcntxt_mask, xstate_size);
348} 455}
456
457/*
458 * For the very first instance, this calls xstate_enable_boot_cpu();
459 * for all subsequent instances, this calls xstate_enable().
460 *
461 * This is somewhat obfuscated due to the lack of powerful enough
462 * overrides for the section checks.
463 */
464void __cpuinit xsave_init(void)
465{
466 static __refdata void (*next_func)(void) = xstate_enable_boot_cpu;
467 void (*this_func)(void);
468
469 if (!cpu_has_xsave)
470 return;
471
472 this_func = next_func;
473 next_func = xstate_enable;
474 this_func();
475}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5ac0bb465ed6..b38bd8b92aa6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,6 +9,7 @@
9 * privileged instructions: 9 * privileged instructions:
10 * 10 *
11 * Copyright (C) 2006 Qumranet 11 * Copyright (C) 2006 Qumranet
12 * Copyright 2010 Red Hat, Inc. and/or its affilates.
12 * 13 *
13 * Avi Kivity <avi@qumranet.com> 14 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com> 15 * Yaniv Kamay <yaniv@qumranet.com>
@@ -67,6 +68,9 @@
67#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ 68#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
68#define SrcImmU (9<<4) /* Immediate operand, unsigned */ 69#define SrcImmU (9<<4) /* Immediate operand, unsigned */
69#define SrcSI (0xa<<4) /* Source is in the DS:RSI */ 70#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
71#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */
72#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */
73#define SrcAcc (0xd<<4) /* Source Accumulator */
70#define SrcMask (0xf<<4) 74#define SrcMask (0xf<<4)
71/* Generic ModRM decode. */ 75/* Generic ModRM decode. */
72#define ModRM (1<<8) 76#define ModRM (1<<8)
@@ -88,10 +92,6 @@
88#define Src2CL (1<<29) 92#define Src2CL (1<<29)
89#define Src2ImmByte (2<<29) 93#define Src2ImmByte (2<<29)
90#define Src2One (3<<29) 94#define Src2One (3<<29)
91#define Src2Imm16 (4<<29)
92#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be
93 in memory and second argument is located
94 immediately after the first one in memory. */
95#define Src2Mask (7<<29) 95#define Src2Mask (7<<29)
96 96
97enum { 97enum {
@@ -124,15 +124,15 @@ static u32 opcode_table[256] = {
124 /* 0x20 - 0x27 */ 124 /* 0x20 - 0x27 */
125 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 125 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
126 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 126 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
127 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, 127 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
128 /* 0x28 - 0x2F */ 128 /* 0x28 - 0x2F */
129 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 129 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
130 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 130 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
131 0, 0, 0, 0, 131 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
132 /* 0x30 - 0x37 */ 132 /* 0x30 - 0x37 */
133 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 133 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
134 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 134 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
135 0, 0, 0, 0, 135 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
136 /* 0x38 - 0x3F */ 136 /* 0x38 - 0x3F */
137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
138 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 138 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -170,20 +170,20 @@ static u32 opcode_table[256] = {
170 /* 0x88 - 0x8F */ 170 /* 0x88 - 0x8F */
171 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 171 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
172 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 172 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
173 DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, 173 DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
174 DstReg | SrcMem | ModRM | Mov, Group | Group1A, 174 ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
175 /* 0x90 - 0x97 */ 175 /* 0x90 - 0x97 */
176 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 176 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
177 /* 0x98 - 0x9F */ 177 /* 0x98 - 0x9F */
178 0, 0, SrcImm | Src2Imm16 | No64, 0, 178 0, 0, SrcImmFAddr | No64, 0,
179 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 179 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
180 /* 0xA0 - 0xA7 */ 180 /* 0xA0 - 0xA7 */
181 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 181 ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
182 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, 182 ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, 183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, 184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
185 /* 0xA8 - 0xAF */ 185 /* 0xA8 - 0xAF */
186 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String, 186 DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, 187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
188 ByteOp | DstDI | String, DstDI | String, 188 ByteOp | DstDI | String, DstDI | String,
189 /* 0xB0 - 0xB7 */ 189 /* 0xB0 - 0xB7 */
@@ -215,7 +215,7 @@ static u32 opcode_table[256] = {
215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, 215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
216 /* 0xE8 - 0xEF */ 216 /* 0xE8 - 0xEF */
217 SrcImm | Stack, SrcImm | ImplicitOps, 217 SrcImm | Stack, SrcImm | ImplicitOps,
218 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, 218 SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, 219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, 220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
221 /* 0xF0 - 0xF7 */ 221 /* 0xF0 - 0xF7 */
@@ -337,20 +337,20 @@ static u32 group_table[] = {
337 [Group1A*8] = 337 [Group1A*8] =
338 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, 338 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
339 [Group3_Byte*8] = 339 [Group3_Byte*8] =
340 ByteOp | SrcImm | DstMem | ModRM, 0, 340 ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
341 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 341 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
342 0, 0, 0, 0, 342 0, 0, 0, 0,
343 [Group3*8] = 343 [Group3*8] =
344 DstMem | SrcImm | ModRM, 0, 344 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
345 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 345 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
346 0, 0, 0, 0, 346 0, 0, 0, 0,
347 [Group4*8] = 347 [Group4*8] =
348 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 348 ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
349 0, 0, 0, 0, 0, 0, 349 0, 0, 0, 0, 0, 0,
350 [Group5*8] = 350 [Group5*8] =
351 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 351 DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
352 SrcMem | ModRM | Stack, 0, 352 SrcMem | ModRM | Stack, 0,
353 SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps, 353 SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
354 SrcMem | ModRM | Stack, 0, 354 SrcMem | ModRM | Stack, 0,
355 [Group7*8] = 355 [Group7*8] =
356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, 356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
@@ -576,6 +576,13 @@ static u32 group2_table[] = {
576 (_type)_x; \ 576 (_type)_x; \
577}) 577})
578 578
579#define insn_fetch_arr(_arr, _size, _eip) \
580({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
581 if (rc != X86EMUL_CONTINUE) \
582 goto done; \
583 (_eip) += (_size); \
584})
585
579static inline unsigned long ad_mask(struct decode_cache *c) 586static inline unsigned long ad_mask(struct decode_cache *c)
580{ 587{
581 return (1UL << (c->ad_bytes << 3)) - 1; 588 return (1UL << (c->ad_bytes << 3)) - 1;
@@ -617,31 +624,66 @@ static void set_seg_override(struct decode_cache *c, int seg)
617 c->seg_override = seg; 624 c->seg_override = seg;
618} 625}
619 626
620static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) 627static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
628 struct x86_emulate_ops *ops, int seg)
621{ 629{
622 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) 630 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
623 return 0; 631 return 0;
624 632
625 return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); 633 return ops->get_cached_segment_base(seg, ctxt->vcpu);
626} 634}
627 635
628static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, 636static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
637 struct x86_emulate_ops *ops,
629 struct decode_cache *c) 638 struct decode_cache *c)
630{ 639{
631 if (!c->has_seg_override) 640 if (!c->has_seg_override)
632 return 0; 641 return 0;
633 642
634 return seg_base(ctxt, c->seg_override); 643 return seg_base(ctxt, ops, c->seg_override);
644}
645
646static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
647 struct x86_emulate_ops *ops)
648{
649 return seg_base(ctxt, ops, VCPU_SREG_ES);
650}
651
652static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
653 struct x86_emulate_ops *ops)
654{
655 return seg_base(ctxt, ops, VCPU_SREG_SS);
656}
657
658static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
659 u32 error, bool valid)
660{
661 ctxt->exception = vec;
662 ctxt->error_code = error;
663 ctxt->error_code_valid = valid;
664 ctxt->restart = false;
665}
666
667static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
668{
669 emulate_exception(ctxt, GP_VECTOR, err, true);
635} 670}
636 671
637static unsigned long es_base(struct x86_emulate_ctxt *ctxt) 672static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr,
673 int err)
638{ 674{
639 return seg_base(ctxt, VCPU_SREG_ES); 675 ctxt->cr2 = addr;
676 emulate_exception(ctxt, PF_VECTOR, err, true);
640} 677}
641 678
642static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) 679static void emulate_ud(struct x86_emulate_ctxt *ctxt)
643{ 680{
644 return seg_base(ctxt, VCPU_SREG_SS); 681 emulate_exception(ctxt, UD_VECTOR, 0, false);
682}
683
684static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
685{
686 emulate_exception(ctxt, TS_VECTOR, err, true);
645} 687}
646 688
647static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 689static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -932,12 +974,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
932 /* we cannot decode insn before we complete previous rep insn */ 974 /* we cannot decode insn before we complete previous rep insn */
933 WARN_ON(ctxt->restart); 975 WARN_ON(ctxt->restart);
934 976
935 /* Shadow copy of register state. Committed on successful emulation. */
936 memset(c, 0, sizeof(struct decode_cache));
937 c->eip = ctxt->eip; 977 c->eip = ctxt->eip;
938 c->fetch.start = c->fetch.end = c->eip; 978 c->fetch.start = c->fetch.end = c->eip;
939 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 979 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
940 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
941 980
942 switch (mode) { 981 switch (mode) {
943 case X86EMUL_MODE_REAL: 982 case X86EMUL_MODE_REAL:
@@ -1060,7 +1099,7 @@ done_prefixes:
1060 set_seg_override(c, VCPU_SREG_DS); 1099 set_seg_override(c, VCPU_SREG_DS);
1061 1100
1062 if (!(!c->twobyte && c->b == 0x8d)) 1101 if (!(!c->twobyte && c->b == 0x8d))
1063 c->modrm_ea += seg_override_base(ctxt, c); 1102 c->modrm_ea += seg_override_base(ctxt, ops, c);
1064 1103
1065 if (c->ad_bytes != 8) 1104 if (c->ad_bytes != 8)
1066 c->modrm_ea = (u32)c->modrm_ea; 1105 c->modrm_ea = (u32)c->modrm_ea;
@@ -1148,6 +1187,25 @@ done_prefixes:
1148 else 1187 else
1149 c->src.val = insn_fetch(u8, 1, c->eip); 1188 c->src.val = insn_fetch(u8, 1, c->eip);
1150 break; 1189 break;
1190 case SrcAcc:
1191 c->src.type = OP_REG;
1192 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1193 c->src.ptr = &c->regs[VCPU_REGS_RAX];
1194 switch (c->src.bytes) {
1195 case 1:
1196 c->src.val = *(u8 *)c->src.ptr;
1197 break;
1198 case 2:
1199 c->src.val = *(u16 *)c->src.ptr;
1200 break;
1201 case 4:
1202 c->src.val = *(u32 *)c->src.ptr;
1203 break;
1204 case 8:
1205 c->src.val = *(u64 *)c->src.ptr;
1206 break;
1207 }
1208 break;
1151 case SrcOne: 1209 case SrcOne:
1152 c->src.bytes = 1; 1210 c->src.bytes = 1;
1153 c->src.val = 1; 1211 c->src.val = 1;
@@ -1156,10 +1214,21 @@ done_prefixes:
1156 c->src.type = OP_MEM; 1214 c->src.type = OP_MEM;
1157 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1215 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1158 c->src.ptr = (unsigned long *) 1216 c->src.ptr = (unsigned long *)
1159 register_address(c, seg_override_base(ctxt, c), 1217 register_address(c, seg_override_base(ctxt, ops, c),
1160 c->regs[VCPU_REGS_RSI]); 1218 c->regs[VCPU_REGS_RSI]);
1161 c->src.val = 0; 1219 c->src.val = 0;
1162 break; 1220 break;
1221 case SrcImmFAddr:
1222 c->src.type = OP_IMM;
1223 c->src.ptr = (unsigned long *)c->eip;
1224 c->src.bytes = c->op_bytes + 2;
1225 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
1226 break;
1227 case SrcMemFAddr:
1228 c->src.type = OP_MEM;
1229 c->src.ptr = (unsigned long *)c->modrm_ea;
1230 c->src.bytes = c->op_bytes + 2;
1231 break;
1163 } 1232 }
1164 1233
1165 /* 1234 /*
@@ -1179,22 +1248,10 @@ done_prefixes:
1179 c->src2.bytes = 1; 1248 c->src2.bytes = 1;
1180 c->src2.val = insn_fetch(u8, 1, c->eip); 1249 c->src2.val = insn_fetch(u8, 1, c->eip);
1181 break; 1250 break;
1182 case Src2Imm16:
1183 c->src2.type = OP_IMM;
1184 c->src2.ptr = (unsigned long *)c->eip;
1185 c->src2.bytes = 2;
1186 c->src2.val = insn_fetch(u16, 2, c->eip);
1187 break;
1188 case Src2One: 1251 case Src2One:
1189 c->src2.bytes = 1; 1252 c->src2.bytes = 1;
1190 c->src2.val = 1; 1253 c->src2.val = 1;
1191 break; 1254 break;
1192 case Src2Mem16:
1193 c->src2.type = OP_MEM;
1194 c->src2.bytes = 2;
1195 c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
1196 c->src2.val = 0;
1197 break;
1198 } 1255 }
1199 1256
1200 /* Decode and fetch the destination operand: register or memory. */ 1257 /* Decode and fetch the destination operand: register or memory. */
@@ -1253,7 +1310,7 @@ done_prefixes:
1253 c->dst.type = OP_MEM; 1310 c->dst.type = OP_MEM;
1254 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1311 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1255 c->dst.ptr = (unsigned long *) 1312 c->dst.ptr = (unsigned long *)
1256 register_address(c, es_base(ctxt), 1313 register_address(c, es_base(ctxt, ops),
1257 c->regs[VCPU_REGS_RDI]); 1314 c->regs[VCPU_REGS_RDI]);
1258 c->dst.val = 0; 1315 c->dst.val = 0;
1259 break; 1316 break;
@@ -1263,6 +1320,37 @@ done:
1263 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1320 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1264} 1321}
1265 1322
1323static int read_emulated(struct x86_emulate_ctxt *ctxt,
1324 struct x86_emulate_ops *ops,
1325 unsigned long addr, void *dest, unsigned size)
1326{
1327 int rc;
1328 struct read_cache *mc = &ctxt->decode.mem_read;
1329 u32 err;
1330
1331 while (size) {
1332 int n = min(size, 8u);
1333 size -= n;
1334 if (mc->pos < mc->end)
1335 goto read_cached;
1336
1337 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
1338 ctxt->vcpu);
1339 if (rc == X86EMUL_PROPAGATE_FAULT)
1340 emulate_pf(ctxt, addr, err);
1341 if (rc != X86EMUL_CONTINUE)
1342 return rc;
1343 mc->end += n;
1344
1345 read_cached:
1346 memcpy(dest, mc->data + mc->pos, n);
1347 mc->pos += n;
1348 dest += n;
1349 addr += n;
1350 }
1351 return X86EMUL_CONTINUE;
1352}
1353
1266static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, 1354static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1267 struct x86_emulate_ops *ops, 1355 struct x86_emulate_ops *ops,
1268 unsigned int size, unsigned short port, 1356 unsigned int size, unsigned short port,
@@ -1330,13 +1418,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1330 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1418 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1331 1419
1332 if (dt.size < index * 8 + 7) { 1420 if (dt.size < index * 8 + 7) {
1333 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); 1421 emulate_gp(ctxt, selector & 0xfffc);
1334 return X86EMUL_PROPAGATE_FAULT; 1422 return X86EMUL_PROPAGATE_FAULT;
1335 } 1423 }
1336 addr = dt.address + index * 8; 1424 addr = dt.address + index * 8;
1337 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 1425 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1338 if (ret == X86EMUL_PROPAGATE_FAULT) 1426 if (ret == X86EMUL_PROPAGATE_FAULT)
1339 kvm_inject_page_fault(ctxt->vcpu, addr, err); 1427 emulate_pf(ctxt, addr, err);
1340 1428
1341 return ret; 1429 return ret;
1342} 1430}
@@ -1355,14 +1443,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1355 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1443 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1356 1444
1357 if (dt.size < index * 8 + 7) { 1445 if (dt.size < index * 8 + 7) {
1358 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); 1446 emulate_gp(ctxt, selector & 0xfffc);
1359 return X86EMUL_PROPAGATE_FAULT; 1447 return X86EMUL_PROPAGATE_FAULT;
1360 } 1448 }
1361 1449
1362 addr = dt.address + index * 8; 1450 addr = dt.address + index * 8;
1363 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 1451 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1364 if (ret == X86EMUL_PROPAGATE_FAULT) 1452 if (ret == X86EMUL_PROPAGATE_FAULT)
1365 kvm_inject_page_fault(ctxt->vcpu, addr, err); 1453 emulate_pf(ctxt, addr, err);
1366 1454
1367 return ret; 1455 return ret;
1368} 1456}
@@ -1481,11 +1569,70 @@ load:
1481 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); 1569 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
1482 return X86EMUL_CONTINUE; 1570 return X86EMUL_CONTINUE;
1483exception: 1571exception:
1484 kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code); 1572 emulate_exception(ctxt, err_vec, err_code, true);
1485 return X86EMUL_PROPAGATE_FAULT; 1573 return X86EMUL_PROPAGATE_FAULT;
1486} 1574}
1487 1575
1488static inline void emulate_push(struct x86_emulate_ctxt *ctxt) 1576static inline int writeback(struct x86_emulate_ctxt *ctxt,
1577 struct x86_emulate_ops *ops)
1578{
1579 int rc;
1580 struct decode_cache *c = &ctxt->decode;
1581 u32 err;
1582
1583 switch (c->dst.type) {
1584 case OP_REG:
1585 /* The 4-byte case *is* correct:
1586 * in 64-bit mode we zero-extend.
1587 */
1588 switch (c->dst.bytes) {
1589 case 1:
1590 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1591 break;
1592 case 2:
1593 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1594 break;
1595 case 4:
1596 *c->dst.ptr = (u32)c->dst.val;
1597 break; /* 64b: zero-ext */
1598 case 8:
1599 *c->dst.ptr = c->dst.val;
1600 break;
1601 }
1602 break;
1603 case OP_MEM:
1604 if (c->lock_prefix)
1605 rc = ops->cmpxchg_emulated(
1606 (unsigned long)c->dst.ptr,
1607 &c->dst.orig_val,
1608 &c->dst.val,
1609 c->dst.bytes,
1610 &err,
1611 ctxt->vcpu);
1612 else
1613 rc = ops->write_emulated(
1614 (unsigned long)c->dst.ptr,
1615 &c->dst.val,
1616 c->dst.bytes,
1617 &err,
1618 ctxt->vcpu);
1619 if (rc == X86EMUL_PROPAGATE_FAULT)
1620 emulate_pf(ctxt,
1621 (unsigned long)c->dst.ptr, err);
1622 if (rc != X86EMUL_CONTINUE)
1623 return rc;
1624 break;
1625 case OP_NONE:
1626 /* no writeback */
1627 break;
1628 default:
1629 break;
1630 }
1631 return X86EMUL_CONTINUE;
1632}
1633
1634static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
1635 struct x86_emulate_ops *ops)
1489{ 1636{
1490 struct decode_cache *c = &ctxt->decode; 1637 struct decode_cache *c = &ctxt->decode;
1491 1638
@@ -1493,7 +1640,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1493 c->dst.bytes = c->op_bytes; 1640 c->dst.bytes = c->op_bytes;
1494 c->dst.val = c->src.val; 1641 c->dst.val = c->src.val;
1495 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1642 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1496 c->dst.ptr = (void *) register_address(c, ss_base(ctxt), 1643 c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
1497 c->regs[VCPU_REGS_RSP]); 1644 c->regs[VCPU_REGS_RSP]);
1498} 1645}
1499 1646
@@ -1504,9 +1651,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1504 struct decode_cache *c = &ctxt->decode; 1651 struct decode_cache *c = &ctxt->decode;
1505 int rc; 1652 int rc;
1506 1653
1507 rc = ops->read_emulated(register_address(c, ss_base(ctxt), 1654 rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
1508 c->regs[VCPU_REGS_RSP]), 1655 c->regs[VCPU_REGS_RSP]),
1509 dest, len, ctxt->vcpu); 1656 dest, len);
1510 if (rc != X86EMUL_CONTINUE) 1657 if (rc != X86EMUL_CONTINUE)
1511 return rc; 1658 return rc;
1512 1659
@@ -1541,7 +1688,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1541 break; 1688 break;
1542 case X86EMUL_MODE_VM86: 1689 case X86EMUL_MODE_VM86:
1543 if (iopl < 3) { 1690 if (iopl < 3) {
1544 kvm_inject_gp(ctxt->vcpu, 0); 1691 emulate_gp(ctxt, 0);
1545 return X86EMUL_PROPAGATE_FAULT; 1692 return X86EMUL_PROPAGATE_FAULT;
1546 } 1693 }
1547 change_mask |= EFLG_IF; 1694 change_mask |= EFLG_IF;
@@ -1557,15 +1704,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1557 return rc; 1704 return rc;
1558} 1705}
1559 1706
1560static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) 1707static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
1708 struct x86_emulate_ops *ops, int seg)
1561{ 1709{
1562 struct decode_cache *c = &ctxt->decode; 1710 struct decode_cache *c = &ctxt->decode;
1563 struct kvm_segment segment;
1564 1711
1565 kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg); 1712 c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
1566 1713
1567 c->src.val = segment.selector; 1714 emulate_push(ctxt, ops);
1568 emulate_push(ctxt);
1569} 1715}
1570 1716
1571static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, 1717static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1583,19 +1729,31 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1583 return rc; 1729 return rc;
1584} 1730}
1585 1731
1586static void emulate_pusha(struct x86_emulate_ctxt *ctxt) 1732static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
1733 struct x86_emulate_ops *ops)
1587{ 1734{
1588 struct decode_cache *c = &ctxt->decode; 1735 struct decode_cache *c = &ctxt->decode;
1589 unsigned long old_esp = c->regs[VCPU_REGS_RSP]; 1736 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
1737 int rc = X86EMUL_CONTINUE;
1590 int reg = VCPU_REGS_RAX; 1738 int reg = VCPU_REGS_RAX;
1591 1739
1592 while (reg <= VCPU_REGS_RDI) { 1740 while (reg <= VCPU_REGS_RDI) {
1593 (reg == VCPU_REGS_RSP) ? 1741 (reg == VCPU_REGS_RSP) ?
1594 (c->src.val = old_esp) : (c->src.val = c->regs[reg]); 1742 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1595 1743
1596 emulate_push(ctxt); 1744 emulate_push(ctxt, ops);
1745
1746 rc = writeback(ctxt, ops);
1747 if (rc != X86EMUL_CONTINUE)
1748 return rc;
1749
1597 ++reg; 1750 ++reg;
1598 } 1751 }
1752
1753 /* Disable writeback. */
1754 c->dst.type = OP_NONE;
1755
1756 return rc;
1599} 1757}
1600 1758
1601static int emulate_popa(struct x86_emulate_ctxt *ctxt, 1759static int emulate_popa(struct x86_emulate_ctxt *ctxt,
@@ -1695,14 +1853,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1695 old_eip = c->eip; 1853 old_eip = c->eip;
1696 c->eip = c->src.val; 1854 c->eip = c->src.val;
1697 c->src.val = old_eip; 1855 c->src.val = old_eip;
1698 emulate_push(ctxt); 1856 emulate_push(ctxt, ops);
1699 break; 1857 break;
1700 } 1858 }
1701 case 4: /* jmp abs */ 1859 case 4: /* jmp abs */
1702 c->eip = c->src.val; 1860 c->eip = c->src.val;
1703 break; 1861 break;
1704 case 6: /* push */ 1862 case 6: /* push */
1705 emulate_push(ctxt); 1863 emulate_push(ctxt, ops);
1706 break; 1864 break;
1707 } 1865 }
1708 return X86EMUL_CONTINUE; 1866 return X86EMUL_CONTINUE;
@@ -1748,145 +1906,82 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1748 return rc; 1906 return rc;
1749} 1907}
1750 1908
1751static inline int writeback(struct x86_emulate_ctxt *ctxt,
1752 struct x86_emulate_ops *ops)
1753{
1754 int rc;
1755 struct decode_cache *c = &ctxt->decode;
1756
1757 switch (c->dst.type) {
1758 case OP_REG:
1759 /* The 4-byte case *is* correct:
1760 * in 64-bit mode we zero-extend.
1761 */
1762 switch (c->dst.bytes) {
1763 case 1:
1764 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1765 break;
1766 case 2:
1767 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1768 break;
1769 case 4:
1770 *c->dst.ptr = (u32)c->dst.val;
1771 break; /* 64b: zero-ext */
1772 case 8:
1773 *c->dst.ptr = c->dst.val;
1774 break;
1775 }
1776 break;
1777 case OP_MEM:
1778 if (c->lock_prefix)
1779 rc = ops->cmpxchg_emulated(
1780 (unsigned long)c->dst.ptr,
1781 &c->dst.orig_val,
1782 &c->dst.val,
1783 c->dst.bytes,
1784 ctxt->vcpu);
1785 else
1786 rc = ops->write_emulated(
1787 (unsigned long)c->dst.ptr,
1788 &c->dst.val,
1789 c->dst.bytes,
1790 ctxt->vcpu);
1791 if (rc != X86EMUL_CONTINUE)
1792 return rc;
1793 break;
1794 case OP_NONE:
1795 /* no writeback */
1796 break;
1797 default:
1798 break;
1799 }
1800 return X86EMUL_CONTINUE;
1801}
1802
1803static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
1804{
1805 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
1806 /*
1807 * an sti; sti; sequence only disable interrupts for the first
1808 * instruction. So, if the last instruction, be it emulated or
1809 * not, left the system with the INT_STI flag enabled, it
1810 * means that the last instruction is an sti. We should not
1811 * leave the flag on in this case. The same goes for mov ss
1812 */
1813 if (!(int_shadow & mask))
1814 ctxt->interruptibility = mask;
1815}
1816
1817static inline void 1909static inline void
1818setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 1910setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1819 struct kvm_segment *cs, struct kvm_segment *ss) 1911 struct x86_emulate_ops *ops, struct desc_struct *cs,
1912 struct desc_struct *ss)
1820{ 1913{
1821 memset(cs, 0, sizeof(struct kvm_segment)); 1914 memset(cs, 0, sizeof(struct desc_struct));
1822 kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); 1915 ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu);
1823 memset(ss, 0, sizeof(struct kvm_segment)); 1916 memset(ss, 0, sizeof(struct desc_struct));
1824 1917
1825 cs->l = 0; /* will be adjusted later */ 1918 cs->l = 0; /* will be adjusted later */
1826 cs->base = 0; /* flat segment */ 1919 set_desc_base(cs, 0); /* flat segment */
1827 cs->g = 1; /* 4kb granularity */ 1920 cs->g = 1; /* 4kb granularity */
1828 cs->limit = 0xffffffff; /* 4GB limit */ 1921 set_desc_limit(cs, 0xfffff); /* 4GB limit */
1829 cs->type = 0x0b; /* Read, Execute, Accessed */ 1922 cs->type = 0x0b; /* Read, Execute, Accessed */
1830 cs->s = 1; 1923 cs->s = 1;
1831 cs->dpl = 0; /* will be adjusted later */ 1924 cs->dpl = 0; /* will be adjusted later */
1832 cs->present = 1; 1925 cs->p = 1;
1833 cs->db = 1; 1926 cs->d = 1;
1834 1927
1835 ss->unusable = 0; 1928 set_desc_base(ss, 0); /* flat segment */
1836 ss->base = 0; /* flat segment */ 1929 set_desc_limit(ss, 0xfffff); /* 4GB limit */
1837 ss->limit = 0xffffffff; /* 4GB limit */
1838 ss->g = 1; /* 4kb granularity */ 1930 ss->g = 1; /* 4kb granularity */
1839 ss->s = 1; 1931 ss->s = 1;
1840 ss->type = 0x03; /* Read/Write, Accessed */ 1932 ss->type = 0x03; /* Read/Write, Accessed */
1841 ss->db = 1; /* 32bit stack segment */ 1933 ss->d = 1; /* 32bit stack segment */
1842 ss->dpl = 0; 1934 ss->dpl = 0;
1843 ss->present = 1; 1935 ss->p = 1;
1844} 1936}
1845 1937
1846static int 1938static int
1847emulate_syscall(struct x86_emulate_ctxt *ctxt) 1939emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1848{ 1940{
1849 struct decode_cache *c = &ctxt->decode; 1941 struct decode_cache *c = &ctxt->decode;
1850 struct kvm_segment cs, ss; 1942 struct desc_struct cs, ss;
1851 u64 msr_data; 1943 u64 msr_data;
1944 u16 cs_sel, ss_sel;
1852 1945
1853 /* syscall is not available in real mode */ 1946 /* syscall is not available in real mode */
1854 if (ctxt->mode == X86EMUL_MODE_REAL || 1947 if (ctxt->mode == X86EMUL_MODE_REAL ||
1855 ctxt->mode == X86EMUL_MODE_VM86) { 1948 ctxt->mode == X86EMUL_MODE_VM86) {
1856 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 1949 emulate_ud(ctxt);
1857 return X86EMUL_PROPAGATE_FAULT; 1950 return X86EMUL_PROPAGATE_FAULT;
1858 } 1951 }
1859 1952
1860 setup_syscalls_segments(ctxt, &cs, &ss); 1953 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1861 1954
1862 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1955 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
1863 msr_data >>= 32; 1956 msr_data >>= 32;
1864 cs.selector = (u16)(msr_data & 0xfffc); 1957 cs_sel = (u16)(msr_data & 0xfffc);
1865 ss.selector = (u16)(msr_data + 8); 1958 ss_sel = (u16)(msr_data + 8);
1866 1959
1867 if (is_long_mode(ctxt->vcpu)) { 1960 if (is_long_mode(ctxt->vcpu)) {
1868 cs.db = 0; 1961 cs.d = 0;
1869 cs.l = 1; 1962 cs.l = 1;
1870 } 1963 }
1871 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 1964 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
1872 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 1965 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1966 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
1967 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1873 1968
1874 c->regs[VCPU_REGS_RCX] = c->eip; 1969 c->regs[VCPU_REGS_RCX] = c->eip;
1875 if (is_long_mode(ctxt->vcpu)) { 1970 if (is_long_mode(ctxt->vcpu)) {
1876#ifdef CONFIG_X86_64 1971#ifdef CONFIG_X86_64
1877 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; 1972 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
1878 1973
1879 kvm_x86_ops->get_msr(ctxt->vcpu, 1974 ops->get_msr(ctxt->vcpu,
1880 ctxt->mode == X86EMUL_MODE_PROT64 ? 1975 ctxt->mode == X86EMUL_MODE_PROT64 ?
1881 MSR_LSTAR : MSR_CSTAR, &msr_data); 1976 MSR_LSTAR : MSR_CSTAR, &msr_data);
1882 c->eip = msr_data; 1977 c->eip = msr_data;
1883 1978
1884 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); 1979 ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
1885 ctxt->eflags &= ~(msr_data | EFLG_RF); 1980 ctxt->eflags &= ~(msr_data | EFLG_RF);
1886#endif 1981#endif
1887 } else { 1982 } else {
1888 /* legacy mode */ 1983 /* legacy mode */
1889 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1984 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
1890 c->eip = (u32)msr_data; 1985 c->eip = (u32)msr_data;
1891 1986
1892 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1987 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1896,15 +1991,16 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1896} 1991}
1897 1992
1898static int 1993static int
1899emulate_sysenter(struct x86_emulate_ctxt *ctxt) 1994emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1900{ 1995{
1901 struct decode_cache *c = &ctxt->decode; 1996 struct decode_cache *c = &ctxt->decode;
1902 struct kvm_segment cs, ss; 1997 struct desc_struct cs, ss;
1903 u64 msr_data; 1998 u64 msr_data;
1999 u16 cs_sel, ss_sel;
1904 2000
1905 /* inject #GP if in real mode */ 2001 /* inject #GP if in real mode */
1906 if (ctxt->mode == X86EMUL_MODE_REAL) { 2002 if (ctxt->mode == X86EMUL_MODE_REAL) {
1907 kvm_inject_gp(ctxt->vcpu, 0); 2003 emulate_gp(ctxt, 0);
1908 return X86EMUL_PROPAGATE_FAULT; 2004 return X86EMUL_PROPAGATE_FAULT;
1909 } 2005 }
1910 2006
@@ -1912,67 +2008,70 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1912 * Therefore, we inject an #UD. 2008 * Therefore, we inject an #UD.
1913 */ 2009 */
1914 if (ctxt->mode == X86EMUL_MODE_PROT64) { 2010 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1915 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2011 emulate_ud(ctxt);
1916 return X86EMUL_PROPAGATE_FAULT; 2012 return X86EMUL_PROPAGATE_FAULT;
1917 } 2013 }
1918 2014
1919 setup_syscalls_segments(ctxt, &cs, &ss); 2015 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1920 2016
1921 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 2017 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1922 switch (ctxt->mode) { 2018 switch (ctxt->mode) {
1923 case X86EMUL_MODE_PROT32: 2019 case X86EMUL_MODE_PROT32:
1924 if ((msr_data & 0xfffc) == 0x0) { 2020 if ((msr_data & 0xfffc) == 0x0) {
1925 kvm_inject_gp(ctxt->vcpu, 0); 2021 emulate_gp(ctxt, 0);
1926 return X86EMUL_PROPAGATE_FAULT; 2022 return X86EMUL_PROPAGATE_FAULT;
1927 } 2023 }
1928 break; 2024 break;
1929 case X86EMUL_MODE_PROT64: 2025 case X86EMUL_MODE_PROT64:
1930 if (msr_data == 0x0) { 2026 if (msr_data == 0x0) {
1931 kvm_inject_gp(ctxt->vcpu, 0); 2027 emulate_gp(ctxt, 0);
1932 return X86EMUL_PROPAGATE_FAULT; 2028 return X86EMUL_PROPAGATE_FAULT;
1933 } 2029 }
1934 break; 2030 break;
1935 } 2031 }
1936 2032
1937 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 2033 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1938 cs.selector = (u16)msr_data; 2034 cs_sel = (u16)msr_data;
1939 cs.selector &= ~SELECTOR_RPL_MASK; 2035 cs_sel &= ~SELECTOR_RPL_MASK;
1940 ss.selector = cs.selector + 8; 2036 ss_sel = cs_sel + 8;
1941 ss.selector &= ~SELECTOR_RPL_MASK; 2037 ss_sel &= ~SELECTOR_RPL_MASK;
1942 if (ctxt->mode == X86EMUL_MODE_PROT64 2038 if (ctxt->mode == X86EMUL_MODE_PROT64
1943 || is_long_mode(ctxt->vcpu)) { 2039 || is_long_mode(ctxt->vcpu)) {
1944 cs.db = 0; 2040 cs.d = 0;
1945 cs.l = 1; 2041 cs.l = 1;
1946 } 2042 }
1947 2043
1948 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 2044 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
1949 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 2045 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
2046 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
2047 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1950 2048
1951 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); 2049 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
1952 c->eip = msr_data; 2050 c->eip = msr_data;
1953 2051
1954 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); 2052 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
1955 c->regs[VCPU_REGS_RSP] = msr_data; 2053 c->regs[VCPU_REGS_RSP] = msr_data;
1956 2054
1957 return X86EMUL_CONTINUE; 2055 return X86EMUL_CONTINUE;
1958} 2056}
1959 2057
1960static int 2058static int
1961emulate_sysexit(struct x86_emulate_ctxt *ctxt) 2059emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1962{ 2060{
1963 struct decode_cache *c = &ctxt->decode; 2061 struct decode_cache *c = &ctxt->decode;
1964 struct kvm_segment cs, ss; 2062 struct desc_struct cs, ss;
1965 u64 msr_data; 2063 u64 msr_data;
1966 int usermode; 2064 int usermode;
2065 u16 cs_sel, ss_sel;
1967 2066
1968 /* inject #GP if in real mode or Virtual 8086 mode */ 2067 /* inject #GP if in real mode or Virtual 8086 mode */
1969 if (ctxt->mode == X86EMUL_MODE_REAL || 2068 if (ctxt->mode == X86EMUL_MODE_REAL ||
1970 ctxt->mode == X86EMUL_MODE_VM86) { 2069 ctxt->mode == X86EMUL_MODE_VM86) {
1971 kvm_inject_gp(ctxt->vcpu, 0); 2070 emulate_gp(ctxt, 0);
1972 return X86EMUL_PROPAGATE_FAULT; 2071 return X86EMUL_PROPAGATE_FAULT;
1973 } 2072 }
1974 2073
1975 setup_syscalls_segments(ctxt, &cs, &ss); 2074 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1976 2075
1977 if ((c->rex_prefix & 0x8) != 0x0) 2076 if ((c->rex_prefix & 0x8) != 0x0)
1978 usermode = X86EMUL_MODE_PROT64; 2077 usermode = X86EMUL_MODE_PROT64;
@@ -1981,35 +2080,37 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1981 2080
1982 cs.dpl = 3; 2081 cs.dpl = 3;
1983 ss.dpl = 3; 2082 ss.dpl = 3;
1984 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 2083 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1985 switch (usermode) { 2084 switch (usermode) {
1986 case X86EMUL_MODE_PROT32: 2085 case X86EMUL_MODE_PROT32:
1987 cs.selector = (u16)(msr_data + 16); 2086 cs_sel = (u16)(msr_data + 16);
1988 if ((msr_data & 0xfffc) == 0x0) { 2087 if ((msr_data & 0xfffc) == 0x0) {
1989 kvm_inject_gp(ctxt->vcpu, 0); 2088 emulate_gp(ctxt, 0);
1990 return X86EMUL_PROPAGATE_FAULT; 2089 return X86EMUL_PROPAGATE_FAULT;
1991 } 2090 }
1992 ss.selector = (u16)(msr_data + 24); 2091 ss_sel = (u16)(msr_data + 24);
1993 break; 2092 break;
1994 case X86EMUL_MODE_PROT64: 2093 case X86EMUL_MODE_PROT64:
1995 cs.selector = (u16)(msr_data + 32); 2094 cs_sel = (u16)(msr_data + 32);
1996 if (msr_data == 0x0) { 2095 if (msr_data == 0x0) {
1997 kvm_inject_gp(ctxt->vcpu, 0); 2096 emulate_gp(ctxt, 0);
1998 return X86EMUL_PROPAGATE_FAULT; 2097 return X86EMUL_PROPAGATE_FAULT;
1999 } 2098 }
2000 ss.selector = cs.selector + 8; 2099 ss_sel = cs_sel + 8;
2001 cs.db = 0; 2100 cs.d = 0;
2002 cs.l = 1; 2101 cs.l = 1;
2003 break; 2102 break;
2004 } 2103 }
2005 cs.selector |= SELECTOR_RPL_MASK; 2104 cs_sel |= SELECTOR_RPL_MASK;
2006 ss.selector |= SELECTOR_RPL_MASK; 2105 ss_sel |= SELECTOR_RPL_MASK;
2007 2106
2008 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 2107 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
2009 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 2108 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
2109 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
2110 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
2010 2111
2011 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; 2112 c->eip = c->regs[VCPU_REGS_RDX];
2012 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; 2113 c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
2013 2114
2014 return X86EMUL_CONTINUE; 2115 return X86EMUL_CONTINUE;
2015} 2116}
@@ -2030,25 +2131,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2030 struct x86_emulate_ops *ops, 2131 struct x86_emulate_ops *ops,
2031 u16 port, u16 len) 2132 u16 port, u16 len)
2032{ 2133{
2033 struct kvm_segment tr_seg; 2134 struct desc_struct tr_seg;
2034 int r; 2135 int r;
2035 u16 io_bitmap_ptr; 2136 u16 io_bitmap_ptr;
2036 u8 perm, bit_idx = port & 0x7; 2137 u8 perm, bit_idx = port & 0x7;
2037 unsigned mask = (1 << len) - 1; 2138 unsigned mask = (1 << len) - 1;
2038 2139
2039 kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); 2140 ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu);
2040 if (tr_seg.unusable) 2141 if (!tr_seg.p)
2041 return false; 2142 return false;
2042 if (tr_seg.limit < 103) 2143 if (desc_limit_scaled(&tr_seg) < 103)
2043 return false; 2144 return false;
2044 r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, 2145 r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2,
2045 NULL); 2146 ctxt->vcpu, NULL);
2046 if (r != X86EMUL_CONTINUE) 2147 if (r != X86EMUL_CONTINUE)
2047 return false; 2148 return false;
2048 if (io_bitmap_ptr + port/8 > tr_seg.limit) 2149 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
2049 return false; 2150 return false;
2050 r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, 2151 r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8,
2051 ctxt->vcpu, NULL); 2152 &perm, 1, ctxt->vcpu, NULL);
2052 if (r != X86EMUL_CONTINUE) 2153 if (r != X86EMUL_CONTINUE)
2053 return false; 2154 return false;
2054 if ((perm >> bit_idx) & mask) 2155 if ((perm >> bit_idx) & mask)
@@ -2066,17 +2167,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2066 return true; 2167 return true;
2067} 2168}
2068 2169
2069static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
2070 struct x86_emulate_ops *ops,
2071 int seg)
2072{
2073 struct desc_struct desc;
2074 if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
2075 return get_desc_base(&desc);
2076 else
2077 return ~0;
2078}
2079
2080static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, 2170static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2081 struct x86_emulate_ops *ops, 2171 struct x86_emulate_ops *ops,
2082 struct tss_segment_16 *tss) 2172 struct tss_segment_16 *tss)
@@ -2165,7 +2255,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2165 &err); 2255 &err);
2166 if (ret == X86EMUL_PROPAGATE_FAULT) { 2256 if (ret == X86EMUL_PROPAGATE_FAULT) {
2167 /* FIXME: need to provide precise fault address */ 2257 /* FIXME: need to provide precise fault address */
2168 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2258 emulate_pf(ctxt, old_tss_base, err);
2169 return ret; 2259 return ret;
2170 } 2260 }
2171 2261
@@ -2175,7 +2265,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2175 &err); 2265 &err);
2176 if (ret == X86EMUL_PROPAGATE_FAULT) { 2266 if (ret == X86EMUL_PROPAGATE_FAULT) {
2177 /* FIXME: need to provide precise fault address */ 2267 /* FIXME: need to provide precise fault address */
2178 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2268 emulate_pf(ctxt, old_tss_base, err);
2179 return ret; 2269 return ret;
2180 } 2270 }
2181 2271
@@ -2183,7 +2273,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2183 &err); 2273 &err);
2184 if (ret == X86EMUL_PROPAGATE_FAULT) { 2274 if (ret == X86EMUL_PROPAGATE_FAULT) {
2185 /* FIXME: need to provide precise fault address */ 2275 /* FIXME: need to provide precise fault address */
2186 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2276 emulate_pf(ctxt, new_tss_base, err);
2187 return ret; 2277 return ret;
2188 } 2278 }
2189 2279
@@ -2196,7 +2286,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2196 ctxt->vcpu, &err); 2286 ctxt->vcpu, &err);
2197 if (ret == X86EMUL_PROPAGATE_FAULT) { 2287 if (ret == X86EMUL_PROPAGATE_FAULT) {
2198 /* FIXME: need to provide precise fault address */ 2288 /* FIXME: need to provide precise fault address */
2199 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2289 emulate_pf(ctxt, new_tss_base, err);
2200 return ret; 2290 return ret;
2201 } 2291 }
2202 } 2292 }
@@ -2238,7 +2328,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2238 struct decode_cache *c = &ctxt->decode; 2328 struct decode_cache *c = &ctxt->decode;
2239 int ret; 2329 int ret;
2240 2330
2241 ops->set_cr(3, tss->cr3, ctxt->vcpu); 2331 if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
2332 emulate_gp(ctxt, 0);
2333 return X86EMUL_PROPAGATE_FAULT;
2334 }
2242 c->eip = tss->eip; 2335 c->eip = tss->eip;
2243 ctxt->eflags = tss->eflags | 2; 2336 ctxt->eflags = tss->eflags | 2;
2244 c->regs[VCPU_REGS_RAX] = tss->eax; 2337 c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2304,7 +2397,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2304 &err); 2397 &err);
2305 if (ret == X86EMUL_PROPAGATE_FAULT) { 2398 if (ret == X86EMUL_PROPAGATE_FAULT) {
2306 /* FIXME: need to provide precise fault address */ 2399 /* FIXME: need to provide precise fault address */
2307 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2400 emulate_pf(ctxt, old_tss_base, err);
2308 return ret; 2401 return ret;
2309 } 2402 }
2310 2403
@@ -2314,7 +2407,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2314 &err); 2407 &err);
2315 if (ret == X86EMUL_PROPAGATE_FAULT) { 2408 if (ret == X86EMUL_PROPAGATE_FAULT) {
2316 /* FIXME: need to provide precise fault address */ 2409 /* FIXME: need to provide precise fault address */
2317 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2410 emulate_pf(ctxt, old_tss_base, err);
2318 return ret; 2411 return ret;
2319 } 2412 }
2320 2413
@@ -2322,7 +2415,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2322 &err); 2415 &err);
2323 if (ret == X86EMUL_PROPAGATE_FAULT) { 2416 if (ret == X86EMUL_PROPAGATE_FAULT) {
2324 /* FIXME: need to provide precise fault address */ 2417 /* FIXME: need to provide precise fault address */
2325 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2418 emulate_pf(ctxt, new_tss_base, err);
2326 return ret; 2419 return ret;
2327 } 2420 }
2328 2421
@@ -2335,7 +2428,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2335 ctxt->vcpu, &err); 2428 ctxt->vcpu, &err);
2336 if (ret == X86EMUL_PROPAGATE_FAULT) { 2429 if (ret == X86EMUL_PROPAGATE_FAULT) {
2337 /* FIXME: need to provide precise fault address */ 2430 /* FIXME: need to provide precise fault address */
2338 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2431 emulate_pf(ctxt, new_tss_base, err);
2339 return ret; 2432 return ret;
2340 } 2433 }
2341 } 2434 }
@@ -2352,7 +2445,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2352 int ret; 2445 int ret;
2353 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); 2446 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
2354 ulong old_tss_base = 2447 ulong old_tss_base =
2355 get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR); 2448 ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);
2356 u32 desc_limit; 2449 u32 desc_limit;
2357 2450
2358 /* FIXME: old_tss_base == ~0 ? */ 2451 /* FIXME: old_tss_base == ~0 ? */
@@ -2369,7 +2462,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2369 if (reason != TASK_SWITCH_IRET) { 2462 if (reason != TASK_SWITCH_IRET) {
2370 if ((tss_selector & 3) > next_tss_desc.dpl || 2463 if ((tss_selector & 3) > next_tss_desc.dpl ||
2371 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { 2464 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
2372 kvm_inject_gp(ctxt->vcpu, 0); 2465 emulate_gp(ctxt, 0);
2373 return X86EMUL_PROPAGATE_FAULT; 2466 return X86EMUL_PROPAGATE_FAULT;
2374 } 2467 }
2375 } 2468 }
@@ -2378,8 +2471,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2378 if (!next_tss_desc.p || 2471 if (!next_tss_desc.p ||
2379 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || 2472 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
2380 desc_limit < 0x2b)) { 2473 desc_limit < 0x2b)) {
2381 kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR, 2474 emulate_ts(ctxt, tss_selector & 0xfffc);
2382 tss_selector & 0xfffc);
2383 return X86EMUL_PROPAGATE_FAULT; 2475 return X86EMUL_PROPAGATE_FAULT;
2384 } 2476 }
2385 2477
@@ -2425,7 +2517,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2425 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; 2517 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2426 c->lock_prefix = 0; 2518 c->lock_prefix = 0;
2427 c->src.val = (unsigned long) error_code; 2519 c->src.val = (unsigned long) error_code;
2428 emulate_push(ctxt); 2520 emulate_push(ctxt, ops);
2429 } 2521 }
2430 2522
2431 return ret; 2523 return ret;
@@ -2439,18 +2531,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2439 struct decode_cache *c = &ctxt->decode; 2531 struct decode_cache *c = &ctxt->decode;
2440 int rc; 2532 int rc;
2441 2533
2442 memset(c, 0, sizeof(struct decode_cache));
2443 c->eip = ctxt->eip; 2534 c->eip = ctxt->eip;
2444 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
2445 c->dst.type = OP_NONE; 2535 c->dst.type = OP_NONE;
2446 2536
2447 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, 2537 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
2448 has_error_code, error_code); 2538 has_error_code, error_code);
2449 2539
2450 if (rc == X86EMUL_CONTINUE) { 2540 if (rc == X86EMUL_CONTINUE) {
2451 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
2452 kvm_rip_write(ctxt->vcpu, c->eip);
2453 rc = writeback(ctxt, ops); 2541 rc = writeback(ctxt, ops);
2542 if (rc == X86EMUL_CONTINUE)
2543 ctxt->eip = c->eip;
2454 } 2544 }
2455 2545
2456 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 2546 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -2474,29 +2564,22 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2474 int rc = X86EMUL_CONTINUE; 2564 int rc = X86EMUL_CONTINUE;
2475 int saved_dst_type = c->dst.type; 2565 int saved_dst_type = c->dst.type;
2476 2566
2477 ctxt->interruptibility = 0; 2567 ctxt->decode.mem_read.pos = 0;
2478
2479 /* Shadow copy of register state. Committed on successful emulation.
2480 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
2481 * modify them.
2482 */
2483
2484 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
2485 2568
2486 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 2569 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
2487 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2570 emulate_ud(ctxt);
2488 goto done; 2571 goto done;
2489 } 2572 }
2490 2573
2491 /* LOCK prefix is allowed only with some instructions */ 2574 /* LOCK prefix is allowed only with some instructions */
2492 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { 2575 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
2493 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2576 emulate_ud(ctxt);
2494 goto done; 2577 goto done;
2495 } 2578 }
2496 2579
2497 /* Privileged instruction can be executed only in CPL=0 */ 2580 /* Privileged instruction can be executed only in CPL=0 */
2498 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 2581 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
2499 kvm_inject_gp(ctxt->vcpu, 0); 2582 emulate_gp(ctxt, 0);
2500 goto done; 2583 goto done;
2501 } 2584 }
2502 2585
@@ -2506,7 +2589,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2506 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { 2589 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
2507 string_done: 2590 string_done:
2508 ctxt->restart = false; 2591 ctxt->restart = false;
2509 kvm_rip_write(ctxt->vcpu, c->eip); 2592 ctxt->eip = c->eip;
2510 goto done; 2593 goto done;
2511 } 2594 }
2512 /* The second termination condition only applies for REPE 2595 /* The second termination condition only applies for REPE
@@ -2529,20 +2612,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2529 } 2612 }
2530 2613
2531 if (c->src.type == OP_MEM) { 2614 if (c->src.type == OP_MEM) {
2532 rc = ops->read_emulated((unsigned long)c->src.ptr, 2615 rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr,
2533 &c->src.val, 2616 c->src.valptr, c->src.bytes);
2534 c->src.bytes,
2535 ctxt->vcpu);
2536 if (rc != X86EMUL_CONTINUE) 2617 if (rc != X86EMUL_CONTINUE)
2537 goto done; 2618 goto done;
2538 c->src.orig_val = c->src.val; 2619 c->src.orig_val = c->src.val;
2539 } 2620 }
2540 2621
2541 if (c->src2.type == OP_MEM) { 2622 if (c->src2.type == OP_MEM) {
2542 rc = ops->read_emulated((unsigned long)c->src2.ptr, 2623 rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr,
2543 &c->src2.val, 2624 &c->src2.val, c->src2.bytes);
2544 c->src2.bytes,
2545 ctxt->vcpu);
2546 if (rc != X86EMUL_CONTINUE) 2625 if (rc != X86EMUL_CONTINUE)
2547 goto done; 2626 goto done;
2548 } 2627 }
@@ -2553,8 +2632,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2553 2632
2554 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 2633 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
2555 /* optimisation - avoid slow emulated read if Mov */ 2634 /* optimisation - avoid slow emulated read if Mov */
2556 rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val, 2635 rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr,
2557 c->dst.bytes, ctxt->vcpu); 2636 &c->dst.val, c->dst.bytes);
2558 if (rc != X86EMUL_CONTINUE) 2637 if (rc != X86EMUL_CONTINUE)
2559 goto done; 2638 goto done;
2560 } 2639 }
@@ -2571,7 +2650,7 @@ special_insn:
2571 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 2650 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
2572 break; 2651 break;
2573 case 0x06: /* push es */ 2652 case 0x06: /* push es */
2574 emulate_push_sreg(ctxt, VCPU_SREG_ES); 2653 emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
2575 break; 2654 break;
2576 case 0x07: /* pop es */ 2655 case 0x07: /* pop es */
2577 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 2656 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
@@ -2583,14 +2662,14 @@ special_insn:
2583 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 2662 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2584 break; 2663 break;
2585 case 0x0e: /* push cs */ 2664 case 0x0e: /* push cs */
2586 emulate_push_sreg(ctxt, VCPU_SREG_CS); 2665 emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
2587 break; 2666 break;
2588 case 0x10 ... 0x15: 2667 case 0x10 ... 0x15:
2589 adc: /* adc */ 2668 adc: /* adc */
2590 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); 2669 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
2591 break; 2670 break;
2592 case 0x16: /* push ss */ 2671 case 0x16: /* push ss */
2593 emulate_push_sreg(ctxt, VCPU_SREG_SS); 2672 emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
2594 break; 2673 break;
2595 case 0x17: /* pop ss */ 2674 case 0x17: /* pop ss */
2596 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 2675 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
@@ -2602,7 +2681,7 @@ special_insn:
2602 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 2681 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
2603 break; 2682 break;
2604 case 0x1e: /* push ds */ 2683 case 0x1e: /* push ds */
2605 emulate_push_sreg(ctxt, VCPU_SREG_DS); 2684 emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
2606 break; 2685 break;
2607 case 0x1f: /* pop ds */ 2686 case 0x1f: /* pop ds */
2608 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 2687 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
@@ -2632,7 +2711,7 @@ special_insn:
2632 emulate_1op("dec", c->dst, ctxt->eflags); 2711 emulate_1op("dec", c->dst, ctxt->eflags);
2633 break; 2712 break;
2634 case 0x50 ... 0x57: /* push reg */ 2713 case 0x50 ... 0x57: /* push reg */
2635 emulate_push(ctxt); 2714 emulate_push(ctxt, ops);
2636 break; 2715 break;
2637 case 0x58 ... 0x5f: /* pop reg */ 2716 case 0x58 ... 0x5f: /* pop reg */
2638 pop_instruction: 2717 pop_instruction:
@@ -2641,7 +2720,9 @@ special_insn:
2641 goto done; 2720 goto done;
2642 break; 2721 break;
2643 case 0x60: /* pusha */ 2722 case 0x60: /* pusha */
2644 emulate_pusha(ctxt); 2723 rc = emulate_pusha(ctxt, ops);
2724 if (rc != X86EMUL_CONTINUE)
2725 goto done;
2645 break; 2726 break;
2646 case 0x61: /* popa */ 2727 case 0x61: /* popa */
2647 rc = emulate_popa(ctxt, ops); 2728 rc = emulate_popa(ctxt, ops);
@@ -2655,14 +2736,14 @@ special_insn:
2655 break; 2736 break;
2656 case 0x68: /* push imm */ 2737 case 0x68: /* push imm */
2657 case 0x6a: /* push imm8 */ 2738 case 0x6a: /* push imm8 */
2658 emulate_push(ctxt); 2739 emulate_push(ctxt, ops);
2659 break; 2740 break;
2660 case 0x6c: /* insb */ 2741 case 0x6c: /* insb */
2661 case 0x6d: /* insw/insd */ 2742 case 0x6d: /* insw/insd */
2662 c->dst.bytes = min(c->dst.bytes, 4u); 2743 c->dst.bytes = min(c->dst.bytes, 4u);
2663 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2744 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2664 c->dst.bytes)) { 2745 c->dst.bytes)) {
2665 kvm_inject_gp(ctxt->vcpu, 0); 2746 emulate_gp(ctxt, 0);
2666 goto done; 2747 goto done;
2667 } 2748 }
2668 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, 2749 if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
@@ -2674,7 +2755,7 @@ special_insn:
2674 c->src.bytes = min(c->src.bytes, 4u); 2755 c->src.bytes = min(c->src.bytes, 4u);
2675 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2756 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2676 c->src.bytes)) { 2757 c->src.bytes)) {
2677 kvm_inject_gp(ctxt->vcpu, 0); 2758 emulate_gp(ctxt, 0);
2678 goto done; 2759 goto done;
2679 } 2760 }
2680 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], 2761 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
@@ -2707,6 +2788,7 @@ special_insn:
2707 } 2788 }
2708 break; 2789 break;
2709 case 0x84 ... 0x85: 2790 case 0x84 ... 0x85:
2791 test:
2710 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 2792 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
2711 break; 2793 break;
2712 case 0x86 ... 0x87: /* xchg */ 2794 case 0x86 ... 0x87: /* xchg */
@@ -2735,18 +2817,13 @@ special_insn:
2735 break; 2817 break;
2736 case 0x88 ... 0x8b: /* mov */ 2818 case 0x88 ... 0x8b: /* mov */
2737 goto mov; 2819 goto mov;
2738 case 0x8c: { /* mov r/m, sreg */ 2820 case 0x8c: /* mov r/m, sreg */
2739 struct kvm_segment segreg; 2821 if (c->modrm_reg > VCPU_SREG_GS) {
2740 2822 emulate_ud(ctxt);
2741 if (c->modrm_reg <= VCPU_SREG_GS)
2742 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
2743 else {
2744 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2745 goto done; 2823 goto done;
2746 } 2824 }
2747 c->dst.val = segreg.selector; 2825 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
2748 break; 2826 break;
2749 }
2750 case 0x8d: /* lea r16/r32, m */ 2827 case 0x8d: /* lea r16/r32, m */
2751 c->dst.val = c->modrm_ea; 2828 c->dst.val = c->modrm_ea;
2752 break; 2829 break;
@@ -2757,12 +2834,12 @@ special_insn:
2757 2834
2758 if (c->modrm_reg == VCPU_SREG_CS || 2835 if (c->modrm_reg == VCPU_SREG_CS ||
2759 c->modrm_reg > VCPU_SREG_GS) { 2836 c->modrm_reg > VCPU_SREG_GS) {
2760 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2837 emulate_ud(ctxt);
2761 goto done; 2838 goto done;
2762 } 2839 }
2763 2840
2764 if (c->modrm_reg == VCPU_SREG_SS) 2841 if (c->modrm_reg == VCPU_SREG_SS)
2765 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS); 2842 ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
2766 2843
2767 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); 2844 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
2768 2845
@@ -2775,19 +2852,19 @@ special_insn:
2775 goto done; 2852 goto done;
2776 break; 2853 break;
2777 case 0x90: /* nop / xchg r8,rax */ 2854 case 0x90: /* nop / xchg r8,rax */
2778 if (!(c->rex_prefix & 1)) { /* nop */ 2855 if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) {
2779 c->dst.type = OP_NONE; 2856 c->dst.type = OP_NONE; /* nop */
2780 break; 2857 break;
2781 } 2858 }
2782 case 0x91 ... 0x97: /* xchg reg,rax */ 2859 case 0x91 ... 0x97: /* xchg reg,rax */
2783 c->src.type = c->dst.type = OP_REG; 2860 c->src.type = OP_REG;
2784 c->src.bytes = c->dst.bytes = c->op_bytes; 2861 c->src.bytes = c->op_bytes;
2785 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; 2862 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
2786 c->src.val = *(c->src.ptr); 2863 c->src.val = *(c->src.ptr);
2787 goto xchg; 2864 goto xchg;
2788 case 0x9c: /* pushf */ 2865 case 0x9c: /* pushf */
2789 c->src.val = (unsigned long) ctxt->eflags; 2866 c->src.val = (unsigned long) ctxt->eflags;
2790 emulate_push(ctxt); 2867 emulate_push(ctxt, ops);
2791 break; 2868 break;
2792 case 0x9d: /* popf */ 2869 case 0x9d: /* popf */
2793 c->dst.type = OP_REG; 2870 c->dst.type = OP_REG;
@@ -2797,19 +2874,15 @@ special_insn:
2797 if (rc != X86EMUL_CONTINUE) 2874 if (rc != X86EMUL_CONTINUE)
2798 goto done; 2875 goto done;
2799 break; 2876 break;
2800 case 0xa0 ... 0xa1: /* mov */ 2877 case 0xa0 ... 0xa3: /* mov */
2801 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
2802 c->dst.val = c->src.val;
2803 break;
2804 case 0xa2 ... 0xa3: /* mov */
2805 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
2806 break;
2807 case 0xa4 ... 0xa5: /* movs */ 2878 case 0xa4 ... 0xa5: /* movs */
2808 goto mov; 2879 goto mov;
2809 case 0xa6 ... 0xa7: /* cmps */ 2880 case 0xa6 ... 0xa7: /* cmps */
2810 c->dst.type = OP_NONE; /* Disable writeback. */ 2881 c->dst.type = OP_NONE; /* Disable writeback. */
2811 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 2882 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
2812 goto cmp; 2883 goto cmp;
2884 case 0xa8 ... 0xa9: /* test ax, imm */
2885 goto test;
2813 case 0xaa ... 0xab: /* stos */ 2886 case 0xaa ... 0xab: /* stos */
2814 c->dst.val = c->regs[VCPU_REGS_RAX]; 2887 c->dst.val = c->regs[VCPU_REGS_RAX];
2815 break; 2888 break;
@@ -2855,19 +2928,23 @@ special_insn:
2855 long int rel = c->src.val; 2928 long int rel = c->src.val;
2856 c->src.val = (unsigned long) c->eip; 2929 c->src.val = (unsigned long) c->eip;
2857 jmp_rel(c, rel); 2930 jmp_rel(c, rel);
2858 emulate_push(ctxt); 2931 emulate_push(ctxt, ops);
2859 break; 2932 break;
2860 } 2933 }
2861 case 0xe9: /* jmp rel */ 2934 case 0xe9: /* jmp rel */
2862 goto jmp; 2935 goto jmp;
2863 case 0xea: /* jmp far */ 2936 case 0xea: { /* jmp far */
2937 unsigned short sel;
2864 jump_far: 2938 jump_far:
2865 if (load_segment_descriptor(ctxt, ops, c->src2.val, 2939 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
2866 VCPU_SREG_CS)) 2940
2941 if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
2867 goto done; 2942 goto done;
2868 2943
2869 c->eip = c->src.val; 2944 c->eip = 0;
2945 memcpy(&c->eip, c->src.valptr, c->op_bytes);
2870 break; 2946 break;
2947 }
2871 case 0xeb: 2948 case 0xeb:
2872 jmp: /* jmp rel short */ 2949 jmp: /* jmp rel short */
2873 jmp_rel(c, c->src.val); 2950 jmp_rel(c, c->src.val);
@@ -2879,20 +2956,20 @@ special_insn:
2879 do_io_in: 2956 do_io_in:
2880 c->dst.bytes = min(c->dst.bytes, 4u); 2957 c->dst.bytes = min(c->dst.bytes, 4u);
2881 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 2958 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2882 kvm_inject_gp(ctxt->vcpu, 0); 2959 emulate_gp(ctxt, 0);
2883 goto done; 2960 goto done;
2884 } 2961 }
2885 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 2962 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
2886 &c->dst.val)) 2963 &c->dst.val))
2887 goto done; /* IO is needed */ 2964 goto done; /* IO is needed */
2888 break; 2965 break;
2889 case 0xee: /* out al,dx */ 2966 case 0xee: /* out dx,al */
2890 case 0xef: /* out (e/r)ax,dx */ 2967 case 0xef: /* out dx,(e/r)ax */
2891 c->src.val = c->regs[VCPU_REGS_RDX]; 2968 c->src.val = c->regs[VCPU_REGS_RDX];
2892 do_io_out: 2969 do_io_out:
2893 c->dst.bytes = min(c->dst.bytes, 4u); 2970 c->dst.bytes = min(c->dst.bytes, 4u);
2894 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 2971 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2895 kvm_inject_gp(ctxt->vcpu, 0); 2972 emulate_gp(ctxt, 0);
2896 goto done; 2973 goto done;
2897 } 2974 }
2898 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, 2975 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
@@ -2916,18 +2993,20 @@ special_insn:
2916 c->dst.type = OP_NONE; /* Disable writeback. */ 2993 c->dst.type = OP_NONE; /* Disable writeback. */
2917 break; 2994 break;
2918 case 0xfa: /* cli */ 2995 case 0xfa: /* cli */
2919 if (emulator_bad_iopl(ctxt, ops)) 2996 if (emulator_bad_iopl(ctxt, ops)) {
2920 kvm_inject_gp(ctxt->vcpu, 0); 2997 emulate_gp(ctxt, 0);
2921 else { 2998 goto done;
2999 } else {
2922 ctxt->eflags &= ~X86_EFLAGS_IF; 3000 ctxt->eflags &= ~X86_EFLAGS_IF;
2923 c->dst.type = OP_NONE; /* Disable writeback. */ 3001 c->dst.type = OP_NONE; /* Disable writeback. */
2924 } 3002 }
2925 break; 3003 break;
2926 case 0xfb: /* sti */ 3004 case 0xfb: /* sti */
2927 if (emulator_bad_iopl(ctxt, ops)) 3005 if (emulator_bad_iopl(ctxt, ops)) {
2928 kvm_inject_gp(ctxt->vcpu, 0); 3006 emulate_gp(ctxt, 0);
2929 else { 3007 goto done;
2930 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI); 3008 } else {
3009 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
2931 ctxt->eflags |= X86_EFLAGS_IF; 3010 ctxt->eflags |= X86_EFLAGS_IF;
2932 c->dst.type = OP_NONE; /* Disable writeback. */ 3011 c->dst.type = OP_NONE; /* Disable writeback. */
2933 } 3012 }
@@ -2964,11 +3043,12 @@ writeback:
2964 c->dst.type = saved_dst_type; 3043 c->dst.type = saved_dst_type;
2965 3044
2966 if ((c->d & SrcMask) == SrcSI) 3045 if ((c->d & SrcMask) == SrcSI)
2967 string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI, 3046 string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
2968 &c->src); 3047 VCPU_REGS_RSI, &c->src);
2969 3048
2970 if ((c->d & DstMask) == DstDI) 3049 if ((c->d & DstMask) == DstDI)
2971 string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst); 3050 string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
3051 &c->dst);
2972 3052
2973 if (c->rep_prefix && (c->d & String)) { 3053 if (c->rep_prefix && (c->d & String)) {
2974 struct read_cache *rc = &ctxt->decode.io_read; 3054 struct read_cache *rc = &ctxt->decode.io_read;
@@ -2981,11 +3061,12 @@ writeback:
2981 (rc->end != 0 && rc->end == rc->pos)) 3061 (rc->end != 0 && rc->end == rc->pos))
2982 ctxt->restart = false; 3062 ctxt->restart = false;
2983 } 3063 }
2984 3064 /*
2985 /* Commit shadow register state. */ 3065 * reset read cache here in case string instruction is restared
2986 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 3066 * without decoding
2987 kvm_rip_write(ctxt->vcpu, c->eip); 3067 */
2988 ops->set_rflags(ctxt->vcpu, ctxt->eflags); 3068 ctxt->decode.mem_read.end = 0;
3069 ctxt->eip = c->eip;
2989 3070
2990done: 3071done:
2991 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 3072 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -3051,7 +3132,7 @@ twobyte_insn:
3051 c->dst.type = OP_NONE; 3132 c->dst.type = OP_NONE;
3052 break; 3133 break;
3053 case 5: /* not defined */ 3134 case 5: /* not defined */
3054 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3135 emulate_ud(ctxt);
3055 goto done; 3136 goto done;
3056 case 7: /* invlpg*/ 3137 case 7: /* invlpg*/
3057 emulate_invlpg(ctxt->vcpu, c->modrm_ea); 3138 emulate_invlpg(ctxt->vcpu, c->modrm_ea);
@@ -3063,7 +3144,7 @@ twobyte_insn:
3063 } 3144 }
3064 break; 3145 break;
3065 case 0x05: /* syscall */ 3146 case 0x05: /* syscall */
3066 rc = emulate_syscall(ctxt); 3147 rc = emulate_syscall(ctxt, ops);
3067 if (rc != X86EMUL_CONTINUE) 3148 if (rc != X86EMUL_CONTINUE)
3068 goto done; 3149 goto done;
3069 else 3150 else
@@ -3073,8 +3154,11 @@ twobyte_insn:
3073 emulate_clts(ctxt->vcpu); 3154 emulate_clts(ctxt->vcpu);
3074 c->dst.type = OP_NONE; 3155 c->dst.type = OP_NONE;
3075 break; 3156 break;
3076 case 0x08: /* invd */
3077 case 0x09: /* wbinvd */ 3157 case 0x09: /* wbinvd */
3158 kvm_emulate_wbinvd(ctxt->vcpu);
3159 c->dst.type = OP_NONE;
3160 break;
3161 case 0x08: /* invd */
3078 case 0x0d: /* GrpP (prefetch) */ 3162 case 0x0d: /* GrpP (prefetch) */
3079 case 0x18: /* Grp16 (prefetch/nop) */ 3163 case 0x18: /* Grp16 (prefetch/nop) */
3080 c->dst.type = OP_NONE; 3164 c->dst.type = OP_NONE;
@@ -3084,7 +3168,7 @@ twobyte_insn:
3084 case 1: 3168 case 1:
3085 case 5 ... 7: 3169 case 5 ... 7:
3086 case 9 ... 15: 3170 case 9 ... 15:
3087 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3171 emulate_ud(ctxt);
3088 goto done; 3172 goto done;
3089 } 3173 }
3090 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); 3174 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3093,31 +3177,42 @@ twobyte_insn:
3093 case 0x21: /* mov from dr to reg */ 3177 case 0x21: /* mov from dr to reg */
3094 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3178 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3095 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3179 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3096 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3180 emulate_ud(ctxt);
3097 goto done; 3181 goto done;
3098 } 3182 }
3099 emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); 3183 ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
3100 c->dst.type = OP_NONE; /* no writeback */ 3184 c->dst.type = OP_NONE; /* no writeback */
3101 break; 3185 break;
3102 case 0x22: /* mov reg, cr */ 3186 case 0x22: /* mov reg, cr */
3103 ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu); 3187 if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) {
3188 emulate_gp(ctxt, 0);
3189 goto done;
3190 }
3104 c->dst.type = OP_NONE; 3191 c->dst.type = OP_NONE;
3105 break; 3192 break;
3106 case 0x23: /* mov from reg to dr */ 3193 case 0x23: /* mov from reg to dr */
3107 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3194 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3108 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3195 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3109 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3196 emulate_ud(ctxt);
3197 goto done;
3198 }
3199
3200 if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] &
3201 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
3202 ~0ULL : ~0U), ctxt->vcpu) < 0) {
3203 /* #UD condition is already handled by the code above */
3204 emulate_gp(ctxt, 0);
3110 goto done; 3205 goto done;
3111 } 3206 }
3112 emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]); 3207
3113 c->dst.type = OP_NONE; /* no writeback */ 3208 c->dst.type = OP_NONE; /* no writeback */
3114 break; 3209 break;
3115 case 0x30: 3210 case 0x30:
3116 /* wrmsr */ 3211 /* wrmsr */
3117 msr_data = (u32)c->regs[VCPU_REGS_RAX] 3212 msr_data = (u32)c->regs[VCPU_REGS_RAX]
3118 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 3213 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
3119 if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { 3214 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
3120 kvm_inject_gp(ctxt->vcpu, 0); 3215 emulate_gp(ctxt, 0);
3121 goto done; 3216 goto done;
3122 } 3217 }
3123 rc = X86EMUL_CONTINUE; 3218 rc = X86EMUL_CONTINUE;
@@ -3125,8 +3220,8 @@ twobyte_insn:
3125 break; 3220 break;
3126 case 0x32: 3221 case 0x32:
3127 /* rdmsr */ 3222 /* rdmsr */
3128 if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { 3223 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
3129 kvm_inject_gp(ctxt->vcpu, 0); 3224 emulate_gp(ctxt, 0);
3130 goto done; 3225 goto done;
3131 } else { 3226 } else {
3132 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 3227 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3136,14 +3231,14 @@ twobyte_insn:
3136 c->dst.type = OP_NONE; 3231 c->dst.type = OP_NONE;
3137 break; 3232 break;
3138 case 0x34: /* sysenter */ 3233 case 0x34: /* sysenter */
3139 rc = emulate_sysenter(ctxt); 3234 rc = emulate_sysenter(ctxt, ops);
3140 if (rc != X86EMUL_CONTINUE) 3235 if (rc != X86EMUL_CONTINUE)
3141 goto done; 3236 goto done;
3142 else 3237 else
3143 goto writeback; 3238 goto writeback;
3144 break; 3239 break;
3145 case 0x35: /* sysexit */ 3240 case 0x35: /* sysexit */
3146 rc = emulate_sysexit(ctxt); 3241 rc = emulate_sysexit(ctxt, ops);
3147 if (rc != X86EMUL_CONTINUE) 3242 if (rc != X86EMUL_CONTINUE)
3148 goto done; 3243 goto done;
3149 else 3244 else
@@ -3160,7 +3255,7 @@ twobyte_insn:
3160 c->dst.type = OP_NONE; 3255 c->dst.type = OP_NONE;
3161 break; 3256 break;
3162 case 0xa0: /* push fs */ 3257 case 0xa0: /* push fs */
3163 emulate_push_sreg(ctxt, VCPU_SREG_FS); 3258 emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
3164 break; 3259 break;
3165 case 0xa1: /* pop fs */ 3260 case 0xa1: /* pop fs */
3166 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 3261 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
@@ -3179,7 +3274,7 @@ twobyte_insn:
3179 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 3274 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
3180 break; 3275 break;
3181 case 0xa8: /* push gs */ 3276 case 0xa8: /* push gs */
3182 emulate_push_sreg(ctxt, VCPU_SREG_GS); 3277 emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
3183 break; 3278 break;
3184 case 0xa9: /* pop gs */ 3279 case 0xa9: /* pop gs */
3185 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 3280 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 0150affad25d..0fd6378981f4 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,6 +5,7 @@
5 * Copyright (c) 2006 Intel Corporation 5 * Copyright (c) 2006 Intel Corporation
6 * Copyright (c) 2007 Keir Fraser, XenSource Inc 6 * Copyright (c) 2007 Keir Fraser, XenSource Inc
7 * Copyright (c) 2008 Intel Corporation 7 * Copyright (c) 2008 Intel Corporation
8 * Copyright 2009 Red Hat, Inc. and/or its affilates.
8 * 9 *
9 * Permission is hereby granted, free of charge, to any person obtaining a copy 10 * Permission is hereby granted, free of charge, to any person obtaining a copy
10 * of this software and associated documentation files (the "Software"), to deal 11 * of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,7 @@
33 34
34#include <linux/kvm_host.h> 35#include <linux/kvm_host.h>
35#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/workqueue.h>
36 38
37#include "irq.h" 39#include "irq.h"
38#include "i8254.h" 40#include "i8254.h"
@@ -243,11 +245,22 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
243{ 245{
244 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, 246 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
245 irq_ack_notifier); 247 irq_ack_notifier);
246 raw_spin_lock(&ps->inject_lock); 248 int value;
247 if (atomic_dec_return(&ps->pit_timer.pending) < 0) 249
250 spin_lock(&ps->inject_lock);
251 value = atomic_dec_return(&ps->pit_timer.pending);
252 if (value < 0)
253 /* spurious acks can be generated if, for example, the
254 * PIC is being reset. Handle it gracefully here
255 */
248 atomic_inc(&ps->pit_timer.pending); 256 atomic_inc(&ps->pit_timer.pending);
257 else if (value > 0)
258 /* in this case, we had multiple outstanding pit interrupts
259 * that we needed to inject. Reinject
260 */
261 queue_work(ps->pit->wq, &ps->pit->expired);
249 ps->irq_ack = 1; 262 ps->irq_ack = 1;
250 raw_spin_unlock(&ps->inject_lock); 263 spin_unlock(&ps->inject_lock);
251} 264}
252 265
253void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) 266void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -263,10 +276,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
263 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 276 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
264} 277}
265 278
266static void destroy_pit_timer(struct kvm_timer *pt) 279static void destroy_pit_timer(struct kvm_pit *pit)
267{ 280{
268 pr_debug("execute del timer!\n"); 281 hrtimer_cancel(&pit->pit_state.pit_timer.timer);
269 hrtimer_cancel(&pt->timer); 282 cancel_work_sync(&pit->expired);
270} 283}
271 284
272static bool kpit_is_periodic(struct kvm_timer *ktimer) 285static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -280,6 +293,60 @@ static struct kvm_timer_ops kpit_ops = {
280 .is_periodic = kpit_is_periodic, 293 .is_periodic = kpit_is_periodic,
281}; 294};
282 295
296static void pit_do_work(struct work_struct *work)
297{
298 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
299 struct kvm *kvm = pit->kvm;
300 struct kvm_vcpu *vcpu;
301 int i;
302 struct kvm_kpit_state *ps = &pit->pit_state;
303 int inject = 0;
304
305 /* Try to inject pending interrupts when
306 * last one has been acked.
307 */
308 spin_lock(&ps->inject_lock);
309 if (ps->irq_ack) {
310 ps->irq_ack = 0;
311 inject = 1;
312 }
313 spin_unlock(&ps->inject_lock);
314 if (inject) {
315 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
316 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
317
318 /*
319 * Provides NMI watchdog support via Virtual Wire mode.
320 * The route is: PIT -> PIC -> LVT0 in NMI mode.
321 *
322 * Note: Our Virtual Wire implementation is simplified, only
323 * propagating PIT interrupts to all VCPUs when they have set
324 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
325 * VCPU0, and only if its LVT0 is in EXTINT mode.
326 */
327 if (kvm->arch.vapics_in_nmi_mode > 0)
328 kvm_for_each_vcpu(i, vcpu, kvm)
329 kvm_apic_nmi_wd_deliver(vcpu);
330 }
331}
332
333static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
334{
335 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
336 struct kvm_pit *pt = ktimer->kvm->arch.vpit;
337
338 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
339 atomic_inc(&ktimer->pending);
340 queue_work(pt->wq, &pt->expired);
341 }
342
343 if (ktimer->t_ops->is_periodic(ktimer)) {
344 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
345 return HRTIMER_RESTART;
346 } else
347 return HRTIMER_NORESTART;
348}
349
283static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) 350static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
284{ 351{
285 struct kvm_timer *pt = &ps->pit_timer; 352 struct kvm_timer *pt = &ps->pit_timer;
@@ -291,13 +358,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
291 358
292 /* TODO The new value only affected after the retriggered */ 359 /* TODO The new value only affected after the retriggered */
293 hrtimer_cancel(&pt->timer); 360 hrtimer_cancel(&pt->timer);
361 cancel_work_sync(&ps->pit->expired);
294 pt->period = interval; 362 pt->period = interval;
295 ps->is_periodic = is_period; 363 ps->is_periodic = is_period;
296 364
297 pt->timer.function = kvm_timer_fn; 365 pt->timer.function = pit_timer_fn;
298 pt->t_ops = &kpit_ops; 366 pt->t_ops = &kpit_ops;
299 pt->kvm = ps->pit->kvm; 367 pt->kvm = ps->pit->kvm;
300 pt->vcpu = pt->kvm->bsp_vcpu;
301 368
302 atomic_set(&pt->pending, 0); 369 atomic_set(&pt->pending, 0);
303 ps->irq_ack = 1; 370 ps->irq_ack = 1;
@@ -346,7 +413,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
346 } 413 }
347 break; 414 break;
348 default: 415 default:
349 destroy_pit_timer(&ps->pit_timer); 416 destroy_pit_timer(kvm->arch.vpit);
350 } 417 }
351} 418}
352 419
@@ -625,7 +692,15 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
625 692
626 mutex_init(&pit->pit_state.lock); 693 mutex_init(&pit->pit_state.lock);
627 mutex_lock(&pit->pit_state.lock); 694 mutex_lock(&pit->pit_state.lock);
628 raw_spin_lock_init(&pit->pit_state.inject_lock); 695 spin_lock_init(&pit->pit_state.inject_lock);
696
697 pit->wq = create_singlethread_workqueue("kvm-pit-wq");
698 if (!pit->wq) {
699 mutex_unlock(&pit->pit_state.lock);
700 kfree(pit);
701 return NULL;
702 }
703 INIT_WORK(&pit->expired, pit_do_work);
629 704
630 kvm->arch.vpit = pit; 705 kvm->arch.vpit = pit;
631 pit->kvm = kvm; 706 pit->kvm = kvm;
@@ -677,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm)
677 struct hrtimer *timer; 752 struct hrtimer *timer;
678 753
679 if (kvm->arch.vpit) { 754 if (kvm->arch.vpit) {
755 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
756 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
757 &kvm->arch.vpit->speaker_dev);
680 kvm_unregister_irq_mask_notifier(kvm, 0, 758 kvm_unregister_irq_mask_notifier(kvm, 0,
681 &kvm->arch.vpit->mask_notifier); 759 &kvm->arch.vpit->mask_notifier);
682 kvm_unregister_irq_ack_notifier(kvm, 760 kvm_unregister_irq_ack_notifier(kvm,
@@ -684,54 +762,10 @@ void kvm_free_pit(struct kvm *kvm)
684 mutex_lock(&kvm->arch.vpit->pit_state.lock); 762 mutex_lock(&kvm->arch.vpit->pit_state.lock);
685 timer = &kvm->arch.vpit->pit_state.pit_timer.timer; 763 timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
686 hrtimer_cancel(timer); 764 hrtimer_cancel(timer);
765 cancel_work_sync(&kvm->arch.vpit->expired);
687 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); 766 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
688 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 767 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
768 destroy_workqueue(kvm->arch.vpit->wq);
689 kfree(kvm->arch.vpit); 769 kfree(kvm->arch.vpit);
690 } 770 }
691} 771}
692
693static void __inject_pit_timer_intr(struct kvm *kvm)
694{
695 struct kvm_vcpu *vcpu;
696 int i;
697
698 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
699 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
700
701 /*
702 * Provides NMI watchdog support via Virtual Wire mode.
703 * The route is: PIT -> PIC -> LVT0 in NMI mode.
704 *
705 * Note: Our Virtual Wire implementation is simplified, only
706 * propagating PIT interrupts to all VCPUs when they have set
707 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
708 * VCPU0, and only if its LVT0 is in EXTINT mode.
709 */
710 if (kvm->arch.vapics_in_nmi_mode > 0)
711 kvm_for_each_vcpu(i, vcpu, kvm)
712 kvm_apic_nmi_wd_deliver(vcpu);
713}
714
715void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
716{
717 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
718 struct kvm *kvm = vcpu->kvm;
719 struct kvm_kpit_state *ps;
720
721 if (pit) {
722 int inject = 0;
723 ps = &pit->pit_state;
724
725 /* Try to inject pending interrupts when
726 * last one has been acked.
727 */
728 raw_spin_lock(&ps->inject_lock);
729 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
730 ps->irq_ack = 0;
731 inject = 1;
732 }
733 raw_spin_unlock(&ps->inject_lock);
734 if (inject)
735 __inject_pit_timer_intr(kvm);
736 }
737}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 900d6b0ba7c2..46d08ca0b48f 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
27 u32 speaker_data_on; 27 u32 speaker_data_on;
28 struct mutex lock; 28 struct mutex lock;
29 struct kvm_pit *pit; 29 struct kvm_pit *pit;
30 raw_spinlock_t inject_lock; 30 spinlock_t inject_lock;
31 unsigned long irq_ack; 31 unsigned long irq_ack;
32 struct kvm_irq_ack_notifier irq_ack_notifier; 32 struct kvm_irq_ack_notifier irq_ack_notifier;
33}; 33};
@@ -40,6 +40,8 @@ struct kvm_pit {
40 struct kvm_kpit_state pit_state; 40 struct kvm_kpit_state pit_state;
41 int irq_source_id; 41 int irq_source_id;
42 struct kvm_irq_mask_notifier mask_notifier; 42 struct kvm_irq_mask_notifier mask_notifier;
43 struct workqueue_struct *wq;
44 struct work_struct expired;
43}; 45};
44 46
45#define KVM_PIT_BASE_ADDRESS 0x40 47#define KVM_PIT_BASE_ADDRESS 0x40
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 93825ff3338f..8d10c063d7f2 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard 4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation 5 * Copyright (c) 2007 Intel Corporation
6 * Copyright 2009 Red Hat, Inc. and/or its affilates.
6 * 7 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal 9 * of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,8 @@
33#include <linux/kvm_host.h> 34#include <linux/kvm_host.h>
34#include "trace.h" 35#include "trace.h"
35 36
37static void pic_irq_request(struct kvm *kvm, int level);
38
36static void pic_lock(struct kvm_pic *s) 39static void pic_lock(struct kvm_pic *s)
37 __acquires(&s->lock) 40 __acquires(&s->lock)
38{ 41{
@@ -43,16 +46,25 @@ static void pic_unlock(struct kvm_pic *s)
43 __releases(&s->lock) 46 __releases(&s->lock)
44{ 47{
45 bool wakeup = s->wakeup_needed; 48 bool wakeup = s->wakeup_needed;
46 struct kvm_vcpu *vcpu; 49 struct kvm_vcpu *vcpu, *found = NULL;
50 int i;
47 51
48 s->wakeup_needed = false; 52 s->wakeup_needed = false;
49 53
50 raw_spin_unlock(&s->lock); 54 raw_spin_unlock(&s->lock);
51 55
52 if (wakeup) { 56 if (wakeup) {
53 vcpu = s->kvm->bsp_vcpu; 57 kvm_for_each_vcpu(i, vcpu, s->kvm) {
54 if (vcpu) 58 if (kvm_apic_accept_pic_intr(vcpu)) {
55 kvm_vcpu_kick(vcpu); 59 found = vcpu;
60 break;
61 }
62 }
63
64 if (!found)
65 found = s->kvm->bsp_vcpu;
66
67 kvm_vcpu_kick(found);
56 } 68 }
57} 69}
58 70
@@ -173,10 +185,7 @@ static void pic_update_irq(struct kvm_pic *s)
173 pic_set_irq1(&s->pics[0], 2, 0); 185 pic_set_irq1(&s->pics[0], 2, 0);
174 } 186 }
175 irq = pic_get_irq(&s->pics[0]); 187 irq = pic_get_irq(&s->pics[0]);
176 if (irq >= 0) 188 pic_irq_request(s->kvm, irq >= 0);
177 s->irq_request(s->irq_request_opaque, 1);
178 else
179 s->irq_request(s->irq_request_opaque, 0);
180} 189}
181 190
182void kvm_pic_update_irq(struct kvm_pic *s) 191void kvm_pic_update_irq(struct kvm_pic *s)
@@ -261,8 +270,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
261void kvm_pic_reset(struct kvm_kpic_state *s) 270void kvm_pic_reset(struct kvm_kpic_state *s)
262{ 271{
263 int irq; 272 int irq;
264 struct kvm *kvm = s->pics_state->irq_request_opaque; 273 struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
265 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
266 u8 irr = s->irr, isr = s->imr; 274 u8 irr = s->irr, isr = s->imr;
267 275
268 s->last_irr = 0; 276 s->last_irr = 0;
@@ -301,8 +309,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
301 /* 309 /*
302 * deassert a pending interrupt 310 * deassert a pending interrupt
303 */ 311 */
304 s->pics_state->irq_request(s->pics_state-> 312 pic_irq_request(s->pics_state->kvm, 0);
305 irq_request_opaque, 0);
306 s->init_state = 1; 313 s->init_state = 1;
307 s->init4 = val & 1; 314 s->init4 = val & 1;
308 if (val & 0x02) 315 if (val & 0x02)
@@ -356,10 +363,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
356 } 363 }
357 } else 364 } else
358 switch (s->init_state) { 365 switch (s->init_state) {
359 case 0: /* normal mode */ 366 case 0: { /* normal mode */
367 u8 imr_diff = s->imr ^ val,
368 off = (s == &s->pics_state->pics[0]) ? 0 : 8;
360 s->imr = val; 369 s->imr = val;
370 for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
371 if (imr_diff & (1 << irq))
372 kvm_fire_mask_notifiers(
373 s->pics_state->kvm,
374 SELECT_PIC(irq + off),
375 irq + off,
376 !!(s->imr & (1 << irq)));
361 pic_update_irq(s->pics_state); 377 pic_update_irq(s->pics_state);
362 break; 378 break;
379 }
363 case 1: 380 case 1:
364 s->irq_base = val & 0xf8; 381 s->irq_base = val & 0xf8;
365 s->init_state = 2; 382 s->init_state = 2;
@@ -518,9 +535,8 @@ static int picdev_read(struct kvm_io_device *this,
518/* 535/*
519 * callback when PIC0 irq status changed 536 * callback when PIC0 irq status changed
520 */ 537 */
521static void pic_irq_request(void *opaque, int level) 538static void pic_irq_request(struct kvm *kvm, int level)
522{ 539{
523 struct kvm *kvm = opaque;
524 struct kvm_vcpu *vcpu = kvm->bsp_vcpu; 540 struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
525 struct kvm_pic *s = pic_irqchip(kvm); 541 struct kvm_pic *s = pic_irqchip(kvm);
526 int irq = pic_get_irq(&s->pics[0]); 542 int irq = pic_get_irq(&s->pics[0]);
@@ -549,8 +565,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
549 s->kvm = kvm; 565 s->kvm = kvm;
550 s->pics[0].elcr_mask = 0xf8; 566 s->pics[0].elcr_mask = 0xf8;
551 s->pics[1].elcr_mask = 0xde; 567 s->pics[1].elcr_mask = 0xde;
552 s->irq_request = pic_irq_request;
553 s->irq_request_opaque = kvm;
554 s->pics[0].pics_state = s; 568 s->pics[0].pics_state = s;
555 s->pics[1].pics_state = s; 569 s->pics[1].pics_state = s;
556 570
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 96dfbb6ad2a9..2095a049835e 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * irq.c: API for in kernel interrupt controller 2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation. 3 * Copyright (c) 2007, Intel Corporation.
4 * Copyright 2009 Red Hat, Inc. and/or its affilates.
4 * 5 *
5 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -89,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
89void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 90void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
90{ 91{
91 kvm_inject_apic_timer_irqs(vcpu); 92 kvm_inject_apic_timer_irqs(vcpu);
92 kvm_inject_pit_timer_irqs(vcpu);
93 /* TODO: PIT, RTC etc. */ 93 /* TODO: PIT, RTC etc. */
94} 94}
95EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); 95EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index cd1f362f413d..ffed06871c5c 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -38,8 +38,6 @@
38struct kvm; 38struct kvm;
39struct kvm_vcpu; 39struct kvm_vcpu;
40 40
41typedef void irq_request_func(void *opaque, int level);
42
43struct kvm_kpic_state { 41struct kvm_kpic_state {
44 u8 last_irr; /* edge detection */ 42 u8 last_irr; /* edge detection */
45 u8 irr; /* interrupt request register */ 43 u8 irr; /* interrupt request register */
@@ -67,8 +65,6 @@ struct kvm_pic {
67 unsigned pending_acks; 65 unsigned pending_acks;
68 struct kvm *kvm; 66 struct kvm *kvm;
69 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 67 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
70 irq_request_func *irq_request;
71 void *irq_request_opaque;
72 int output; /* intr from master PIC */ 68 int output; /* intr from master PIC */
73 struct kvm_io_device dev; 69 struct kvm_io_device dev;
74 void (*ack_notifier)(void *opaque, int irq); 70 void (*ack_notifier)(void *opaque, int irq);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index cff851cf5322..6491ac8e755b 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -36,6 +36,8 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
36 36
37static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) 37static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
38{ 38{
39 might_sleep(); /* on svm */
40
39 if (!test_bit(VCPU_EXREG_PDPTR, 41 if (!test_bit(VCPU_EXREG_PDPTR,
40 (unsigned long *)&vcpu->arch.regs_avail)) 42 (unsigned long *)&vcpu->arch.regs_avail))
41 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); 43 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
@@ -69,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
69 return kvm_read_cr4_bits(vcpu, ~0UL); 71 return kvm_read_cr4_bits(vcpu, ~0UL);
70} 72}
71 73
74static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
75{
76 return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
77 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
78}
79
72#endif 80#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1eb7a4ae0c9c..77d8c0f4817d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,6 +5,7 @@
5 * Copyright (C) 2006 Qumranet, Inc. 5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell 6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel 7 * Copyright (C) 2007 Intel
8 * Copyright 2009 Red Hat, Inc. and/or its affilates.
8 * 9 *
9 * Authors: 10 * Authors:
10 * Dor Laor <dor.laor@qumranet.com> 11 * Dor Laor <dor.laor@qumranet.com>
@@ -328,7 +329,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
328 "dest_mode 0x%x, short_hand 0x%x\n", 329 "dest_mode 0x%x, short_hand 0x%x\n",
329 target, source, dest, dest_mode, short_hand); 330 target, source, dest, dest_mode, short_hand);
330 331
331 ASSERT(!target); 332 ASSERT(target);
332 switch (short_hand) { 333 switch (short_hand) {
333 case APIC_DEST_NOSHORT: 334 case APIC_DEST_NOSHORT:
334 if (dest_mode == 0) 335 if (dest_mode == 0)
@@ -533,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
533 struct kvm_vcpu *vcpu = apic->vcpu; 534 struct kvm_vcpu *vcpu = apic->vcpu;
534 struct kvm_run *run = vcpu->run; 535 struct kvm_run *run = vcpu->run;
535 536
536 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); 537 kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
537 run->tpr_access.rip = kvm_rip_read(vcpu); 538 run->tpr_access.rip = kvm_rip_read(vcpu);
538 run->tpr_access.is_write = write; 539 run->tpr_access.is_write = write;
539} 540}
@@ -1106,13 +1107,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1106 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); 1107 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
1107 int r = 0; 1108 int r = 0;
1108 1109
1109 if (kvm_vcpu_is_bsp(vcpu)) { 1110 if (!apic_hw_enabled(vcpu->arch.apic))
1110 if (!apic_hw_enabled(vcpu->arch.apic)) 1111 r = 1;
1111 r = 1; 1112 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1112 if ((lvt0 & APIC_LVT_MASKED) == 0 && 1113 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
1113 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) 1114 r = 1;
1114 r = 1;
1115 }
1116 return r; 1115 return r;
1117} 1116}
1118 1117
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b1ed0a1a5913..311f6dad8951 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,6 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates.
10 * 11 *
11 * Authors: 12 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -32,6 +33,7 @@
32#include <linux/compiler.h> 33#include <linux/compiler.h>
33#include <linux/srcu.h> 34#include <linux/srcu.h>
34#include <linux/slab.h> 35#include <linux/slab.h>
36#include <linux/uaccess.h>
35 37
36#include <asm/page.h> 38#include <asm/page.h>
37#include <asm/cmpxchg.h> 39#include <asm/cmpxchg.h>
@@ -90,8 +92,6 @@ module_param(oos_shadow, bool, 0644);
90#define PT_FIRST_AVAIL_BITS_SHIFT 9 92#define PT_FIRST_AVAIL_BITS_SHIFT 9
91#define PT64_SECOND_AVAIL_BITS_SHIFT 52 93#define PT64_SECOND_AVAIL_BITS_SHIFT 52
92 94
93#define VALID_PAGE(x) ((x) != INVALID_PAGE)
94
95#define PT64_LEVEL_BITS 9 95#define PT64_LEVEL_BITS 9
96 96
97#define PT64_LEVEL_SHIFT(level) \ 97#define PT64_LEVEL_SHIFT(level) \
@@ -173,7 +173,7 @@ struct kvm_shadow_walk_iterator {
173 shadow_walk_okay(&(_walker)); \ 173 shadow_walk_okay(&(_walker)); \
174 shadow_walk_next(&(_walker))) 174 shadow_walk_next(&(_walker)))
175 175
176typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp); 176typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
177 177
178static struct kmem_cache *pte_chain_cache; 178static struct kmem_cache *pte_chain_cache;
179static struct kmem_cache *rmap_desc_cache; 179static struct kmem_cache *rmap_desc_cache;
@@ -281,13 +281,38 @@ static gfn_t pse36_gfn_delta(u32 gpte)
281 281
282static void __set_spte(u64 *sptep, u64 spte) 282static void __set_spte(u64 *sptep, u64 spte)
283{ 283{
284 set_64bit(sptep, spte);
285}
286
287static u64 __xchg_spte(u64 *sptep, u64 new_spte)
288{
284#ifdef CONFIG_X86_64 289#ifdef CONFIG_X86_64
285 set_64bit((unsigned long *)sptep, spte); 290 return xchg(sptep, new_spte);
286#else 291#else
287 set_64bit((unsigned long long *)sptep, spte); 292 u64 old_spte;
293
294 do {
295 old_spte = *sptep;
296 } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
297
298 return old_spte;
288#endif 299#endif
289} 300}
290 301
302static void update_spte(u64 *sptep, u64 new_spte)
303{
304 u64 old_spte;
305
306 if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
307 !is_rmap_spte(*sptep))
308 __set_spte(sptep, new_spte);
309 else {
310 old_spte = __xchg_spte(sptep, new_spte);
311 if (old_spte & shadow_accessed_mask)
312 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
313 }
314}
315
291static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 316static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
292 struct kmem_cache *base_cache, int min) 317 struct kmem_cache *base_cache, int min)
293{ 318{
@@ -304,10 +329,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
304 return 0; 329 return 0;
305} 330}
306 331
307static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 332static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
333 struct kmem_cache *cache)
308{ 334{
309 while (mc->nobjs) 335 while (mc->nobjs)
310 kfree(mc->objects[--mc->nobjs]); 336 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
311} 337}
312 338
313static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 339static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
@@ -355,10 +381,11 @@ out:
355 381
356static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 382static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
357{ 383{
358 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); 384 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
359 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); 385 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
360 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); 386 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
361 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); 387 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
388 mmu_page_header_cache);
362} 389}
363 390
364static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, 391static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -379,7 +406,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
379 406
380static void mmu_free_pte_chain(struct kvm_pte_chain *pc) 407static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
381{ 408{
382 kfree(pc); 409 kmem_cache_free(pte_chain_cache, pc);
383} 410}
384 411
385static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) 412static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
@@ -390,7 +417,23 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
390 417
391static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) 418static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
392{ 419{
393 kfree(rd); 420 kmem_cache_free(rmap_desc_cache, rd);
421}
422
423static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
424{
425 if (!sp->role.direct)
426 return sp->gfns[index];
427
428 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
429}
430
431static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
432{
433 if (sp->role.direct)
434 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
435 else
436 sp->gfns[index] = gfn;
394} 437}
395 438
396/* 439/*
@@ -403,8 +446,8 @@ static int *slot_largepage_idx(gfn_t gfn,
403{ 446{
404 unsigned long idx; 447 unsigned long idx;
405 448
406 idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - 449 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
407 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); 450 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
408 return &slot->lpage_info[level - 2][idx].write_count; 451 return &slot->lpage_info[level - 2][idx].write_count;
409} 452}
410 453
@@ -414,9 +457,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
414 int *write_count; 457 int *write_count;
415 int i; 458 int i;
416 459
417 gfn = unalias_gfn(kvm, gfn); 460 slot = gfn_to_memslot(kvm, gfn);
418
419 slot = gfn_to_memslot_unaliased(kvm, gfn);
420 for (i = PT_DIRECTORY_LEVEL; 461 for (i = PT_DIRECTORY_LEVEL;
421 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 462 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
422 write_count = slot_largepage_idx(gfn, slot, i); 463 write_count = slot_largepage_idx(gfn, slot, i);
@@ -430,8 +471,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
430 int *write_count; 471 int *write_count;
431 int i; 472 int i;
432 473
433 gfn = unalias_gfn(kvm, gfn); 474 slot = gfn_to_memslot(kvm, gfn);
434 slot = gfn_to_memslot_unaliased(kvm, gfn);
435 for (i = PT_DIRECTORY_LEVEL; 475 for (i = PT_DIRECTORY_LEVEL;
436 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 476 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
437 write_count = slot_largepage_idx(gfn, slot, i); 477 write_count = slot_largepage_idx(gfn, slot, i);
@@ -447,8 +487,7 @@ static int has_wrprotected_page(struct kvm *kvm,
447 struct kvm_memory_slot *slot; 487 struct kvm_memory_slot *slot;
448 int *largepage_idx; 488 int *largepage_idx;
449 489
450 gfn = unalias_gfn(kvm, gfn); 490 slot = gfn_to_memslot(kvm, gfn);
451 slot = gfn_to_memslot_unaliased(kvm, gfn);
452 if (slot) { 491 if (slot) {
453 largepage_idx = slot_largepage_idx(gfn, slot, level); 492 largepage_idx = slot_largepage_idx(gfn, slot, level);
454 return *largepage_idx; 493 return *largepage_idx;
@@ -501,7 +540,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
501 540
502/* 541/*
503 * Take gfn and return the reverse mapping to it. 542 * Take gfn and return the reverse mapping to it.
504 * Note: gfn must be unaliased before this function get called
505 */ 543 */
506 544
507static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 545static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
@@ -513,8 +551,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
513 if (likely(level == PT_PAGE_TABLE_LEVEL)) 551 if (likely(level == PT_PAGE_TABLE_LEVEL))
514 return &slot->rmap[gfn - slot->base_gfn]; 552 return &slot->rmap[gfn - slot->base_gfn];
515 553
516 idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - 554 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
517 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); 555 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
518 556
519 return &slot->lpage_info[level - 2][idx].rmap_pde; 557 return &slot->lpage_info[level - 2][idx].rmap_pde;
520} 558}
@@ -541,9 +579,8 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
541 579
542 if (!is_rmap_spte(*spte)) 580 if (!is_rmap_spte(*spte))
543 return count; 581 return count;
544 gfn = unalias_gfn(vcpu->kvm, gfn);
545 sp = page_header(__pa(spte)); 582 sp = page_header(__pa(spte));
546 sp->gfns[spte - sp->spt] = gfn; 583 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
547 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 584 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
548 if (!*rmapp) { 585 if (!*rmapp) {
549 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); 586 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
@@ -600,19 +637,13 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
600 struct kvm_rmap_desc *desc; 637 struct kvm_rmap_desc *desc;
601 struct kvm_rmap_desc *prev_desc; 638 struct kvm_rmap_desc *prev_desc;
602 struct kvm_mmu_page *sp; 639 struct kvm_mmu_page *sp;
603 pfn_t pfn; 640 gfn_t gfn;
604 unsigned long *rmapp; 641 unsigned long *rmapp;
605 int i; 642 int i;
606 643
607 if (!is_rmap_spte(*spte))
608 return;
609 sp = page_header(__pa(spte)); 644 sp = page_header(__pa(spte));
610 pfn = spte_to_pfn(*spte); 645 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
611 if (*spte & shadow_accessed_mask) 646 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
612 kvm_set_pfn_accessed(pfn);
613 if (is_writable_pte(*spte))
614 kvm_set_pfn_dirty(pfn);
615 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
616 if (!*rmapp) { 647 if (!*rmapp) {
617 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 648 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
618 BUG(); 649 BUG();
@@ -644,6 +675,32 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
644 } 675 }
645} 676}
646 677
678static void set_spte_track_bits(u64 *sptep, u64 new_spte)
679{
680 pfn_t pfn;
681 u64 old_spte = *sptep;
682
683 if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
684 old_spte & shadow_accessed_mask) {
685 __set_spte(sptep, new_spte);
686 } else
687 old_spte = __xchg_spte(sptep, new_spte);
688
689 if (!is_rmap_spte(old_spte))
690 return;
691 pfn = spte_to_pfn(old_spte);
692 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
693 kvm_set_pfn_accessed(pfn);
694 if (is_writable_pte(old_spte))
695 kvm_set_pfn_dirty(pfn);
696}
697
698static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
699{
700 set_spte_track_bits(sptep, new_spte);
701 rmap_remove(kvm, sptep);
702}
703
647static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 704static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
648{ 705{
649 struct kvm_rmap_desc *desc; 706 struct kvm_rmap_desc *desc;
@@ -676,7 +733,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
676 u64 *spte; 733 u64 *spte;
677 int i, write_protected = 0; 734 int i, write_protected = 0;
678 735
679 gfn = unalias_gfn(kvm, gfn);
680 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); 736 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
681 737
682 spte = rmap_next(kvm, rmapp, NULL); 738 spte = rmap_next(kvm, rmapp, NULL);
@@ -685,7 +741,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
685 BUG_ON(!(*spte & PT_PRESENT_MASK)); 741 BUG_ON(!(*spte & PT_PRESENT_MASK));
686 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 742 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
687 if (is_writable_pte(*spte)) { 743 if (is_writable_pte(*spte)) {
688 __set_spte(spte, *spte & ~PT_WRITABLE_MASK); 744 update_spte(spte, *spte & ~PT_WRITABLE_MASK);
689 write_protected = 1; 745 write_protected = 1;
690 } 746 }
691 spte = rmap_next(kvm, rmapp, spte); 747 spte = rmap_next(kvm, rmapp, spte);
@@ -709,9 +765,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
709 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 765 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
710 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 766 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
711 if (is_writable_pte(*spte)) { 767 if (is_writable_pte(*spte)) {
712 rmap_remove(kvm, spte); 768 drop_spte(kvm, spte,
769 shadow_trap_nonpresent_pte);
713 --kvm->stat.lpages; 770 --kvm->stat.lpages;
714 __set_spte(spte, shadow_trap_nonpresent_pte);
715 spte = NULL; 771 spte = NULL;
716 write_protected = 1; 772 write_protected = 1;
717 } 773 }
@@ -731,8 +787,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
731 while ((spte = rmap_next(kvm, rmapp, NULL))) { 787 while ((spte = rmap_next(kvm, rmapp, NULL))) {
732 BUG_ON(!(*spte & PT_PRESENT_MASK)); 788 BUG_ON(!(*spte & PT_PRESENT_MASK));
733 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 789 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
734 rmap_remove(kvm, spte); 790 drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
735 __set_spte(spte, shadow_trap_nonpresent_pte);
736 need_tlb_flush = 1; 791 need_tlb_flush = 1;
737 } 792 }
738 return need_tlb_flush; 793 return need_tlb_flush;
@@ -754,8 +809,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
754 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 809 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
755 need_flush = 1; 810 need_flush = 1;
756 if (pte_write(*ptep)) { 811 if (pte_write(*ptep)) {
757 rmap_remove(kvm, spte); 812 drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
758 __set_spte(spte, shadow_trap_nonpresent_pte);
759 spte = rmap_next(kvm, rmapp, NULL); 813 spte = rmap_next(kvm, rmapp, NULL);
760 } else { 814 } else {
761 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 815 new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -763,9 +817,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
763 817
764 new_spte &= ~PT_WRITABLE_MASK; 818 new_spte &= ~PT_WRITABLE_MASK;
765 new_spte &= ~SPTE_HOST_WRITEABLE; 819 new_spte &= ~SPTE_HOST_WRITEABLE;
766 if (is_writable_pte(*spte)) 820 new_spte &= ~shadow_accessed_mask;
767 kvm_set_pfn_dirty(spte_to_pfn(*spte)); 821 set_spte_track_bits(spte, new_spte);
768 __set_spte(spte, new_spte);
769 spte = rmap_next(kvm, rmapp, spte); 822 spte = rmap_next(kvm, rmapp, spte);
770 } 823 }
771 } 824 }
@@ -799,8 +852,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
799 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 852 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
800 853
801 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 854 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
802 int idx = gfn_offset; 855 unsigned long idx;
803 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); 856 int sh;
857
858 sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
859 idx = ((memslot->base_gfn+gfn_offset) >> sh) -
860 (memslot->base_gfn >> sh);
804 ret |= handler(kvm, 861 ret |= handler(kvm,
805 &memslot->lpage_info[j][idx].rmap_pde, 862 &memslot->lpage_info[j][idx].rmap_pde,
806 data); 863 data);
@@ -863,7 +920,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
863 920
864 sp = page_header(__pa(spte)); 921 sp = page_header(__pa(spte));
865 922
866 gfn = unalias_gfn(vcpu->kvm, gfn);
867 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 923 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
868 924
869 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); 925 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
@@ -894,10 +950,12 @@ static int is_empty_shadow_page(u64 *spt)
894static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 950static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
895{ 951{
896 ASSERT(is_empty_shadow_page(sp->spt)); 952 ASSERT(is_empty_shadow_page(sp->spt));
953 hlist_del(&sp->hash_link);
897 list_del(&sp->link); 954 list_del(&sp->link);
898 __free_page(virt_to_page(sp->spt)); 955 __free_page(virt_to_page(sp->spt));
899 __free_page(virt_to_page(sp->gfns)); 956 if (!sp->role.direct)
900 kfree(sp); 957 __free_page(virt_to_page(sp->gfns));
958 kmem_cache_free(mmu_page_header_cache, sp);
901 ++kvm->arch.n_free_mmu_pages; 959 ++kvm->arch.n_free_mmu_pages;
902} 960}
903 961
@@ -907,13 +965,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
907} 965}
908 966
909static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, 967static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
910 u64 *parent_pte) 968 u64 *parent_pte, int direct)
911{ 969{
912 struct kvm_mmu_page *sp; 970 struct kvm_mmu_page *sp;
913 971
914 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); 972 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
915 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 973 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
916 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 974 if (!direct)
975 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
976 PAGE_SIZE);
917 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 977 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
918 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 978 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
919 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 979 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
@@ -998,7 +1058,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
998 BUG(); 1058 BUG();
999} 1059}
1000 1060
1001
1002static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) 1061static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1003{ 1062{
1004 struct kvm_pte_chain *pte_chain; 1063 struct kvm_pte_chain *pte_chain;
@@ -1008,63 +1067,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1008 1067
1009 if (!sp->multimapped && sp->parent_pte) { 1068 if (!sp->multimapped && sp->parent_pte) {
1010 parent_sp = page_header(__pa(sp->parent_pte)); 1069 parent_sp = page_header(__pa(sp->parent_pte));
1011 fn(parent_sp); 1070 fn(parent_sp, sp->parent_pte);
1012 mmu_parent_walk(parent_sp, fn);
1013 return; 1071 return;
1014 } 1072 }
1073
1015 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1074 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1016 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { 1075 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1017 if (!pte_chain->parent_ptes[i]) 1076 u64 *spte = pte_chain->parent_ptes[i];
1077
1078 if (!spte)
1018 break; 1079 break;
1019 parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); 1080 parent_sp = page_header(__pa(spte));
1020 fn(parent_sp); 1081 fn(parent_sp, spte);
1021 mmu_parent_walk(parent_sp, fn);
1022 } 1082 }
1023} 1083}
1024 1084
1025static void kvm_mmu_update_unsync_bitmap(u64 *spte) 1085static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
1086static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1026{ 1087{
1027 unsigned int index; 1088 mmu_parent_walk(sp, mark_unsync);
1028 struct kvm_mmu_page *sp = page_header(__pa(spte));
1029
1030 index = spte - sp->spt;
1031 if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
1032 sp->unsync_children++;
1033 WARN_ON(!sp->unsync_children);
1034} 1089}
1035 1090
1036static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) 1091static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1037{ 1092{
1038 struct kvm_pte_chain *pte_chain; 1093 unsigned int index;
1039 struct hlist_node *node;
1040 int i;
1041 1094
1042 if (!sp->parent_pte) 1095 index = spte - sp->spt;
1096 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1043 return; 1097 return;
1044 1098 if (sp->unsync_children++)
1045 if (!sp->multimapped) {
1046 kvm_mmu_update_unsync_bitmap(sp->parent_pte);
1047 return; 1099 return;
1048 } 1100 kvm_mmu_mark_parents_unsync(sp);
1049
1050 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1051 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1052 if (!pte_chain->parent_ptes[i])
1053 break;
1054 kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
1055 }
1056}
1057
1058static int unsync_walk_fn(struct kvm_mmu_page *sp)
1059{
1060 kvm_mmu_update_parents_unsync(sp);
1061 return 1;
1062}
1063
1064static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1065{
1066 mmu_parent_walk(sp, unsync_walk_fn);
1067 kvm_mmu_update_parents_unsync(sp);
1068} 1101}
1069 1102
1070static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, 1103static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
@@ -1077,7 +1110,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1077} 1110}
1078 1111
1079static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1112static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1080 struct kvm_mmu_page *sp) 1113 struct kvm_mmu_page *sp, bool clear_unsync)
1081{ 1114{
1082 return 1; 1115 return 1;
1083} 1116}
@@ -1123,35 +1156,40 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1123 int i, ret, nr_unsync_leaf = 0; 1156 int i, ret, nr_unsync_leaf = 0;
1124 1157
1125 for_each_unsync_children(sp->unsync_child_bitmap, i) { 1158 for_each_unsync_children(sp->unsync_child_bitmap, i) {
1159 struct kvm_mmu_page *child;
1126 u64 ent = sp->spt[i]; 1160 u64 ent = sp->spt[i];
1127 1161
1128 if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { 1162 if (!is_shadow_present_pte(ent) || is_large_pte(ent))
1129 struct kvm_mmu_page *child; 1163 goto clear_child_bitmap;
1130 child = page_header(ent & PT64_BASE_ADDR_MASK); 1164
1131 1165 child = page_header(ent & PT64_BASE_ADDR_MASK);
1132 if (child->unsync_children) { 1166
1133 if (mmu_pages_add(pvec, child, i)) 1167 if (child->unsync_children) {
1134 return -ENOSPC; 1168 if (mmu_pages_add(pvec, child, i))
1135 1169 return -ENOSPC;
1136 ret = __mmu_unsync_walk(child, pvec); 1170
1137 if (!ret) 1171 ret = __mmu_unsync_walk(child, pvec);
1138 __clear_bit(i, sp->unsync_child_bitmap); 1172 if (!ret)
1139 else if (ret > 0) 1173 goto clear_child_bitmap;
1140 nr_unsync_leaf += ret; 1174 else if (ret > 0)
1141 else 1175 nr_unsync_leaf += ret;
1142 return ret; 1176 else
1143 } 1177 return ret;
1178 } else if (child->unsync) {
1179 nr_unsync_leaf++;
1180 if (mmu_pages_add(pvec, child, i))
1181 return -ENOSPC;
1182 } else
1183 goto clear_child_bitmap;
1144 1184
1145 if (child->unsync) { 1185 continue;
1146 nr_unsync_leaf++; 1186
1147 if (mmu_pages_add(pvec, child, i)) 1187clear_child_bitmap:
1148 return -ENOSPC; 1188 __clear_bit(i, sp->unsync_child_bitmap);
1149 } 1189 sp->unsync_children--;
1150 } 1190 WARN_ON((int)sp->unsync_children < 0);
1151 } 1191 }
1152 1192
1153 if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
1154 sp->unsync_children = 0;
1155 1193
1156 return nr_unsync_leaf; 1194 return nr_unsync_leaf;
1157} 1195}
@@ -1166,26 +1204,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1166 return __mmu_unsync_walk(sp, pvec); 1204 return __mmu_unsync_walk(sp, pvec);
1167} 1205}
1168 1206
1169static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1170{
1171 unsigned index;
1172 struct hlist_head *bucket;
1173 struct kvm_mmu_page *sp;
1174 struct hlist_node *node;
1175
1176 pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1177 index = kvm_page_table_hashfn(gfn);
1178 bucket = &kvm->arch.mmu_page_hash[index];
1179 hlist_for_each_entry(sp, node, bucket, hash_link)
1180 if (sp->gfn == gfn && !sp->role.direct
1181 && !sp->role.invalid) {
1182 pgprintk("%s: found role %x\n",
1183 __func__, sp->role.word);
1184 return sp;
1185 }
1186 return NULL;
1187}
1188
1189static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1207static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1190{ 1208{
1191 WARN_ON(!sp->unsync); 1209 WARN_ON(!sp->unsync);
@@ -1194,20 +1212,36 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1194 --kvm->stat.mmu_unsync; 1212 --kvm->stat.mmu_unsync;
1195} 1213}
1196 1214
1197static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); 1215static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1216 struct list_head *invalid_list);
1217static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1218 struct list_head *invalid_list);
1198 1219
1199static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1220#define for_each_gfn_sp(kvm, sp, gfn, pos) \
1221 hlist_for_each_entry(sp, pos, \
1222 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
1223 if ((sp)->gfn != (gfn)) {} else
1224
1225#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \
1226 hlist_for_each_entry(sp, pos, \
1227 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
1228 if ((sp)->gfn != (gfn) || (sp)->role.direct || \
1229 (sp)->role.invalid) {} else
1230
1231/* @sp->gfn should be write-protected at the call site */
1232static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1233 struct list_head *invalid_list, bool clear_unsync)
1200{ 1234{
1201 if (sp->role.cr4_pae != !!is_pae(vcpu)) { 1235 if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1202 kvm_mmu_zap_page(vcpu->kvm, sp); 1236 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1203 return 1; 1237 return 1;
1204 } 1238 }
1205 1239
1206 if (rmap_write_protect(vcpu->kvm, sp->gfn)) 1240 if (clear_unsync)
1207 kvm_flush_remote_tlbs(vcpu->kvm); 1241 kvm_unlink_unsync_page(vcpu->kvm, sp);
1208 kvm_unlink_unsync_page(vcpu->kvm, sp); 1242
1209 if (vcpu->arch.mmu.sync_page(vcpu, sp)) { 1243 if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
1210 kvm_mmu_zap_page(vcpu->kvm, sp); 1244 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1211 return 1; 1245 return 1;
1212 } 1246 }
1213 1247
@@ -1215,6 +1249,52 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1215 return 0; 1249 return 0;
1216} 1250}
1217 1251
1252static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1253 struct kvm_mmu_page *sp)
1254{
1255 LIST_HEAD(invalid_list);
1256 int ret;
1257
1258 ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1259 if (ret)
1260 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1261
1262 return ret;
1263}
1264
1265static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1266 struct list_head *invalid_list)
1267{
1268 return __kvm_sync_page(vcpu, sp, invalid_list, true);
1269}
1270
1271/* @gfn should be write-protected at the call site */
1272static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1273{
1274 struct kvm_mmu_page *s;
1275 struct hlist_node *node;
1276 LIST_HEAD(invalid_list);
1277 bool flush = false;
1278
1279 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1280 if (!s->unsync)
1281 continue;
1282
1283 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1284 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1285 (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
1286 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1287 continue;
1288 }
1289 kvm_unlink_unsync_page(vcpu->kvm, s);
1290 flush = true;
1291 }
1292
1293 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1294 if (flush)
1295 kvm_mmu_flush_tlb(vcpu);
1296}
1297
1218struct mmu_page_path { 1298struct mmu_page_path {
1219 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; 1299 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1220 unsigned int idx[PT64_ROOT_LEVEL-1]; 1300 unsigned int idx[PT64_ROOT_LEVEL-1];
@@ -1281,6 +1361,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1281 struct kvm_mmu_page *sp; 1361 struct kvm_mmu_page *sp;
1282 struct mmu_page_path parents; 1362 struct mmu_page_path parents;
1283 struct kvm_mmu_pages pages; 1363 struct kvm_mmu_pages pages;
1364 LIST_HEAD(invalid_list);
1284 1365
1285 kvm_mmu_pages_init(parent, &parents, &pages); 1366 kvm_mmu_pages_init(parent, &parents, &pages);
1286 while (mmu_unsync_walk(parent, &pages)) { 1367 while (mmu_unsync_walk(parent, &pages)) {
@@ -1293,9 +1374,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1293 kvm_flush_remote_tlbs(vcpu->kvm); 1374 kvm_flush_remote_tlbs(vcpu->kvm);
1294 1375
1295 for_each_sp(pages, sp, parents, i) { 1376 for_each_sp(pages, sp, parents, i) {
1296 kvm_sync_page(vcpu, sp); 1377 kvm_sync_page(vcpu, sp, &invalid_list);
1297 mmu_pages_clear_parents(&parents); 1378 mmu_pages_clear_parents(&parents);
1298 } 1379 }
1380 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1299 cond_resched_lock(&vcpu->kvm->mmu_lock); 1381 cond_resched_lock(&vcpu->kvm->mmu_lock);
1300 kvm_mmu_pages_init(parent, &parents, &pages); 1382 kvm_mmu_pages_init(parent, &parents, &pages);
1301 } 1383 }
@@ -1310,11 +1392,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1310 u64 *parent_pte) 1392 u64 *parent_pte)
1311{ 1393{
1312 union kvm_mmu_page_role role; 1394 union kvm_mmu_page_role role;
1313 unsigned index;
1314 unsigned quadrant; 1395 unsigned quadrant;
1315 struct hlist_head *bucket;
1316 struct kvm_mmu_page *sp; 1396 struct kvm_mmu_page *sp;
1317 struct hlist_node *node, *tmp; 1397 struct hlist_node *node;
1398 bool need_sync = false;
1318 1399
1319 role = vcpu->arch.mmu.base_role; 1400 role = vcpu->arch.mmu.base_role;
1320 role.level = level; 1401 role.level = level;
@@ -1322,40 +1403,45 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1322 if (role.direct) 1403 if (role.direct)
1323 role.cr4_pae = 0; 1404 role.cr4_pae = 0;
1324 role.access = access; 1405 role.access = access;
1325 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1406 if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1326 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1407 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1327 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1408 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1328 role.quadrant = quadrant; 1409 role.quadrant = quadrant;
1329 } 1410 }
1330 index = kvm_page_table_hashfn(gfn); 1411 for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1331 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1412 if (!need_sync && sp->unsync)
1332 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) 1413 need_sync = true;
1333 if (sp->gfn == gfn) {
1334 if (sp->unsync)
1335 if (kvm_sync_page(vcpu, sp))
1336 continue;
1337 1414
1338 if (sp->role.word != role.word) 1415 if (sp->role.word != role.word)
1339 continue; 1416 continue;
1340 1417
1341 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1418 if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
1342 if (sp->unsync_children) { 1419 break;
1343 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); 1420
1344 kvm_mmu_mark_parents_unsync(sp); 1421 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1345 } 1422 if (sp->unsync_children) {
1346 trace_kvm_mmu_get_page(sp, false); 1423 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1347 return sp; 1424 kvm_mmu_mark_parents_unsync(sp);
1348 } 1425 } else if (sp->unsync)
1426 kvm_mmu_mark_parents_unsync(sp);
1427
1428 trace_kvm_mmu_get_page(sp, false);
1429 return sp;
1430 }
1349 ++vcpu->kvm->stat.mmu_cache_miss; 1431 ++vcpu->kvm->stat.mmu_cache_miss;
1350 sp = kvm_mmu_alloc_page(vcpu, parent_pte); 1432 sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1351 if (!sp) 1433 if (!sp)
1352 return sp; 1434 return sp;
1353 sp->gfn = gfn; 1435 sp->gfn = gfn;
1354 sp->role = role; 1436 sp->role = role;
1355 hlist_add_head(&sp->hash_link, bucket); 1437 hlist_add_head(&sp->hash_link,
1438 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1356 if (!direct) { 1439 if (!direct) {
1357 if (rmap_write_protect(vcpu->kvm, gfn)) 1440 if (rmap_write_protect(vcpu->kvm, gfn))
1358 kvm_flush_remote_tlbs(vcpu->kvm); 1441 kvm_flush_remote_tlbs(vcpu->kvm);
1442 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
1443 kvm_sync_pages(vcpu, gfn);
1444
1359 account_shadowed(vcpu->kvm, gfn); 1445 account_shadowed(vcpu->kvm, gfn);
1360 } 1446 }
1361 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1447 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1402,6 +1488,47 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1402 --iterator->level; 1488 --iterator->level;
1403} 1489}
1404 1490
1491static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1492{
1493 u64 spte;
1494
1495 spte = __pa(sp->spt)
1496 | PT_PRESENT_MASK | PT_ACCESSED_MASK
1497 | PT_WRITABLE_MASK | PT_USER_MASK;
1498 __set_spte(sptep, spte);
1499}
1500
1501static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1502{
1503 if (is_large_pte(*sptep)) {
1504 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1505 kvm_flush_remote_tlbs(vcpu->kvm);
1506 }
1507}
1508
1509static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1510 unsigned direct_access)
1511{
1512 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
1513 struct kvm_mmu_page *child;
1514
1515 /*
1516 * For the direct sp, if the guest pte's dirty bit
1517 * changed form clean to dirty, it will corrupt the
1518 * sp's access: allow writable in the read-only sp,
1519 * so we should update the spte at this point to get
1520 * a new sp with the correct access.
1521 */
1522 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
1523 if (child->role.access == direct_access)
1524 return;
1525
1526 mmu_page_remove_parent_pte(child, sptep);
1527 __set_spte(sptep, shadow_trap_nonpresent_pte);
1528 kvm_flush_remote_tlbs(vcpu->kvm);
1529 }
1530}
1531
1405static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1532static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1406 struct kvm_mmu_page *sp) 1533 struct kvm_mmu_page *sp)
1407{ 1534{
@@ -1422,7 +1549,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1422 } else { 1549 } else {
1423 if (is_large_pte(ent)) 1550 if (is_large_pte(ent))
1424 --kvm->stat.lpages; 1551 --kvm->stat.lpages;
1425 rmap_remove(kvm, &pt[i]); 1552 drop_spte(kvm, &pt[i],
1553 shadow_trap_nonpresent_pte);
1426 } 1554 }
1427 } 1555 }
1428 pt[i] = shadow_trap_nonpresent_pte; 1556 pt[i] = shadow_trap_nonpresent_pte;
@@ -1464,7 +1592,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1464} 1592}
1465 1593
1466static int mmu_zap_unsync_children(struct kvm *kvm, 1594static int mmu_zap_unsync_children(struct kvm *kvm,
1467 struct kvm_mmu_page *parent) 1595 struct kvm_mmu_page *parent,
1596 struct list_head *invalid_list)
1468{ 1597{
1469 int i, zapped = 0; 1598 int i, zapped = 0;
1470 struct mmu_page_path parents; 1599 struct mmu_page_path parents;
@@ -1478,7 +1607,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1478 struct kvm_mmu_page *sp; 1607 struct kvm_mmu_page *sp;
1479 1608
1480 for_each_sp(pages, sp, parents, i) { 1609 for_each_sp(pages, sp, parents, i) {
1481 kvm_mmu_zap_page(kvm, sp); 1610 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1482 mmu_pages_clear_parents(&parents); 1611 mmu_pages_clear_parents(&parents);
1483 zapped++; 1612 zapped++;
1484 } 1613 }
@@ -1488,32 +1617,52 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1488 return zapped; 1617 return zapped;
1489} 1618}
1490 1619
1491static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1620static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1621 struct list_head *invalid_list)
1492{ 1622{
1493 int ret; 1623 int ret;
1494 1624
1495 trace_kvm_mmu_zap_page(sp); 1625 trace_kvm_mmu_prepare_zap_page(sp);
1496 ++kvm->stat.mmu_shadow_zapped; 1626 ++kvm->stat.mmu_shadow_zapped;
1497 ret = mmu_zap_unsync_children(kvm, sp); 1627 ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1498 kvm_mmu_page_unlink_children(kvm, sp); 1628 kvm_mmu_page_unlink_children(kvm, sp);
1499 kvm_mmu_unlink_parents(kvm, sp); 1629 kvm_mmu_unlink_parents(kvm, sp);
1500 kvm_flush_remote_tlbs(kvm);
1501 if (!sp->role.invalid && !sp->role.direct) 1630 if (!sp->role.invalid && !sp->role.direct)
1502 unaccount_shadowed(kvm, sp->gfn); 1631 unaccount_shadowed(kvm, sp->gfn);
1503 if (sp->unsync) 1632 if (sp->unsync)
1504 kvm_unlink_unsync_page(kvm, sp); 1633 kvm_unlink_unsync_page(kvm, sp);
1505 if (!sp->root_count) { 1634 if (!sp->root_count) {
1506 hlist_del(&sp->hash_link); 1635 /* Count self */
1507 kvm_mmu_free_page(kvm, sp); 1636 ret++;
1637 list_move(&sp->link, invalid_list);
1508 } else { 1638 } else {
1509 sp->role.invalid = 1;
1510 list_move(&sp->link, &kvm->arch.active_mmu_pages); 1639 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1511 kvm_reload_remote_mmus(kvm); 1640 kvm_reload_remote_mmus(kvm);
1512 } 1641 }
1642
1643 sp->role.invalid = 1;
1513 kvm_mmu_reset_last_pte_updated(kvm); 1644 kvm_mmu_reset_last_pte_updated(kvm);
1514 return ret; 1645 return ret;
1515} 1646}
1516 1647
1648static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1649 struct list_head *invalid_list)
1650{
1651 struct kvm_mmu_page *sp;
1652
1653 if (list_empty(invalid_list))
1654 return;
1655
1656 kvm_flush_remote_tlbs(kvm);
1657
1658 do {
1659 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1660 WARN_ON(!sp->role.invalid || sp->root_count);
1661 kvm_mmu_free_page(kvm, sp);
1662 } while (!list_empty(invalid_list));
1663
1664}
1665
1517/* 1666/*
1518 * Changing the number of mmu pages allocated to the vm 1667 * Changing the number of mmu pages allocated to the vm
1519 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock 1668 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
@@ -1521,6 +1670,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1521void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1670void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1522{ 1671{
1523 int used_pages; 1672 int used_pages;
1673 LIST_HEAD(invalid_list);
1524 1674
1525 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; 1675 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1526 used_pages = max(0, used_pages); 1676 used_pages = max(0, used_pages);
@@ -1538,9 +1688,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1538 1688
1539 page = container_of(kvm->arch.active_mmu_pages.prev, 1689 page = container_of(kvm->arch.active_mmu_pages.prev,
1540 struct kvm_mmu_page, link); 1690 struct kvm_mmu_page, link);
1541 used_pages -= kvm_mmu_zap_page(kvm, page); 1691 used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
1542 used_pages--; 1692 &invalid_list);
1543 } 1693 }
1694 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1544 kvm_nr_mmu_pages = used_pages; 1695 kvm_nr_mmu_pages = used_pages;
1545 kvm->arch.n_free_mmu_pages = 0; 1696 kvm->arch.n_free_mmu_pages = 0;
1546 } 1697 }
@@ -1553,47 +1704,36 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1553 1704
1554static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 1705static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1555{ 1706{
1556 unsigned index;
1557 struct hlist_head *bucket;
1558 struct kvm_mmu_page *sp; 1707 struct kvm_mmu_page *sp;
1559 struct hlist_node *node, *n; 1708 struct hlist_node *node;
1709 LIST_HEAD(invalid_list);
1560 int r; 1710 int r;
1561 1711
1562 pgprintk("%s: looking for gfn %lx\n", __func__, gfn); 1712 pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1563 r = 0; 1713 r = 0;
1564 index = kvm_page_table_hashfn(gfn); 1714
1565 bucket = &kvm->arch.mmu_page_hash[index]; 1715 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1566restart: 1716 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1567 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) 1717 sp->role.word);
1568 if (sp->gfn == gfn && !sp->role.direct) { 1718 r = 1;
1569 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1719 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1570 sp->role.word); 1720 }
1571 r = 1; 1721 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1572 if (kvm_mmu_zap_page(kvm, sp))
1573 goto restart;
1574 }
1575 return r; 1722 return r;
1576} 1723}
1577 1724
1578static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) 1725static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1579{ 1726{
1580 unsigned index;
1581 struct hlist_head *bucket;
1582 struct kvm_mmu_page *sp; 1727 struct kvm_mmu_page *sp;
1583 struct hlist_node *node, *nn; 1728 struct hlist_node *node;
1729 LIST_HEAD(invalid_list);
1584 1730
1585 index = kvm_page_table_hashfn(gfn); 1731 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1586 bucket = &kvm->arch.mmu_page_hash[index]; 1732 pgprintk("%s: zap %lx %x\n",
1587restart: 1733 __func__, gfn, sp->role.word);
1588 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { 1734 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1589 if (sp->gfn == gfn && !sp->role.direct
1590 && !sp->role.invalid) {
1591 pgprintk("%s: zap %lx %x\n",
1592 __func__, gfn, sp->role.word);
1593 if (kvm_mmu_zap_page(kvm, sp))
1594 goto restart;
1595 }
1596 } 1735 }
1736 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1597} 1737}
1598 1738
1599static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) 1739static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
@@ -1723,47 +1863,51 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1723} 1863}
1724EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); 1864EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1725 1865
1726static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1866static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1727{ 1867{
1728 unsigned index;
1729 struct hlist_head *bucket;
1730 struct kvm_mmu_page *s;
1731 struct hlist_node *node, *n;
1732
1733 index = kvm_page_table_hashfn(sp->gfn);
1734 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1735 /* don't unsync if pagetable is shadowed with multiple roles */
1736 hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
1737 if (s->gfn != sp->gfn || s->role.direct)
1738 continue;
1739 if (s->role.word != sp->role.word)
1740 return 1;
1741 }
1742 trace_kvm_mmu_unsync_page(sp); 1868 trace_kvm_mmu_unsync_page(sp);
1743 ++vcpu->kvm->stat.mmu_unsync; 1869 ++vcpu->kvm->stat.mmu_unsync;
1744 sp->unsync = 1; 1870 sp->unsync = 1;
1745 1871
1746 kvm_mmu_mark_parents_unsync(sp); 1872 kvm_mmu_mark_parents_unsync(sp);
1747
1748 mmu_convert_notrap(sp); 1873 mmu_convert_notrap(sp);
1749 return 0; 1874}
1875
1876static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1877{
1878 struct kvm_mmu_page *s;
1879 struct hlist_node *node;
1880
1881 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1882 if (s->unsync)
1883 continue;
1884 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1885 __kvm_unsync_page(vcpu, s);
1886 }
1750} 1887}
1751 1888
1752static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, 1889static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1753 bool can_unsync) 1890 bool can_unsync)
1754{ 1891{
1755 struct kvm_mmu_page *shadow; 1892 struct kvm_mmu_page *s;
1893 struct hlist_node *node;
1894 bool need_unsync = false;
1756 1895
1757 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1896 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1758 if (shadow) { 1897 if (!can_unsync)
1759 if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
1760 return 1; 1898 return 1;
1761 if (shadow->unsync) 1899
1762 return 0; 1900 if (s->role.level != PT_PAGE_TABLE_LEVEL)
1763 if (can_unsync && oos_shadow) 1901 return 1;
1764 return kvm_unsync_page(vcpu, shadow); 1902
1765 return 1; 1903 if (!need_unsync && !s->unsync) {
1904 if (!oos_shadow)
1905 return 1;
1906 need_unsync = true;
1907 }
1766 } 1908 }
1909 if (need_unsync)
1910 kvm_unsync_pages(vcpu, gfn);
1767 return 0; 1911 return 0;
1768} 1912}
1769 1913
@@ -1804,13 +1948,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1804 spte |= (u64)pfn << PAGE_SHIFT; 1948 spte |= (u64)pfn << PAGE_SHIFT;
1805 1949
1806 if ((pte_access & ACC_WRITE_MASK) 1950 if ((pte_access & ACC_WRITE_MASK)
1807 || (write_fault && !is_write_protection(vcpu) && !user_fault)) { 1951 || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
1952 && !user_fault)) {
1808 1953
1809 if (level > PT_PAGE_TABLE_LEVEL && 1954 if (level > PT_PAGE_TABLE_LEVEL &&
1810 has_wrprotected_page(vcpu->kvm, gfn, level)) { 1955 has_wrprotected_page(vcpu->kvm, gfn, level)) {
1811 ret = 1; 1956 ret = 1;
1812 spte = shadow_trap_nonpresent_pte; 1957 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1813 goto set_pte; 1958 goto done;
1814 } 1959 }
1815 1960
1816 spte |= PT_WRITABLE_MASK; 1961 spte |= PT_WRITABLE_MASK;
@@ -1841,7 +1986,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1841 mark_page_dirty(vcpu->kvm, gfn); 1986 mark_page_dirty(vcpu->kvm, gfn);
1842 1987
1843set_pte: 1988set_pte:
1844 __set_spte(sptep, spte); 1989 if (is_writable_pte(*sptep) && !is_writable_pte(spte))
1990 kvm_set_pfn_dirty(pfn);
1991 update_spte(sptep, spte);
1992done:
1845 return ret; 1993 return ret;
1846} 1994}
1847 1995
@@ -1853,7 +2001,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1853 bool reset_host_protection) 2001 bool reset_host_protection)
1854{ 2002{
1855 int was_rmapped = 0; 2003 int was_rmapped = 0;
1856 int was_writable = is_writable_pte(*sptep);
1857 int rmap_count; 2004 int rmap_count;
1858 2005
1859 pgprintk("%s: spte %llx access %x write_fault %d" 2006 pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1878,8 +2025,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1878 } else if (pfn != spte_to_pfn(*sptep)) { 2025 } else if (pfn != spte_to_pfn(*sptep)) {
1879 pgprintk("hfn old %lx new %lx\n", 2026 pgprintk("hfn old %lx new %lx\n",
1880 spte_to_pfn(*sptep), pfn); 2027 spte_to_pfn(*sptep), pfn);
1881 rmap_remove(vcpu->kvm, sptep); 2028 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1882 __set_spte(sptep, shadow_trap_nonpresent_pte);
1883 kvm_flush_remote_tlbs(vcpu->kvm); 2029 kvm_flush_remote_tlbs(vcpu->kvm);
1884 } else 2030 } else
1885 was_rmapped = 1; 2031 was_rmapped = 1;
@@ -1890,7 +2036,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1890 reset_host_protection)) { 2036 reset_host_protection)) {
1891 if (write_fault) 2037 if (write_fault)
1892 *ptwrite = 1; 2038 *ptwrite = 1;
1893 kvm_x86_ops->tlb_flush(vcpu); 2039 kvm_mmu_flush_tlb(vcpu);
1894 } 2040 }
1895 2041
1896 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2042 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
@@ -1904,15 +2050,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1904 page_header_update_slot(vcpu->kvm, sptep, gfn); 2050 page_header_update_slot(vcpu->kvm, sptep, gfn);
1905 if (!was_rmapped) { 2051 if (!was_rmapped) {
1906 rmap_count = rmap_add(vcpu, sptep, gfn); 2052 rmap_count = rmap_add(vcpu, sptep, gfn);
1907 kvm_release_pfn_clean(pfn);
1908 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2053 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1909 rmap_recycle(vcpu, sptep, gfn); 2054 rmap_recycle(vcpu, sptep, gfn);
1910 } else {
1911 if (was_writable)
1912 kvm_release_pfn_dirty(pfn);
1913 else
1914 kvm_release_pfn_clean(pfn);
1915 } 2055 }
2056 kvm_release_pfn_clean(pfn);
1916 if (speculative) { 2057 if (speculative) {
1917 vcpu->arch.last_pte_updated = sptep; 2058 vcpu->arch.last_pte_updated = sptep;
1918 vcpu->arch.last_pte_gfn = gfn; 2059 vcpu->arch.last_pte_gfn = gfn;
@@ -1941,7 +2082,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1941 } 2082 }
1942 2083
1943 if (*iterator.sptep == shadow_trap_nonpresent_pte) { 2084 if (*iterator.sptep == shadow_trap_nonpresent_pte) {
1944 pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; 2085 u64 base_addr = iterator.addr;
2086
2087 base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
2088 pseudo_gfn = base_addr >> PAGE_SHIFT;
1945 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, 2089 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
1946 iterator.level - 1, 2090 iterator.level - 1,
1947 1, ACC_ALL, iterator.sptep); 2091 1, ACC_ALL, iterator.sptep);
@@ -1960,6 +2104,29 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1960 return pt_write; 2104 return pt_write;
1961} 2105}
1962 2106
2107static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
2108{
2109 char buf[1];
2110 void __user *hva;
2111 int r;
2112
2113 /* Touch the page, so send SIGBUS */
2114 hva = (void __user *)gfn_to_hva(kvm, gfn);
2115 r = copy_from_user(buf, hva, 1);
2116}
2117
2118static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2119{
2120 kvm_release_pfn_clean(pfn);
2121 if (is_hwpoison_pfn(pfn)) {
2122 kvm_send_hwpoison_signal(kvm, gfn);
2123 return 0;
2124 } else if (is_fault_pfn(pfn))
2125 return -EFAULT;
2126
2127 return 1;
2128}
2129
1963static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 2130static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1964{ 2131{
1965 int r; 2132 int r;
@@ -1983,10 +2150,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1983 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2150 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1984 2151
1985 /* mmio */ 2152 /* mmio */
1986 if (is_error_pfn(pfn)) { 2153 if (is_error_pfn(pfn))
1987 kvm_release_pfn_clean(pfn); 2154 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
1988 return 1;
1989 }
1990 2155
1991 spin_lock(&vcpu->kvm->mmu_lock); 2156 spin_lock(&vcpu->kvm->mmu_lock);
1992 if (mmu_notifier_retry(vcpu, mmu_seq)) 2157 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2009,6 +2174,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2009{ 2174{
2010 int i; 2175 int i;
2011 struct kvm_mmu_page *sp; 2176 struct kvm_mmu_page *sp;
2177 LIST_HEAD(invalid_list);
2012 2178
2013 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2179 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2014 return; 2180 return;
@@ -2018,8 +2184,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2018 2184
2019 sp = page_header(root); 2185 sp = page_header(root);
2020 --sp->root_count; 2186 --sp->root_count;
2021 if (!sp->root_count && sp->role.invalid) 2187 if (!sp->root_count && sp->role.invalid) {
2022 kvm_mmu_zap_page(vcpu->kvm, sp); 2188 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2189 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2190 }
2023 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2191 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2024 spin_unlock(&vcpu->kvm->mmu_lock); 2192 spin_unlock(&vcpu->kvm->mmu_lock);
2025 return; 2193 return;
@@ -2032,10 +2200,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2032 sp = page_header(root); 2200 sp = page_header(root);
2033 --sp->root_count; 2201 --sp->root_count;
2034 if (!sp->root_count && sp->role.invalid) 2202 if (!sp->root_count && sp->role.invalid)
2035 kvm_mmu_zap_page(vcpu->kvm, sp); 2203 kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2204 &invalid_list);
2036 } 2205 }
2037 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 2206 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2038 } 2207 }
2208 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2039 spin_unlock(&vcpu->kvm->mmu_lock); 2209 spin_unlock(&vcpu->kvm->mmu_lock);
2040 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2210 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2041} 2211}
@@ -2045,7 +2215,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2045 int ret = 0; 2215 int ret = 0;
2046 2216
2047 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { 2217 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2048 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 2218 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2049 ret = 1; 2219 ret = 1;
2050 } 2220 }
2051 2221
@@ -2073,6 +2243,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2073 root_gfn = 0; 2243 root_gfn = 0;
2074 } 2244 }
2075 spin_lock(&vcpu->kvm->mmu_lock); 2245 spin_lock(&vcpu->kvm->mmu_lock);
2246 kvm_mmu_free_some_pages(vcpu);
2076 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2247 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
2077 PT64_ROOT_LEVEL, direct, 2248 PT64_ROOT_LEVEL, direct,
2078 ACC_ALL, NULL); 2249 ACC_ALL, NULL);
@@ -2103,6 +2274,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2103 root_gfn = i << 30; 2274 root_gfn = i << 30;
2104 } 2275 }
2105 spin_lock(&vcpu->kvm->mmu_lock); 2276 spin_lock(&vcpu->kvm->mmu_lock);
2277 kvm_mmu_free_some_pages(vcpu);
2106 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2278 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2107 PT32_ROOT_LEVEL, direct, 2279 PT32_ROOT_LEVEL, direct,
2108 ACC_ALL, NULL); 2280 ACC_ALL, NULL);
@@ -2198,10 +2370,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2198 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2370 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2199 smp_rmb(); 2371 smp_rmb();
2200 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2372 pfn = gfn_to_pfn(vcpu->kvm, gfn);
2201 if (is_error_pfn(pfn)) { 2373 if (is_error_pfn(pfn))
2202 kvm_release_pfn_clean(pfn); 2374 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2203 return 1;
2204 }
2205 spin_lock(&vcpu->kvm->mmu_lock); 2375 spin_lock(&vcpu->kvm->mmu_lock);
2206 if (mmu_notifier_retry(vcpu, mmu_seq)) 2376 if (mmu_notifier_retry(vcpu, mmu_seq))
2207 goto out_unlock; 2377 goto out_unlock;
@@ -2243,7 +2413,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
2243void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) 2413void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2244{ 2414{
2245 ++vcpu->stat.tlb_flush; 2415 ++vcpu->stat.tlb_flush;
2246 kvm_x86_ops->tlb_flush(vcpu); 2416 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2247} 2417}
2248 2418
2249static void paging_new_cr3(struct kvm_vcpu *vcpu) 2419static void paging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2457,10 +2627,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
2457static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) 2627static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
2458{ 2628{
2459 ASSERT(vcpu); 2629 ASSERT(vcpu);
2460 if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { 2630 if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
2631 /* mmu.free() should set root_hpa = INVALID_PAGE */
2461 vcpu->arch.mmu.free(vcpu); 2632 vcpu->arch.mmu.free(vcpu);
2462 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2463 }
2464} 2633}
2465 2634
2466int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 2635int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -2477,9 +2646,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2477 r = mmu_topup_memory_caches(vcpu); 2646 r = mmu_topup_memory_caches(vcpu);
2478 if (r) 2647 if (r)
2479 goto out; 2648 goto out;
2480 spin_lock(&vcpu->kvm->mmu_lock);
2481 kvm_mmu_free_some_pages(vcpu);
2482 spin_unlock(&vcpu->kvm->mmu_lock);
2483 r = mmu_alloc_roots(vcpu); 2649 r = mmu_alloc_roots(vcpu);
2484 spin_lock(&vcpu->kvm->mmu_lock); 2650 spin_lock(&vcpu->kvm->mmu_lock);
2485 mmu_sync_roots(vcpu); 2651 mmu_sync_roots(vcpu);
@@ -2508,7 +2674,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2508 pte = *spte; 2674 pte = *spte;
2509 if (is_shadow_present_pte(pte)) { 2675 if (is_shadow_present_pte(pte)) {
2510 if (is_last_spte(pte, sp->role.level)) 2676 if (is_last_spte(pte, sp->role.level))
2511 rmap_remove(vcpu->kvm, spte); 2677 drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
2512 else { 2678 else {
2513 child = page_header(pte & PT64_BASE_ADDR_MASK); 2679 child = page_header(pte & PT64_BASE_ADDR_MASK);
2514 mmu_page_remove_parent_pte(child, spte); 2680 mmu_page_remove_parent_pte(child, spte);
@@ -2529,6 +2695,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2529 return; 2695 return;
2530 } 2696 }
2531 2697
2698 if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
2699 return;
2700
2532 ++vcpu->kvm->stat.mmu_pte_updated; 2701 ++vcpu->kvm->stat.mmu_pte_updated;
2533 if (!sp->role.cr4_pae) 2702 if (!sp->role.cr4_pae)
2534 paging32_update_pte(vcpu, sp, spte, new); 2703 paging32_update_pte(vcpu, sp, spte, new);
@@ -2549,11 +2718,15 @@ static bool need_remote_flush(u64 old, u64 new)
2549 return (old & ~new & PT64_PERM_MASK) != 0; 2718 return (old & ~new & PT64_PERM_MASK) != 0;
2550} 2719}
2551 2720
2552static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) 2721static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
2722 bool remote_flush, bool local_flush)
2553{ 2723{
2554 if (need_remote_flush(old, new)) 2724 if (zap_page)
2725 return;
2726
2727 if (remote_flush)
2555 kvm_flush_remote_tlbs(vcpu->kvm); 2728 kvm_flush_remote_tlbs(vcpu->kvm);
2556 else 2729 else if (local_flush)
2557 kvm_mmu_flush_tlb(vcpu); 2730 kvm_mmu_flush_tlb(vcpu);
2558} 2731}
2559 2732
@@ -2603,10 +2776,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2603 bool guest_initiated) 2776 bool guest_initiated)
2604{ 2777{
2605 gfn_t gfn = gpa >> PAGE_SHIFT; 2778 gfn_t gfn = gpa >> PAGE_SHIFT;
2779 union kvm_mmu_page_role mask = { .word = 0 };
2606 struct kvm_mmu_page *sp; 2780 struct kvm_mmu_page *sp;
2607 struct hlist_node *node, *n; 2781 struct hlist_node *node;
2608 struct hlist_head *bucket; 2782 LIST_HEAD(invalid_list);
2609 unsigned index;
2610 u64 entry, gentry; 2783 u64 entry, gentry;
2611 u64 *spte; 2784 u64 *spte;
2612 unsigned offset = offset_in_page(gpa); 2785 unsigned offset = offset_in_page(gpa);
@@ -2619,6 +2792,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2619 int npte; 2792 int npte;
2620 int r; 2793 int r;
2621 int invlpg_counter; 2794 int invlpg_counter;
2795 bool remote_flush, local_flush, zap_page;
2796
2797 zap_page = remote_flush = local_flush = false;
2622 2798
2623 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 2799 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2624 2800
@@ -2674,13 +2850,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2674 vcpu->arch.last_pte_updated = NULL; 2850 vcpu->arch.last_pte_updated = NULL;
2675 } 2851 }
2676 } 2852 }
2677 index = kvm_page_table_hashfn(gfn);
2678 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
2679 2853
2680restart: 2854 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
2681 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2855 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
2682 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
2683 continue;
2684 pte_size = sp->role.cr4_pae ? 8 : 4; 2856 pte_size = sp->role.cr4_pae ? 8 : 4;
2685 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2857 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2686 misaligned |= bytes < 4; 2858 misaligned |= bytes < 4;
@@ -2697,8 +2869,8 @@ restart:
2697 */ 2869 */
2698 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2870 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2699 gpa, bytes, sp->role.word); 2871 gpa, bytes, sp->role.word);
2700 if (kvm_mmu_zap_page(vcpu->kvm, sp)) 2872 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2701 goto restart; 2873 &invalid_list);
2702 ++vcpu->kvm->stat.mmu_flooded; 2874 ++vcpu->kvm->stat.mmu_flooded;
2703 continue; 2875 continue;
2704 } 2876 }
@@ -2722,16 +2894,22 @@ restart:
2722 if (quadrant != sp->role.quadrant) 2894 if (quadrant != sp->role.quadrant)
2723 continue; 2895 continue;
2724 } 2896 }
2897 local_flush = true;
2725 spte = &sp->spt[page_offset / sizeof(*spte)]; 2898 spte = &sp->spt[page_offset / sizeof(*spte)];
2726 while (npte--) { 2899 while (npte--) {
2727 entry = *spte; 2900 entry = *spte;
2728 mmu_pte_write_zap_pte(vcpu, sp, spte); 2901 mmu_pte_write_zap_pte(vcpu, sp, spte);
2729 if (gentry) 2902 if (gentry &&
2903 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
2904 & mask.word))
2730 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 2905 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2731 mmu_pte_write_flush_tlb(vcpu, entry, *spte); 2906 if (!remote_flush && need_remote_flush(entry, *spte))
2907 remote_flush = true;
2732 ++spte; 2908 ++spte;
2733 } 2909 }
2734 } 2910 }
2911 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2912 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2735 kvm_mmu_audit(vcpu, "post pte write"); 2913 kvm_mmu_audit(vcpu, "post pte write");
2736 spin_unlock(&vcpu->kvm->mmu_lock); 2914 spin_unlock(&vcpu->kvm->mmu_lock);
2737 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { 2915 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
@@ -2759,15 +2937,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2759 2937
2760void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 2938void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2761{ 2939{
2762 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && 2940 int free_pages;
2941 LIST_HEAD(invalid_list);
2942
2943 free_pages = vcpu->kvm->arch.n_free_mmu_pages;
2944 while (free_pages < KVM_REFILL_PAGES &&
2763 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 2945 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2764 struct kvm_mmu_page *sp; 2946 struct kvm_mmu_page *sp;
2765 2947
2766 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 2948 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2767 struct kvm_mmu_page, link); 2949 struct kvm_mmu_page, link);
2768 kvm_mmu_zap_page(vcpu->kvm, sp); 2950 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2951 &invalid_list);
2769 ++vcpu->kvm->stat.mmu_recycled; 2952 ++vcpu->kvm->stat.mmu_recycled;
2770 } 2953 }
2954 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2771} 2955}
2772 2956
2773int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 2957int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -2795,11 +2979,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2795 return 1; 2979 return 1;
2796 case EMULATE_DO_MMIO: 2980 case EMULATE_DO_MMIO:
2797 ++vcpu->stat.mmio_exits; 2981 ++vcpu->stat.mmio_exits;
2798 return 0; 2982 /* fall through */
2799 case EMULATE_FAIL: 2983 case EMULATE_FAIL:
2800 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2801 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2802 vcpu->run->internal.ndata = 0;
2803 return 0; 2984 return 0;
2804 default: 2985 default:
2805 BUG(); 2986 BUG();
@@ -2896,7 +3077,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2896 pt = sp->spt; 3077 pt = sp->spt;
2897 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3078 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2898 /* avoid RMW */ 3079 /* avoid RMW */
2899 if (pt[i] & PT_WRITABLE_MASK) 3080 if (is_writable_pte(pt[i]))
2900 pt[i] &= ~PT_WRITABLE_MASK; 3081 pt[i] &= ~PT_WRITABLE_MASK;
2901 } 3082 }
2902 kvm_flush_remote_tlbs(kvm); 3083 kvm_flush_remote_tlbs(kvm);
@@ -2905,25 +3086,26 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2905void kvm_mmu_zap_all(struct kvm *kvm) 3086void kvm_mmu_zap_all(struct kvm *kvm)
2906{ 3087{
2907 struct kvm_mmu_page *sp, *node; 3088 struct kvm_mmu_page *sp, *node;
3089 LIST_HEAD(invalid_list);
2908 3090
2909 spin_lock(&kvm->mmu_lock); 3091 spin_lock(&kvm->mmu_lock);
2910restart: 3092restart:
2911 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 3093 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2912 if (kvm_mmu_zap_page(kvm, sp)) 3094 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
2913 goto restart; 3095 goto restart;
2914 3096
3097 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2915 spin_unlock(&kvm->mmu_lock); 3098 spin_unlock(&kvm->mmu_lock);
2916
2917 kvm_flush_remote_tlbs(kvm);
2918} 3099}
2919 3100
2920static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) 3101static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3102 struct list_head *invalid_list)
2921{ 3103{
2922 struct kvm_mmu_page *page; 3104 struct kvm_mmu_page *page;
2923 3105
2924 page = container_of(kvm->arch.active_mmu_pages.prev, 3106 page = container_of(kvm->arch.active_mmu_pages.prev,
2925 struct kvm_mmu_page, link); 3107 struct kvm_mmu_page, link);
2926 return kvm_mmu_zap_page(kvm, page) + 1; 3108 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
2927} 3109}
2928 3110
2929static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 3111static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
@@ -2936,6 +3118,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
2936 3118
2937 list_for_each_entry(kvm, &vm_list, vm_list) { 3119 list_for_each_entry(kvm, &vm_list, vm_list) {
2938 int npages, idx, freed_pages; 3120 int npages, idx, freed_pages;
3121 LIST_HEAD(invalid_list);
2939 3122
2940 idx = srcu_read_lock(&kvm->srcu); 3123 idx = srcu_read_lock(&kvm->srcu);
2941 spin_lock(&kvm->mmu_lock); 3124 spin_lock(&kvm->mmu_lock);
@@ -2943,12 +3126,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
2943 kvm->arch.n_free_mmu_pages; 3126 kvm->arch.n_free_mmu_pages;
2944 cache_count += npages; 3127 cache_count += npages;
2945 if (!kvm_freed && nr_to_scan > 0 && npages > 0) { 3128 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
2946 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm); 3129 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3130 &invalid_list);
2947 cache_count -= freed_pages; 3131 cache_count -= freed_pages;
2948 kvm_freed = kvm; 3132 kvm_freed = kvm;
2949 } 3133 }
2950 nr_to_scan--; 3134 nr_to_scan--;
2951 3135
3136 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2952 spin_unlock(&kvm->mmu_lock); 3137 spin_unlock(&kvm->mmu_lock);
2953 srcu_read_unlock(&kvm->srcu, idx); 3138 srcu_read_unlock(&kvm->srcu, idx);
2954 } 3139 }
@@ -3074,7 +3259,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3074 3259
3075static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 3260static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3076{ 3261{
3077 kvm_set_cr3(vcpu, vcpu->arch.cr3); 3262 (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
3078 return 1; 3263 return 1;
3079} 3264}
3080 3265
@@ -3331,9 +3516,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3331 struct kvm_mmu_page *rev_sp; 3516 struct kvm_mmu_page *rev_sp;
3332 gfn_t gfn; 3517 gfn_t gfn;
3333 3518
3334 if (*sptep & PT_WRITABLE_MASK) { 3519 if (is_writable_pte(*sptep)) {
3335 rev_sp = page_header(__pa(sptep)); 3520 rev_sp = page_header(__pa(sptep));
3336 gfn = rev_sp->gfns[sptep - rev_sp->spt]; 3521 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3337 3522
3338 if (!gfn_to_memslot(kvm, gfn)) { 3523 if (!gfn_to_memslot(kvm, gfn)) {
3339 if (!printk_ratelimit()) 3524 if (!printk_ratelimit())
@@ -3347,8 +3532,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3347 return; 3532 return;
3348 } 3533 }
3349 3534
3350 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 3535 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3351 rev_sp->role.level);
3352 if (!*rmapp) { 3536 if (!*rmapp) {
3353 if (!printk_ratelimit()) 3537 if (!printk_ratelimit())
3354 return; 3538 return;
@@ -3381,7 +3565,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3381 3565
3382 if (!(ent & PT_PRESENT_MASK)) 3566 if (!(ent & PT_PRESENT_MASK))
3383 continue; 3567 continue;
3384 if (!(ent & PT_WRITABLE_MASK)) 3568 if (!is_writable_pte(ent))
3385 continue; 3569 continue;
3386 inspect_spte_has_rmap(vcpu->kvm, &pt[i]); 3570 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3387 } 3571 }
@@ -3409,13 +3593,12 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
3409 if (sp->unsync) 3593 if (sp->unsync)
3410 continue; 3594 continue;
3411 3595
3412 gfn = unalias_gfn(vcpu->kvm, sp->gfn); 3596 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3413 slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
3414 rmapp = &slot->rmap[gfn - slot->base_gfn]; 3597 rmapp = &slot->rmap[gfn - slot->base_gfn];
3415 3598
3416 spte = rmap_next(vcpu->kvm, rmapp, NULL); 3599 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3417 while (spte) { 3600 while (spte) {
3418 if (*spte & PT_WRITABLE_MASK) 3601 if (is_writable_pte(*spte))
3419 printk(KERN_ERR "%s: (%s) shadow page has " 3602 printk(KERN_ERR "%s: (%s) shadow page has "
3420 "writable mappings: gfn %lx role %x\n", 3603 "writable mappings: gfn %lx role %x\n",
3421 __func__, audit_msg, sp->gfn, 3604 __func__, audit_msg, sp->gfn,
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 42f07b1bfbc9..3aab0f0930ef 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -190,7 +190,7 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
190 TP_ARGS(sp) 190 TP_ARGS(sp)
191); 191);
192 192
193DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page, 193DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
194 TP_PROTO(struct kvm_mmu_page *sp), 194 TP_PROTO(struct kvm_mmu_page *sp),
195 195
196 TP_ARGS(sp) 196 TP_ARGS(sp)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2331bdc2b549..51ef9097960d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,6 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates.
10 * 11 *
11 * Authors: 12 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -118,21 +119,25 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
118{ 119{
119 pt_element_t pte; 120 pt_element_t pte;
120 gfn_t table_gfn; 121 gfn_t table_gfn;
121 unsigned index, pt_access, pte_access; 122 unsigned index, pt_access, uninitialized_var(pte_access);
122 gpa_t pte_gpa; 123 gpa_t pte_gpa;
123 int rsvd_fault = 0; 124 bool eperm, present, rsvd_fault;
124 125
125 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 126 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
126 fetch_fault); 127 fetch_fault);
127walk: 128walk:
129 present = true;
130 eperm = rsvd_fault = false;
128 walker->level = vcpu->arch.mmu.root_level; 131 walker->level = vcpu->arch.mmu.root_level;
129 pte = vcpu->arch.cr3; 132 pte = vcpu->arch.cr3;
130#if PTTYPE == 64 133#if PTTYPE == 64
131 if (!is_long_mode(vcpu)) { 134 if (!is_long_mode(vcpu)) {
132 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); 135 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
133 trace_kvm_mmu_paging_element(pte, walker->level); 136 trace_kvm_mmu_paging_element(pte, walker->level);
134 if (!is_present_gpte(pte)) 137 if (!is_present_gpte(pte)) {
135 goto not_present; 138 present = false;
139 goto error;
140 }
136 --walker->level; 141 --walker->level;
137 } 142 }
138#endif 143#endif
@@ -150,37 +155,42 @@ walk:
150 walker->table_gfn[walker->level - 1] = table_gfn; 155 walker->table_gfn[walker->level - 1] = table_gfn;
151 walker->pte_gpa[walker->level - 1] = pte_gpa; 156 walker->pte_gpa[walker->level - 1] = pte_gpa;
152 157
153 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) 158 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) {
154 goto not_present; 159 present = false;
160 break;
161 }
155 162
156 trace_kvm_mmu_paging_element(pte, walker->level); 163 trace_kvm_mmu_paging_element(pte, walker->level);
157 164
158 if (!is_present_gpte(pte)) 165 if (!is_present_gpte(pte)) {
159 goto not_present; 166 present = false;
167 break;
168 }
160 169
161 rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); 170 if (is_rsvd_bits_set(vcpu, pte, walker->level)) {
162 if (rsvd_fault) 171 rsvd_fault = true;
163 goto access_error; 172 break;
173 }
164 174
165 if (write_fault && !is_writable_pte(pte)) 175 if (write_fault && !is_writable_pte(pte))
166 if (user_fault || is_write_protection(vcpu)) 176 if (user_fault || is_write_protection(vcpu))
167 goto access_error; 177 eperm = true;
168 178
169 if (user_fault && !(pte & PT_USER_MASK)) 179 if (user_fault && !(pte & PT_USER_MASK))
170 goto access_error; 180 eperm = true;
171 181
172#if PTTYPE == 64 182#if PTTYPE == 64
173 if (fetch_fault && (pte & PT64_NX_MASK)) 183 if (fetch_fault && (pte & PT64_NX_MASK))
174 goto access_error; 184 eperm = true;
175#endif 185#endif
176 186
177 if (!(pte & PT_ACCESSED_MASK)) { 187 if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
178 trace_kvm_mmu_set_accessed_bit(table_gfn, index, 188 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
179 sizeof(pte)); 189 sizeof(pte));
180 mark_page_dirty(vcpu->kvm, table_gfn);
181 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, 190 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
182 index, pte, pte|PT_ACCESSED_MASK)) 191 index, pte, pte|PT_ACCESSED_MASK))
183 goto walk; 192 goto walk;
193 mark_page_dirty(vcpu->kvm, table_gfn);
184 pte |= PT_ACCESSED_MASK; 194 pte |= PT_ACCESSED_MASK;
185 } 195 }
186 196
@@ -213,15 +223,18 @@ walk:
213 --walker->level; 223 --walker->level;
214 } 224 }
215 225
226 if (!present || eperm || rsvd_fault)
227 goto error;
228
216 if (write_fault && !is_dirty_gpte(pte)) { 229 if (write_fault && !is_dirty_gpte(pte)) {
217 bool ret; 230 bool ret;
218 231
219 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 232 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
220 mark_page_dirty(vcpu->kvm, table_gfn);
221 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, 233 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
222 pte|PT_DIRTY_MASK); 234 pte|PT_DIRTY_MASK);
223 if (ret) 235 if (ret)
224 goto walk; 236 goto walk;
237 mark_page_dirty(vcpu->kvm, table_gfn);
225 pte |= PT_DIRTY_MASK; 238 pte |= PT_DIRTY_MASK;
226 walker->ptes[walker->level - 1] = pte; 239 walker->ptes[walker->level - 1] = pte;
227 } 240 }
@@ -229,22 +242,18 @@ walk:
229 walker->pt_access = pt_access; 242 walker->pt_access = pt_access;
230 walker->pte_access = pte_access; 243 walker->pte_access = pte_access;
231 pgprintk("%s: pte %llx pte_access %x pt_access %x\n", 244 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
232 __func__, (u64)pte, pt_access, pte_access); 245 __func__, (u64)pte, pte_access, pt_access);
233 return 1; 246 return 1;
234 247
235not_present: 248error:
236 walker->error_code = 0; 249 walker->error_code = 0;
237 goto err; 250 if (present)
238 251 walker->error_code |= PFERR_PRESENT_MASK;
239access_error:
240 walker->error_code = PFERR_PRESENT_MASK;
241
242err:
243 if (write_fault) 252 if (write_fault)
244 walker->error_code |= PFERR_WRITE_MASK; 253 walker->error_code |= PFERR_WRITE_MASK;
245 if (user_fault) 254 if (user_fault)
246 walker->error_code |= PFERR_USER_MASK; 255 walker->error_code |= PFERR_USER_MASK;
247 if (fetch_fault) 256 if (fetch_fault && is_nx(vcpu))
248 walker->error_code |= PFERR_FETCH_MASK; 257 walker->error_code |= PFERR_FETCH_MASK;
249 if (rsvd_fault) 258 if (rsvd_fault)
250 walker->error_code |= PFERR_RSVD_MASK; 259 walker->error_code |= PFERR_RSVD_MASK;
@@ -252,7 +261,7 @@ err:
252 return 0; 261 return 0;
253} 262}
254 263
255static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, 264static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
256 u64 *spte, const void *pte) 265 u64 *spte, const void *pte)
257{ 266{
258 pt_element_t gpte; 267 pt_element_t gpte;
@@ -263,7 +272,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
263 gpte = *(const pt_element_t *)pte; 272 gpte = *(const pt_element_t *)pte;
264 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 273 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
265 if (!is_present_gpte(gpte)) { 274 if (!is_present_gpte(gpte)) {
266 if (page->unsync) 275 if (sp->unsync)
267 new_spte = shadow_trap_nonpresent_pte; 276 new_spte = shadow_trap_nonpresent_pte;
268 else 277 else
269 new_spte = shadow_notrap_nonpresent_pte; 278 new_spte = shadow_notrap_nonpresent_pte;
@@ -272,7 +281,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
272 return; 281 return;
273 } 282 }
274 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 283 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
275 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); 284 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
276 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 285 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
277 return; 286 return;
278 pfn = vcpu->arch.update_pte.pfn; 287 pfn = vcpu->arch.update_pte.pfn;
@@ -285,11 +294,22 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
285 * we call mmu_set_spte() with reset_host_protection = true beacuse that 294 * we call mmu_set_spte() with reset_host_protection = true beacuse that
286 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 295 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
287 */ 296 */
288 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 297 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
289 gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, 298 is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
290 gpte_to_gfn(gpte), pfn, true, true); 299 gpte_to_gfn(gpte), pfn, true, true);
291} 300}
292 301
302static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
303 struct guest_walker *gw, int level)
304{
305 int r;
306 pt_element_t curr_pte;
307
308 r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
309 &curr_pte, sizeof(curr_pte));
310 return r || curr_pte != gw->ptes[level - 1];
311}
312
293/* 313/*
294 * Fetch a shadow pte for a specific level in the paging hierarchy. 314 * Fetch a shadow pte for a specific level in the paging hierarchy.
295 */ 315 */
@@ -299,75 +319,86 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
299 int *ptwrite, pfn_t pfn) 319 int *ptwrite, pfn_t pfn)
300{ 320{
301 unsigned access = gw->pt_access; 321 unsigned access = gw->pt_access;
302 struct kvm_mmu_page *shadow_page; 322 struct kvm_mmu_page *sp = NULL;
303 u64 spte, *sptep = NULL; 323 bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
304 int direct; 324 int top_level;
305 gfn_t table_gfn; 325 unsigned direct_access;
306 int r; 326 struct kvm_shadow_walk_iterator it;
307 int level;
308 pt_element_t curr_pte;
309 struct kvm_shadow_walk_iterator iterator;
310 327
311 if (!is_present_gpte(gw->ptes[gw->level - 1])) 328 if (!is_present_gpte(gw->ptes[gw->level - 1]))
312 return NULL; 329 return NULL;
313 330
314 for_each_shadow_entry(vcpu, addr, iterator) { 331 direct_access = gw->pt_access & gw->pte_access;
315 level = iterator.level; 332 if (!dirty)
316 sptep = iterator.sptep; 333 direct_access &= ~ACC_WRITE_MASK;
317 if (iterator.level == hlevel) {
318 mmu_set_spte(vcpu, sptep, access,
319 gw->pte_access & access,
320 user_fault, write_fault,
321 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
322 ptwrite, level,
323 gw->gfn, pfn, false, true);
324 break;
325 }
326 334
327 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) 335 top_level = vcpu->arch.mmu.root_level;
328 continue; 336 if (top_level == PT32E_ROOT_LEVEL)
337 top_level = PT32_ROOT_LEVEL;
338 /*
339 * Verify that the top-level gpte is still there. Since the page
340 * is a root page, it is either write protected (and cannot be
341 * changed from now on) or it is invalid (in which case, we don't
342 * really care if it changes underneath us after this point).
343 */
344 if (FNAME(gpte_changed)(vcpu, gw, top_level))
345 goto out_gpte_changed;
329 346
330 if (is_large_pte(*sptep)) { 347 for (shadow_walk_init(&it, vcpu, addr);
331 rmap_remove(vcpu->kvm, sptep); 348 shadow_walk_okay(&it) && it.level > gw->level;
332 __set_spte(sptep, shadow_trap_nonpresent_pte); 349 shadow_walk_next(&it)) {
333 kvm_flush_remote_tlbs(vcpu->kvm); 350 gfn_t table_gfn;
334 }
335 351
336 if (level <= gw->level) { 352 drop_large_spte(vcpu, it.sptep);
337 int delta = level - gw->level + 1; 353
338 direct = 1; 354 sp = NULL;
339 if (!is_dirty_gpte(gw->ptes[level - delta])) 355 if (!is_shadow_present_pte(*it.sptep)) {
340 access &= ~ACC_WRITE_MASK; 356 table_gfn = gw->table_gfn[it.level - 2];
341 table_gfn = gpte_to_gfn(gw->ptes[level - delta]); 357 sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
342 /* advance table_gfn when emulating 1gb pages with 4k */ 358 false, access, it.sptep);
343 if (delta == 0)
344 table_gfn += PT_INDEX(addr, level);
345 access &= gw->pte_access;
346 } else {
347 direct = 0;
348 table_gfn = gw->table_gfn[level - 2];
349 }
350 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
351 direct, access, sptep);
352 if (!direct) {
353 r = kvm_read_guest_atomic(vcpu->kvm,
354 gw->pte_gpa[level - 2],
355 &curr_pte, sizeof(curr_pte));
356 if (r || curr_pte != gw->ptes[level - 2]) {
357 kvm_mmu_put_page(shadow_page, sptep);
358 kvm_release_pfn_clean(pfn);
359 sptep = NULL;
360 break;
361 }
362 } 359 }
363 360
364 spte = __pa(shadow_page->spt) 361 /*
365 | PT_PRESENT_MASK | PT_ACCESSED_MASK 362 * Verify that the gpte in the page we've just write
366 | PT_WRITABLE_MASK | PT_USER_MASK; 363 * protected is still there.
367 *sptep = spte; 364 */
365 if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
366 goto out_gpte_changed;
367
368 if (sp)
369 link_shadow_page(it.sptep, sp);
368 } 370 }
369 371
370 return sptep; 372 for (;
373 shadow_walk_okay(&it) && it.level > hlevel;
374 shadow_walk_next(&it)) {
375 gfn_t direct_gfn;
376
377 validate_direct_spte(vcpu, it.sptep, direct_access);
378
379 drop_large_spte(vcpu, it.sptep);
380
381 if (is_shadow_present_pte(*it.sptep))
382 continue;
383
384 direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
385
386 sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
387 true, direct_access, it.sptep);
388 link_shadow_page(it.sptep, sp);
389 }
390
391 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
392 user_fault, write_fault, dirty, ptwrite, it.level,
393 gw->gfn, pfn, false, true);
394
395 return it.sptep;
396
397out_gpte_changed:
398 if (sp)
399 kvm_mmu_put_page(sp, it.sptep);
400 kvm_release_pfn_clean(pfn);
401 return NULL;
371} 402}
372 403
373/* 404/*
@@ -431,11 +462,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
431 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 462 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
432 463
433 /* mmio */ 464 /* mmio */
434 if (is_error_pfn(pfn)) { 465 if (is_error_pfn(pfn))
435 pgprintk("gfn %lx is mmio\n", walker.gfn); 466 return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
436 kvm_release_pfn_clean(pfn);
437 return 1;
438 }
439 467
440 spin_lock(&vcpu->kvm->mmu_lock); 468 spin_lock(&vcpu->kvm->mmu_lock);
441 if (mmu_notifier_retry(vcpu, mmu_seq)) 469 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -443,6 +471,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
443 kvm_mmu_free_some_pages(vcpu); 471 kvm_mmu_free_some_pages(vcpu);
444 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 472 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
445 level, &write_pt, pfn); 473 level, &write_pt, pfn);
474 (void)sptep;
446 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 475 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
447 sptep, *sptep, write_pt); 476 sptep, *sptep, write_pt);
448 477
@@ -464,6 +493,7 @@ out_unlock:
464static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 493static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
465{ 494{
466 struct kvm_shadow_walk_iterator iterator; 495 struct kvm_shadow_walk_iterator iterator;
496 struct kvm_mmu_page *sp;
467 gpa_t pte_gpa = -1; 497 gpa_t pte_gpa = -1;
468 int level; 498 int level;
469 u64 *sptep; 499 u64 *sptep;
@@ -475,10 +505,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
475 level = iterator.level; 505 level = iterator.level;
476 sptep = iterator.sptep; 506 sptep = iterator.sptep;
477 507
508 sp = page_header(__pa(sptep));
478 if (is_last_spte(*sptep, level)) { 509 if (is_last_spte(*sptep, level)) {
479 struct kvm_mmu_page *sp = page_header(__pa(sptep));
480 int offset, shift; 510 int offset, shift;
481 511
512 if (!sp->unsync)
513 break;
514
482 shift = PAGE_SHIFT - 515 shift = PAGE_SHIFT -
483 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; 516 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
484 offset = sp->role.quadrant << shift; 517 offset = sp->role.quadrant << shift;
@@ -487,16 +520,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
487 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); 520 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
488 521
489 if (is_shadow_present_pte(*sptep)) { 522 if (is_shadow_present_pte(*sptep)) {
490 rmap_remove(vcpu->kvm, sptep);
491 if (is_large_pte(*sptep)) 523 if (is_large_pte(*sptep))
492 --vcpu->kvm->stat.lpages; 524 --vcpu->kvm->stat.lpages;
525 drop_spte(vcpu->kvm, sptep,
526 shadow_trap_nonpresent_pte);
493 need_flush = 1; 527 need_flush = 1;
494 } 528 } else
495 __set_spte(sptep, shadow_trap_nonpresent_pte); 529 __set_spte(sptep, shadow_trap_nonpresent_pte);
496 break; 530 break;
497 } 531 }
498 532
499 if (!is_shadow_present_pte(*sptep)) 533 if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
500 break; 534 break;
501 } 535 }
502 536
@@ -570,9 +604,9 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
570 * Using the cached information from sp->gfns is safe because: 604 * Using the cached information from sp->gfns is safe because:
571 * - The spte has a reference to the struct page, so the pfn for a given gfn 605 * - The spte has a reference to the struct page, so the pfn for a given gfn
572 * can't change unless all sptes pointing to it are nuked first. 606 * can't change unless all sptes pointing to it are nuked first.
573 * - Alias changes zap the entire shadow cache.
574 */ 607 */
575static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 608static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
609 bool clear_unsync)
576{ 610{
577 int i, offset, nr_present; 611 int i, offset, nr_present;
578 bool reset_host_protection; 612 bool reset_host_protection;
@@ -580,6 +614,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
580 614
581 offset = nr_present = 0; 615 offset = nr_present = 0;
582 616
617 /* direct kvm_mmu_page can not be unsync. */
618 BUG_ON(sp->role.direct);
619
583 if (PTTYPE == 32) 620 if (PTTYPE == 32)
584 offset = sp->role.quadrant << PT64_LEVEL_BITS; 621 offset = sp->role.quadrant << PT64_LEVEL_BITS;
585 622
@@ -589,7 +626,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
589 unsigned pte_access; 626 unsigned pte_access;
590 pt_element_t gpte; 627 pt_element_t gpte;
591 gpa_t pte_gpa; 628 gpa_t pte_gpa;
592 gfn_t gfn = sp->gfns[i]; 629 gfn_t gfn;
593 630
594 if (!is_shadow_present_pte(sp->spt[i])) 631 if (!is_shadow_present_pte(sp->spt[i]))
595 continue; 632 continue;
@@ -600,16 +637,17 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
600 sizeof(pt_element_t))) 637 sizeof(pt_element_t)))
601 return -EINVAL; 638 return -EINVAL;
602 639
603 if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || 640 gfn = gpte_to_gfn(gpte);
604 !(gpte & PT_ACCESSED_MASK)) { 641 if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
642 || gfn != sp->gfns[i] || !is_present_gpte(gpte)
643 || !(gpte & PT_ACCESSED_MASK)) {
605 u64 nonpresent; 644 u64 nonpresent;
606 645
607 rmap_remove(vcpu->kvm, &sp->spt[i]); 646 if (is_present_gpte(gpte) || !clear_unsync)
608 if (is_present_gpte(gpte))
609 nonpresent = shadow_trap_nonpresent_pte; 647 nonpresent = shadow_trap_nonpresent_pte;
610 else 648 else
611 nonpresent = shadow_notrap_nonpresent_pte; 649 nonpresent = shadow_notrap_nonpresent_pte;
612 __set_spte(&sp->spt[i], nonpresent); 650 drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
613 continue; 651 continue;
614 } 652 }
615 653
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce438e0fdd26..bc5b9b8d4a33 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,6 +4,7 @@
4 * AMD SVM support 4 * AMD SVM support
5 * 5 *
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright 2010 Red Hat, Inc. and/or its affilates.
7 * 8 *
8 * Authors: 9 * Authors:
9 * Yaniv Kamay <yaniv@qumranet.com> 10 * Yaniv Kamay <yaniv@qumranet.com>
@@ -130,7 +131,7 @@ static struct svm_direct_access_msrs {
130 u32 index; /* Index of the MSR */ 131 u32 index; /* Index of the MSR */
131 bool always; /* True if intercept is always on */ 132 bool always; /* True if intercept is always on */
132} direct_access_msrs[] = { 133} direct_access_msrs[] = {
133 { .index = MSR_K6_STAR, .always = true }, 134 { .index = MSR_STAR, .always = true },
134 { .index = MSR_IA32_SYSENTER_CS, .always = true }, 135 { .index = MSR_IA32_SYSENTER_CS, .always = true },
135#ifdef CONFIG_X86_64 136#ifdef CONFIG_X86_64
136 { .index = MSR_GS_BASE, .always = true }, 137 { .index = MSR_GS_BASE, .always = true },
@@ -285,11 +286,11 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
285 286
286static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 287static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
287{ 288{
289 vcpu->arch.efer = efer;
288 if (!npt_enabled && !(efer & EFER_LMA)) 290 if (!npt_enabled && !(efer & EFER_LMA))
289 efer &= ~EFER_LME; 291 efer &= ~EFER_LME;
290 292
291 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 293 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
292 vcpu->arch.efer = efer;
293} 294}
294 295
295static int is_external_interrupt(u32 info) 296static int is_external_interrupt(u32 info)
@@ -383,8 +384,7 @@ static void svm_init_erratum_383(void)
383 int err; 384 int err;
384 u64 val; 385 u64 val;
385 386
386 /* Only Fam10h is affected */ 387 if (!cpu_has_amd_erratum(amd_erratum_383))
387 if (boot_cpu_data.x86 != 0x10)
388 return; 388 return;
389 389
390 /* Use _safe variants to not break nested virtualization */ 390 /* Use _safe variants to not break nested virtualization */
@@ -640,7 +640,7 @@ static __init int svm_hardware_setup(void)
640 640
641 if (nested) { 641 if (nested) {
642 printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 642 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
643 kvm_enable_efer_bits(EFER_SVME); 643 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
644 } 644 }
645 645
646 for_each_possible_cpu(cpu) { 646 for_each_possible_cpu(cpu) {
@@ -806,7 +806,7 @@ static void init_vmcb(struct vcpu_svm *svm)
806 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 806 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
807 */ 807 */
808 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 808 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
809 kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); 809 (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
810 810
811 save->cr4 = X86_CR4_PAE; 811 save->cr4 = X86_CR4_PAE;
812 /* rdx = ?? */ 812 /* rdx = ?? */
@@ -903,13 +903,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
903 svm->asid_generation = 0; 903 svm->asid_generation = 0;
904 init_vmcb(svm); 904 init_vmcb(svm);
905 905
906 fx_init(&svm->vcpu); 906 err = fx_init(&svm->vcpu);
907 if (err)
908 goto free_page4;
909
907 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 910 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
908 if (kvm_vcpu_is_bsp(&svm->vcpu)) 911 if (kvm_vcpu_is_bsp(&svm->vcpu))
909 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 912 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
910 913
911 return &svm->vcpu; 914 return &svm->vcpu;
912 915
916free_page4:
917 __free_page(hsave_page);
913free_page3: 918free_page3:
914 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); 919 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
915free_page2: 920free_page2:
@@ -1488,7 +1493,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
1488 */ 1493 */
1489 pr_err("KVM: Guest triggered AMD Erratum 383\n"); 1494 pr_err("KVM: Guest triggered AMD Erratum 383\n");
1490 1495
1491 set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests); 1496 kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
1492 1497
1493 return; 1498 return;
1494 } 1499 }
@@ -1535,7 +1540,7 @@ static int io_interception(struct vcpu_svm *svm)
1535 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1540 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1536 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1541 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1537 if (string || in) 1542 if (string || in)
1538 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); 1543 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
1539 1544
1540 port = io_info >> 16; 1545 port = io_info >> 16;
1541 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1546 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1957,7 +1962,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1957 svm->vmcb->save.cr3 = hsave->save.cr3; 1962 svm->vmcb->save.cr3 = hsave->save.cr3;
1958 svm->vcpu.arch.cr3 = hsave->save.cr3; 1963 svm->vcpu.arch.cr3 = hsave->save.cr3;
1959 } else { 1964 } else {
1960 kvm_set_cr3(&svm->vcpu, hsave->save.cr3); 1965 (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
1961 } 1966 }
1962 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); 1967 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
1963 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); 1968 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
@@ -2080,7 +2085,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2080 svm->vmcb->save.cr3 = nested_vmcb->save.cr3; 2085 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
2081 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; 2086 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
2082 } else 2087 } else
2083 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); 2088 (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
2084 2089
2085 /* Guest paging mode is active - reset mmu */ 2090 /* Guest paging mode is active - reset mmu */
2086 kvm_mmu_reset_context(&svm->vcpu); 2091 kvm_mmu_reset_context(&svm->vcpu);
@@ -2386,16 +2391,12 @@ static int iret_interception(struct vcpu_svm *svm)
2386 2391
2387static int invlpg_interception(struct vcpu_svm *svm) 2392static int invlpg_interception(struct vcpu_svm *svm)
2388{ 2393{
2389 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) 2394 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
2390 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2391 return 1;
2392} 2395}
2393 2396
2394static int emulate_on_interception(struct vcpu_svm *svm) 2397static int emulate_on_interception(struct vcpu_svm *svm)
2395{ 2398{
2396 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) 2399 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
2397 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2398 return 1;
2399} 2400}
2400 2401
2401static int cr8_write_interception(struct vcpu_svm *svm) 2402static int cr8_write_interception(struct vcpu_svm *svm)
@@ -2431,7 +2432,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2431 *data = tsc_offset + native_read_tsc(); 2432 *data = tsc_offset + native_read_tsc();
2432 break; 2433 break;
2433 } 2434 }
2434 case MSR_K6_STAR: 2435 case MSR_STAR:
2435 *data = svm->vmcb->save.star; 2436 *data = svm->vmcb->save.star;
2436 break; 2437 break;
2437#ifdef CONFIG_X86_64 2438#ifdef CONFIG_X86_64
@@ -2555,7 +2556,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2555 2556
2556 break; 2557 break;
2557 } 2558 }
2558 case MSR_K6_STAR: 2559 case MSR_STAR:
2559 svm->vmcb->save.star = data; 2560 svm->vmcb->save.star = data;
2560 break; 2561 break;
2561#ifdef CONFIG_X86_64 2562#ifdef CONFIG_X86_64
@@ -2726,6 +2727,99 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2726 [SVM_EXIT_NPF] = pf_interception, 2727 [SVM_EXIT_NPF] = pf_interception,
2727}; 2728};
2728 2729
2730void dump_vmcb(struct kvm_vcpu *vcpu)
2731{
2732 struct vcpu_svm *svm = to_svm(vcpu);
2733 struct vmcb_control_area *control = &svm->vmcb->control;
2734 struct vmcb_save_area *save = &svm->vmcb->save;
2735
2736 pr_err("VMCB Control Area:\n");
2737 pr_err("cr_read: %04x\n", control->intercept_cr_read);
2738 pr_err("cr_write: %04x\n", control->intercept_cr_write);
2739 pr_err("dr_read: %04x\n", control->intercept_dr_read);
2740 pr_err("dr_write: %04x\n", control->intercept_dr_write);
2741 pr_err("exceptions: %08x\n", control->intercept_exceptions);
2742 pr_err("intercepts: %016llx\n", control->intercept);
2743 pr_err("pause filter count: %d\n", control->pause_filter_count);
2744 pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa);
2745 pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa);
2746 pr_err("tsc_offset: %016llx\n", control->tsc_offset);
2747 pr_err("asid: %d\n", control->asid);
2748 pr_err("tlb_ctl: %d\n", control->tlb_ctl);
2749 pr_err("int_ctl: %08x\n", control->int_ctl);
2750 pr_err("int_vector: %08x\n", control->int_vector);
2751 pr_err("int_state: %08x\n", control->int_state);
2752 pr_err("exit_code: %08x\n", control->exit_code);
2753 pr_err("exit_info1: %016llx\n", control->exit_info_1);
2754 pr_err("exit_info2: %016llx\n", control->exit_info_2);
2755 pr_err("exit_int_info: %08x\n", control->exit_int_info);
2756 pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err);
2757 pr_err("nested_ctl: %lld\n", control->nested_ctl);
2758 pr_err("nested_cr3: %016llx\n", control->nested_cr3);
2759 pr_err("event_inj: %08x\n", control->event_inj);
2760 pr_err("event_inj_err: %08x\n", control->event_inj_err);
2761 pr_err("lbr_ctl: %lld\n", control->lbr_ctl);
2762 pr_err("next_rip: %016llx\n", control->next_rip);
2763 pr_err("VMCB State Save Area:\n");
2764 pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n",
2765 save->es.selector, save->es.attrib,
2766 save->es.limit, save->es.base);
2767 pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n",
2768 save->cs.selector, save->cs.attrib,
2769 save->cs.limit, save->cs.base);
2770 pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n",
2771 save->ss.selector, save->ss.attrib,
2772 save->ss.limit, save->ss.base);
2773 pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n",
2774 save->ds.selector, save->ds.attrib,
2775 save->ds.limit, save->ds.base);
2776 pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n",
2777 save->fs.selector, save->fs.attrib,
2778 save->fs.limit, save->fs.base);
2779 pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n",
2780 save->gs.selector, save->gs.attrib,
2781 save->gs.limit, save->gs.base);
2782 pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n",
2783 save->gdtr.selector, save->gdtr.attrib,
2784 save->gdtr.limit, save->gdtr.base);
2785 pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n",
2786 save->ldtr.selector, save->ldtr.attrib,
2787 save->ldtr.limit, save->ldtr.base);
2788 pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n",
2789 save->idtr.selector, save->idtr.attrib,
2790 save->idtr.limit, save->idtr.base);
2791 pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n",
2792 save->tr.selector, save->tr.attrib,
2793 save->tr.limit, save->tr.base);
2794 pr_err("cpl: %d efer: %016llx\n",
2795 save->cpl, save->efer);
2796 pr_err("cr0: %016llx cr2: %016llx\n",
2797 save->cr0, save->cr2);
2798 pr_err("cr3: %016llx cr4: %016llx\n",
2799 save->cr3, save->cr4);
2800 pr_err("dr6: %016llx dr7: %016llx\n",
2801 save->dr6, save->dr7);
2802 pr_err("rip: %016llx rflags: %016llx\n",
2803 save->rip, save->rflags);
2804 pr_err("rsp: %016llx rax: %016llx\n",
2805 save->rsp, save->rax);
2806 pr_err("star: %016llx lstar: %016llx\n",
2807 save->star, save->lstar);
2808 pr_err("cstar: %016llx sfmask: %016llx\n",
2809 save->cstar, save->sfmask);
2810 pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n",
2811 save->kernel_gs_base, save->sysenter_cs);
2812 pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n",
2813 save->sysenter_esp, save->sysenter_eip);
2814 pr_err("gpat: %016llx dbgctl: %016llx\n",
2815 save->g_pat, save->dbgctl);
2816 pr_err("br_from: %016llx br_to: %016llx\n",
2817 save->br_from, save->br_to);
2818 pr_err("excp_from: %016llx excp_to: %016llx\n",
2819 save->last_excp_from, save->last_excp_to);
2820
2821}
2822
2729static int handle_exit(struct kvm_vcpu *vcpu) 2823static int handle_exit(struct kvm_vcpu *vcpu)
2730{ 2824{
2731 struct vcpu_svm *svm = to_svm(vcpu); 2825 struct vcpu_svm *svm = to_svm(vcpu);
@@ -2770,6 +2864,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2770 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2864 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2771 kvm_run->fail_entry.hardware_entry_failure_reason 2865 kvm_run->fail_entry.hardware_entry_failure_reason
2772 = svm->vmcb->control.exit_code; 2866 = svm->vmcb->control.exit_code;
2867 pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
2868 dump_vmcb(vcpu);
2773 return 0; 2869 return 0;
2774 } 2870 }
2775 2871
@@ -2826,9 +2922,6 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2826{ 2922{
2827 struct vmcb_control_area *control; 2923 struct vmcb_control_area *control;
2828 2924
2829 trace_kvm_inj_virq(irq);
2830
2831 ++svm->vcpu.stat.irq_injections;
2832 control = &svm->vmcb->control; 2925 control = &svm->vmcb->control;
2833 control->int_vector = irq; 2926 control->int_vector = irq;
2834 control->int_ctl &= ~V_INTR_PRIO_MASK; 2927 control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -2842,6 +2935,9 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
2842 2935
2843 BUG_ON(!(gif_set(svm))); 2936 BUG_ON(!(gif_set(svm)));
2844 2937
2938 trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
2939 ++vcpu->stat.irq_injections;
2940
2845 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | 2941 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
2846 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; 2942 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
2847} 2943}
@@ -3327,6 +3423,11 @@ static bool svm_rdtscp_supported(void)
3327 return false; 3423 return false;
3328} 3424}
3329 3425
3426static bool svm_has_wbinvd_exit(void)
3427{
3428 return true;
3429}
3430
3330static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) 3431static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
3331{ 3432{
3332 struct vcpu_svm *svm = to_svm(vcpu); 3433 struct vcpu_svm *svm = to_svm(vcpu);
@@ -3411,6 +3512,8 @@ static struct kvm_x86_ops svm_x86_ops = {
3411 .rdtscp_supported = svm_rdtscp_supported, 3512 .rdtscp_supported = svm_rdtscp_supported,
3412 3513
3413 .set_supported_cpuid = svm_set_supported_cpuid, 3514 .set_supported_cpuid = svm_set_supported_cpuid,
3515
3516 .has_wbinvd_exit = svm_has_wbinvd_exit,
3414}; 3517};
3415 3518
3416static int __init svm_init(void) 3519static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 4ddadb1a5ffe..e16a0dbe74d8 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -1,3 +1,17 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * timer support
8 *
9 * Copyright 2010 Red Hat, Inc. and/or its affilates.
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2. See
12 * the COPYING file in the top-level directory.
13 */
14
1#include <linux/kvm_host.h> 15#include <linux/kvm_host.h>
2#include <linux/kvm.h> 16#include <linux/kvm.h>
3#include <linux/hrtimer.h> 17#include <linux/hrtimer.h>
@@ -18,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
18 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 32 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
19 atomic_inc(&ktimer->pending); 33 atomic_inc(&ktimer->pending);
20 /* FIXME: this code should not know anything about vcpus */ 34 /* FIXME: this code should not know anything about vcpus */
21 set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 35 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
22 } 36 }
23 37
24 if (waitqueue_active(q)) 38 if (waitqueue_active(q))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ee03679efe78..49b25eee25ac 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,6 +5,7 @@
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affilates.
8 * 9 *
9 * Authors: 10 * Authors:
10 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
@@ -36,6 +37,8 @@
36#include <asm/vmx.h> 37#include <asm/vmx.h>
37#include <asm/virtext.h> 38#include <asm/virtext.h>
38#include <asm/mce.h> 39#include <asm/mce.h>
40#include <asm/i387.h>
41#include <asm/xcr.h>
39 42
40#include "trace.h" 43#include "trace.h"
41 44
@@ -63,6 +66,9 @@ module_param_named(unrestricted_guest,
63static int __read_mostly emulate_invalid_guest_state = 0; 66static int __read_mostly emulate_invalid_guest_state = 0;
64module_param(emulate_invalid_guest_state, bool, S_IRUGO); 67module_param(emulate_invalid_guest_state, bool, S_IRUGO);
65 68
69static int __read_mostly vmm_exclusive = 1;
70module_param(vmm_exclusive, bool, S_IRUGO);
71
66#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 72#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
67 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 73 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
68#define KVM_GUEST_CR0_MASK \ 74#define KVM_GUEST_CR0_MASK \
@@ -173,10 +179,13 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
173 179
174static int init_rmode(struct kvm *kvm); 180static int init_rmode(struct kvm *kvm);
175static u64 construct_eptp(unsigned long root_hpa); 181static u64 construct_eptp(unsigned long root_hpa);
182static void kvm_cpu_vmxon(u64 addr);
183static void kvm_cpu_vmxoff(void);
176 184
177static DEFINE_PER_CPU(struct vmcs *, vmxarea); 185static DEFINE_PER_CPU(struct vmcs *, vmxarea);
178static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 186static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
179static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); 187static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
188static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
180 189
181static unsigned long *vmx_io_bitmap_a; 190static unsigned long *vmx_io_bitmap_a;
182static unsigned long *vmx_io_bitmap_b; 191static unsigned long *vmx_io_bitmap_b;
@@ -231,14 +240,14 @@ static u64 host_efer;
231static void ept_save_pdptrs(struct kvm_vcpu *vcpu); 240static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
232 241
233/* 242/*
234 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it 243 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
235 * away by decrementing the array size. 244 * away by decrementing the array size.
236 */ 245 */
237static const u32 vmx_msr_index[] = { 246static const u32 vmx_msr_index[] = {
238#ifdef CONFIG_X86_64 247#ifdef CONFIG_X86_64
239 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 248 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
240#endif 249#endif
241 MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, 250 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
242}; 251};
243#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 252#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
244 253
@@ -334,6 +343,11 @@ static inline bool cpu_has_vmx_ept_1g_page(void)
334 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; 343 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
335} 344}
336 345
346static inline bool cpu_has_vmx_ept_4levels(void)
347{
348 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
349}
350
337static inline bool cpu_has_vmx_invept_individual_addr(void) 351static inline bool cpu_has_vmx_invept_individual_addr(void)
338{ 352{
339 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; 353 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -349,6 +363,16 @@ static inline bool cpu_has_vmx_invept_global(void)
349 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; 363 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
350} 364}
351 365
366static inline bool cpu_has_vmx_invvpid_single(void)
367{
368 return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
369}
370
371static inline bool cpu_has_vmx_invvpid_global(void)
372{
373 return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
374}
375
352static inline bool cpu_has_vmx_ept(void) 376static inline bool cpu_has_vmx_ept(void)
353{ 377{
354 return vmcs_config.cpu_based_2nd_exec_ctrl & 378 return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -389,6 +413,12 @@ static inline bool cpu_has_virtual_nmis(void)
389 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 413 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
390} 414}
391 415
416static inline bool cpu_has_vmx_wbinvd_exit(void)
417{
418 return vmcs_config.cpu_based_2nd_exec_ctrl &
419 SECONDARY_EXEC_WBINVD_EXITING;
420}
421
392static inline bool report_flexpriority(void) 422static inline bool report_flexpriority(void)
393{ 423{
394 return flexpriority_enabled; 424 return flexpriority_enabled;
@@ -453,6 +483,19 @@ static void vmcs_clear(struct vmcs *vmcs)
453 vmcs, phys_addr); 483 vmcs, phys_addr);
454} 484}
455 485
486static void vmcs_load(struct vmcs *vmcs)
487{
488 u64 phys_addr = __pa(vmcs);
489 u8 error;
490
491 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
492 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
493 : "cc", "memory");
494 if (error)
495 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
496 vmcs, phys_addr);
497}
498
456static void __vcpu_clear(void *arg) 499static void __vcpu_clear(void *arg)
457{ 500{
458 struct vcpu_vmx *vmx = arg; 501 struct vcpu_vmx *vmx = arg;
@@ -475,12 +518,27 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
475 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); 518 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
476} 519}
477 520
478static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) 521static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
479{ 522{
480 if (vmx->vpid == 0) 523 if (vmx->vpid == 0)
481 return; 524 return;
482 525
483 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); 526 if (cpu_has_vmx_invvpid_single())
527 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
528}
529
530static inline void vpid_sync_vcpu_global(void)
531{
532 if (cpu_has_vmx_invvpid_global())
533 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
534}
535
536static inline void vpid_sync_context(struct vcpu_vmx *vmx)
537{
538 if (cpu_has_vmx_invvpid_single())
539 vpid_sync_vcpu_single(vmx);
540 else
541 vpid_sync_vcpu_global();
484} 542}
485 543
486static inline void ept_sync_global(void) 544static inline void ept_sync_global(void)
@@ -812,6 +870,9 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
812 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 870 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
813 } 871 }
814#endif 872#endif
873 if (current_thread_info()->status & TS_USEDFPU)
874 clts();
875 load_gdt(&__get_cpu_var(host_gdt));
815} 876}
816 877
817static void vmx_load_host_state(struct vcpu_vmx *vmx) 878static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -828,35 +889,30 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
828static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 889static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
829{ 890{
830 struct vcpu_vmx *vmx = to_vmx(vcpu); 891 struct vcpu_vmx *vmx = to_vmx(vcpu);
831 u64 phys_addr = __pa(vmx->vmcs);
832 u64 tsc_this, delta, new_offset; 892 u64 tsc_this, delta, new_offset;
893 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
833 894
834 if (vcpu->cpu != cpu) { 895 if (!vmm_exclusive)
896 kvm_cpu_vmxon(phys_addr);
897 else if (vcpu->cpu != cpu)
835 vcpu_clear(vmx); 898 vcpu_clear(vmx);
836 kvm_migrate_timers(vcpu);
837 set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
838 local_irq_disable();
839 list_add(&vmx->local_vcpus_link,
840 &per_cpu(vcpus_on_cpu, cpu));
841 local_irq_enable();
842 }
843 899
844 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 900 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
845 u8 error;
846
847 per_cpu(current_vmcs, cpu) = vmx->vmcs; 901 per_cpu(current_vmcs, cpu) = vmx->vmcs;
848 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 902 vmcs_load(vmx->vmcs);
849 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
850 : "cc");
851 if (error)
852 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
853 vmx->vmcs, phys_addr);
854 } 903 }
855 904
856 if (vcpu->cpu != cpu) { 905 if (vcpu->cpu != cpu) {
857 struct desc_ptr dt; 906 struct desc_ptr dt;
858 unsigned long sysenter_esp; 907 unsigned long sysenter_esp;
859 908
909 kvm_migrate_timers(vcpu);
910 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
911 local_irq_disable();
912 list_add(&vmx->local_vcpus_link,
913 &per_cpu(vcpus_on_cpu, cpu));
914 local_irq_enable();
915
860 vcpu->cpu = cpu; 916 vcpu->cpu = cpu;
861 /* 917 /*
862 * Linux uses per-cpu TSS and GDT, so set these when switching 918 * Linux uses per-cpu TSS and GDT, so set these when switching
@@ -884,6 +940,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
884static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 940static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
885{ 941{
886 __vmx_load_host_state(to_vmx(vcpu)); 942 __vmx_load_host_state(to_vmx(vcpu));
943 if (!vmm_exclusive) {
944 __vcpu_clear(to_vmx(vcpu));
945 kvm_cpu_vmxoff();
946 }
887} 947}
888 948
889static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 949static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -1057,10 +1117,10 @@ static void setup_msrs(struct vcpu_vmx *vmx)
1057 if (index >= 0 && vmx->rdtscp_enabled) 1117 if (index >= 0 && vmx->rdtscp_enabled)
1058 move_msr_up(vmx, index, save_nmsrs++); 1118 move_msr_up(vmx, index, save_nmsrs++);
1059 /* 1119 /*
1060 * MSR_K6_STAR is only needed on long mode guests, and only 1120 * MSR_STAR is only needed on long mode guests, and only
1061 * if efer.sce is enabled. 1121 * if efer.sce is enabled.
1062 */ 1122 */
1063 index = __find_msr_index(vmx, MSR_K6_STAR); 1123 index = __find_msr_index(vmx, MSR_STAR);
1064 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) 1124 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
1065 move_msr_up(vmx, index, save_nmsrs++); 1125 move_msr_up(vmx, index, save_nmsrs++);
1066 } 1126 }
@@ -1286,6 +1346,13 @@ static __init int vmx_disabled_by_bios(void)
1286 /* locked but not enabled */ 1346 /* locked but not enabled */
1287} 1347}
1288 1348
1349static void kvm_cpu_vmxon(u64 addr)
1350{
1351 asm volatile (ASM_VMX_VMXON_RAX
1352 : : "a"(&addr), "m"(addr)
1353 : "memory", "cc");
1354}
1355
1289static int hardware_enable(void *garbage) 1356static int hardware_enable(void *garbage)
1290{ 1357{
1291 int cpu = raw_smp_processor_id(); 1358 int cpu = raw_smp_processor_id();
@@ -1308,11 +1375,13 @@ static int hardware_enable(void *garbage)
1308 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); 1375 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1309 } 1376 }
1310 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1377 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1311 asm volatile (ASM_VMX_VMXON_RAX
1312 : : "a"(&phys_addr), "m"(phys_addr)
1313 : "memory", "cc");
1314 1378
1315 ept_sync_global(); 1379 if (vmm_exclusive) {
1380 kvm_cpu_vmxon(phys_addr);
1381 ept_sync_global();
1382 }
1383
1384 store_gdt(&__get_cpu_var(host_gdt));
1316 1385
1317 return 0; 1386 return 0;
1318} 1387}
@@ -1334,13 +1403,15 @@ static void vmclear_local_vcpus(void)
1334static void kvm_cpu_vmxoff(void) 1403static void kvm_cpu_vmxoff(void)
1335{ 1404{
1336 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); 1405 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1337 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1338} 1406}
1339 1407
1340static void hardware_disable(void *garbage) 1408static void hardware_disable(void *garbage)
1341{ 1409{
1342 vmclear_local_vcpus(); 1410 if (vmm_exclusive) {
1343 kvm_cpu_vmxoff(); 1411 vmclear_local_vcpus();
1412 kvm_cpu_vmxoff();
1413 }
1414 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1344} 1415}
1345 1416
1346static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 1417static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -1539,7 +1610,8 @@ static __init int hardware_setup(void)
1539 if (!cpu_has_vmx_vpid()) 1610 if (!cpu_has_vmx_vpid())
1540 enable_vpid = 0; 1611 enable_vpid = 0;
1541 1612
1542 if (!cpu_has_vmx_ept()) { 1613 if (!cpu_has_vmx_ept() ||
1614 !cpu_has_vmx_ept_4levels()) {
1543 enable_ept = 0; 1615 enable_ept = 0;
1544 enable_unrestricted_guest = 0; 1616 enable_unrestricted_guest = 0;
1545 } 1617 }
@@ -1628,7 +1700,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
1628 gfn_t base_gfn; 1700 gfn_t base_gfn;
1629 1701
1630 slots = kvm_memslots(kvm); 1702 slots = kvm_memslots(kvm);
1631 base_gfn = kvm->memslots->memslots[0].base_gfn + 1703 base_gfn = slots->memslots[0].base_gfn +
1632 kvm->memslots->memslots[0].npages - 3; 1704 kvm->memslots->memslots[0].npages - 3;
1633 return base_gfn << PAGE_SHIFT; 1705 return base_gfn << PAGE_SHIFT;
1634 } 1706 }
@@ -1759,9 +1831,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1759 1831
1760static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1832static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1761{ 1833{
1762 vpid_sync_vcpu_all(to_vmx(vcpu)); 1834 vpid_sync_context(to_vmx(vcpu));
1763 if (enable_ept) 1835 if (enable_ept) {
1836 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1837 return;
1764 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 1838 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1839 }
1765} 1840}
1766 1841
1767static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1842static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -2507,7 +2582,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2507 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 2582 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2508 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 2583 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
2509 2584
2510 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ 2585 vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
2511 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ 2586 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
2512 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ 2587 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
2513 2588
@@ -2599,21 +2674,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2599 2674
2600static int init_rmode(struct kvm *kvm) 2675static int init_rmode(struct kvm *kvm)
2601{ 2676{
2677 int idx, ret = 0;
2678
2679 idx = srcu_read_lock(&kvm->srcu);
2602 if (!init_rmode_tss(kvm)) 2680 if (!init_rmode_tss(kvm))
2603 return 0; 2681 goto exit;
2604 if (!init_rmode_identity_map(kvm)) 2682 if (!init_rmode_identity_map(kvm))
2605 return 0; 2683 goto exit;
2606 return 1; 2684
2685 ret = 1;
2686exit:
2687 srcu_read_unlock(&kvm->srcu, idx);
2688 return ret;
2607} 2689}
2608 2690
2609static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) 2691static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2610{ 2692{
2611 struct vcpu_vmx *vmx = to_vmx(vcpu); 2693 struct vcpu_vmx *vmx = to_vmx(vcpu);
2612 u64 msr; 2694 u64 msr;
2613 int ret, idx; 2695 int ret;
2614 2696
2615 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2697 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2616 idx = srcu_read_lock(&vcpu->kvm->srcu);
2617 if (!init_rmode(vmx->vcpu.kvm)) { 2698 if (!init_rmode(vmx->vcpu.kvm)) {
2618 ret = -ENOMEM; 2699 ret = -ENOMEM;
2619 goto out; 2700 goto out;
@@ -2630,7 +2711,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2630 msr |= MSR_IA32_APICBASE_BSP; 2711 msr |= MSR_IA32_APICBASE_BSP;
2631 kvm_set_apic_base(&vmx->vcpu, msr); 2712 kvm_set_apic_base(&vmx->vcpu, msr);
2632 2713
2633 fx_init(&vmx->vcpu); 2714 ret = fx_init(&vmx->vcpu);
2715 if (ret != 0)
2716 goto out;
2634 2717
2635 seg_setup(VCPU_SREG_CS); 2718 seg_setup(VCPU_SREG_CS);
2636 /* 2719 /*
@@ -2713,7 +2796,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2713 vmx_fpu_activate(&vmx->vcpu); 2796 vmx_fpu_activate(&vmx->vcpu);
2714 update_exception_bitmap(&vmx->vcpu); 2797 update_exception_bitmap(&vmx->vcpu);
2715 2798
2716 vpid_sync_vcpu_all(vmx); 2799 vpid_sync_context(vmx);
2717 2800
2718 ret = 0; 2801 ret = 0;
2719 2802
@@ -2721,7 +2804,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2721 vmx->emulation_required = 0; 2804 vmx->emulation_required = 0;
2722 2805
2723out: 2806out:
2724 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2725 return ret; 2807 return ret;
2726} 2808}
2727 2809
@@ -2826,9 +2908,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2826{ 2908{
2827 if (!cpu_has_virtual_nmis()) 2909 if (!cpu_has_virtual_nmis())
2828 return to_vmx(vcpu)->soft_vnmi_blocked; 2910 return to_vmx(vcpu)->soft_vnmi_blocked;
2829 else 2911 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
2830 return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2831 GUEST_INTR_STATE_NMI);
2832} 2912}
2833 2913
2834static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 2914static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3070,7 +3150,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
3070 ++vcpu->stat.io_exits; 3150 ++vcpu->stat.io_exits;
3071 3151
3072 if (string || in) 3152 if (string || in)
3073 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); 3153 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
3074 3154
3075 port = exit_qualification >> 16; 3155 port = exit_qualification >> 16;
3076 size = (exit_qualification & 7) + 1; 3156 size = (exit_qualification & 7) + 1;
@@ -3090,11 +3170,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3090 hypercall[2] = 0xc1; 3170 hypercall[2] = 0xc1;
3091} 3171}
3092 3172
3173static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
3174{
3175 if (err)
3176 kvm_inject_gp(vcpu, 0);
3177 else
3178 skip_emulated_instruction(vcpu);
3179}
3180
3093static int handle_cr(struct kvm_vcpu *vcpu) 3181static int handle_cr(struct kvm_vcpu *vcpu)
3094{ 3182{
3095 unsigned long exit_qualification, val; 3183 unsigned long exit_qualification, val;
3096 int cr; 3184 int cr;
3097 int reg; 3185 int reg;
3186 int err;
3098 3187
3099 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3188 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3100 cr = exit_qualification & 15; 3189 cr = exit_qualification & 15;
@@ -3105,16 +3194,16 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3105 trace_kvm_cr_write(cr, val); 3194 trace_kvm_cr_write(cr, val);
3106 switch (cr) { 3195 switch (cr) {
3107 case 0: 3196 case 0:
3108 kvm_set_cr0(vcpu, val); 3197 err = kvm_set_cr0(vcpu, val);
3109 skip_emulated_instruction(vcpu); 3198 complete_insn_gp(vcpu, err);
3110 return 1; 3199 return 1;
3111 case 3: 3200 case 3:
3112 kvm_set_cr3(vcpu, val); 3201 err = kvm_set_cr3(vcpu, val);
3113 skip_emulated_instruction(vcpu); 3202 complete_insn_gp(vcpu, err);
3114 return 1; 3203 return 1;
3115 case 4: 3204 case 4:
3116 kvm_set_cr4(vcpu, val); 3205 err = kvm_set_cr4(vcpu, val);
3117 skip_emulated_instruction(vcpu); 3206 complete_insn_gp(vcpu, err);
3118 return 1; 3207 return 1;
3119 case 8: { 3208 case 8: {
3120 u8 cr8_prev = kvm_get_cr8(vcpu); 3209 u8 cr8_prev = kvm_get_cr8(vcpu);
@@ -3321,30 +3410,25 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
3321static int handle_wbinvd(struct kvm_vcpu *vcpu) 3410static int handle_wbinvd(struct kvm_vcpu *vcpu)
3322{ 3411{
3323 skip_emulated_instruction(vcpu); 3412 skip_emulated_instruction(vcpu);
3324 /* TODO: Add support for VT-d/pass-through device */ 3413 kvm_emulate_wbinvd(vcpu);
3325 return 1; 3414 return 1;
3326} 3415}
3327 3416
3328static int handle_apic_access(struct kvm_vcpu *vcpu) 3417static int handle_xsetbv(struct kvm_vcpu *vcpu)
3329{ 3418{
3330 unsigned long exit_qualification; 3419 u64 new_bv = kvm_read_edx_eax(vcpu);
3331 enum emulation_result er; 3420 u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3332 unsigned long offset;
3333 3421
3334 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3422 if (kvm_set_xcr(vcpu, index, new_bv) == 0)
3335 offset = exit_qualification & 0xffful; 3423 skip_emulated_instruction(vcpu);
3336
3337 er = emulate_instruction(vcpu, 0, 0, 0);
3338
3339 if (er != EMULATE_DONE) {
3340 printk(KERN_ERR
3341 "Fail to handle apic access vmexit! Offset is 0x%lx\n",
3342 offset);
3343 return -ENOEXEC;
3344 }
3345 return 1; 3424 return 1;
3346} 3425}
3347 3426
3427static int handle_apic_access(struct kvm_vcpu *vcpu)
3428{
3429 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
3430}
3431
3348static int handle_task_switch(struct kvm_vcpu *vcpu) 3432static int handle_task_switch(struct kvm_vcpu *vcpu)
3349{ 3433{
3350 struct vcpu_vmx *vmx = to_vmx(vcpu); 3434 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3554,13 +3638,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3554 goto out; 3638 goto out;
3555 } 3639 }
3556 3640
3557 if (err != EMULATE_DONE) { 3641 if (err != EMULATE_DONE)
3558 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3642 return 0;
3559 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3560 vcpu->run->internal.ndata = 0;
3561 ret = 0;
3562 goto out;
3563 }
3564 3643
3565 if (signal_pending(current)) 3644 if (signal_pending(current))
3566 goto out; 3645 goto out;
@@ -3623,6 +3702,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3623 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 3702 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
3624 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 3703 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
3625 [EXIT_REASON_WBINVD] = handle_wbinvd, 3704 [EXIT_REASON_WBINVD] = handle_wbinvd,
3705 [EXIT_REASON_XSETBV] = handle_xsetbv,
3626 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 3706 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
3627 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 3707 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3628 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3708 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
@@ -3656,6 +3736,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3656 if (enable_ept && is_paging(vcpu)) 3736 if (enable_ept && is_paging(vcpu))
3657 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3737 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3658 3738
3739 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3740 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3741 vcpu->run->fail_entry.hardware_entry_failure_reason
3742 = exit_reason;
3743 return 0;
3744 }
3745
3659 if (unlikely(vmx->fail)) { 3746 if (unlikely(vmx->fail)) {
3660 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3747 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3661 vcpu->run->fail_entry.hardware_entry_failure_reason 3748 vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3861,11 +3948,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3861 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 3948 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3862 vmx_set_interrupt_shadow(vcpu, 0); 3949 vmx_set_interrupt_shadow(vcpu, 0);
3863 3950
3864 /*
3865 * Loading guest fpu may have cleared host cr0.ts
3866 */
3867 vmcs_writel(HOST_CR0, read_cr0());
3868
3869 asm( 3951 asm(
3870 /* Store host registers */ 3952 /* Store host registers */
3871 "push %%"R"dx; push %%"R"bp;" 3953 "push %%"R"dx; push %%"R"bp;"
@@ -4001,6 +4083,19 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
4001 kmem_cache_free(kvm_vcpu_cache, vmx); 4083 kmem_cache_free(kvm_vcpu_cache, vmx);
4002} 4084}
4003 4085
4086static inline void vmcs_init(struct vmcs *vmcs)
4087{
4088 u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
4089
4090 if (!vmm_exclusive)
4091 kvm_cpu_vmxon(phys_addr);
4092
4093 vmcs_clear(vmcs);
4094
4095 if (!vmm_exclusive)
4096 kvm_cpu_vmxoff();
4097}
4098
4004static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 4099static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4005{ 4100{
4006 int err; 4101 int err;
@@ -4026,7 +4121,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4026 if (!vmx->vmcs) 4121 if (!vmx->vmcs)
4027 goto free_msrs; 4122 goto free_msrs;
4028 4123
4029 vmcs_clear(vmx->vmcs); 4124 vmcs_init(vmx->vmcs);
4030 4125
4031 cpu = get_cpu(); 4126 cpu = get_cpu();
4032 vmx_vcpu_load(&vmx->vcpu, cpu); 4127 vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4265,6 +4360,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
4265 .rdtscp_supported = vmx_rdtscp_supported, 4360 .rdtscp_supported = vmx_rdtscp_supported,
4266 4361
4267 .set_supported_cpuid = vmx_set_supported_cpuid, 4362 .set_supported_cpuid = vmx_set_supported_cpuid,
4363
4364 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
4268}; 4365};
4269 4366
4270static int __init vmx_init(void) 4367static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7fa89c39c64f..25f19078b321 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,6 +6,7 @@
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008 8 * Copyright IBM Corporation, 2008
9 * Copyright 2010 Red Hat, Inc. and/or its affilates.
9 * 10 *
10 * Authors: 11 * Authors:
11 * Avi Kivity <avi@qumranet.com> 12 * Avi Kivity <avi@qumranet.com>
@@ -41,17 +42,19 @@
41#include <linux/srcu.h> 42#include <linux/srcu.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
43#include <linux/perf_event.h> 44#include <linux/perf_event.h>
45#include <linux/uaccess.h>
44#include <trace/events/kvm.h> 46#include <trace/events/kvm.h>
45 47
46#define CREATE_TRACE_POINTS 48#define CREATE_TRACE_POINTS
47#include "trace.h" 49#include "trace.h"
48 50
49#include <asm/debugreg.h> 51#include <asm/debugreg.h>
50#include <asm/uaccess.h>
51#include <asm/msr.h> 52#include <asm/msr.h>
52#include <asm/desc.h> 53#include <asm/desc.h>
53#include <asm/mtrr.h> 54#include <asm/mtrr.h>
54#include <asm/mce.h> 55#include <asm/mce.h>
56#include <asm/i387.h>
57#include <asm/xcr.h>
55 58
56#define MAX_IO_MSRS 256 59#define MAX_IO_MSRS 256
57#define CR0_RESERVED_BITS \ 60#define CR0_RESERVED_BITS \
@@ -62,6 +65,7 @@
62 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 65 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
63 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 66 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
64 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 67 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
68 | X86_CR4_OSXSAVE \
65 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 69 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
66 70
67#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 71#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -147,6 +151,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
147 { NULL } 151 { NULL }
148}; 152};
149 153
154u64 __read_mostly host_xcr0;
155
156static inline u32 bit(int bitno)
157{
158 return 1 << (bitno & 31);
159}
160
150static void kvm_on_user_return(struct user_return_notifier *urn) 161static void kvm_on_user_return(struct user_return_notifier *urn)
151{ 162{
152 unsigned slot; 163 unsigned slot;
@@ -285,7 +296,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
285 prev_nr = vcpu->arch.exception.nr; 296 prev_nr = vcpu->arch.exception.nr;
286 if (prev_nr == DF_VECTOR) { 297 if (prev_nr == DF_VECTOR) {
287 /* triple fault -> shutdown */ 298 /* triple fault -> shutdown */
288 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 299 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
289 return; 300 return;
290 } 301 }
291 class1 = exception_class(prev_nr); 302 class1 = exception_class(prev_nr);
@@ -414,121 +425,163 @@ out:
414 return changed; 425 return changed;
415} 426}
416 427
417void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 428int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
418{ 429{
430 unsigned long old_cr0 = kvm_read_cr0(vcpu);
431 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
432 X86_CR0_CD | X86_CR0_NW;
433
419 cr0 |= X86_CR0_ET; 434 cr0 |= X86_CR0_ET;
420 435
421#ifdef CONFIG_X86_64 436#ifdef CONFIG_X86_64
422 if (cr0 & 0xffffffff00000000UL) { 437 if (cr0 & 0xffffffff00000000UL)
423 kvm_inject_gp(vcpu, 0); 438 return 1;
424 return;
425 }
426#endif 439#endif
427 440
428 cr0 &= ~CR0_RESERVED_BITS; 441 cr0 &= ~CR0_RESERVED_BITS;
429 442
430 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 443 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
431 kvm_inject_gp(vcpu, 0); 444 return 1;
432 return;
433 }
434 445
435 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 446 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
436 kvm_inject_gp(vcpu, 0); 447 return 1;
437 return;
438 }
439 448
440 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 449 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
441#ifdef CONFIG_X86_64 450#ifdef CONFIG_X86_64
442 if ((vcpu->arch.efer & EFER_LME)) { 451 if ((vcpu->arch.efer & EFER_LME)) {
443 int cs_db, cs_l; 452 int cs_db, cs_l;
444 453
445 if (!is_pae(vcpu)) { 454 if (!is_pae(vcpu))
446 kvm_inject_gp(vcpu, 0); 455 return 1;
447 return;
448 }
449 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 456 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
450 if (cs_l) { 457 if (cs_l)
451 kvm_inject_gp(vcpu, 0); 458 return 1;
452 return;
453
454 }
455 } else 459 } else
456#endif 460#endif
457 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 461 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3))
458 kvm_inject_gp(vcpu, 0); 462 return 1;
459 return;
460 }
461
462 } 463 }
463 464
464 kvm_x86_ops->set_cr0(vcpu, cr0); 465 kvm_x86_ops->set_cr0(vcpu, cr0);
465 466
466 kvm_mmu_reset_context(vcpu); 467 if ((cr0 ^ old_cr0) & update_bits)
467 return; 468 kvm_mmu_reset_context(vcpu);
469 return 0;
468} 470}
469EXPORT_SYMBOL_GPL(kvm_set_cr0); 471EXPORT_SYMBOL_GPL(kvm_set_cr0);
470 472
471void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 473void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
472{ 474{
473 kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); 475 (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
474} 476}
475EXPORT_SYMBOL_GPL(kvm_lmsw); 477EXPORT_SYMBOL_GPL(kvm_lmsw);
476 478
477void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 479int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
478{ 480{
479 unsigned long old_cr4 = kvm_read_cr4(vcpu); 481 u64 xcr0;
480 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
481 482
482 if (cr4 & CR4_RESERVED_BITS) { 483 /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
484 if (index != XCR_XFEATURE_ENABLED_MASK)
485 return 1;
486 xcr0 = xcr;
487 if (kvm_x86_ops->get_cpl(vcpu) != 0)
488 return 1;
489 if (!(xcr0 & XSTATE_FP))
490 return 1;
491 if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
492 return 1;
493 if (xcr0 & ~host_xcr0)
494 return 1;
495 vcpu->arch.xcr0 = xcr0;
496 vcpu->guest_xcr0_loaded = 0;
497 return 0;
498}
499
500int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
501{
502 if (__kvm_set_xcr(vcpu, index, xcr)) {
483 kvm_inject_gp(vcpu, 0); 503 kvm_inject_gp(vcpu, 0);
504 return 1;
505 }
506 return 0;
507}
508EXPORT_SYMBOL_GPL(kvm_set_xcr);
509
510static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
511{
512 struct kvm_cpuid_entry2 *best;
513
514 best = kvm_find_cpuid_entry(vcpu, 1, 0);
515 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
516}
517
518static void update_cpuid(struct kvm_vcpu *vcpu)
519{
520 struct kvm_cpuid_entry2 *best;
521
522 best = kvm_find_cpuid_entry(vcpu, 1, 0);
523 if (!best)
484 return; 524 return;
525
526 /* Update OSXSAVE bit */
527 if (cpu_has_xsave && best->function == 0x1) {
528 best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
529 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
530 best->ecx |= bit(X86_FEATURE_OSXSAVE);
485 } 531 }
532}
533
534int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
535{
536 unsigned long old_cr4 = kvm_read_cr4(vcpu);
537 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
538
539 if (cr4 & CR4_RESERVED_BITS)
540 return 1;
541
542 if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
543 return 1;
486 544
487 if (is_long_mode(vcpu)) { 545 if (is_long_mode(vcpu)) {
488 if (!(cr4 & X86_CR4_PAE)) { 546 if (!(cr4 & X86_CR4_PAE))
489 kvm_inject_gp(vcpu, 0); 547 return 1;
490 return;
491 }
492 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 548 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
493 && ((cr4 ^ old_cr4) & pdptr_bits) 549 && ((cr4 ^ old_cr4) & pdptr_bits)
494 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 550 && !load_pdptrs(vcpu, vcpu->arch.cr3))
495 kvm_inject_gp(vcpu, 0); 551 return 1;
496 return; 552
497 } 553 if (cr4 & X86_CR4_VMXE)
554 return 1;
498 555
499 if (cr4 & X86_CR4_VMXE) {
500 kvm_inject_gp(vcpu, 0);
501 return;
502 }
503 kvm_x86_ops->set_cr4(vcpu, cr4); 556 kvm_x86_ops->set_cr4(vcpu, cr4);
504 vcpu->arch.cr4 = cr4; 557
505 kvm_mmu_reset_context(vcpu); 558 if ((cr4 ^ old_cr4) & pdptr_bits)
559 kvm_mmu_reset_context(vcpu);
560
561 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
562 update_cpuid(vcpu);
563
564 return 0;
506} 565}
507EXPORT_SYMBOL_GPL(kvm_set_cr4); 566EXPORT_SYMBOL_GPL(kvm_set_cr4);
508 567
509void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 568int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
510{ 569{
511 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 570 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
512 kvm_mmu_sync_roots(vcpu); 571 kvm_mmu_sync_roots(vcpu);
513 kvm_mmu_flush_tlb(vcpu); 572 kvm_mmu_flush_tlb(vcpu);
514 return; 573 return 0;
515 } 574 }
516 575
517 if (is_long_mode(vcpu)) { 576 if (is_long_mode(vcpu)) {
518 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 577 if (cr3 & CR3_L_MODE_RESERVED_BITS)
519 kvm_inject_gp(vcpu, 0); 578 return 1;
520 return;
521 }
522 } else { 579 } else {
523 if (is_pae(vcpu)) { 580 if (is_pae(vcpu)) {
524 if (cr3 & CR3_PAE_RESERVED_BITS) { 581 if (cr3 & CR3_PAE_RESERVED_BITS)
525 kvm_inject_gp(vcpu, 0); 582 return 1;
526 return; 583 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3))
527 } 584 return 1;
528 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
529 kvm_inject_gp(vcpu, 0);
530 return;
531 }
532 } 585 }
533 /* 586 /*
534 * We don't check reserved bits in nonpae mode, because 587 * We don't check reserved bits in nonpae mode, because
@@ -546,24 +599,28 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
546 * to debug) behavior on the guest side. 599 * to debug) behavior on the guest side.
547 */ 600 */
548 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 601 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
549 kvm_inject_gp(vcpu, 0); 602 return 1;
550 else { 603 vcpu->arch.cr3 = cr3;
551 vcpu->arch.cr3 = cr3; 604 vcpu->arch.mmu.new_cr3(vcpu);
552 vcpu->arch.mmu.new_cr3(vcpu); 605 return 0;
553 }
554} 606}
555EXPORT_SYMBOL_GPL(kvm_set_cr3); 607EXPORT_SYMBOL_GPL(kvm_set_cr3);
556 608
557void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 609int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
558{ 610{
559 if (cr8 & CR8_RESERVED_BITS) { 611 if (cr8 & CR8_RESERVED_BITS)
560 kvm_inject_gp(vcpu, 0); 612 return 1;
561 return;
562 }
563 if (irqchip_in_kernel(vcpu->kvm)) 613 if (irqchip_in_kernel(vcpu->kvm))
564 kvm_lapic_set_tpr(vcpu, cr8); 614 kvm_lapic_set_tpr(vcpu, cr8);
565 else 615 else
566 vcpu->arch.cr8 = cr8; 616 vcpu->arch.cr8 = cr8;
617 return 0;
618}
619
620void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
621{
622 if (__kvm_set_cr8(vcpu, cr8))
623 kvm_inject_gp(vcpu, 0);
567} 624}
568EXPORT_SYMBOL_GPL(kvm_set_cr8); 625EXPORT_SYMBOL_GPL(kvm_set_cr8);
569 626
@@ -576,7 +633,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
576} 633}
577EXPORT_SYMBOL_GPL(kvm_get_cr8); 634EXPORT_SYMBOL_GPL(kvm_get_cr8);
578 635
579int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 636static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
580{ 637{
581 switch (dr) { 638 switch (dr) {
582 case 0 ... 3: 639 case 0 ... 3:
@@ -585,29 +642,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
585 vcpu->arch.eff_db[dr] = val; 642 vcpu->arch.eff_db[dr] = val;
586 break; 643 break;
587 case 4: 644 case 4:
588 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 645 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
589 kvm_queue_exception(vcpu, UD_VECTOR); 646 return 1; /* #UD */
590 return 1;
591 }
592 /* fall through */ 647 /* fall through */
593 case 6: 648 case 6:
594 if (val & 0xffffffff00000000ULL) { 649 if (val & 0xffffffff00000000ULL)
595 kvm_inject_gp(vcpu, 0); 650 return -1; /* #GP */
596 return 1;
597 }
598 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; 651 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
599 break; 652 break;
600 case 5: 653 case 5:
601 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 654 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
602 kvm_queue_exception(vcpu, UD_VECTOR); 655 return 1; /* #UD */
603 return 1;
604 }
605 /* fall through */ 656 /* fall through */
606 default: /* 7 */ 657 default: /* 7 */
607 if (val & 0xffffffff00000000ULL) { 658 if (val & 0xffffffff00000000ULL)
608 kvm_inject_gp(vcpu, 0); 659 return -1; /* #GP */
609 return 1;
610 }
611 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 660 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
612 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 661 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
613 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); 662 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
@@ -618,28 +667,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
618 667
619 return 0; 668 return 0;
620} 669}
670
671int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
672{
673 int res;
674
675 res = __kvm_set_dr(vcpu, dr, val);
676 if (res > 0)
677 kvm_queue_exception(vcpu, UD_VECTOR);
678 else if (res < 0)
679 kvm_inject_gp(vcpu, 0);
680
681 return res;
682}
621EXPORT_SYMBOL_GPL(kvm_set_dr); 683EXPORT_SYMBOL_GPL(kvm_set_dr);
622 684
623int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 685static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
624{ 686{
625 switch (dr) { 687 switch (dr) {
626 case 0 ... 3: 688 case 0 ... 3:
627 *val = vcpu->arch.db[dr]; 689 *val = vcpu->arch.db[dr];
628 break; 690 break;
629 case 4: 691 case 4:
630 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 692 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
631 kvm_queue_exception(vcpu, UD_VECTOR);
632 return 1; 693 return 1;
633 }
634 /* fall through */ 694 /* fall through */
635 case 6: 695 case 6:
636 *val = vcpu->arch.dr6; 696 *val = vcpu->arch.dr6;
637 break; 697 break;
638 case 5: 698 case 5:
639 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 699 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
640 kvm_queue_exception(vcpu, UD_VECTOR);
641 return 1; 700 return 1;
642 }
643 /* fall through */ 701 /* fall through */
644 default: /* 7 */ 702 default: /* 7 */
645 *val = vcpu->arch.dr7; 703 *val = vcpu->arch.dr7;
@@ -648,12 +706,16 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
648 706
649 return 0; 707 return 0;
650} 708}
651EXPORT_SYMBOL_GPL(kvm_get_dr);
652 709
653static inline u32 bit(int bitno) 710int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
654{ 711{
655 return 1 << (bitno & 31); 712 if (_kvm_get_dr(vcpu, dr, val)) {
713 kvm_queue_exception(vcpu, UD_VECTOR);
714 return 1;
715 }
716 return 0;
656} 717}
718EXPORT_SYMBOL_GPL(kvm_get_dr);
657 719
658/* 720/*
659 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 721 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -671,7 +733,7 @@ static u32 msrs_to_save[] = {
671 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 733 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
672 HV_X64_MSR_APIC_ASSIST_PAGE, 734 HV_X64_MSR_APIC_ASSIST_PAGE,
673 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 735 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
674 MSR_K6_STAR, 736 MSR_STAR,
675#ifdef CONFIG_X86_64 737#ifdef CONFIG_X86_64
676 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 738 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
677#endif 739#endif
@@ -682,10 +744,14 @@ static unsigned num_msrs_to_save;
682 744
683static u32 emulated_msrs[] = { 745static u32 emulated_msrs[] = {
684 MSR_IA32_MISC_ENABLE, 746 MSR_IA32_MISC_ENABLE,
747 MSR_IA32_MCG_STATUS,
748 MSR_IA32_MCG_CTL,
685}; 749};
686 750
687static int set_efer(struct kvm_vcpu *vcpu, u64 efer) 751static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
688{ 752{
753 u64 old_efer = vcpu->arch.efer;
754
689 if (efer & efer_reserved_bits) 755 if (efer & efer_reserved_bits)
690 return 1; 756 return 1;
691 757
@@ -714,11 +780,13 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
714 780
715 kvm_x86_ops->set_efer(vcpu, efer); 781 kvm_x86_ops->set_efer(vcpu, efer);
716 782
717 vcpu->arch.efer = efer;
718
719 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 783 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
720 kvm_mmu_reset_context(vcpu); 784 kvm_mmu_reset_context(vcpu);
721 785
786 /* Update reserved bits */
787 if ((efer ^ old_efer) & EFER_NX)
788 kvm_mmu_reset_context(vcpu);
789
722 return 0; 790 return 0;
723} 791}
724 792
@@ -882,7 +950,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v)
882 950
883 if (!vcpu->time_page) 951 if (!vcpu->time_page)
884 return 0; 952 return 0;
885 set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); 953 kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
886 return 1; 954 return 1;
887} 955}
888 956
@@ -1524,16 +1592,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1524{ 1592{
1525 int i, idx; 1593 int i, idx;
1526 1594
1527 vcpu_load(vcpu);
1528
1529 idx = srcu_read_lock(&vcpu->kvm->srcu); 1595 idx = srcu_read_lock(&vcpu->kvm->srcu);
1530 for (i = 0; i < msrs->nmsrs; ++i) 1596 for (i = 0; i < msrs->nmsrs; ++i)
1531 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1597 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1532 break; 1598 break;
1533 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1599 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1534 1600
1535 vcpu_put(vcpu);
1536
1537 return i; 1601 return i;
1538} 1602}
1539 1603
@@ -1618,6 +1682,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1618 case KVM_CAP_PCI_SEGMENT: 1682 case KVM_CAP_PCI_SEGMENT:
1619 case KVM_CAP_DEBUGREGS: 1683 case KVM_CAP_DEBUGREGS:
1620 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1684 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1685 case KVM_CAP_XSAVE:
1621 r = 1; 1686 r = 1;
1622 break; 1687 break;
1623 case KVM_CAP_COALESCED_MMIO: 1688 case KVM_CAP_COALESCED_MMIO:
@@ -1641,6 +1706,9 @@ int kvm_dev_ioctl_check_extension(long ext)
1641 case KVM_CAP_MCE: 1706 case KVM_CAP_MCE:
1642 r = KVM_MAX_MCE_BANKS; 1707 r = KVM_MAX_MCE_BANKS;
1643 break; 1708 break;
1709 case KVM_CAP_XCRS:
1710 r = cpu_has_xsave;
1711 break;
1644 default: 1712 default:
1645 r = 0; 1713 r = 0;
1646 break; 1714 break;
@@ -1717,8 +1785,28 @@ out:
1717 return r; 1785 return r;
1718} 1786}
1719 1787
1788static void wbinvd_ipi(void *garbage)
1789{
1790 wbinvd();
1791}
1792
1793static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
1794{
1795 return vcpu->kvm->arch.iommu_domain &&
1796 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
1797}
1798
1720void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1799void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1721{ 1800{
1801 /* Address WBINVD may be executed by guest */
1802 if (need_emulate_wbinvd(vcpu)) {
1803 if (kvm_x86_ops->has_wbinvd_exit())
1804 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
1805 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
1806 smp_call_function_single(vcpu->cpu,
1807 wbinvd_ipi, NULL, 1);
1808 }
1809
1722 kvm_x86_ops->vcpu_load(vcpu, cpu); 1810 kvm_x86_ops->vcpu_load(vcpu, cpu);
1723 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { 1811 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1724 unsigned long khz = cpufreq_quick_get(cpu); 1812 unsigned long khz = cpufreq_quick_get(cpu);
@@ -1731,8 +1819,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1731 1819
1732void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1820void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1733{ 1821{
1734 kvm_put_guest_fpu(vcpu);
1735 kvm_x86_ops->vcpu_put(vcpu); 1822 kvm_x86_ops->vcpu_put(vcpu);
1823 kvm_put_guest_fpu(vcpu);
1736} 1824}
1737 1825
1738static int is_efer_nx(void) 1826static int is_efer_nx(void)
@@ -1781,7 +1869,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1781 if (copy_from_user(cpuid_entries, entries, 1869 if (copy_from_user(cpuid_entries, entries,
1782 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1870 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1783 goto out_free; 1871 goto out_free;
1784 vcpu_load(vcpu);
1785 for (i = 0; i < cpuid->nent; i++) { 1872 for (i = 0; i < cpuid->nent; i++) {
1786 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1873 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1787 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1874 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1799,7 +1886,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1799 r = 0; 1886 r = 0;
1800 kvm_apic_set_version(vcpu); 1887 kvm_apic_set_version(vcpu);
1801 kvm_x86_ops->cpuid_update(vcpu); 1888 kvm_x86_ops->cpuid_update(vcpu);
1802 vcpu_put(vcpu); 1889 update_cpuid(vcpu);
1803 1890
1804out_free: 1891out_free:
1805 vfree(cpuid_entries); 1892 vfree(cpuid_entries);
@@ -1820,11 +1907,10 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1820 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1907 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1821 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1908 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1822 goto out; 1909 goto out;
1823 vcpu_load(vcpu);
1824 vcpu->arch.cpuid_nent = cpuid->nent; 1910 vcpu->arch.cpuid_nent = cpuid->nent;
1825 kvm_apic_set_version(vcpu); 1911 kvm_apic_set_version(vcpu);
1826 kvm_x86_ops->cpuid_update(vcpu); 1912 kvm_x86_ops->cpuid_update(vcpu);
1827 vcpu_put(vcpu); 1913 update_cpuid(vcpu);
1828 return 0; 1914 return 0;
1829 1915
1830out: 1916out:
@@ -1837,7 +1923,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1837{ 1923{
1838 int r; 1924 int r;
1839 1925
1840 vcpu_load(vcpu);
1841 r = -E2BIG; 1926 r = -E2BIG;
1842 if (cpuid->nent < vcpu->arch.cpuid_nent) 1927 if (cpuid->nent < vcpu->arch.cpuid_nent)
1843 goto out; 1928 goto out;
@@ -1849,7 +1934,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1849 1934
1850out: 1935out:
1851 cpuid->nent = vcpu->arch.cpuid_nent; 1936 cpuid->nent = vcpu->arch.cpuid_nent;
1852 vcpu_put(vcpu);
1853 return r; 1937 return r;
1854} 1938}
1855 1939
@@ -1901,13 +1985,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1901 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1985 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1902 /* cpuid 1.ecx */ 1986 /* cpuid 1.ecx */
1903 const u32 kvm_supported_word4_x86_features = 1987 const u32 kvm_supported_word4_x86_features =
1904 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | 1988 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
1905 0 /* DS-CPL, VMX, SMX, EST */ | 1989 0 /* DS-CPL, VMX, SMX, EST */ |
1906 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 1990 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1907 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1991 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1908 0 /* Reserved, DCA */ | F(XMM4_1) | 1992 0 /* Reserved, DCA */ | F(XMM4_1) |
1909 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 1993 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1910 0 /* Reserved, XSAVE, OSXSAVE */; 1994 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
1911 /* cpuid 0x80000001.ecx */ 1995 /* cpuid 0x80000001.ecx */
1912 const u32 kvm_supported_word6_x86_features = 1996 const u32 kvm_supported_word6_x86_features =
1913 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 1997 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
@@ -1922,7 +2006,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1922 2006
1923 switch (function) { 2007 switch (function) {
1924 case 0: 2008 case 0:
1925 entry->eax = min(entry->eax, (u32)0xb); 2009 entry->eax = min(entry->eax, (u32)0xd);
1926 break; 2010 break;
1927 case 1: 2011 case 1:
1928 entry->edx &= kvm_supported_word0_x86_features; 2012 entry->edx &= kvm_supported_word0_x86_features;
@@ -1980,6 +2064,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1980 } 2064 }
1981 break; 2065 break;
1982 } 2066 }
2067 case 0xd: {
2068 int i;
2069
2070 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2071 for (i = 1; *nent < maxnent; ++i) {
2072 if (entry[i - 1].eax == 0 && i != 2)
2073 break;
2074 do_cpuid_1_ent(&entry[i], function, i);
2075 entry[i].flags |=
2076 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2077 ++*nent;
2078 }
2079 break;
2080 }
1983 case KVM_CPUID_SIGNATURE: { 2081 case KVM_CPUID_SIGNATURE: {
1984 char signature[12] = "KVMKVMKVM\0\0"; 2082 char signature[12] = "KVMKVMKVM\0\0";
1985 u32 *sigptr = (u32 *)signature; 2083 u32 *sigptr = (u32 *)signature;
@@ -2081,9 +2179,7 @@ out:
2081static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2179static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2082 struct kvm_lapic_state *s) 2180 struct kvm_lapic_state *s)
2083{ 2181{
2084 vcpu_load(vcpu);
2085 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 2182 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
2086 vcpu_put(vcpu);
2087 2183
2088 return 0; 2184 return 0;
2089} 2185}
@@ -2091,11 +2187,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2091static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 2187static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2092 struct kvm_lapic_state *s) 2188 struct kvm_lapic_state *s)
2093{ 2189{
2094 vcpu_load(vcpu);
2095 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 2190 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
2096 kvm_apic_post_state_restore(vcpu); 2191 kvm_apic_post_state_restore(vcpu);
2097 update_cr8_intercept(vcpu); 2192 update_cr8_intercept(vcpu);
2098 vcpu_put(vcpu);
2099 2193
2100 return 0; 2194 return 0;
2101} 2195}
@@ -2107,20 +2201,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2107 return -EINVAL; 2201 return -EINVAL;
2108 if (irqchip_in_kernel(vcpu->kvm)) 2202 if (irqchip_in_kernel(vcpu->kvm))
2109 return -ENXIO; 2203 return -ENXIO;
2110 vcpu_load(vcpu);
2111 2204
2112 kvm_queue_interrupt(vcpu, irq->irq, false); 2205 kvm_queue_interrupt(vcpu, irq->irq, false);
2113 2206
2114 vcpu_put(vcpu);
2115
2116 return 0; 2207 return 0;
2117} 2208}
2118 2209
2119static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 2210static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
2120{ 2211{
2121 vcpu_load(vcpu);
2122 kvm_inject_nmi(vcpu); 2212 kvm_inject_nmi(vcpu);
2123 vcpu_put(vcpu);
2124 2213
2125 return 0; 2214 return 0;
2126} 2215}
@@ -2140,7 +2229,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2140 int r; 2229 int r;
2141 unsigned bank_num = mcg_cap & 0xff, bank; 2230 unsigned bank_num = mcg_cap & 0xff, bank;
2142 2231
2143 vcpu_load(vcpu);
2144 r = -EINVAL; 2232 r = -EINVAL;
2145 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) 2233 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
2146 goto out; 2234 goto out;
@@ -2155,7 +2243,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2155 for (bank = 0; bank < bank_num; bank++) 2243 for (bank = 0; bank < bank_num; bank++)
2156 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 2244 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
2157out: 2245out:
2158 vcpu_put(vcpu);
2159 return r; 2246 return r;
2160} 2247}
2161 2248
@@ -2188,7 +2275,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2188 printk(KERN_DEBUG "kvm: set_mce: " 2275 printk(KERN_DEBUG "kvm: set_mce: "
2189 "injects mce exception while " 2276 "injects mce exception while "
2190 "previous one is in progress!\n"); 2277 "previous one is in progress!\n");
2191 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 2278 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2192 return 0; 2279 return 0;
2193 } 2280 }
2194 if (banks[1] & MCI_STATUS_VAL) 2281 if (banks[1] & MCI_STATUS_VAL)
@@ -2213,8 +2300,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2213static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 2300static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2214 struct kvm_vcpu_events *events) 2301 struct kvm_vcpu_events *events)
2215{ 2302{
2216 vcpu_load(vcpu);
2217
2218 events->exception.injected = 2303 events->exception.injected =
2219 vcpu->arch.exception.pending && 2304 vcpu->arch.exception.pending &&
2220 !kvm_exception_is_soft(vcpu->arch.exception.nr); 2305 !kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2239,8 +2324,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2239 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2324 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2240 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 2325 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2241 | KVM_VCPUEVENT_VALID_SHADOW); 2326 | KVM_VCPUEVENT_VALID_SHADOW);
2242
2243 vcpu_put(vcpu);
2244} 2327}
2245 2328
2246static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, 2329static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
@@ -2251,8 +2334,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2251 | KVM_VCPUEVENT_VALID_SHADOW)) 2334 | KVM_VCPUEVENT_VALID_SHADOW))
2252 return -EINVAL; 2335 return -EINVAL;
2253 2336
2254 vcpu_load(vcpu);
2255
2256 vcpu->arch.exception.pending = events->exception.injected; 2337 vcpu->arch.exception.pending = events->exception.injected;
2257 vcpu->arch.exception.nr = events->exception.nr; 2338 vcpu->arch.exception.nr = events->exception.nr;
2258 vcpu->arch.exception.has_error_code = events->exception.has_error_code; 2339 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -2275,22 +2356,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2275 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2356 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2276 vcpu->arch.sipi_vector = events->sipi_vector; 2357 vcpu->arch.sipi_vector = events->sipi_vector;
2277 2358
2278 vcpu_put(vcpu);
2279
2280 return 0; 2359 return 0;
2281} 2360}
2282 2361
2283static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, 2362static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2284 struct kvm_debugregs *dbgregs) 2363 struct kvm_debugregs *dbgregs)
2285{ 2364{
2286 vcpu_load(vcpu);
2287
2288 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); 2365 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2289 dbgregs->dr6 = vcpu->arch.dr6; 2366 dbgregs->dr6 = vcpu->arch.dr6;
2290 dbgregs->dr7 = vcpu->arch.dr7; 2367 dbgregs->dr7 = vcpu->arch.dr7;
2291 dbgregs->flags = 0; 2368 dbgregs->flags = 0;
2292
2293 vcpu_put(vcpu);
2294} 2369}
2295 2370
2296static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, 2371static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -2299,40 +2374,113 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
2299 if (dbgregs->flags) 2374 if (dbgregs->flags)
2300 return -EINVAL; 2375 return -EINVAL;
2301 2376
2302 vcpu_load(vcpu);
2303
2304 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); 2377 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
2305 vcpu->arch.dr6 = dbgregs->dr6; 2378 vcpu->arch.dr6 = dbgregs->dr6;
2306 vcpu->arch.dr7 = dbgregs->dr7; 2379 vcpu->arch.dr7 = dbgregs->dr7;
2307 2380
2308 vcpu_put(vcpu); 2381 return 0;
2382}
2383
2384static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
2385 struct kvm_xsave *guest_xsave)
2386{
2387 if (cpu_has_xsave)
2388 memcpy(guest_xsave->region,
2389 &vcpu->arch.guest_fpu.state->xsave,
2390 sizeof(struct xsave_struct));
2391 else {
2392 memcpy(guest_xsave->region,
2393 &vcpu->arch.guest_fpu.state->fxsave,
2394 sizeof(struct i387_fxsave_struct));
2395 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
2396 XSTATE_FPSSE;
2397 }
2398}
2399
2400static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
2401 struct kvm_xsave *guest_xsave)
2402{
2403 u64 xstate_bv =
2404 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
2309 2405
2406 if (cpu_has_xsave)
2407 memcpy(&vcpu->arch.guest_fpu.state->xsave,
2408 guest_xsave->region, sizeof(struct xsave_struct));
2409 else {
2410 if (xstate_bv & ~XSTATE_FPSSE)
2411 return -EINVAL;
2412 memcpy(&vcpu->arch.guest_fpu.state->fxsave,
2413 guest_xsave->region, sizeof(struct i387_fxsave_struct));
2414 }
2310 return 0; 2415 return 0;
2311} 2416}
2312 2417
2418static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
2419 struct kvm_xcrs *guest_xcrs)
2420{
2421 if (!cpu_has_xsave) {
2422 guest_xcrs->nr_xcrs = 0;
2423 return;
2424 }
2425
2426 guest_xcrs->nr_xcrs = 1;
2427 guest_xcrs->flags = 0;
2428 guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
2429 guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
2430}
2431
2432static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
2433 struct kvm_xcrs *guest_xcrs)
2434{
2435 int i, r = 0;
2436
2437 if (!cpu_has_xsave)
2438 return -EINVAL;
2439
2440 if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
2441 return -EINVAL;
2442
2443 for (i = 0; i < guest_xcrs->nr_xcrs; i++)
2444 /* Only support XCR0 currently */
2445 if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
2446 r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
2447 guest_xcrs->xcrs[0].value);
2448 break;
2449 }
2450 if (r)
2451 r = -EINVAL;
2452 return r;
2453}
2454
2313long kvm_arch_vcpu_ioctl(struct file *filp, 2455long kvm_arch_vcpu_ioctl(struct file *filp,
2314 unsigned int ioctl, unsigned long arg) 2456 unsigned int ioctl, unsigned long arg)
2315{ 2457{
2316 struct kvm_vcpu *vcpu = filp->private_data; 2458 struct kvm_vcpu *vcpu = filp->private_data;
2317 void __user *argp = (void __user *)arg; 2459 void __user *argp = (void __user *)arg;
2318 int r; 2460 int r;
2319 struct kvm_lapic_state *lapic = NULL; 2461 union {
2462 struct kvm_lapic_state *lapic;
2463 struct kvm_xsave *xsave;
2464 struct kvm_xcrs *xcrs;
2465 void *buffer;
2466 } u;
2320 2467
2468 u.buffer = NULL;
2321 switch (ioctl) { 2469 switch (ioctl) {
2322 case KVM_GET_LAPIC: { 2470 case KVM_GET_LAPIC: {
2323 r = -EINVAL; 2471 r = -EINVAL;
2324 if (!vcpu->arch.apic) 2472 if (!vcpu->arch.apic)
2325 goto out; 2473 goto out;
2326 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2474 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
2327 2475
2328 r = -ENOMEM; 2476 r = -ENOMEM;
2329 if (!lapic) 2477 if (!u.lapic)
2330 goto out; 2478 goto out;
2331 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 2479 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
2332 if (r) 2480 if (r)
2333 goto out; 2481 goto out;
2334 r = -EFAULT; 2482 r = -EFAULT;
2335 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 2483 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
2336 goto out; 2484 goto out;
2337 r = 0; 2485 r = 0;
2338 break; 2486 break;
@@ -2341,14 +2489,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2341 r = -EINVAL; 2489 r = -EINVAL;
2342 if (!vcpu->arch.apic) 2490 if (!vcpu->arch.apic)
2343 goto out; 2491 goto out;
2344 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2492 u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
2345 r = -ENOMEM; 2493 r = -ENOMEM;
2346 if (!lapic) 2494 if (!u.lapic)
2347 goto out; 2495 goto out;
2348 r = -EFAULT; 2496 r = -EFAULT;
2349 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 2497 if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
2350 goto out; 2498 goto out;
2351 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 2499 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
2352 if (r) 2500 if (r)
2353 goto out; 2501 goto out;
2354 r = 0; 2502 r = 0;
@@ -2464,9 +2612,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2464 r = -EFAULT; 2612 r = -EFAULT;
2465 if (copy_from_user(&mce, argp, sizeof mce)) 2613 if (copy_from_user(&mce, argp, sizeof mce))
2466 goto out; 2614 goto out;
2467 vcpu_load(vcpu);
2468 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2615 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
2469 vcpu_put(vcpu);
2470 break; 2616 break;
2471 } 2617 }
2472 case KVM_GET_VCPU_EVENTS: { 2618 case KVM_GET_VCPU_EVENTS: {
@@ -2513,11 +2659,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2513 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); 2659 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
2514 break; 2660 break;
2515 } 2661 }
2662 case KVM_GET_XSAVE: {
2663 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
2664 r = -ENOMEM;
2665 if (!u.xsave)
2666 break;
2667
2668 kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
2669
2670 r = -EFAULT;
2671 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
2672 break;
2673 r = 0;
2674 break;
2675 }
2676 case KVM_SET_XSAVE: {
2677 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
2678 r = -ENOMEM;
2679 if (!u.xsave)
2680 break;
2681
2682 r = -EFAULT;
2683 if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
2684 break;
2685
2686 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
2687 break;
2688 }
2689 case KVM_GET_XCRS: {
2690 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
2691 r = -ENOMEM;
2692 if (!u.xcrs)
2693 break;
2694
2695 kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
2696
2697 r = -EFAULT;
2698 if (copy_to_user(argp, u.xcrs,
2699 sizeof(struct kvm_xcrs)))
2700 break;
2701 r = 0;
2702 break;
2703 }
2704 case KVM_SET_XCRS: {
2705 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
2706 r = -ENOMEM;
2707 if (!u.xcrs)
2708 break;
2709
2710 r = -EFAULT;
2711 if (copy_from_user(u.xcrs, argp,
2712 sizeof(struct kvm_xcrs)))
2713 break;
2714
2715 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
2716 break;
2717 }
2516 default: 2718 default:
2517 r = -EINVAL; 2719 r = -EINVAL;
2518 } 2720 }
2519out: 2721out:
2520 kfree(lapic); 2722 kfree(u.buffer);
2521 return r; 2723 return r;
2522} 2724}
2523 2725
@@ -2560,115 +2762,6 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2560 return kvm->arch.n_alloc_mmu_pages; 2762 return kvm->arch.n_alloc_mmu_pages;
2561} 2763}
2562 2764
2563gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2564{
2565 int i;
2566 struct kvm_mem_alias *alias;
2567 struct kvm_mem_aliases *aliases;
2568
2569 aliases = kvm_aliases(kvm);
2570
2571 for (i = 0; i < aliases->naliases; ++i) {
2572 alias = &aliases->aliases[i];
2573 if (alias->flags & KVM_ALIAS_INVALID)
2574 continue;
2575 if (gfn >= alias->base_gfn
2576 && gfn < alias->base_gfn + alias->npages)
2577 return alias->target_gfn + gfn - alias->base_gfn;
2578 }
2579 return gfn;
2580}
2581
2582gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2583{
2584 int i;
2585 struct kvm_mem_alias *alias;
2586 struct kvm_mem_aliases *aliases;
2587
2588 aliases = kvm_aliases(kvm);
2589
2590 for (i = 0; i < aliases->naliases; ++i) {
2591 alias = &aliases->aliases[i];
2592 if (gfn >= alias->base_gfn
2593 && gfn < alias->base_gfn + alias->npages)
2594 return alias->target_gfn + gfn - alias->base_gfn;
2595 }
2596 return gfn;
2597}
2598
2599/*
2600 * Set a new alias region. Aliases map a portion of physical memory into
2601 * another portion. This is useful for memory windows, for example the PC
2602 * VGA region.
2603 */
2604static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
2605 struct kvm_memory_alias *alias)
2606{
2607 int r, n;
2608 struct kvm_mem_alias *p;
2609 struct kvm_mem_aliases *aliases, *old_aliases;
2610
2611 r = -EINVAL;
2612 /* General sanity checks */
2613 if (alias->memory_size & (PAGE_SIZE - 1))
2614 goto out;
2615 if (alias->guest_phys_addr & (PAGE_SIZE - 1))
2616 goto out;
2617 if (alias->slot >= KVM_ALIAS_SLOTS)
2618 goto out;
2619 if (alias->guest_phys_addr + alias->memory_size
2620 < alias->guest_phys_addr)
2621 goto out;
2622 if (alias->target_phys_addr + alias->memory_size
2623 < alias->target_phys_addr)
2624 goto out;
2625
2626 r = -ENOMEM;
2627 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2628 if (!aliases)
2629 goto out;
2630
2631 mutex_lock(&kvm->slots_lock);
2632
2633 /* invalidate any gfn reference in case of deletion/shrinking */
2634 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2635 aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
2636 old_aliases = kvm->arch.aliases;
2637 rcu_assign_pointer(kvm->arch.aliases, aliases);
2638 synchronize_srcu_expedited(&kvm->srcu);
2639 kvm_mmu_zap_all(kvm);
2640 kfree(old_aliases);
2641
2642 r = -ENOMEM;
2643 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2644 if (!aliases)
2645 goto out_unlock;
2646
2647 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2648
2649 p = &aliases->aliases[alias->slot];
2650 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
2651 p->npages = alias->memory_size >> PAGE_SHIFT;
2652 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
2653 p->flags &= ~(KVM_ALIAS_INVALID);
2654
2655 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
2656 if (aliases->aliases[n - 1].npages)
2657 break;
2658 aliases->naliases = n;
2659
2660 old_aliases = kvm->arch.aliases;
2661 rcu_assign_pointer(kvm->arch.aliases, aliases);
2662 synchronize_srcu_expedited(&kvm->srcu);
2663 kfree(old_aliases);
2664 r = 0;
2665
2666out_unlock:
2667 mutex_unlock(&kvm->slots_lock);
2668out:
2669 return r;
2670}
2671
2672static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2765static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2673{ 2766{
2674 int r; 2767 int r;
@@ -2797,7 +2890,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2797 struct kvm_memory_slot *memslot; 2890 struct kvm_memory_slot *memslot;
2798 unsigned long n; 2891 unsigned long n;
2799 unsigned long is_dirty = 0; 2892 unsigned long is_dirty = 0;
2800 unsigned long *dirty_bitmap = NULL;
2801 2893
2802 mutex_lock(&kvm->slots_lock); 2894 mutex_lock(&kvm->slots_lock);
2803 2895
@@ -2812,27 +2904,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2812 2904
2813 n = kvm_dirty_bitmap_bytes(memslot); 2905 n = kvm_dirty_bitmap_bytes(memslot);
2814 2906
2815 r = -ENOMEM;
2816 dirty_bitmap = vmalloc(n);
2817 if (!dirty_bitmap)
2818 goto out;
2819 memset(dirty_bitmap, 0, n);
2820
2821 for (i = 0; !is_dirty && i < n/sizeof(long); i++) 2907 for (i = 0; !is_dirty && i < n/sizeof(long); i++)
2822 is_dirty = memslot->dirty_bitmap[i]; 2908 is_dirty = memslot->dirty_bitmap[i];
2823 2909
2824 /* If nothing is dirty, don't bother messing with page tables. */ 2910 /* If nothing is dirty, don't bother messing with page tables. */
2825 if (is_dirty) { 2911 if (is_dirty) {
2826 struct kvm_memslots *slots, *old_slots; 2912 struct kvm_memslots *slots, *old_slots;
2913 unsigned long *dirty_bitmap;
2827 2914
2828 spin_lock(&kvm->mmu_lock); 2915 spin_lock(&kvm->mmu_lock);
2829 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2916 kvm_mmu_slot_remove_write_access(kvm, log->slot);
2830 spin_unlock(&kvm->mmu_lock); 2917 spin_unlock(&kvm->mmu_lock);
2831 2918
2832 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 2919 r = -ENOMEM;
2833 if (!slots) 2920 dirty_bitmap = vmalloc(n);
2834 goto out_free; 2921 if (!dirty_bitmap)
2922 goto out;
2923 memset(dirty_bitmap, 0, n);
2835 2924
2925 r = -ENOMEM;
2926 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
2927 if (!slots) {
2928 vfree(dirty_bitmap);
2929 goto out;
2930 }
2836 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 2931 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
2837 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 2932 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
2838 2933
@@ -2841,13 +2936,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2841 synchronize_srcu_expedited(&kvm->srcu); 2936 synchronize_srcu_expedited(&kvm->srcu);
2842 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; 2937 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
2843 kfree(old_slots); 2938 kfree(old_slots);
2939
2940 r = -EFAULT;
2941 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
2942 vfree(dirty_bitmap);
2943 goto out;
2944 }
2945 vfree(dirty_bitmap);
2946 } else {
2947 r = -EFAULT;
2948 if (clear_user(log->dirty_bitmap, n))
2949 goto out;
2844 } 2950 }
2845 2951
2846 r = 0; 2952 r = 0;
2847 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
2848 r = -EFAULT;
2849out_free:
2850 vfree(dirty_bitmap);
2851out: 2953out:
2852 mutex_unlock(&kvm->slots_lock); 2954 mutex_unlock(&kvm->slots_lock);
2853 return r; 2955 return r;
@@ -2867,7 +2969,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
2867 union { 2969 union {
2868 struct kvm_pit_state ps; 2970 struct kvm_pit_state ps;
2869 struct kvm_pit_state2 ps2; 2971 struct kvm_pit_state2 ps2;
2870 struct kvm_memory_alias alias;
2871 struct kvm_pit_config pit_config; 2972 struct kvm_pit_config pit_config;
2872 } u; 2973 } u;
2873 2974
@@ -2888,22 +2989,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
2888 goto out; 2989 goto out;
2889 break; 2990 break;
2890 } 2991 }
2891 case KVM_SET_MEMORY_REGION: {
2892 struct kvm_memory_region kvm_mem;
2893 struct kvm_userspace_memory_region kvm_userspace_mem;
2894
2895 r = -EFAULT;
2896 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2897 goto out;
2898 kvm_userspace_mem.slot = kvm_mem.slot;
2899 kvm_userspace_mem.flags = kvm_mem.flags;
2900 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
2901 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
2902 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
2903 if (r)
2904 goto out;
2905 break;
2906 }
2907 case KVM_SET_NR_MMU_PAGES: 2992 case KVM_SET_NR_MMU_PAGES:
2908 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 2993 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
2909 if (r) 2994 if (r)
@@ -2912,14 +2997,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
2912 case KVM_GET_NR_MMU_PAGES: 2997 case KVM_GET_NR_MMU_PAGES:
2913 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 2998 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
2914 break; 2999 break;
2915 case KVM_SET_MEMORY_ALIAS:
2916 r = -EFAULT;
2917 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
2918 goto out;
2919 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
2920 if (r)
2921 goto out;
2922 break;
2923 case KVM_CREATE_IRQCHIP: { 3000 case KVM_CREATE_IRQCHIP: {
2924 struct kvm_pic *vpic; 3001 struct kvm_pic *vpic;
2925 3002
@@ -3259,7 +3336,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3259 } 3336 }
3260 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 3337 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
3261 if (ret < 0) { 3338 if (ret < 0) {
3262 r = X86EMUL_UNHANDLEABLE; 3339 r = X86EMUL_IO_NEEDED;
3263 goto out; 3340 goto out;
3264 } 3341 }
3265 3342
@@ -3315,7 +3392,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
3315 } 3392 }
3316 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 3393 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
3317 if (ret < 0) { 3394 if (ret < 0) {
3318 r = X86EMUL_UNHANDLEABLE; 3395 r = X86EMUL_IO_NEEDED;
3319 goto out; 3396 goto out;
3320 } 3397 }
3321 3398
@@ -3330,10 +3407,10 @@ out:
3330static int emulator_read_emulated(unsigned long addr, 3407static int emulator_read_emulated(unsigned long addr,
3331 void *val, 3408 void *val,
3332 unsigned int bytes, 3409 unsigned int bytes,
3410 unsigned int *error_code,
3333 struct kvm_vcpu *vcpu) 3411 struct kvm_vcpu *vcpu)
3334{ 3412{
3335 gpa_t gpa; 3413 gpa_t gpa;
3336 u32 error_code;
3337 3414
3338 if (vcpu->mmio_read_completed) { 3415 if (vcpu->mmio_read_completed) {
3339 memcpy(val, vcpu->mmio_data, bytes); 3416 memcpy(val, vcpu->mmio_data, bytes);
@@ -3343,12 +3420,10 @@ static int emulator_read_emulated(unsigned long addr,
3343 return X86EMUL_CONTINUE; 3420 return X86EMUL_CONTINUE;
3344 } 3421 }
3345 3422
3346 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); 3423 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
3347 3424
3348 if (gpa == UNMAPPED_GVA) { 3425 if (gpa == UNMAPPED_GVA)
3349 kvm_inject_page_fault(vcpu, addr, error_code);
3350 return X86EMUL_PROPAGATE_FAULT; 3426 return X86EMUL_PROPAGATE_FAULT;
3351 }
3352 3427
3353 /* For APIC access vmexit */ 3428 /* For APIC access vmexit */
3354 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3429 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3370,11 +3445,12 @@ mmio:
3370 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 3445 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
3371 3446
3372 vcpu->mmio_needed = 1; 3447 vcpu->mmio_needed = 1;
3373 vcpu->mmio_phys_addr = gpa; 3448 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3374 vcpu->mmio_size = bytes; 3449 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3375 vcpu->mmio_is_write = 0; 3450 vcpu->run->mmio.len = vcpu->mmio_size = bytes;
3451 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
3376 3452
3377 return X86EMUL_UNHANDLEABLE; 3453 return X86EMUL_IO_NEEDED;
3378} 3454}
3379 3455
3380int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 3456int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -3392,17 +3468,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3392static int emulator_write_emulated_onepage(unsigned long addr, 3468static int emulator_write_emulated_onepage(unsigned long addr,
3393 const void *val, 3469 const void *val,
3394 unsigned int bytes, 3470 unsigned int bytes,
3471 unsigned int *error_code,
3395 struct kvm_vcpu *vcpu) 3472 struct kvm_vcpu *vcpu)
3396{ 3473{
3397 gpa_t gpa; 3474 gpa_t gpa;
3398 u32 error_code;
3399 3475
3400 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); 3476 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
3401 3477
3402 if (gpa == UNMAPPED_GVA) { 3478 if (gpa == UNMAPPED_GVA)
3403 kvm_inject_page_fault(vcpu, addr, error_code);
3404 return X86EMUL_PROPAGATE_FAULT; 3479 return X86EMUL_PROPAGATE_FAULT;
3405 }
3406 3480
3407 /* For APIC access vmexit */ 3481 /* For APIC access vmexit */
3408 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3482 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3420,10 +3494,11 @@ mmio:
3420 return X86EMUL_CONTINUE; 3494 return X86EMUL_CONTINUE;
3421 3495
3422 vcpu->mmio_needed = 1; 3496 vcpu->mmio_needed = 1;
3423 vcpu->mmio_phys_addr = gpa; 3497 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3424 vcpu->mmio_size = bytes; 3498 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3425 vcpu->mmio_is_write = 1; 3499 vcpu->run->mmio.len = vcpu->mmio_size = bytes;
3426 memcpy(vcpu->mmio_data, val, bytes); 3500 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
3501 memcpy(vcpu->run->mmio.data, val, bytes);
3427 3502
3428 return X86EMUL_CONTINUE; 3503 return X86EMUL_CONTINUE;
3429} 3504}
@@ -3431,6 +3506,7 @@ mmio:
3431int emulator_write_emulated(unsigned long addr, 3506int emulator_write_emulated(unsigned long addr,
3432 const void *val, 3507 const void *val,
3433 unsigned int bytes, 3508 unsigned int bytes,
3509 unsigned int *error_code,
3434 struct kvm_vcpu *vcpu) 3510 struct kvm_vcpu *vcpu)
3435{ 3511{
3436 /* Crossing a page boundary? */ 3512 /* Crossing a page boundary? */
@@ -3438,16 +3514,17 @@ int emulator_write_emulated(unsigned long addr,
3438 int rc, now; 3514 int rc, now;
3439 3515
3440 now = -addr & ~PAGE_MASK; 3516 now = -addr & ~PAGE_MASK;
3441 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 3517 rc = emulator_write_emulated_onepage(addr, val, now, error_code,
3518 vcpu);
3442 if (rc != X86EMUL_CONTINUE) 3519 if (rc != X86EMUL_CONTINUE)
3443 return rc; 3520 return rc;
3444 addr += now; 3521 addr += now;
3445 val += now; 3522 val += now;
3446 bytes -= now; 3523 bytes -= now;
3447 } 3524 }
3448 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 3525 return emulator_write_emulated_onepage(addr, val, bytes, error_code,
3526 vcpu);
3449} 3527}
3450EXPORT_SYMBOL_GPL(emulator_write_emulated);
3451 3528
3452#define CMPXCHG_TYPE(t, ptr, old, new) \ 3529#define CMPXCHG_TYPE(t, ptr, old, new) \
3453 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) 3530 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
@@ -3463,6 +3540,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3463 const void *old, 3540 const void *old,
3464 const void *new, 3541 const void *new,
3465 unsigned int bytes, 3542 unsigned int bytes,
3543 unsigned int *error_code,
3466 struct kvm_vcpu *vcpu) 3544 struct kvm_vcpu *vcpu)
3467{ 3545{
3468 gpa_t gpa; 3546 gpa_t gpa;
@@ -3484,6 +3562,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3484 goto emul_write; 3562 goto emul_write;
3485 3563
3486 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3564 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3565 if (is_error_page(page)) {
3566 kvm_release_page_clean(page);
3567 goto emul_write;
3568 }
3487 3569
3488 kaddr = kmap_atomic(page, KM_USER0); 3570 kaddr = kmap_atomic(page, KM_USER0);
3489 kaddr += offset_in_page(gpa); 3571 kaddr += offset_in_page(gpa);
@@ -3516,7 +3598,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3516emul_write: 3598emul_write:
3517 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3599 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3518 3600
3519 return emulator_write_emulated(addr, new, bytes, vcpu); 3601 return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
3520} 3602}
3521 3603
3522static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3604static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3604,42 +3686,38 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
3604 return X86EMUL_CONTINUE; 3686 return X86EMUL_CONTINUE;
3605} 3687}
3606 3688
3607int emulate_clts(struct kvm_vcpu *vcpu) 3689int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
3608{ 3690{
3609 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 3691 if (!need_emulate_wbinvd(vcpu))
3610 kvm_x86_ops->fpu_activate(vcpu); 3692 return X86EMUL_CONTINUE;
3693
3694 if (kvm_x86_ops->has_wbinvd_exit()) {
3695 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
3696 wbinvd_ipi, NULL, 1);
3697 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
3698 }
3699 wbinvd();
3611 return X86EMUL_CONTINUE; 3700 return X86EMUL_CONTINUE;
3612} 3701}
3702EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
3613 3703
3614int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3704int emulate_clts(struct kvm_vcpu *vcpu)
3615{ 3705{
3616 return kvm_get_dr(ctxt->vcpu, dr, dest); 3706 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
3707 kvm_x86_ops->fpu_activate(vcpu);
3708 return X86EMUL_CONTINUE;
3617} 3709}
3618 3710
3619int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3711int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
3620{ 3712{
3621 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3713 return _kvm_get_dr(vcpu, dr, dest);
3622
3623 return kvm_set_dr(ctxt->vcpu, dr, value & mask);
3624} 3714}
3625 3715
3626void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3716int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
3627{ 3717{
3628 u8 opcodes[4];
3629 unsigned long rip = kvm_rip_read(vcpu);
3630 unsigned long rip_linear;
3631
3632 if (!printk_ratelimit())
3633 return;
3634 3718
3635 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 3719 return __kvm_set_dr(vcpu, dr, value);
3636
3637 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
3638
3639 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
3640 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
3641} 3720}
3642EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
3643 3721
3644static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3722static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3645{ 3723{
@@ -3674,27 +3752,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
3674 return value; 3752 return value;
3675} 3753}
3676 3754
3677static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) 3755static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
3678{ 3756{
3757 int res = 0;
3758
3679 switch (cr) { 3759 switch (cr) {
3680 case 0: 3760 case 0:
3681 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); 3761 res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
3682 break; 3762 break;
3683 case 2: 3763 case 2:
3684 vcpu->arch.cr2 = val; 3764 vcpu->arch.cr2 = val;
3685 break; 3765 break;
3686 case 3: 3766 case 3:
3687 kvm_set_cr3(vcpu, val); 3767 res = kvm_set_cr3(vcpu, val);
3688 break; 3768 break;
3689 case 4: 3769 case 4:
3690 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 3770 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3691 break; 3771 break;
3692 case 8: 3772 case 8:
3693 kvm_set_cr8(vcpu, val & 0xfUL); 3773 res = __kvm_set_cr8(vcpu, val & 0xfUL);
3694 break; 3774 break;
3695 default: 3775 default:
3696 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3776 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3777 res = -1;
3697 } 3778 }
3779
3780 return res;
3698} 3781}
3699 3782
3700static int emulator_get_cpl(struct kvm_vcpu *vcpu) 3783static int emulator_get_cpl(struct kvm_vcpu *vcpu)
@@ -3707,6 +3790,12 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
3707 kvm_x86_ops->get_gdt(vcpu, dt); 3790 kvm_x86_ops->get_gdt(vcpu, dt);
3708} 3791}
3709 3792
3793static unsigned long emulator_get_cached_segment_base(int seg,
3794 struct kvm_vcpu *vcpu)
3795{
3796 return get_segment_base(vcpu, seg);
3797}
3798
3710static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, 3799static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
3711 struct kvm_vcpu *vcpu) 3800 struct kvm_vcpu *vcpu)
3712{ 3801{
@@ -3779,11 +3868,6 @@ static void emulator_set_segment_selector(u16 sel, int seg,
3779 kvm_set_segment(vcpu, &kvm_seg, seg); 3868 kvm_set_segment(vcpu, &kvm_seg, seg);
3780} 3869}
3781 3870
3782static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3783{
3784 kvm_x86_ops->set_rflags(vcpu, rflags);
3785}
3786
3787static struct x86_emulate_ops emulate_ops = { 3871static struct x86_emulate_ops emulate_ops = {
3788 .read_std = kvm_read_guest_virt_system, 3872 .read_std = kvm_read_guest_virt_system,
3789 .write_std = kvm_write_guest_virt_system, 3873 .write_std = kvm_write_guest_virt_system,
@@ -3797,11 +3881,15 @@ static struct x86_emulate_ops emulate_ops = {
3797 .set_cached_descriptor = emulator_set_cached_descriptor, 3881 .set_cached_descriptor = emulator_set_cached_descriptor,
3798 .get_segment_selector = emulator_get_segment_selector, 3882 .get_segment_selector = emulator_get_segment_selector,
3799 .set_segment_selector = emulator_set_segment_selector, 3883 .set_segment_selector = emulator_set_segment_selector,
3884 .get_cached_segment_base = emulator_get_cached_segment_base,
3800 .get_gdt = emulator_get_gdt, 3885 .get_gdt = emulator_get_gdt,
3801 .get_cr = emulator_get_cr, 3886 .get_cr = emulator_get_cr,
3802 .set_cr = emulator_set_cr, 3887 .set_cr = emulator_set_cr,
3803 .cpl = emulator_get_cpl, 3888 .cpl = emulator_get_cpl,
3804 .set_rflags = emulator_set_rflags, 3889 .get_dr = emulator_get_dr,
3890 .set_dr = emulator_set_dr,
3891 .set_msr = kvm_set_msr,
3892 .get_msr = kvm_get_msr,
3805}; 3893};
3806 3894
3807static void cache_all_regs(struct kvm_vcpu *vcpu) 3895static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3812,14 +3900,75 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
3812 vcpu->arch.regs_dirty = ~0; 3900 vcpu->arch.regs_dirty = ~0;
3813} 3901}
3814 3902
3903static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
3904{
3905 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
3906 /*
3907 * an sti; sti; sequence only disable interrupts for the first
3908 * instruction. So, if the last instruction, be it emulated or
3909 * not, left the system with the INT_STI flag enabled, it
3910 * means that the last instruction is an sti. We should not
3911 * leave the flag on in this case. The same goes for mov ss
3912 */
3913 if (!(int_shadow & mask))
3914 kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
3915}
3916
3917static void inject_emulated_exception(struct kvm_vcpu *vcpu)
3918{
3919 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
3920 if (ctxt->exception == PF_VECTOR)
3921 kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code);
3922 else if (ctxt->error_code_valid)
3923 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
3924 else
3925 kvm_queue_exception(vcpu, ctxt->exception);
3926}
3927
3928static int handle_emulation_failure(struct kvm_vcpu *vcpu)
3929{
3930 ++vcpu->stat.insn_emulation_fail;
3931 trace_kvm_emulate_insn_failed(vcpu);
3932 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3933 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3934 vcpu->run->internal.ndata = 0;
3935 kvm_queue_exception(vcpu, UD_VECTOR);
3936 return EMULATE_FAIL;
3937}
3938
3939static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
3940{
3941 gpa_t gpa;
3942
3943 if (tdp_enabled)
3944 return false;
3945
3946 /*
3947 * if emulation was due to access to shadowed page table
3948 * and it failed try to unshadow page and re-entetr the
3949 * guest to let CPU execute the instruction.
3950 */
3951 if (kvm_mmu_unprotect_page_virt(vcpu, gva))
3952 return true;
3953
3954 gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
3955
3956 if (gpa == UNMAPPED_GVA)
3957 return true; /* let cpu generate fault */
3958
3959 if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
3960 return true;
3961
3962 return false;
3963}
3964
3815int emulate_instruction(struct kvm_vcpu *vcpu, 3965int emulate_instruction(struct kvm_vcpu *vcpu,
3816 unsigned long cr2, 3966 unsigned long cr2,
3817 u16 error_code, 3967 u16 error_code,
3818 int emulation_type) 3968 int emulation_type)
3819{ 3969{
3820 int r, shadow_mask; 3970 int r;
3821 struct decode_cache *c; 3971 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
3822 struct kvm_run *run = vcpu->run;
3823 3972
3824 kvm_clear_exception_queue(vcpu); 3973 kvm_clear_exception_queue(vcpu);
3825 vcpu->arch.mmio_fault_cr2 = cr2; 3974 vcpu->arch.mmio_fault_cr2 = cr2;
@@ -3831,8 +3980,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3831 */ 3980 */
3832 cache_all_regs(vcpu); 3981 cache_all_regs(vcpu);
3833 3982
3834 vcpu->mmio_is_write = 0;
3835
3836 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 3983 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3837 int cs_db, cs_l; 3984 int cs_db, cs_l;
3838 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3985 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
@@ -3846,13 +3993,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3846 ? X86EMUL_MODE_VM86 : cs_l 3993 ? X86EMUL_MODE_VM86 : cs_l
3847 ? X86EMUL_MODE_PROT64 : cs_db 3994 ? X86EMUL_MODE_PROT64 : cs_db
3848 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3995 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3996 memset(c, 0, sizeof(struct decode_cache));
3997 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
3998 vcpu->arch.emulate_ctxt.interruptibility = 0;
3999 vcpu->arch.emulate_ctxt.exception = -1;
3849 4000
3850 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4001 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3851 trace_kvm_emulate_insn_start(vcpu); 4002 trace_kvm_emulate_insn_start(vcpu);
3852 4003
3853 /* Only allow emulation of specific instructions on #UD 4004 /* Only allow emulation of specific instructions on #UD
3854 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 4005 * (namely VMMCALL, sysenter, sysexit, syscall)*/
3855 c = &vcpu->arch.emulate_ctxt.decode;
3856 if (emulation_type & EMULTYPE_TRAP_UD) { 4006 if (emulation_type & EMULTYPE_TRAP_UD) {
3857 if (!c->twobyte) 4007 if (!c->twobyte)
3858 return EMULATE_FAIL; 4008 return EMULATE_FAIL;
@@ -3880,11 +4030,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3880 4030
3881 ++vcpu->stat.insn_emulation; 4031 ++vcpu->stat.insn_emulation;
3882 if (r) { 4032 if (r) {
3883 ++vcpu->stat.insn_emulation_fail; 4033 if (reexecute_instruction(vcpu, cr2))
3884 trace_kvm_emulate_insn_failed(vcpu);
3885 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3886 return EMULATE_DONE; 4034 return EMULATE_DONE;
3887 return EMULATE_FAIL; 4035 if (emulation_type & EMULTYPE_SKIP)
4036 return EMULATE_FAIL;
4037 return handle_emulation_failure(vcpu);
3888 } 4038 }
3889 } 4039 }
3890 4040
@@ -3893,48 +4043,42 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3893 return EMULATE_DONE; 4043 return EMULATE_DONE;
3894 } 4044 }
3895 4045
4046 /* this is needed for vmware backdor interface to work since it
4047 changes registers values during IO operation */
4048 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4049
3896restart: 4050restart:
3897 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4051 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3898 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
3899 4052
3900 if (r == 0) 4053 if (r) { /* emulation failed */
3901 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 4054 if (reexecute_instruction(vcpu, cr2))
4055 return EMULATE_DONE;
3902 4056
3903 if (vcpu->arch.pio.count) { 4057 return handle_emulation_failure(vcpu);
3904 if (!vcpu->arch.pio.in)
3905 vcpu->arch.pio.count = 0;
3906 return EMULATE_DO_MMIO;
3907 } 4058 }
3908 4059
3909 if (r || vcpu->mmio_is_write) { 4060 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
3910 run->exit_reason = KVM_EXIT_MMIO; 4061 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
3911 run->mmio.phys_addr = vcpu->mmio_phys_addr; 4062 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
3912 memcpy(run->mmio.data, vcpu->mmio_data, 8); 4063 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
3913 run->mmio.len = vcpu->mmio_size; 4064
3914 run->mmio.is_write = vcpu->mmio_is_write; 4065 if (vcpu->arch.emulate_ctxt.exception >= 0) {
4066 inject_emulated_exception(vcpu);
4067 return EMULATE_DONE;
3915 } 4068 }
3916 4069
3917 if (r) { 4070 if (vcpu->arch.pio.count) {
3918 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 4071 if (!vcpu->arch.pio.in)
3919 goto done; 4072 vcpu->arch.pio.count = 0;
3920 if (!vcpu->mmio_needed) {
3921 ++vcpu->stat.insn_emulation_fail;
3922 trace_kvm_emulate_insn_failed(vcpu);
3923 kvm_report_emulation_failure(vcpu, "mmio");
3924 return EMULATE_FAIL;
3925 }
3926 return EMULATE_DO_MMIO; 4073 return EMULATE_DO_MMIO;
3927 } 4074 }
3928 4075
3929 if (vcpu->mmio_is_write) { 4076 if (vcpu->mmio_needed) {
3930 vcpu->mmio_needed = 0; 4077 if (vcpu->mmio_is_write)
4078 vcpu->mmio_needed = 0;
3931 return EMULATE_DO_MMIO; 4079 return EMULATE_DO_MMIO;
3932 } 4080 }
3933 4081
3934done:
3935 if (vcpu->arch.exception.pending)
3936 vcpu->arch.emulate_ctxt.restart = false;
3937
3938 if (vcpu->arch.emulate_ctxt.restart) 4082 if (vcpu->arch.emulate_ctxt.restart)
3939 goto restart; 4083 goto restart;
3940 4084
@@ -4108,6 +4252,9 @@ int kvm_arch_init(void *opaque)
4108 4252
4109 perf_register_guest_info_callbacks(&kvm_guest_cbs); 4253 perf_register_guest_info_callbacks(&kvm_guest_cbs);
4110 4254
4255 if (cpu_has_xsave)
4256 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
4257
4111 return 0; 4258 return 0;
4112 4259
4113out: 4260out:
@@ -4270,7 +4417,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
4270 4417
4271 kvm_x86_ops->patch_hypercall(vcpu, instruction); 4418 kvm_x86_ops->patch_hypercall(vcpu, instruction);
4272 4419
4273 return emulator_write_emulated(rip, instruction, 3, vcpu); 4420 return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
4274} 4421}
4275 4422
4276void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4423void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
@@ -4506,59 +4653,78 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
4506 } 4653 }
4507} 4654}
4508 4655
4656static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
4657{
4658 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
4659 !vcpu->guest_xcr0_loaded) {
4660 /* kvm_set_xcr() also depends on this */
4661 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
4662 vcpu->guest_xcr0_loaded = 1;
4663 }
4664}
4665
4666static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
4667{
4668 if (vcpu->guest_xcr0_loaded) {
4669 if (vcpu->arch.xcr0 != host_xcr0)
4670 xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
4671 vcpu->guest_xcr0_loaded = 0;
4672 }
4673}
4674
4509static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 4675static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4510{ 4676{
4511 int r; 4677 int r;
4512 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 4678 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
4513 vcpu->run->request_interrupt_window; 4679 vcpu->run->request_interrupt_window;
4514 4680
4515 if (vcpu->requests)
4516 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
4517 kvm_mmu_unload(vcpu);
4518
4519 r = kvm_mmu_reload(vcpu);
4520 if (unlikely(r))
4521 goto out;
4522
4523 if (vcpu->requests) { 4681 if (vcpu->requests) {
4524 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 4682 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
4683 kvm_mmu_unload(vcpu);
4684 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
4525 __kvm_migrate_timers(vcpu); 4685 __kvm_migrate_timers(vcpu);
4526 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) 4686 if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
4527 kvm_write_guest_time(vcpu); 4687 kvm_write_guest_time(vcpu);
4528 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 4688 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
4529 kvm_mmu_sync_roots(vcpu); 4689 kvm_mmu_sync_roots(vcpu);
4530 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 4690 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
4531 kvm_x86_ops->tlb_flush(vcpu); 4691 kvm_x86_ops->tlb_flush(vcpu);
4532 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 4692 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
4533 &vcpu->requests)) {
4534 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; 4693 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
4535 r = 0; 4694 r = 0;
4536 goto out; 4695 goto out;
4537 } 4696 }
4538 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 4697 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
4539 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 4698 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
4540 r = 0; 4699 r = 0;
4541 goto out; 4700 goto out;
4542 } 4701 }
4543 if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { 4702 if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
4544 vcpu->fpu_active = 0; 4703 vcpu->fpu_active = 0;
4545 kvm_x86_ops->fpu_deactivate(vcpu); 4704 kvm_x86_ops->fpu_deactivate(vcpu);
4546 } 4705 }
4547 } 4706 }
4548 4707
4708 r = kvm_mmu_reload(vcpu);
4709 if (unlikely(r))
4710 goto out;
4711
4549 preempt_disable(); 4712 preempt_disable();
4550 4713
4551 kvm_x86_ops->prepare_guest_switch(vcpu); 4714 kvm_x86_ops->prepare_guest_switch(vcpu);
4552 if (vcpu->fpu_active) 4715 if (vcpu->fpu_active)
4553 kvm_load_guest_fpu(vcpu); 4716 kvm_load_guest_fpu(vcpu);
4717 kvm_load_guest_xcr0(vcpu);
4554 4718
4555 local_irq_disable(); 4719 atomic_set(&vcpu->guest_mode, 1);
4720 smp_wmb();
4556 4721
4557 clear_bit(KVM_REQ_KICK, &vcpu->requests); 4722 local_irq_disable();
4558 smp_mb__after_clear_bit();
4559 4723
4560 if (vcpu->requests || need_resched() || signal_pending(current)) { 4724 if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
4561 set_bit(KVM_REQ_KICK, &vcpu->requests); 4725 || need_resched() || signal_pending(current)) {
4726 atomic_set(&vcpu->guest_mode, 0);
4727 smp_wmb();
4562 local_irq_enable(); 4728 local_irq_enable();
4563 preempt_enable(); 4729 preempt_enable();
4564 r = 1; 4730 r = 1;
@@ -4603,7 +4769,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4603 if (hw_breakpoint_active()) 4769 if (hw_breakpoint_active())
4604 hw_breakpoint_restore(); 4770 hw_breakpoint_restore();
4605 4771
4606 set_bit(KVM_REQ_KICK, &vcpu->requests); 4772 atomic_set(&vcpu->guest_mode, 0);
4773 smp_wmb();
4607 local_irq_enable(); 4774 local_irq_enable();
4608 4775
4609 ++vcpu->stat.exits; 4776 ++vcpu->stat.exits;
@@ -4665,7 +4832,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4665 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 4832 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4666 kvm_vcpu_block(vcpu); 4833 kvm_vcpu_block(vcpu);
4667 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 4834 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
4668 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 4835 if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
4669 { 4836 {
4670 switch(vcpu->arch.mp_state) { 4837 switch(vcpu->arch.mp_state) {
4671 case KVM_MP_STATE_HALTED: 4838 case KVM_MP_STATE_HALTED:
@@ -4717,8 +4884,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4717 int r; 4884 int r;
4718 sigset_t sigsaved; 4885 sigset_t sigsaved;
4719 4886
4720 vcpu_load(vcpu);
4721
4722 if (vcpu->sigset_active) 4887 if (vcpu->sigset_active)
4723 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 4888 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
4724 4889
@@ -4743,7 +4908,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4743 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4908 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4744 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); 4909 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
4745 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4910 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4746 if (r == EMULATE_DO_MMIO) { 4911 if (r != EMULATE_DONE) {
4747 r = 0; 4912 r = 0;
4748 goto out; 4913 goto out;
4749 } 4914 }
@@ -4759,14 +4924,11 @@ out:
4759 if (vcpu->sigset_active) 4924 if (vcpu->sigset_active)
4760 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 4925 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
4761 4926
4762 vcpu_put(vcpu);
4763 return r; 4927 return r;
4764} 4928}
4765 4929
4766int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4930int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4767{ 4931{
4768 vcpu_load(vcpu);
4769
4770 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4932 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4771 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4933 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4772 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4934 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -4789,15 +4951,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4789 regs->rip = kvm_rip_read(vcpu); 4951 regs->rip = kvm_rip_read(vcpu);
4790 regs->rflags = kvm_get_rflags(vcpu); 4952 regs->rflags = kvm_get_rflags(vcpu);
4791 4953
4792 vcpu_put(vcpu);
4793
4794 return 0; 4954 return 0;
4795} 4955}
4796 4956
4797int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4957int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4798{ 4958{
4799 vcpu_load(vcpu);
4800
4801 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 4959 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
4802 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 4960 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
4803 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 4961 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -4822,8 +4980,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4822 4980
4823 vcpu->arch.exception.pending = false; 4981 vcpu->arch.exception.pending = false;
4824 4982
4825 vcpu_put(vcpu);
4826
4827 return 0; 4983 return 0;
4828} 4984}
4829 4985
@@ -4842,8 +4998,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4842{ 4998{
4843 struct desc_ptr dt; 4999 struct desc_ptr dt;
4844 5000
4845 vcpu_load(vcpu);
4846
4847 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5001 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4848 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 5002 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4849 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 5003 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -4875,32 +5029,27 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4875 set_bit(vcpu->arch.interrupt.nr, 5029 set_bit(vcpu->arch.interrupt.nr,
4876 (unsigned long *)sregs->interrupt_bitmap); 5030 (unsigned long *)sregs->interrupt_bitmap);
4877 5031
4878 vcpu_put(vcpu);
4879
4880 return 0; 5032 return 0;
4881} 5033}
4882 5034
4883int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 5035int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
4884 struct kvm_mp_state *mp_state) 5036 struct kvm_mp_state *mp_state)
4885{ 5037{
4886 vcpu_load(vcpu);
4887 mp_state->mp_state = vcpu->arch.mp_state; 5038 mp_state->mp_state = vcpu->arch.mp_state;
4888 vcpu_put(vcpu);
4889 return 0; 5039 return 0;
4890} 5040}
4891 5041
4892int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 5042int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
4893 struct kvm_mp_state *mp_state) 5043 struct kvm_mp_state *mp_state)
4894{ 5044{
4895 vcpu_load(vcpu);
4896 vcpu->arch.mp_state = mp_state->mp_state; 5045 vcpu->arch.mp_state = mp_state->mp_state;
4897 vcpu_put(vcpu);
4898 return 0; 5046 return 0;
4899} 5047}
4900 5048
4901int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 5049int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4902 bool has_error_code, u32 error_code) 5050 bool has_error_code, u32 error_code)
4903{ 5051{
5052 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4904 int cs_db, cs_l, ret; 5053 int cs_db, cs_l, ret;
4905 cache_all_regs(vcpu); 5054 cache_all_regs(vcpu);
4906 5055
@@ -4915,6 +5064,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4915 ? X86EMUL_MODE_VM86 : cs_l 5064 ? X86EMUL_MODE_VM86 : cs_l
4916 ? X86EMUL_MODE_PROT64 : cs_db 5065 ? X86EMUL_MODE_PROT64 : cs_db
4917 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 5066 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
5067 memset(c, 0, sizeof(struct decode_cache));
5068 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4918 5069
4919 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, 5070 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
4920 tss_selector, reason, has_error_code, 5071 tss_selector, reason, has_error_code,
@@ -4923,6 +5074,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4923 if (ret) 5074 if (ret)
4924 return EMULATE_FAIL; 5075 return EMULATE_FAIL;
4925 5076
5077 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5078 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4926 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 5079 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4927 return EMULATE_DONE; 5080 return EMULATE_DONE;
4928} 5081}
@@ -4935,8 +5088,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4935 int pending_vec, max_bits; 5088 int pending_vec, max_bits;
4936 struct desc_ptr dt; 5089 struct desc_ptr dt;
4937 5090
4938 vcpu_load(vcpu);
4939
4940 dt.size = sregs->idt.limit; 5091 dt.size = sregs->idt.limit;
4941 dt.address = sregs->idt.base; 5092 dt.address = sregs->idt.base;
4942 kvm_x86_ops->set_idt(vcpu, &dt); 5093 kvm_x86_ops->set_idt(vcpu, &dt);
@@ -4996,8 +5147,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4996 !is_protmode(vcpu)) 5147 !is_protmode(vcpu))
4997 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5148 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4998 5149
4999 vcpu_put(vcpu);
5000
5001 return 0; 5150 return 0;
5002} 5151}
5003 5152
@@ -5007,12 +5156,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5007 unsigned long rflags; 5156 unsigned long rflags;
5008 int i, r; 5157 int i, r;
5009 5158
5010 vcpu_load(vcpu);
5011
5012 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { 5159 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
5013 r = -EBUSY; 5160 r = -EBUSY;
5014 if (vcpu->arch.exception.pending) 5161 if (vcpu->arch.exception.pending)
5015 goto unlock_out; 5162 goto out;
5016 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 5163 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
5017 kvm_queue_exception(vcpu, DB_VECTOR); 5164 kvm_queue_exception(vcpu, DB_VECTOR);
5018 else 5165 else
@@ -5054,34 +5201,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5054 5201
5055 r = 0; 5202 r = 0;
5056 5203
5057unlock_out: 5204out:
5058 vcpu_put(vcpu);
5059 5205
5060 return r; 5206 return r;
5061} 5207}
5062 5208
5063/* 5209/*
5064 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
5065 * we have asm/x86/processor.h
5066 */
5067struct fxsave {
5068 u16 cwd;
5069 u16 swd;
5070 u16 twd;
5071 u16 fop;
5072 u64 rip;
5073 u64 rdp;
5074 u32 mxcsr;
5075 u32 mxcsr_mask;
5076 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
5077#ifdef CONFIG_X86_64
5078 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
5079#else
5080 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
5081#endif
5082};
5083
5084/*
5085 * Translate a guest virtual address to a guest physical address. 5210 * Translate a guest virtual address to a guest physical address.
5086 */ 5211 */
5087int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 5212int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
@@ -5091,7 +5216,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
5091 gpa_t gpa; 5216 gpa_t gpa;
5092 int idx; 5217 int idx;
5093 5218
5094 vcpu_load(vcpu);
5095 idx = srcu_read_lock(&vcpu->kvm->srcu); 5219 idx = srcu_read_lock(&vcpu->kvm->srcu);
5096 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); 5220 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
5097 srcu_read_unlock(&vcpu->kvm->srcu, idx); 5221 srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -5099,16 +5223,14 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
5099 tr->valid = gpa != UNMAPPED_GVA; 5223 tr->valid = gpa != UNMAPPED_GVA;
5100 tr->writeable = 1; 5224 tr->writeable = 1;
5101 tr->usermode = 0; 5225 tr->usermode = 0;
5102 vcpu_put(vcpu);
5103 5226
5104 return 0; 5227 return 0;
5105} 5228}
5106 5229
5107int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 5230int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5108{ 5231{
5109 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 5232 struct i387_fxsave_struct *fxsave =
5110 5233 &vcpu->arch.guest_fpu.state->fxsave;
5111 vcpu_load(vcpu);
5112 5234
5113 memcpy(fpu->fpr, fxsave->st_space, 128); 5235 memcpy(fpu->fpr, fxsave->st_space, 128);
5114 fpu->fcw = fxsave->cwd; 5236 fpu->fcw = fxsave->cwd;
@@ -5119,16 +5241,13 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5119 fpu->last_dp = fxsave->rdp; 5241 fpu->last_dp = fxsave->rdp;
5120 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 5242 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
5121 5243
5122 vcpu_put(vcpu);
5123
5124 return 0; 5244 return 0;
5125} 5245}
5126 5246
5127int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 5247int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5128{ 5248{
5129 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 5249 struct i387_fxsave_struct *fxsave =
5130 5250 &vcpu->arch.guest_fpu.state->fxsave;
5131 vcpu_load(vcpu);
5132 5251
5133 memcpy(fxsave->st_space, fpu->fpr, 128); 5252 memcpy(fxsave->st_space, fpu->fpr, 128);
5134 fxsave->cwd = fpu->fcw; 5253 fxsave->cwd = fpu->fcw;
@@ -5139,61 +5258,63 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5139 fxsave->rdp = fpu->last_dp; 5258 fxsave->rdp = fpu->last_dp;
5140 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 5259 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
5141 5260
5142 vcpu_put(vcpu);
5143
5144 return 0; 5261 return 0;
5145} 5262}
5146 5263
5147void fx_init(struct kvm_vcpu *vcpu) 5264int fx_init(struct kvm_vcpu *vcpu)
5148{ 5265{
5149 unsigned after_mxcsr_mask; 5266 int err;
5267
5268 err = fpu_alloc(&vcpu->arch.guest_fpu);
5269 if (err)
5270 return err;
5271
5272 fpu_finit(&vcpu->arch.guest_fpu);
5150 5273
5151 /* 5274 /*
5152 * Touch the fpu the first time in non atomic context as if 5275 * Ensure guest xcr0 is valid for loading
5153 * this is the first fpu instruction the exception handler
5154 * will fire before the instruction returns and it'll have to
5155 * allocate ram with GFP_KERNEL.
5156 */ 5276 */
5157 if (!used_math()) 5277 vcpu->arch.xcr0 = XSTATE_FP;
5158 kvm_fx_save(&vcpu->arch.host_fx_image);
5159
5160 /* Initialize guest FPU by resetting ours and saving into guest's */
5161 preempt_disable();
5162 kvm_fx_save(&vcpu->arch.host_fx_image);
5163 kvm_fx_finit();
5164 kvm_fx_save(&vcpu->arch.guest_fx_image);
5165 kvm_fx_restore(&vcpu->arch.host_fx_image);
5166 preempt_enable();
5167 5278
5168 vcpu->arch.cr0 |= X86_CR0_ET; 5279 vcpu->arch.cr0 |= X86_CR0_ET;
5169 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 5280
5170 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 5281 return 0;
5171 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
5172 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
5173} 5282}
5174EXPORT_SYMBOL_GPL(fx_init); 5283EXPORT_SYMBOL_GPL(fx_init);
5175 5284
5285static void fx_free(struct kvm_vcpu *vcpu)
5286{
5287 fpu_free(&vcpu->arch.guest_fpu);
5288}
5289
5176void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 5290void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
5177{ 5291{
5178 if (vcpu->guest_fpu_loaded) 5292 if (vcpu->guest_fpu_loaded)
5179 return; 5293 return;
5180 5294
5295 /*
5296 * Restore all possible states in the guest,
5297 * and assume host would use all available bits.
5298 * Guest xcr0 would be loaded later.
5299 */
5300 kvm_put_guest_xcr0(vcpu);
5181 vcpu->guest_fpu_loaded = 1; 5301 vcpu->guest_fpu_loaded = 1;
5182 kvm_fx_save(&vcpu->arch.host_fx_image); 5302 unlazy_fpu(current);
5183 kvm_fx_restore(&vcpu->arch.guest_fx_image); 5303 fpu_restore_checking(&vcpu->arch.guest_fpu);
5184 trace_kvm_fpu(1); 5304 trace_kvm_fpu(1);
5185} 5305}
5186 5306
5187void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 5307void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
5188{ 5308{
5309 kvm_put_guest_xcr0(vcpu);
5310
5189 if (!vcpu->guest_fpu_loaded) 5311 if (!vcpu->guest_fpu_loaded)
5190 return; 5312 return;
5191 5313
5192 vcpu->guest_fpu_loaded = 0; 5314 vcpu->guest_fpu_loaded = 0;
5193 kvm_fx_save(&vcpu->arch.guest_fx_image); 5315 fpu_save_init(&vcpu->arch.guest_fpu);
5194 kvm_fx_restore(&vcpu->arch.host_fx_image);
5195 ++vcpu->stat.fpu_reload; 5316 ++vcpu->stat.fpu_reload;
5196 set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); 5317 kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
5197 trace_kvm_fpu(0); 5318 trace_kvm_fpu(0);
5198} 5319}
5199 5320
@@ -5204,6 +5325,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5204 vcpu->arch.time_page = NULL; 5325 vcpu->arch.time_page = NULL;
5205 } 5326 }
5206 5327
5328 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
5329 fx_free(vcpu);
5207 kvm_x86_ops->vcpu_free(vcpu); 5330 kvm_x86_ops->vcpu_free(vcpu);
5208} 5331}
5209 5332
@@ -5217,9 +5340,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
5217{ 5340{
5218 int r; 5341 int r;
5219 5342
5220 /* We do fxsave: this must be aligned. */
5221 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
5222
5223 vcpu->arch.mtrr_state.have_fixed = 1; 5343 vcpu->arch.mtrr_state.have_fixed = 1;
5224 vcpu_load(vcpu); 5344 vcpu_load(vcpu);
5225 r = kvm_arch_vcpu_reset(vcpu); 5345 r = kvm_arch_vcpu_reset(vcpu);
@@ -5241,6 +5361,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
5241 kvm_mmu_unload(vcpu); 5361 kvm_mmu_unload(vcpu);
5242 vcpu_put(vcpu); 5362 vcpu_put(vcpu);
5243 5363
5364 fx_free(vcpu);
5244 kvm_x86_ops->vcpu_free(vcpu); 5365 kvm_x86_ops->vcpu_free(vcpu);
5245} 5366}
5246 5367
@@ -5334,7 +5455,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5334 } 5455 }
5335 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5456 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
5336 5457
5458 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
5459 goto fail_free_mce_banks;
5460
5337 return 0; 5461 return 0;
5462fail_free_mce_banks:
5463 kfree(vcpu->arch.mce_banks);
5338fail_free_lapic: 5464fail_free_lapic:
5339 kvm_free_lapic(vcpu); 5465 kvm_free_lapic(vcpu);
5340fail_mmu_destroy: 5466fail_mmu_destroy:
@@ -5364,12 +5490,6 @@ struct kvm *kvm_arch_create_vm(void)
5364 if (!kvm) 5490 if (!kvm)
5365 return ERR_PTR(-ENOMEM); 5491 return ERR_PTR(-ENOMEM);
5366 5492
5367 kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
5368 if (!kvm->arch.aliases) {
5369 kfree(kvm);
5370 return ERR_PTR(-ENOMEM);
5371 }
5372
5373 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 5493 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5374 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 5494 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5375 5495
@@ -5412,12 +5532,12 @@ static void kvm_free_vcpus(struct kvm *kvm)
5412void kvm_arch_sync_events(struct kvm *kvm) 5532void kvm_arch_sync_events(struct kvm *kvm)
5413{ 5533{
5414 kvm_free_all_assigned_devices(kvm); 5534 kvm_free_all_assigned_devices(kvm);
5535 kvm_free_pit(kvm);
5415} 5536}
5416 5537
5417void kvm_arch_destroy_vm(struct kvm *kvm) 5538void kvm_arch_destroy_vm(struct kvm *kvm)
5418{ 5539{
5419 kvm_iommu_unmap_guest(kvm); 5540 kvm_iommu_unmap_guest(kvm);
5420 kvm_free_pit(kvm);
5421 kfree(kvm->arch.vpic); 5541 kfree(kvm->arch.vpic);
5422 kfree(kvm->arch.vioapic); 5542 kfree(kvm->arch.vioapic);
5423 kvm_free_vcpus(kvm); 5543 kvm_free_vcpus(kvm);
@@ -5427,7 +5547,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
5427 if (kvm->arch.ept_identity_pagetable) 5547 if (kvm->arch.ept_identity_pagetable)
5428 put_page(kvm->arch.ept_identity_pagetable); 5548 put_page(kvm->arch.ept_identity_pagetable);
5429 cleanup_srcu_struct(&kvm->srcu); 5549 cleanup_srcu_struct(&kvm->srcu);
5430 kfree(kvm->arch.aliases);
5431 kfree(kvm); 5550 kfree(kvm);
5432} 5551}
5433 5552
@@ -5438,6 +5557,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
5438 int user_alloc) 5557 int user_alloc)
5439{ 5558{
5440 int npages = memslot->npages; 5559 int npages = memslot->npages;
5560 int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
5561
5562 /* Prevent internal slot pages from being moved by fork()/COW. */
5563 if (memslot->id >= KVM_MEMORY_SLOTS)
5564 map_flags = MAP_SHARED | MAP_ANONYMOUS;
5441 5565
5442 /*To keep backward compatibility with older userspace, 5566 /*To keep backward compatibility with older userspace,
5443 *x86 needs to hanlde !user_alloc case. 5567 *x86 needs to hanlde !user_alloc case.
@@ -5450,7 +5574,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
5450 userspace_addr = do_mmap(NULL, 0, 5574 userspace_addr = do_mmap(NULL, 0,
5451 npages * PAGE_SIZE, 5575 npages * PAGE_SIZE,
5452 PROT_READ | PROT_WRITE, 5576 PROT_READ | PROT_WRITE,
5453 MAP_PRIVATE | MAP_ANONYMOUS, 5577 map_flags,
5454 0); 5578 0);
5455 up_write(&current->mm->mmap_sem); 5579 up_write(&current->mm->mmap_sem);
5456 5580
@@ -5523,7 +5647,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
5523 5647
5524 me = get_cpu(); 5648 me = get_cpu();
5525 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 5649 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
5526 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) 5650 if (atomic_xchg(&vcpu->guest_mode, 0))
5527 smp_send_reschedule(cpu); 5651 smp_send_reschedule(cpu);
5528 put_cpu(); 5652 put_cpu();
5529} 5653}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f4b54458285b..b7a404722d2b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,13 +65,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66} 66}
67 67
68static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
69{
70 return rcu_dereference_check(kvm->arch.aliases,
71 srcu_read_lock_held(&kvm->srcu)
72 || lockdep_is_held(&kvm->slots_lock));
73}
74
75void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 68void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
76void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 69void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
77 70
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f871e04b6965..e10cf070ede0 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -30,6 +30,7 @@ ifeq ($(CONFIG_X86_32),y)
30 lib-y += checksum_32.o 30 lib-y += checksum_32.o
31 lib-y += strstr_32.o 31 lib-y += strstr_32.o
32 lib-y += semaphore_32.o string_32.o 32 lib-y += semaphore_32.o string_32.o
33 lib-y += cmpxchg.o
33ifneq ($(CONFIG_X86_CMPXCHG64),y) 34ifneq ($(CONFIG_X86_CMPXCHG64),y)
34 lib-y += cmpxchg8b_emu.o atomic64_386_32.o 35 lib-y += cmpxchg8b_emu.o atomic64_386_32.o
35endif 36endif
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 4a5979aa6883..2cda60a06e65 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -25,150 +25,172 @@
25 CFI_ADJUST_CFA_OFFSET -4 25 CFI_ADJUST_CFA_OFFSET -4
26.endm 26.endm
27 27
28.macro BEGIN func reg 28#define BEGIN(op) \
29$v = \reg 29.macro endp; \
30 30 CFI_ENDPROC; \
31ENTRY(atomic64_\func\()_386) 31ENDPROC(atomic64_##op##_386); \
32 CFI_STARTPROC 32.purgem endp; \
33 LOCK $v 33.endm; \
34 34ENTRY(atomic64_##op##_386); \
35.macro RETURN 35 CFI_STARTPROC; \
36 UNLOCK $v 36 LOCK v;
37
38#define ENDP endp
39
40#define RET \
41 UNLOCK v; \
37 ret 42 ret
38.endm
39
40.macro END_
41 CFI_ENDPROC
42ENDPROC(atomic64_\func\()_386)
43.purgem RETURN
44.purgem END_
45.purgem END
46.endm
47
48.macro END
49RETURN
50END_
51.endm
52.endm
53 43
54BEGIN read %ecx 44#define RET_ENDP \
55 movl ($v), %eax 45 RET; \
56 movl 4($v), %edx 46 ENDP
57END 47
58 48#define v %ecx
59BEGIN set %esi 49BEGIN(read)
60 movl %ebx, ($v) 50 movl (v), %eax
61 movl %ecx, 4($v) 51 movl 4(v), %edx
62END 52RET_ENDP
63 53#undef v
64BEGIN xchg %esi 54
65 movl ($v), %eax 55#define v %esi
66 movl 4($v), %edx 56BEGIN(set)
67 movl %ebx, ($v) 57 movl %ebx, (v)
68 movl %ecx, 4($v) 58 movl %ecx, 4(v)
69END 59RET_ENDP
70 60#undef v
71BEGIN add %ecx 61
72 addl %eax, ($v) 62#define v %esi
73 adcl %edx, 4($v) 63BEGIN(xchg)
74END 64 movl (v), %eax
75 65 movl 4(v), %edx
76BEGIN add_return %ecx 66 movl %ebx, (v)
77 addl ($v), %eax 67 movl %ecx, 4(v)
78 adcl 4($v), %edx 68RET_ENDP
79 movl %eax, ($v) 69#undef v
80 movl %edx, 4($v) 70
81END 71#define v %ecx
82 72BEGIN(add)
83BEGIN sub %ecx 73 addl %eax, (v)
84 subl %eax, ($v) 74 adcl %edx, 4(v)
85 sbbl %edx, 4($v) 75RET_ENDP
86END 76#undef v
87 77
88BEGIN sub_return %ecx 78#define v %ecx
79BEGIN(add_return)
80 addl (v), %eax
81 adcl 4(v), %edx
82 movl %eax, (v)
83 movl %edx, 4(v)
84RET_ENDP
85#undef v
86
87#define v %ecx
88BEGIN(sub)
89 subl %eax, (v)
90 sbbl %edx, 4(v)
91RET_ENDP
92#undef v
93
94#define v %ecx
95BEGIN(sub_return)
89 negl %edx 96 negl %edx
90 negl %eax 97 negl %eax
91 sbbl $0, %edx 98 sbbl $0, %edx
92 addl ($v), %eax 99 addl (v), %eax
93 adcl 4($v), %edx 100 adcl 4(v), %edx
94 movl %eax, ($v) 101 movl %eax, (v)
95 movl %edx, 4($v) 102 movl %edx, 4(v)
96END 103RET_ENDP
97 104#undef v
98BEGIN inc %esi 105
99 addl $1, ($v) 106#define v %esi
100 adcl $0, 4($v) 107BEGIN(inc)
101END 108 addl $1, (v)
102 109 adcl $0, 4(v)
103BEGIN inc_return %esi 110RET_ENDP
104 movl ($v), %eax 111#undef v
105 movl 4($v), %edx 112
113#define v %esi
114BEGIN(inc_return)
115 movl (v), %eax
116 movl 4(v), %edx
106 addl $1, %eax 117 addl $1, %eax
107 adcl $0, %edx 118 adcl $0, %edx
108 movl %eax, ($v) 119 movl %eax, (v)
109 movl %edx, 4($v) 120 movl %edx, 4(v)
110END 121RET_ENDP
111 122#undef v
112BEGIN dec %esi 123
113 subl $1, ($v) 124#define v %esi
114 sbbl $0, 4($v) 125BEGIN(dec)
115END 126 subl $1, (v)
116 127 sbbl $0, 4(v)
117BEGIN dec_return %esi 128RET_ENDP
118 movl ($v), %eax 129#undef v
119 movl 4($v), %edx 130
131#define v %esi
132BEGIN(dec_return)
133 movl (v), %eax
134 movl 4(v), %edx
120 subl $1, %eax 135 subl $1, %eax
121 sbbl $0, %edx 136 sbbl $0, %edx
122 movl %eax, ($v) 137 movl %eax, (v)
123 movl %edx, 4($v) 138 movl %edx, 4(v)
124END 139RET_ENDP
140#undef v
125 141
126BEGIN add_unless %ecx 142#define v %ecx
143BEGIN(add_unless)
127 addl %eax, %esi 144 addl %eax, %esi
128 adcl %edx, %edi 145 adcl %edx, %edi
129 addl ($v), %eax 146 addl (v), %eax
130 adcl 4($v), %edx 147 adcl 4(v), %edx
131 cmpl %eax, %esi 148 cmpl %eax, %esi
132 je 3f 149 je 3f
1331: 1501:
134 movl %eax, ($v) 151 movl %eax, (v)
135 movl %edx, 4($v) 152 movl %edx, 4(v)
136 movl $1, %eax 153 movl $1, %eax
1372: 1542:
138RETURN 155 RET
1393: 1563:
140 cmpl %edx, %edi 157 cmpl %edx, %edi
141 jne 1b 158 jne 1b
142 xorl %eax, %eax 159 xorl %eax, %eax
143 jmp 2b 160 jmp 2b
144END_ 161ENDP
162#undef v
145 163
146BEGIN inc_not_zero %esi 164#define v %esi
147 movl ($v), %eax 165BEGIN(inc_not_zero)
148 movl 4($v), %edx 166 movl (v), %eax
167 movl 4(v), %edx
149 testl %eax, %eax 168 testl %eax, %eax
150 je 3f 169 je 3f
1511: 1701:
152 addl $1, %eax 171 addl $1, %eax
153 adcl $0, %edx 172 adcl $0, %edx
154 movl %eax, ($v) 173 movl %eax, (v)
155 movl %edx, 4($v) 174 movl %edx, 4(v)
156 movl $1, %eax 175 movl $1, %eax
1572: 1762:
158RETURN 177 RET
1593: 1783:
160 testl %edx, %edx 179 testl %edx, %edx
161 jne 1b 180 jne 1b
162 jmp 2b 181 jmp 2b
163END_ 182ENDP
183#undef v
164 184
165BEGIN dec_if_positive %esi 185#define v %esi
166 movl ($v), %eax 186BEGIN(dec_if_positive)
167 movl 4($v), %edx 187 movl (v), %eax
188 movl 4(v), %edx
168 subl $1, %eax 189 subl $1, %eax
169 sbbl $0, %edx 190 sbbl $0, %edx
170 js 1f 191 js 1f
171 movl %eax, ($v) 192 movl %eax, (v)
172 movl %edx, 4($v) 193 movl %edx, 4(v)
1731: 1941:
174END 195RET_ENDP
196#undef v
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index ebeafcce04a9..aa4326bfb24a 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -52,7 +52,7 @@ ENDPROC(clear_page)
52 .align 8 52 .align 8
53 .quad clear_page 53 .quad clear_page
54 .quad 1b 54 .quad 1b
55 .byte X86_FEATURE_REP_GOOD 55 .word X86_FEATURE_REP_GOOD
56 .byte .Lclear_page_end - clear_page 56 .byte .Lclear_page_end - clear_page
57 .byte 2b - 1b 57 .byte 2b - 1b
58 .previous 58 .previous
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/lib/cmpxchg.c
index 2056ccf572cc..5d619f6df3ee 100644
--- a/arch/x86/kernel/cpu/cmpxchg.c
+++ b/arch/x86/lib/cmpxchg.c
@@ -52,21 +52,3 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
52} 52}
53EXPORT_SYMBOL(cmpxchg_386_u32); 53EXPORT_SYMBOL(cmpxchg_386_u32);
54#endif 54#endif
55
56#ifndef CONFIG_X86_CMPXCHG64
57unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
58{
59 u64 prev;
60 unsigned long flags;
61
62 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
63 local_irq_save(flags);
64 prev = *(u64 *)ptr;
65 if (prev == old)
66 *(u64 *)ptr = new;
67 local_irq_restore(flags);
68 return prev;
69}
70EXPORT_SYMBOL(cmpxchg_486_u64);
71#endif
72
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 727a5d46d2fc..6fec2d1cebe1 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -113,7 +113,7 @@ ENDPROC(copy_page)
113 .align 8 113 .align 8
114 .quad copy_page 114 .quad copy_page
115 .quad 1b 115 .quad 1b
116 .byte X86_FEATURE_REP_GOOD 116 .word X86_FEATURE_REP_GOOD
117 .byte .Lcopy_page_end - copy_page 117 .byte .Lcopy_page_end - copy_page
118 .byte 2b - 1b 118 .byte 2b - 1b
119 .previous 119 .previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 71100c98e337..a460158b5ac5 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -29,7 +29,7 @@
29 .align 8 29 .align 8
30 .quad 0b 30 .quad 0b
31 .quad 2b 31 .quad 2b
32 .byte \feature /* when feature is set */ 32 .word \feature /* when feature is set */
33 .byte 5 33 .byte 5
34 .byte 5 34 .byte 5
35 .previous 35 .previous
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index f82e884928af..bcbcd1e0f7d5 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -131,7 +131,7 @@ ENDPROC(__memcpy)
131 .align 8 131 .align 8
132 .quad memcpy 132 .quad memcpy
133 .quad .Lmemcpy_c 133 .quad .Lmemcpy_c
134 .byte X86_FEATURE_REP_GOOD 134 .word X86_FEATURE_REP_GOOD
135 135
136 /* 136 /*
137 * Replace only beginning, memcpy is used to apply alternatives, 137 * Replace only beginning, memcpy is used to apply alternatives,
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index e88d3b81644a..09d344269652 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -121,7 +121,7 @@ ENDPROC(__memset)
121 .align 8 121 .align 8
122 .quad memset 122 .quad memset
123 .quad .Lmemset_c 123 .quad .Lmemset_c
124 .byte X86_FEATURE_REP_GOOD 124 .word X86_FEATURE_REP_GOOD
125 .byte .Lfinal - memset 125 .byte .Lfinal - memset
126 .byte .Lmemset_e - .Lmemset_c 126 .byte .Lmemset_e - .Lmemset_c
127 .previous 127 .previous
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a725b7f760ae..0002a3a33081 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -37,6 +37,28 @@ struct addr_marker {
37 const char *name; 37 const char *name;
38}; 38};
39 39
40/* indices for address_markers; keep sync'd w/ address_markers below */
41enum address_markers_idx {
42 USER_SPACE_NR = 0,
43#ifdef CONFIG_X86_64
44 KERNEL_SPACE_NR,
45 LOW_KERNEL_NR,
46 VMALLOC_START_NR,
47 VMEMMAP_START_NR,
48 HIGH_KERNEL_NR,
49 MODULES_VADDR_NR,
50 MODULES_END_NR,
51#else
52 KERNEL_SPACE_NR,
53 VMALLOC_START_NR,
54 VMALLOC_END_NR,
55# ifdef CONFIG_HIGHMEM
56 PKMAP_BASE_NR,
57# endif
58 FIXADDR_START_NR,
59#endif
60};
61
40/* Address space markers hints */ 62/* Address space markers hints */
41static struct addr_marker address_markers[] = { 63static struct addr_marker address_markers[] = {
42 { 0, "User Space" }, 64 { 0, "User Space" },
@@ -331,14 +353,12 @@ static int pt_dump_init(void)
331 353
332#ifdef CONFIG_X86_32 354#ifdef CONFIG_X86_32
333 /* Not a compile-time constant on x86-32 */ 355 /* Not a compile-time constant on x86-32 */
334 address_markers[2].start_address = VMALLOC_START; 356 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
335 address_markers[3].start_address = VMALLOC_END; 357 address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
336# ifdef CONFIG_HIGHMEM 358# ifdef CONFIG_HIGHMEM
337 address_markers[4].start_address = PKMAP_BASE; 359 address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
338 address_markers[5].start_address = FIXADDR_START;
339# else
340 address_markers[4].start_address = FIXADDR_START;
341# endif 360# endif
361 address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
342#endif 362#endif
343 363
344 pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, 364 pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f62777940dfb..4c4508e8a204 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -802,8 +802,10 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
802 up_read(&mm->mmap_sem); 802 up_read(&mm->mmap_sem);
803 803
804 /* Kernel mode? Handle exceptions or die: */ 804 /* Kernel mode? Handle exceptions or die: */
805 if (!(error_code & PF_USER)) 805 if (!(error_code & PF_USER)) {
806 no_context(regs, error_code, address); 806 no_context(regs, error_code, address);
807 return;
808 }
807 809
808 /* User-space => ok to do another page fault: */ 810 /* User-space => ok to do another page fault: */
809 if (is_prefetch(regs, error_code, address)) 811 if (is_prefetch(regs, error_code, address))
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 63a6ba66cbe0..5e8fa12ef861 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -53,7 +53,7 @@ void *kmap_atomic(struct page *page, enum km_type type)
53 return kmap_atomic_prot(page, type, kmap_prot); 53 return kmap_atomic_prot(page, type, kmap_prot);
54} 54}
55 55
56void kunmap_atomic(void *kvaddr, enum km_type type) 56void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
57{ 57{
58 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; 58 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
59 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); 59 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
@@ -102,7 +102,7 @@ struct page *kmap_atomic_to_page(void *ptr)
102EXPORT_SYMBOL(kmap); 102EXPORT_SYMBOL(kmap);
103EXPORT_SYMBOL(kunmap); 103EXPORT_SYMBOL(kunmap);
104EXPORT_SYMBOL(kmap_atomic); 104EXPORT_SYMBOL(kmap_atomic);
105EXPORT_SYMBOL(kunmap_atomic); 105EXPORT_SYMBOL(kunmap_atomic_notypecheck);
106EXPORT_SYMBOL(kmap_atomic_prot); 106EXPORT_SYMBOL(kmap_atomic_prot);
107EXPORT_SYMBOL(kmap_atomic_to_page); 107EXPORT_SYMBOL(kmap_atomic_to_page);
108 108
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ee41bba315d1..9a6674689a20 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -2,7 +2,7 @@
2 * linux/arch/x86_64/mm/init.c 2 * linux/arch/x86_64/mm/init.c
3 * 3 *
4 * Copyright (C) 1995 Linus Torvalds 4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 5 * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> 6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */ 7 */
8 8
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 12e4d2d3c110..3ba6e0608c55 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -62,8 +62,8 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
62static void __iomem *__ioremap_caller(resource_size_t phys_addr, 62static void __iomem *__ioremap_caller(resource_size_t phys_addr,
63 unsigned long size, unsigned long prot_val, void *caller) 63 unsigned long size, unsigned long prot_val, void *caller)
64{ 64{
65 unsigned long pfn, offset, vaddr; 65 unsigned long offset, vaddr;
66 resource_size_t last_addr; 66 resource_size_t pfn, last_pfn, last_addr;
67 const resource_size_t unaligned_phys_addr = phys_addr; 67 const resource_size_t unaligned_phys_addr = phys_addr;
68 const unsigned long unaligned_size = size; 68 const unsigned long unaligned_size = size;
69 struct vm_struct *area; 69 struct vm_struct *area;
@@ -100,10 +100,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
100 /* 100 /*
101 * Don't allow anybody to remap normal RAM that we're using.. 101 * Don't allow anybody to remap normal RAM that we're using..
102 */ 102 */
103 for (pfn = phys_addr >> PAGE_SHIFT; 103 last_pfn = last_addr >> PAGE_SHIFT;
104 (pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK); 104 for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
105 pfn++) {
106
107 int is_ram = page_is_ram(pfn); 105 int is_ram = page_is_ram(pfn);
108 106
109 if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) 107 if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
@@ -115,7 +113,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
115 * Mappings have to be page-aligned 113 * Mappings have to be page-aligned
116 */ 114 */
117 offset = phys_addr & ~PAGE_MASK; 115 offset = phys_addr & ~PAGE_MASK;
118 phys_addr &= PAGE_MASK; 116 phys_addr &= PHYSICAL_PAGE_MASK;
119 size = PAGE_ALIGN(last_addr+1) - phys_addr; 117 size = PAGE_ALIGN(last_addr+1) - phys_addr;
120 118
121 retval = reserve_memtype(phys_addr, (u64)phys_addr + size, 119 retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
@@ -613,7 +611,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
613 return; 611 return;
614 } 612 }
615 offset = virt_addr & ~PAGE_MASK; 613 offset = virt_addr & ~PAGE_MASK;
616 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; 614 nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
617 615
618 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; 616 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
619 while (nrpages > 0) { 617 while (nrpages > 0) {
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 5d0e67fff1a6..e5d5e2ce9f77 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -45,6 +45,8 @@ struct kmmio_fault_page {
45 * Protected by kmmio_lock, when linked into kmmio_page_table. 45 * Protected by kmmio_lock, when linked into kmmio_page_table.
46 */ 46 */
47 int count; 47 int count;
48
49 bool scheduled_for_release;
48}; 50};
49 51
50struct kmmio_delayed_release { 52struct kmmio_delayed_release {
@@ -398,8 +400,11 @@ static void release_kmmio_fault_page(unsigned long page,
398 BUG_ON(f->count < 0); 400 BUG_ON(f->count < 0);
399 if (!f->count) { 401 if (!f->count) {
400 disarm_kmmio_fault_page(f); 402 disarm_kmmio_fault_page(f);
401 f->release_next = *release_list; 403 if (!f->scheduled_for_release) {
402 *release_list = f; 404 f->release_next = *release_list;
405 *release_list = f;
406 f->scheduled_for_release = true;
407 }
403 } 408 }
404} 409}
405 410
@@ -471,8 +476,10 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
471 prevp = &f->release_next; 476 prevp = &f->release_next;
472 } else { 477 } else {
473 *prevp = f->release_next; 478 *prevp = f->release_next;
479 f->release_next = NULL;
480 f->scheduled_for_release = false;
474 } 481 }
475 f = f->release_next; 482 f = *prevp;
476 } 483 }
477 spin_unlock_irqrestore(&kmmio_lock, flags); 484 spin_unlock_irqrestore(&kmmio_lock, flags);
478 485
@@ -510,6 +517,9 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
510 kmmio_count--; 517 kmmio_count--;
511 spin_unlock_irqrestore(&kmmio_lock, flags); 518 spin_unlock_irqrestore(&kmmio_lock, flags);
512 519
520 if (!release_list)
521 return;
522
513 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); 523 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
514 if (!drelease) { 524 if (!drelease) {
515 pr_crit("leaking kmmio_fault_page objects.\n"); 525 pr_crit("leaking kmmio_fault_page objects.\n");
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 64121a18b8cb..f6ff57b7efa5 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -158,7 +158,7 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
158 return req_type; 158 return req_type;
159} 159}
160 160
161static int pat_pagerange_is_ram(unsigned long start, unsigned long end) 161static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
162{ 162{
163 int ram_page = 0, not_rampage = 0; 163 int ram_page = 0, not_rampage = 0;
164 unsigned long page_nr; 164 unsigned long page_nr;
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index 308e32570d84..38e6d174c497 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -40,16 +40,16 @@ static unsigned char prefix_codes[] = {
40static unsigned int reg_rop[] = { 40static unsigned int reg_rop[] = {
41 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F 41 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
42}; 42};
43static unsigned int reg_wop[] = { 0x88, 0x89 }; 43static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
44static unsigned int imm_wop[] = { 0xC6, 0xC7 }; 44static unsigned int imm_wop[] = { 0xC6, 0xC7 };
45/* IA32 Manual 3, 3-432*/ 45/* IA32 Manual 3, 3-432*/
46static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 }; 46static unsigned int rw8[] = { 0x88, 0x8A, 0xC6, 0xAA };
47static unsigned int rw32[] = { 47static unsigned int rw32[] = {
48 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F 48 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
49}; 49};
50static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F }; 50static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F, 0xAA };
51static unsigned int mw16[] = { 0xB70F, 0xBF0F }; 51static unsigned int mw16[] = { 0xB70F, 0xBF0F };
52static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 }; 52static unsigned int mw32[] = { 0x89, 0x8B, 0xC7, 0xAB };
53static unsigned int mw64[] = {}; 53static unsigned int mw64[] = {};
54#else /* not __i386__ */ 54#else /* not __i386__ */
55static unsigned char prefix_codes[] = { 55static unsigned char prefix_codes[] = {
@@ -63,20 +63,20 @@ static unsigned char prefix_codes[] = {
63static unsigned int reg_rop[] = { 63static unsigned int reg_rop[] = {
64 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F 64 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
65}; 65};
66static unsigned int reg_wop[] = { 0x88, 0x89 }; 66static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
67static unsigned int imm_wop[] = { 0xC6, 0xC7 }; 67static unsigned int imm_wop[] = { 0xC6, 0xC7 };
68static unsigned int rw8[] = { 0xC6, 0x88, 0x8A }; 68static unsigned int rw8[] = { 0xC6, 0x88, 0x8A, 0xAA };
69static unsigned int rw32[] = { 69static unsigned int rw32[] = {
70 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F 70 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
71}; 71};
72/* 8 bit only */ 72/* 8 bit only */
73static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F }; 73static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F, 0xAA };
74/* 16 bit only */ 74/* 16 bit only */
75static unsigned int mw16[] = { 0xB70F, 0xBF0F }; 75static unsigned int mw16[] = { 0xB70F, 0xBF0F };
76/* 16 or 32 bit */ 76/* 16 or 32 bit */
77static unsigned int mw32[] = { 0xC7 }; 77static unsigned int mw32[] = { 0xC7 };
78/* 16, 32 or 64 bit */ 78/* 16, 32 or 64 bit */
79static unsigned int mw64[] = { 0x89, 0x8B }; 79static unsigned int mw64[] = { 0x89, 0x8B, 0xAB };
80#endif /* not __i386__ */ 80#endif /* not __i386__ */
81 81
82struct prefix_bits { 82struct prefix_bits {
@@ -410,7 +410,6 @@ static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
410unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) 410unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
411{ 411{
412 unsigned int opcode; 412 unsigned int opcode;
413 unsigned char mod_rm;
414 int reg; 413 int reg;
415 unsigned char *p; 414 unsigned char *p;
416 struct prefix_bits prf; 415 struct prefix_bits prf;
@@ -437,8 +436,13 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
437 goto err; 436 goto err;
438 437
439do_work: 438do_work:
440 mod_rm = *p; 439 /* for STOS, source register is fixed */
441 reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3); 440 if (opcode == 0xAA || opcode == 0xAB) {
441 reg = arg_AX;
442 } else {
443 unsigned char mod_rm = *p;
444 reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
445 }
442 switch (get_ins_reg_width(ins_addr)) { 446 switch (get_ins_reg_width(ins_addr)) {
443 case 1: 447 case 1:
444 return *get_reg_w8(reg, prf.rex, regs); 448 return *get_reg_w8(reg, prf.rex, regs);
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 8565d944f7cf..38868adf07ea 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -90,6 +90,27 @@ static void do_test(unsigned long size)
90 iounmap(p); 90 iounmap(p);
91} 91}
92 92
93/*
94 * Tests how mmiotrace behaves in face of multiple ioremap / iounmaps in
95 * a short time. We had a bug in deferred freeing procedure which tried
96 * to free this region multiple times (ioremap can reuse the same address
97 * for many mappings).
98 */
99static void do_test_bulk_ioremapping(void)
100{
101 void __iomem *p;
102 int i;
103
104 for (i = 0; i < 10; ++i) {
105 p = ioremap_nocache(mmio_address, PAGE_SIZE);
106 if (p)
107 iounmap(p);
108 }
109
110 /* Force freeing. If it will crash we will know why. */
111 synchronize_rcu();
112}
113
93static int __init init(void) 114static int __init init(void)
94{ 115{
95 unsigned long size = (read_far) ? (8 << 20) : (16 << 10); 116 unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
@@ -104,6 +125,7 @@ static int __init init(void)
104 "and writing 16 kB of rubbish in there.\n", 125 "and writing 16 kB of rubbish in there.\n",
105 size >> 10, mmio_address); 126 size >> 10, mmio_address);
106 do_test(size); 127 do_test(size);
128 do_test_bulk_ioremapping();
107 pr_info("All done.\n"); 129 pr_info("All done.\n");
108 return 0; 130 return 0;
109} 131}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 426f3a1a64d3..c03f14ab6667 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -278,11 +278,9 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
278 278
279static void do_flush_tlb_all(void *info) 279static void do_flush_tlb_all(void *info)
280{ 280{
281 unsigned long cpu = smp_processor_id();
282
283 __flush_tlb_all(); 281 __flush_tlb_all();
284 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) 282 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
285 leave_mm(cpu); 283 leave_mm(smp_processor_id());
286} 284}
287 285
288void flush_tlb_all(void) 286void flush_tlb_all(void)
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index b28d2f1253bb..f6b48f6c5951 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -634,6 +634,18 @@ static int __init ppro_init(char **cpu_type)
634 if (force_arch_perfmon && cpu_has_arch_perfmon) 634 if (force_arch_perfmon && cpu_has_arch_perfmon)
635 return 0; 635 return 0;
636 636
637 /*
638 * Documentation on identifying Intel processors by CPU family
639 * and model can be found in the Intel Software Developer's
640 * Manuals (SDM):
641 *
642 * http://www.intel.com/products/processor/manuals/
643 *
644 * As of May 2010 the documentation for this was in the:
645 * "Intel 64 and IA-32 Architectures Software Developer's
646 * Manual Volume 3B: System Programming Guide", "Table B-1
647 * CPUID Signature Values of DisplayFamily_DisplayModel".
648 */
637 switch (cpu_model) { 649 switch (cpu_model) {
638 case 0 ... 2: 650 case 0 ... 2:
639 *cpu_type = "i386/ppro"; 651 *cpu_type = "i386/ppro";
@@ -655,12 +667,13 @@ static int __init ppro_init(char **cpu_type)
655 case 15: case 23: 667 case 15: case 23:
656 *cpu_type = "i386/core_2"; 668 *cpu_type = "i386/core_2";
657 break; 669 break;
670 case 0x1a:
671 case 0x1e:
658 case 0x2e: 672 case 0x2e:
659 case 26:
660 spec = &op_arch_perfmon_spec; 673 spec = &op_arch_perfmon_spec;
661 *cpu_type = "i386/core_i7"; 674 *cpu_type = "i386/core_i7";
662 break; 675 break;
663 case 28: 676 case 0x1c:
664 *cpu_type = "i386/atom"; 677 *cpu_type = "i386/atom";
665 break; 678 break;
666 default: 679 default:
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 2ec04c424a62..15466c096ba5 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -34,6 +34,15 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
34 DMI_MATCH(DMI_PRODUCT_NAME, "x3800"), 34 DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
35 }, 35 },
36 }, 36 },
37 /* https://bugzilla.kernel.org/show_bug.cgi?id=16007 */
38 /* 2006 AMD HT/VIA system with two host bridges */
39 {
40 .callback = set_use_crs,
41 .ident = "ASRock ALiveSATA2-GLAN",
42 .matches = {
43 DMI_MATCH(DMI_PRODUCT_NAME, "ALiveSATA2-GLAN"),
44 },
45 },
37 {} 46 {}
38}; 47};
39 48
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 215a27ae050d..a0772af64efb 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -125,6 +125,23 @@ void __init dmi_check_skip_isa_align(void)
125static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev) 125static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
126{ 126{
127 struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE]; 127 struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
128 struct resource *bar_r;
129 int bar;
130
131 if (pci_probe & PCI_NOASSIGN_BARS) {
132 /*
133 * If the BIOS did not assign the BAR, zero out the
134 * resource so the kernel doesn't attmept to assign
135 * it later on in pci_assign_unassigned_resources
136 */
137 for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) {
138 bar_r = &dev->resource[bar];
139 if (bar_r->start == 0 && bar_r->end != 0) {
140 bar_r->flags = 0;
141 bar_r->end = 0;
142 }
143 }
144 }
128 145
129 if (pci_probe & PCI_NOASSIGN_ROMS) { 146 if (pci_probe & PCI_NOASSIGN_ROMS) {
130 if (rom_r->parent) 147 if (rom_r->parent)
@@ -509,6 +526,9 @@ char * __devinit pcibios_setup(char *str)
509 } else if (!strcmp(str, "norom")) { 526 } else if (!strcmp(str, "norom")) {
510 pci_probe |= PCI_NOASSIGN_ROMS; 527 pci_probe |= PCI_NOASSIGN_ROMS;
511 return NULL; 528 return NULL;
529 } else if (!strcmp(str, "nobar")) {
530 pci_probe |= PCI_NOASSIGN_BARS;
531 return NULL;
512 } else if (!strcmp(str, "assign-busses")) { 532 } else if (!strcmp(str, "assign-busses")) {
513 pci_probe |= PCI_ASSIGN_ALL_BUSSES; 533 pci_probe |= PCI_ASSIGN_ALL_BUSSES;
514 return NULL; 534 return NULL;
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 9810a0f76c91..f547ee05f715 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -989,7 +989,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
989 dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq); 989 dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq);
990 990
991 /* Update IRQ for all devices with the same pirq value */ 991 /* Update IRQ for all devices with the same pirq value */
992 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { 992 for_each_pci_dev(dev2) {
993 pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin); 993 pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
994 if (!pin) 994 if (!pin)
995 continue; 995 continue;
@@ -1028,7 +1028,7 @@ void __init pcibios_fixup_irqs(void)
1028 u8 pin; 1028 u8 pin;
1029 1029
1030 DBG(KERN_DEBUG "PCI: IRQ fixup\n"); 1030 DBG(KERN_DEBUG "PCI: IRQ fixup\n");
1031 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1031 for_each_pci_dev(dev) {
1032 /* 1032 /*
1033 * If the BIOS has set an out of range IRQ number, just 1033 * If the BIOS has set an out of range IRQ number, just
1034 * ignore it. Also keep track of which IRQ's are 1034 * ignore it. Also keep track of which IRQ's are
@@ -1052,7 +1052,7 @@ void __init pcibios_fixup_irqs(void)
1052 return; 1052 return;
1053 1053
1054 dev = NULL; 1054 dev = NULL;
1055 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1055 for_each_pci_dev(dev) {
1056 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); 1056 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
1057 if (!pin) 1057 if (!pin)
1058 continue; 1058 continue;
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 8d460eaf524f..c89266be6048 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -36,7 +36,7 @@ int __init pci_legacy_init(void)
36 return 0; 36 return 0;
37} 37}
38 38
39void pcibios_scan_specific_bus(int busn) 39void __devinit pcibios_scan_specific_bus(int busn)
40{ 40{
41 int devfn; 41 int devfn;
42 long node; 42 long node;
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 1290ba54b350..e7e8c5f54956 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -4,7 +4,7 @@
4 * Distribute under GPLv2 4 * Distribute under GPLv2
5 * 5 *
6 * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> 6 * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
7 * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> 7 * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> 8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
9 */ 9 */
10 10
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index d24f983ba1e5..460f314d13e5 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -4,7 +4,7 @@
4 * Distribute under GPLv2 4 * Distribute under GPLv2
5 * 5 *
6 * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> 6 * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
7 * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> 7 * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> 8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
9 */ 9 */
10 10
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 6b4ffedb93c9..4a2afa1bac51 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -120,7 +120,8 @@ $(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE
120quiet_cmd_vdso = VDSO $@ 120quiet_cmd_vdso = VDSO $@
121 cmd_vdso = $(CC) -nostdlib -o $@ \ 121 cmd_vdso = $(CC) -nostdlib -o $@ \
122 $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ 122 $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
123 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) 123 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
124 sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
124 125
125VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) 126VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
126GCOV_PROFILE := n 127GCOV_PROFILE := n
diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/vdso/checkundef.sh
new file mode 100755
index 000000000000..7ee90a9b549d
--- /dev/null
+++ b/arch/x86/vdso/checkundef.sh
@@ -0,0 +1,10 @@
1#!/bin/sh
2nm="$1"
3file="$2"
4$nm "$file" | grep '^ *U' > /dev/null 2>&1
5if [ $? -eq 1 ]; then
6 exit 0
7else
8 echo "$file: undefined symbols found" >&2
9 exit 1
10fi
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 02b442e92007..36df991985b2 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -374,7 +374,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
374 374
375#ifdef CONFIG_X86_64 375#ifdef CONFIG_X86_64
376 376
377__initcall(sysenter_setup); 377subsys_initcall(sysenter_setup);
378 378
379#ifdef CONFIG_SYSCTL 379#ifdef CONFIG_SYSCTL
380/* Register vsyscall32 into the ABI table */ 380/* Register vsyscall32 into the ABI table */
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index ac74869b8140..4b5d26f108bb 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -67,6 +67,7 @@ static int __init init_vdso_vars(void)
67 *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x; 67 *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
68#include "vextern.h" 68#include "vextern.h"
69#undef VEXTERN 69#undef VEXTERN
70 vunmap(vbase);
70 return 0; 71 return 0;
71 72
72 oom: 73 oom:
@@ -74,7 +75,7 @@ static int __init init_vdso_vars(void)
74 vdso_enabled = 0; 75 vdso_enabled = 0;
75 return -ENOMEM; 76 return -ENOMEM;
76} 77}
77__initcall(init_vdso_vars); 78subsys_initcall(init_vdso_vars);
78 79
79struct linux_binprm; 80struct linux_binprm;
80 81
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index b83e119fbeb0..68128a1b401a 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -13,6 +13,11 @@ config XEN
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
14 Xen hypervisor. 14 Xen hypervisor.
15 15
16config XEN_PVHVM
17 def_bool y
18 depends on XEN
19 depends on X86_LOCAL_APIC
20
16config XEN_MAX_DOMAIN_MEMORY 21config XEN_MAX_DOMAIN_MEMORY
17 int "Maximum allowed size of a domain in gigabytes" 22 int "Maximum allowed size of a domain in gigabytes"
18 default 8 if X86_32 23 default 8 if X86_32
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3bb4fc21f4f2..779385158915 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,9 +12,10 @@ CFLAGS_mmu.o := $(nostackp)
12 12
13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
14 time.o xen-asm.o xen-asm_$(BITS).o \ 14 time.o xen-asm.o xen-asm_$(BITS).o \
15 grant-table.o suspend.o 15 grant-table.o suspend.o platform-pci-unplug.o
16 16
17obj-$(CONFIG_SMP) += smp.o 17obj-$(CONFIG_SMP) += smp.o
18obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o 18obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
19obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o 19obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
20 20
21obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 65d8d79b46a8..7d46c8441418 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -11,6 +11,7 @@
11 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 11 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
12 */ 12 */
13 13
14#include <linux/cpu.h>
14#include <linux/kernel.h> 15#include <linux/kernel.h>
15#include <linux/init.h> 16#include <linux/init.h>
16#include <linux/smp.h> 17#include <linux/smp.h>
@@ -35,8 +36,10 @@
35#include <xen/interface/version.h> 36#include <xen/interface/version.h>
36#include <xen/interface/physdev.h> 37#include <xen/interface/physdev.h>
37#include <xen/interface/vcpu.h> 38#include <xen/interface/vcpu.h>
39#include <xen/interface/memory.h>
38#include <xen/features.h> 40#include <xen/features.h>
39#include <xen/page.h> 41#include <xen/page.h>
42#include <xen/hvm.h>
40#include <xen/hvc-console.h> 43#include <xen/hvc-console.h>
41 44
42#include <asm/paravirt.h> 45#include <asm/paravirt.h>
@@ -55,7 +58,9 @@
55#include <asm/pgtable.h> 58#include <asm/pgtable.h>
56#include <asm/tlbflush.h> 59#include <asm/tlbflush.h>
57#include <asm/reboot.h> 60#include <asm/reboot.h>
61#include <asm/setup.h>
58#include <asm/stackprotector.h> 62#include <asm/stackprotector.h>
63#include <asm/hypervisor.h>
59 64
60#include "xen-ops.h" 65#include "xen-ops.h"
61#include "mmu.h" 66#include "mmu.h"
@@ -76,6 +81,10 @@ struct shared_info xen_dummy_shared_info;
76 81
77void *xen_initial_gdt; 82void *xen_initial_gdt;
78 83
84RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
85__read_mostly int xen_have_vector_callback;
86EXPORT_SYMBOL_GPL(xen_have_vector_callback);
87
79/* 88/*
80 * Point at some empty memory to start with. We map the real shared_info 89 * Point at some empty memory to start with. We map the real shared_info
81 * page as soon as fixmap is up and running. 90 * page as soon as fixmap is up and running.
@@ -97,6 +106,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
97 */ 106 */
98static int have_vcpu_info_placement = 1; 107static int have_vcpu_info_placement = 1;
99 108
109static void clamp_max_cpus(void)
110{
111#ifdef CONFIG_SMP
112 if (setup_max_cpus > MAX_VIRT_CPUS)
113 setup_max_cpus = MAX_VIRT_CPUS;
114#endif
115}
116
100static void xen_vcpu_setup(int cpu) 117static void xen_vcpu_setup(int cpu)
101{ 118{
102 struct vcpu_register_vcpu_info info; 119 struct vcpu_register_vcpu_info info;
@@ -104,13 +121,17 @@ static void xen_vcpu_setup(int cpu)
104 struct vcpu_info *vcpup; 121 struct vcpu_info *vcpup;
105 122
106 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); 123 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
107 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
108 124
109 if (!have_vcpu_info_placement) 125 if (cpu < MAX_VIRT_CPUS)
110 return; /* already tested, not available */ 126 per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
111 127
112 vcpup = &per_cpu(xen_vcpu_info, cpu); 128 if (!have_vcpu_info_placement) {
129 if (cpu >= MAX_VIRT_CPUS)
130 clamp_max_cpus();
131 return;
132 }
113 133
134 vcpup = &per_cpu(xen_vcpu_info, cpu);
114 info.mfn = arbitrary_virt_to_mfn(vcpup); 135 info.mfn = arbitrary_virt_to_mfn(vcpup);
115 info.offset = offset_in_page(vcpup); 136 info.offset = offset_in_page(vcpup);
116 137
@@ -125,6 +146,7 @@ static void xen_vcpu_setup(int cpu)
125 if (err) { 146 if (err) {
126 printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); 147 printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
127 have_vcpu_info_placement = 0; 148 have_vcpu_info_placement = 0;
149 clamp_max_cpus();
128 } else { 150 } else {
129 /* This cpu is using the registered vcpu info, even if 151 /* This cpu is using the registered vcpu info, even if
130 later ones fail to. */ 152 later ones fail to. */
@@ -731,7 +753,6 @@ static void set_xen_basic_apic_ops(void)
731 753
732#endif 754#endif
733 755
734
735static void xen_clts(void) 756static void xen_clts(void)
736{ 757{
737 struct multicall_space mcs; 758 struct multicall_space mcs;
@@ -926,10 +947,6 @@ static const struct pv_init_ops xen_init_ops __initdata = {
926 .patch = xen_patch, 947 .patch = xen_patch,
927}; 948};
928 949
929static const struct pv_time_ops xen_time_ops __initdata = {
930 .sched_clock = xen_sched_clock,
931};
932
933static const struct pv_cpu_ops xen_cpu_ops __initdata = { 950static const struct pv_cpu_ops xen_cpu_ops __initdata = {
934 .cpuid = xen_cpuid, 951 .cpuid = xen_cpuid,
935 952
@@ -1028,6 +1045,23 @@ static void xen_crash_shutdown(struct pt_regs *regs)
1028 xen_reboot(SHUTDOWN_crash); 1045 xen_reboot(SHUTDOWN_crash);
1029} 1046}
1030 1047
1048static int
1049xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
1050{
1051 xen_reboot(SHUTDOWN_crash);
1052 return NOTIFY_DONE;
1053}
1054
1055static struct notifier_block xen_panic_block = {
1056 .notifier_call= xen_panic_event,
1057};
1058
1059int xen_panic_handler_init(void)
1060{
1061 atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
1062 return 0;
1063}
1064
1031static const struct machine_ops __initdata xen_machine_ops = { 1065static const struct machine_ops __initdata xen_machine_ops = {
1032 .restart = xen_restart, 1066 .restart = xen_restart,
1033 .halt = xen_machine_halt, 1067 .halt = xen_machine_halt,
@@ -1067,7 +1101,6 @@ asmlinkage void __init xen_start_kernel(void)
1067 /* Install Xen paravirt ops */ 1101 /* Install Xen paravirt ops */
1068 pv_info = xen_info; 1102 pv_info = xen_info;
1069 pv_init_ops = xen_init_ops; 1103 pv_init_ops = xen_init_ops;
1070 pv_time_ops = xen_time_ops;
1071 pv_cpu_ops = xen_cpu_ops; 1104 pv_cpu_ops = xen_cpu_ops;
1072 pv_apic_ops = xen_apic_ops; 1105 pv_apic_ops = xen_apic_ops;
1073 1106
@@ -1075,13 +1108,7 @@ asmlinkage void __init xen_start_kernel(void)
1075 x86_init.oem.arch_setup = xen_arch_setup; 1108 x86_init.oem.arch_setup = xen_arch_setup;
1076 x86_init.oem.banner = xen_banner; 1109 x86_init.oem.banner = xen_banner;
1077 1110
1078 x86_init.timers.timer_init = xen_time_init; 1111 xen_init_time_ops();
1079 x86_init.timers.setup_percpu_clockev = x86_init_noop;
1080 x86_cpuinit.setup_percpu_clockev = x86_init_noop;
1081
1082 x86_platform.calibrate_tsc = xen_tsc_khz;
1083 x86_platform.get_wallclock = xen_get_wallclock;
1084 x86_platform.set_wallclock = xen_set_wallclock;
1085 1112
1086 /* 1113 /*
1087 * Set up some pagetable state before starting to set any ptes. 1114 * Set up some pagetable state before starting to set any ptes.
@@ -1145,6 +1172,10 @@ asmlinkage void __init xen_start_kernel(void)
1145 1172
1146 pgd = (pgd_t *)xen_start_info->pt_base; 1173 pgd = (pgd_t *)xen_start_info->pt_base;
1147 1174
1175 if (!xen_initial_domain())
1176 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1177
1178 __supported_pte_mask |= _PAGE_IOMAP;
1148 /* Don't do the full vcpu_info placement stuff until we have a 1179 /* Don't do the full vcpu_info placement stuff until we have a
1149 possible map and a non-dummy shared_info. */ 1180 possible map and a non-dummy shared_info. */
1150 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1181 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
@@ -1206,3 +1237,139 @@ asmlinkage void __init xen_start_kernel(void)
1206 x86_64_start_reservations((char *)__pa_symbol(&boot_params)); 1237 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1207#endif 1238#endif
1208} 1239}
1240
1241static uint32_t xen_cpuid_base(void)
1242{
1243 uint32_t base, eax, ebx, ecx, edx;
1244 char signature[13];
1245
1246 for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1247 cpuid(base, &eax, &ebx, &ecx, &edx);
1248 *(uint32_t *)(signature + 0) = ebx;
1249 *(uint32_t *)(signature + 4) = ecx;
1250 *(uint32_t *)(signature + 8) = edx;
1251 signature[12] = 0;
1252
1253 if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
1254 return base;
1255 }
1256
1257 return 0;
1258}
1259
1260static int init_hvm_pv_info(int *major, int *minor)
1261{
1262 uint32_t eax, ebx, ecx, edx, pages, msr, base;
1263 u64 pfn;
1264
1265 base = xen_cpuid_base();
1266 cpuid(base + 1, &eax, &ebx, &ecx, &edx);
1267
1268 *major = eax >> 16;
1269 *minor = eax & 0xffff;
1270 printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
1271
1272 cpuid(base + 2, &pages, &msr, &ecx, &edx);
1273
1274 pfn = __pa(hypercall_page);
1275 wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
1276
1277 xen_setup_features();
1278
1279 pv_info = xen_info;
1280 pv_info.kernel_rpl = 0;
1281
1282 xen_domain_type = XEN_HVM_DOMAIN;
1283
1284 return 0;
1285}
1286
1287void xen_hvm_init_shared_info(void)
1288{
1289 int cpu;
1290 struct xen_add_to_physmap xatp;
1291 static struct shared_info *shared_info_page = 0;
1292
1293 if (!shared_info_page)
1294 shared_info_page = (struct shared_info *)
1295 extend_brk(PAGE_SIZE, PAGE_SIZE);
1296 xatp.domid = DOMID_SELF;
1297 xatp.idx = 0;
1298 xatp.space = XENMAPSPACE_shared_info;
1299 xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
1300 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
1301 BUG();
1302
1303 HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
1304
1305 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
1306 * page, we use it in the event channel upcall and in some pvclock
1307 * related functions. We don't need the vcpu_info placement
1308 * optimizations because we don't use any pv_mmu or pv_irq op on
1309 * HVM.
1310 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
1311 * online but xen_hvm_init_shared_info is run at resume time too and
1312 * in that case multiple vcpus might be online. */
1313 for_each_online_cpu(cpu) {
1314 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1315 }
1316}
1317
1318#ifdef CONFIG_XEN_PVHVM
1319static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1320 unsigned long action, void *hcpu)
1321{
1322 int cpu = (long)hcpu;
1323 switch (action) {
1324 case CPU_UP_PREPARE:
1325 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1326 break;
1327 default:
1328 break;
1329 }
1330 return NOTIFY_OK;
1331}
1332
1333static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = {
1334 .notifier_call = xen_hvm_cpu_notify,
1335};
1336
1337static void __init xen_hvm_guest_init(void)
1338{
1339 int r;
1340 int major, minor;
1341
1342 r = init_hvm_pv_info(&major, &minor);
1343 if (r < 0)
1344 return;
1345
1346 xen_hvm_init_shared_info();
1347
1348 if (xen_feature(XENFEAT_hvm_callback_vector))
1349 xen_have_vector_callback = 1;
1350 register_cpu_notifier(&xen_hvm_cpu_notifier);
1351 xen_unplug_emulated_devices();
1352 have_vcpu_info_placement = 0;
1353 x86_init.irqs.intr_init = xen_init_IRQ;
1354 xen_hvm_init_time_ops();
1355 xen_hvm_init_mmu_ops();
1356}
1357
1358static bool __init xen_hvm_platform(void)
1359{
1360 if (xen_pv_domain())
1361 return false;
1362
1363 if (!xen_cpuid_base())
1364 return false;
1365
1366 return true;
1367}
1368
1369const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
1370 .name = "Xen HVM",
1371 .detect = xen_hvm_platform,
1372 .init_platform = xen_hvm_guest_init,
1373};
1374EXPORT_SYMBOL(x86_hyper_xen_hvm);
1375#endif
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 914f04695ce5..42086ac406af 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -42,6 +42,7 @@
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/bug.h> 44#include <linux/bug.h>
45#include <linux/vmalloc.h>
45#include <linux/module.h> 46#include <linux/module.h>
46#include <linux/gfp.h> 47#include <linux/gfp.h>
47 48
@@ -51,14 +52,19 @@
51#include <asm/mmu_context.h> 52#include <asm/mmu_context.h>
52#include <asm/setup.h> 53#include <asm/setup.h>
53#include <asm/paravirt.h> 54#include <asm/paravirt.h>
55#include <asm/e820.h>
54#include <asm/linkage.h> 56#include <asm/linkage.h>
57#include <asm/page.h>
55 58
56#include <asm/xen/hypercall.h> 59#include <asm/xen/hypercall.h>
57#include <asm/xen/hypervisor.h> 60#include <asm/xen/hypervisor.h>
58 61
62#include <xen/xen.h>
59#include <xen/page.h> 63#include <xen/page.h>
60#include <xen/interface/xen.h> 64#include <xen/interface/xen.h>
65#include <xen/interface/hvm/hvm_op.h>
61#include <xen/interface/version.h> 66#include <xen/interface/version.h>
67#include <xen/interface/memory.h>
62#include <xen/hvc-console.h> 68#include <xen/hvc-console.h>
63 69
64#include "multicalls.h" 70#include "multicalls.h"
@@ -67,6 +73,13 @@
67 73
68#define MMU_UPDATE_HISTO 30 74#define MMU_UPDATE_HISTO 30
69 75
76/*
77 * Protects atomic reservation decrease/increase against concurrent increases.
78 * Also protects non-atomic updates of current_pages and driver_pages, and
79 * balloon lists.
80 */
81DEFINE_SPINLOCK(xen_reservation_lock);
82
70#ifdef CONFIG_XEN_DEBUG_FS 83#ifdef CONFIG_XEN_DEBUG_FS
71 84
72static struct { 85static struct {
@@ -377,6 +390,28 @@ static bool xen_page_pinned(void *ptr)
377 return PagePinned(page); 390 return PagePinned(page);
378} 391}
379 392
393static bool xen_iomap_pte(pte_t pte)
394{
395 return pte_flags(pte) & _PAGE_IOMAP;
396}
397
398static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
399{
400 struct multicall_space mcs;
401 struct mmu_update *u;
402
403 mcs = xen_mc_entry(sizeof(*u));
404 u = mcs.args;
405
406 /* ptep might be kmapped when using 32-bit HIGHPTE */
407 u->ptr = arbitrary_virt_to_machine(ptep).maddr;
408 u->val = pte_val_ma(pteval);
409
410 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO);
411
412 xen_mc_issue(PARAVIRT_LAZY_MMU);
413}
414
380static void xen_extend_mmu_update(const struct mmu_update *update) 415static void xen_extend_mmu_update(const struct mmu_update *update)
381{ 416{
382 struct multicall_space mcs; 417 struct multicall_space mcs;
@@ -453,6 +488,11 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
453void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 488void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
454 pte_t *ptep, pte_t pteval) 489 pte_t *ptep, pte_t pteval)
455{ 490{
491 if (xen_iomap_pte(pteval)) {
492 xen_set_iomap_pte(ptep, pteval);
493 goto out;
494 }
495
456 ADD_STATS(set_pte_at, 1); 496 ADD_STATS(set_pte_at, 1);
457// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); 497// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
458 ADD_STATS(set_pte_at_current, mm == current->mm); 498 ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -523,8 +563,25 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
523 return val; 563 return val;
524} 564}
525 565
566static pteval_t iomap_pte(pteval_t val)
567{
568 if (val & _PAGE_PRESENT) {
569 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
570 pteval_t flags = val & PTE_FLAGS_MASK;
571
572 /* We assume the pte frame number is a MFN, so
573 just use it as-is. */
574 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
575 }
576
577 return val;
578}
579
526pteval_t xen_pte_val(pte_t pte) 580pteval_t xen_pte_val(pte_t pte)
527{ 581{
582 if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
583 return pte.pte;
584
528 return pte_mfn_to_pfn(pte.pte); 585 return pte_mfn_to_pfn(pte.pte);
529} 586}
530PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); 587PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
@@ -537,7 +594,22 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
537 594
538pte_t xen_make_pte(pteval_t pte) 595pte_t xen_make_pte(pteval_t pte)
539{ 596{
540 pte = pte_pfn_to_mfn(pte); 597 phys_addr_t addr = (pte & PTE_PFN_MASK);
598
599 /*
600 * Unprivileged domains are allowed to do IOMAPpings for
601 * PCI passthrough, but not map ISA space. The ISA
602 * mappings are just dummy local mappings to keep other
603 * parts of the kernel happy.
604 */
605 if (unlikely(pte & _PAGE_IOMAP) &&
606 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
607 pte = iomap_pte(pte);
608 } else {
609 pte &= ~_PAGE_IOMAP;
610 pte = pte_pfn_to_mfn(pte);
611 }
612
541 return native_make_pte(pte); 613 return native_make_pte(pte);
542} 614}
543PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); 615PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
@@ -593,6 +665,11 @@ void xen_set_pud(pud_t *ptr, pud_t val)
593 665
594void xen_set_pte(pte_t *ptep, pte_t pte) 666void xen_set_pte(pte_t *ptep, pte_t pte)
595{ 667{
668 if (xen_iomap_pte(pte)) {
669 xen_set_iomap_pte(ptep, pte);
670 return;
671 }
672
596 ADD_STATS(pte_update, 1); 673 ADD_STATS(pte_update, 1);
597// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); 674// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
598 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); 675 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
@@ -609,6 +686,11 @@ void xen_set_pte(pte_t *ptep, pte_t pte)
609#ifdef CONFIG_X86_PAE 686#ifdef CONFIG_X86_PAE
610void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 687void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
611{ 688{
689 if (xen_iomap_pte(pte)) {
690 xen_set_iomap_pte(ptep, pte);
691 return;
692 }
693
612 set_64bit((u64 *)ptep, native_pte_val(pte)); 694 set_64bit((u64 *)ptep, native_pte_val(pte));
613} 695}
614 696
@@ -935,8 +1017,6 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page,
935 read-only, and can be pinned. */ 1017 read-only, and can be pinned. */
936static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) 1018static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
937{ 1019{
938 vm_unmap_aliases();
939
940 xen_mc_batch(); 1020 xen_mc_batch();
941 1021
942 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { 1022 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
@@ -1500,7 +1580,6 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
1500 if (PagePinned(virt_to_page(mm->pgd))) { 1580 if (PagePinned(virt_to_page(mm->pgd))) {
1501 SetPagePinned(page); 1581 SetPagePinned(page);
1502 1582
1503 vm_unmap_aliases();
1504 if (!PageHighMem(page)) { 1583 if (!PageHighMem(page)) {
1505 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); 1584 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1506 if (level == PT_PTE && USE_SPLIT_PTLOCKS) 1585 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
@@ -1811,9 +1890,16 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1811 pte = pfn_pte(phys, prot); 1890 pte = pfn_pte(phys, prot);
1812 break; 1891 break;
1813 1892
1814 default: 1893 case FIX_PARAVIRT_BOOTMAP:
1894 /* This is an MFN, but it isn't an IO mapping from the
1895 IO domain */
1815 pte = mfn_pte(phys, prot); 1896 pte = mfn_pte(phys, prot);
1816 break; 1897 break;
1898
1899 default:
1900 /* By default, set_fixmap is used for hardware mappings */
1901 pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1902 break;
1817 } 1903 }
1818 1904
1819 __native_set_fixmap(idx, pte); 1905 __native_set_fixmap(idx, pte);
@@ -1939,8 +2025,240 @@ void __init xen_init_mmu_ops(void)
1939 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; 2025 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
1940 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; 2026 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
1941 pv_mmu_ops = xen_mmu_ops; 2027 pv_mmu_ops = xen_mmu_ops;
2028
2029 vmap_lazy_unmap = false;
2030}
2031
2032/* Protected by xen_reservation_lock. */
2033#define MAX_CONTIG_ORDER 9 /* 2MB */
2034static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2035
2036#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2037static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2038 unsigned long *in_frames,
2039 unsigned long *out_frames)
2040{
2041 int i;
2042 struct multicall_space mcs;
2043
2044 xen_mc_batch();
2045 for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2046 mcs = __xen_mc_entry(0);
2047
2048 if (in_frames)
2049 in_frames[i] = virt_to_mfn(vaddr);
2050
2051 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2052 set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2053
2054 if (out_frames)
2055 out_frames[i] = virt_to_pfn(vaddr);
2056 }
2057 xen_mc_issue(0);
2058}
2059
2060/*
2061 * Update the pfn-to-mfn mappings for a virtual address range, either to
2062 * point to an array of mfns, or contiguously from a single starting
2063 * mfn.
2064 */
2065static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2066 unsigned long *mfns,
2067 unsigned long first_mfn)
2068{
2069 unsigned i, limit;
2070 unsigned long mfn;
2071
2072 xen_mc_batch();
2073
2074 limit = 1u << order;
2075 for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2076 struct multicall_space mcs;
2077 unsigned flags;
2078
2079 mcs = __xen_mc_entry(0);
2080 if (mfns)
2081 mfn = mfns[i];
2082 else
2083 mfn = first_mfn + i;
2084
2085 if (i < (limit - 1))
2086 flags = 0;
2087 else {
2088 if (order == 0)
2089 flags = UVMF_INVLPG | UVMF_ALL;
2090 else
2091 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2092 }
2093
2094 MULTI_update_va_mapping(mcs.mc, vaddr,
2095 mfn_pte(mfn, PAGE_KERNEL), flags);
2096
2097 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2098 }
2099
2100 xen_mc_issue(0);
2101}
2102
2103/*
2104 * Perform the hypercall to exchange a region of our pfns to point to
2105 * memory with the required contiguous alignment. Takes the pfns as
2106 * input, and populates mfns as output.
2107 *
2108 * Returns a success code indicating whether the hypervisor was able to
2109 * satisfy the request or not.
2110 */
2111static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2112 unsigned long *pfns_in,
2113 unsigned long extents_out,
2114 unsigned int order_out,
2115 unsigned long *mfns_out,
2116 unsigned int address_bits)
2117{
2118 long rc;
2119 int success;
2120
2121 struct xen_memory_exchange exchange = {
2122 .in = {
2123 .nr_extents = extents_in,
2124 .extent_order = order_in,
2125 .extent_start = pfns_in,
2126 .domid = DOMID_SELF
2127 },
2128 .out = {
2129 .nr_extents = extents_out,
2130 .extent_order = order_out,
2131 .extent_start = mfns_out,
2132 .address_bits = address_bits,
2133 .domid = DOMID_SELF
2134 }
2135 };
2136
2137 BUG_ON(extents_in << order_in != extents_out << order_out);
2138
2139 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2140 success = (exchange.nr_exchanged == extents_in);
2141
2142 BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2143 BUG_ON(success && (rc != 0));
2144
2145 return success;
1942} 2146}
1943 2147
2148int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2149 unsigned int address_bits)
2150{
2151 unsigned long *in_frames = discontig_frames, out_frame;
2152 unsigned long flags;
2153 int success;
2154
2155 /*
2156 * Currently an auto-translated guest will not perform I/O, nor will
2157 * it require PAE page directories below 4GB. Therefore any calls to
2158 * this function are redundant and can be ignored.
2159 */
2160
2161 if (xen_feature(XENFEAT_auto_translated_physmap))
2162 return 0;
2163
2164 if (unlikely(order > MAX_CONTIG_ORDER))
2165 return -ENOMEM;
2166
2167 memset((void *) vstart, 0, PAGE_SIZE << order);
2168
2169 spin_lock_irqsave(&xen_reservation_lock, flags);
2170
2171 /* 1. Zap current PTEs, remembering MFNs. */
2172 xen_zap_pfn_range(vstart, order, in_frames, NULL);
2173
2174 /* 2. Get a new contiguous memory extent. */
2175 out_frame = virt_to_pfn(vstart);
2176 success = xen_exchange_memory(1UL << order, 0, in_frames,
2177 1, order, &out_frame,
2178 address_bits);
2179
2180 /* 3. Map the new extent in place of old pages. */
2181 if (success)
2182 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2183 else
2184 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2185
2186 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2187
2188 return success ? 0 : -ENOMEM;
2189}
2190EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2191
2192void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2193{
2194 unsigned long *out_frames = discontig_frames, in_frame;
2195 unsigned long flags;
2196 int success;
2197
2198 if (xen_feature(XENFEAT_auto_translated_physmap))
2199 return;
2200
2201 if (unlikely(order > MAX_CONTIG_ORDER))
2202 return;
2203
2204 memset((void *) vstart, 0, PAGE_SIZE << order);
2205
2206 spin_lock_irqsave(&xen_reservation_lock, flags);
2207
2208 /* 1. Find start MFN of contiguous extent. */
2209 in_frame = virt_to_mfn(vstart);
2210
2211 /* 2. Zap current PTEs. */
2212 xen_zap_pfn_range(vstart, order, NULL, out_frames);
2213
2214 /* 3. Do the exchange for non-contiguous MFNs. */
2215 success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2216 0, out_frames, 0);
2217
2218 /* 4. Map new pages in place of old pages. */
2219 if (success)
2220 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2221 else
2222 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2223
2224 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2225}
2226EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2227
2228#ifdef CONFIG_XEN_PVHVM
2229static void xen_hvm_exit_mmap(struct mm_struct *mm)
2230{
2231 struct xen_hvm_pagetable_dying a;
2232 int rc;
2233
2234 a.domid = DOMID_SELF;
2235 a.gpa = __pa(mm->pgd);
2236 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2237 WARN_ON_ONCE(rc < 0);
2238}
2239
2240static int is_pagetable_dying_supported(void)
2241{
2242 struct xen_hvm_pagetable_dying a;
2243 int rc = 0;
2244
2245 a.domid = DOMID_SELF;
2246 a.gpa = 0x00;
2247 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2248 if (rc < 0) {
2249 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2250 return 0;
2251 }
2252 return 1;
2253}
2254
2255void __init xen_hvm_init_mmu_ops(void)
2256{
2257 if (is_pagetable_dying_supported())
2258 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2259}
2260#endif
2261
1944#ifdef CONFIG_XEN_DEBUG_FS 2262#ifdef CONFIG_XEN_DEBUG_FS
1945 2263
1946static struct dentry *d_mmu_debug; 2264static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 5fe6bc7f5ecf..fa938c4aa2f7 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -60,4 +60,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
60unsigned long xen_read_cr2_direct(void); 60unsigned long xen_read_cr2_direct(void);
61 61
62extern void xen_init_mmu_ops(void); 62extern void xen_init_mmu_ops(void);
63extern void xen_hvm_init_mmu_ops(void);
63#endif /* _XEN_MMU_H */ 64#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
new file mode 100644
index 000000000000..a013ec9d0c54
--- /dev/null
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -0,0 +1,58 @@
1/* Glue code to lib/swiotlb-xen.c */
2
3#include <linux/dma-mapping.h>
4#include <xen/swiotlb-xen.h>
5
6#include <asm/xen/hypervisor.h>
7#include <xen/xen.h>
8
9int xen_swiotlb __read_mostly;
10
11static struct dma_map_ops xen_swiotlb_dma_ops = {
12 .mapping_error = xen_swiotlb_dma_mapping_error,
13 .alloc_coherent = xen_swiotlb_alloc_coherent,
14 .free_coherent = xen_swiotlb_free_coherent,
15 .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
16 .sync_single_for_device = xen_swiotlb_sync_single_for_device,
17 .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
18 .sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
19 .map_sg = xen_swiotlb_map_sg_attrs,
20 .unmap_sg = xen_swiotlb_unmap_sg_attrs,
21 .map_page = xen_swiotlb_map_page,
22 .unmap_page = xen_swiotlb_unmap_page,
23 .dma_supported = xen_swiotlb_dma_supported,
24};
25
26/*
27 * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary
28 *
29 * This returns non-zero if we are forced to use xen_swiotlb (by the boot
30 * option).
31 */
32int __init pci_xen_swiotlb_detect(void)
33{
34
35 /* If running as PV guest, either iommu=soft, or swiotlb=force will
36 * activate this IOMMU. If running as PV privileged, activate it
37 * irregardlesss.
38 */
39 if ((xen_initial_domain() || swiotlb || swiotlb_force) &&
40 (xen_pv_domain()))
41 xen_swiotlb = 1;
42
43 /* If we are running under Xen, we MUST disable the native SWIOTLB.
44 * Don't worry about swiotlb_force flag activating the native, as
45 * the 'swiotlb' flag is the only one turning it on. */
46 if (xen_pv_domain())
47 swiotlb = 0;
48
49 return xen_swiotlb;
50}
51
52void __init pci_xen_swiotlb_init(void)
53{
54 if (xen_swiotlb) {
55 xen_swiotlb_init(1);
56 dma_ops = &xen_swiotlb_dma_ops;
57 }
58}
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
new file mode 100644
index 000000000000..554c002a1e1a
--- /dev/null
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -0,0 +1,137 @@
1/******************************************************************************
2 * platform-pci-unplug.c
3 *
4 * Xen platform PCI device driver
5 * Copyright (c) 2010, Citrix
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
18 * Place - Suite 330, Boston, MA 02111-1307 USA.
19 *
20 */
21
22#include <linux/init.h>
23#include <linux/io.h>
24#include <linux/module.h>
25
26#include <xen/platform_pci.h>
27
28#define XEN_PLATFORM_ERR_MAGIC -1
29#define XEN_PLATFORM_ERR_PROTOCOL -2
30#define XEN_PLATFORM_ERR_BLACKLIST -3
31
32/* store the value of xen_emul_unplug after the unplug is done */
33int xen_platform_pci_unplug;
34EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
35#ifdef CONFIG_XEN_PVHVM
36static int xen_emul_unplug;
37
38static int __init check_platform_magic(void)
39{
40 short magic;
41 char protocol;
42
43 magic = inw(XEN_IOPORT_MAGIC);
44 if (magic != XEN_IOPORT_MAGIC_VAL) {
45 printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n");
46 return XEN_PLATFORM_ERR_MAGIC;
47 }
48
49 protocol = inb(XEN_IOPORT_PROTOVER);
50
51 printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n",
52 protocol);
53
54 switch (protocol) {
55 case 1:
56 outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM);
57 outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER);
58 if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) {
59 printk(KERN_ERR "Xen Platform: blacklisted by host\n");
60 return XEN_PLATFORM_ERR_BLACKLIST;
61 }
62 break;
63 default:
64 printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version");
65 return XEN_PLATFORM_ERR_PROTOCOL;
66 }
67
68 return 0;
69}
70
71void __init xen_unplug_emulated_devices(void)
72{
73 int r;
74
75 /* check the version of the xen platform PCI device */
76 r = check_platform_magic();
77 /* If the version matches enable the Xen platform PCI driver.
78 * Also enable the Xen platform PCI driver if the version is really old
79 * and the user told us to ignore it. */
80 if (r && !(r == XEN_PLATFORM_ERR_MAGIC &&
81 (xen_emul_unplug & XEN_UNPLUG_IGNORE)))
82 return;
83 /* Set the default value of xen_emul_unplug depending on whether or
84 * not the Xen PV frontends and the Xen platform PCI driver have
85 * been compiled for this kernel (modules or built-in are both OK). */
86 if (!xen_emul_unplug) {
87 if (xen_must_unplug_nics()) {
88 printk(KERN_INFO "Netfront and the Xen platform PCI driver have "
89 "been compiled for this kernel: unplug emulated NICs.\n");
90 xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
91 }
92 if (xen_must_unplug_disks()) {
93 printk(KERN_INFO "Blkfront and the Xen platform PCI driver have "
94 "been compiled for this kernel: unplug emulated disks.\n"
95 "You might have to change the root device\n"
96 "from /dev/hd[a-d] to /dev/xvd[a-d]\n"
97 "in your root= kernel command line option\n");
98 xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
99 }
100 }
101 /* Now unplug the emulated devices */
102 if (!(xen_emul_unplug & XEN_UNPLUG_IGNORE))
103 outw(xen_emul_unplug, XEN_IOPORT_UNPLUG);
104 xen_platform_pci_unplug = xen_emul_unplug;
105}
106
107static int __init parse_xen_emul_unplug(char *arg)
108{
109 char *p, *q;
110 int l;
111
112 for (p = arg; p; p = q) {
113 q = strchr(p, ',');
114 if (q) {
115 l = q - p;
116 q++;
117 } else {
118 l = strlen(p);
119 }
120 if (!strncmp(p, "all", l))
121 xen_emul_unplug |= XEN_UNPLUG_ALL;
122 else if (!strncmp(p, "ide-disks", l))
123 xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
124 else if (!strncmp(p, "aux-ide-disks", l))
125 xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS;
126 else if (!strncmp(p, "nics", l))
127 xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
128 else if (!strncmp(p, "ignore", l))
129 xen_emul_unplug |= XEN_UNPLUG_IGNORE;
130 else
131 printk(KERN_WARNING "unrecognised option '%s' "
132 "in parameter 'xen_emul_unplug'\n", p);
133 }
134 return 0;
135}
136early_param("xen_emul_unplug", parse_xen_emul_unplug);
137#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index ad0047f47cd4..328b00305426 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
20#include <xen/page.h> 20#include <xen/page.h>
21#include <xen/interface/callback.h> 21#include <xen/interface/callback.h>
22#include <xen/interface/physdev.h> 22#include <xen/interface/physdev.h>
23#include <xen/interface/memory.h>
23#include <xen/features.h> 24#include <xen/features.h>
24 25
25#include "xen-ops.h" 26#include "xen-ops.h"
@@ -32,6 +33,73 @@ extern void xen_sysenter_target(void);
32extern void xen_syscall_target(void); 33extern void xen_syscall_target(void);
33extern void xen_syscall32_target(void); 34extern void xen_syscall32_target(void);
34 35
36static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
37 phys_addr_t end_addr)
38{
39 struct xen_memory_reservation reservation = {
40 .address_bits = 0,
41 .extent_order = 0,
42 .domid = DOMID_SELF
43 };
44 unsigned long start, end;
45 unsigned long len = 0;
46 unsigned long pfn;
47 int ret;
48
49 start = PFN_UP(start_addr);
50 end = PFN_DOWN(end_addr);
51
52 if (end <= start)
53 return 0;
54
55 printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ",
56 start, end);
57 for(pfn = start; pfn < end; pfn++) {
58 unsigned long mfn = pfn_to_mfn(pfn);
59
60 /* Make sure pfn exists to start with */
61 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
62 continue;
63
64 set_xen_guest_handle(reservation.extent_start, &mfn);
65 reservation.nr_extents = 1;
66
67 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
68 &reservation);
69 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
70 start, end, ret);
71 if (ret == 1) {
72 set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
73 len++;
74 }
75 }
76 printk(KERN_CONT "%ld pages freed\n", len);
77
78 return len;
79}
80
81static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
82 const struct e820map *e820)
83{
84 phys_addr_t max_addr = PFN_PHYS(max_pfn);
85 phys_addr_t last_end = 0;
86 unsigned long released = 0;
87 int i;
88
89 for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
90 phys_addr_t end = e820->map[i].addr;
91 end = min(max_addr, end);
92
93 released += xen_release_chunk(last_end, end);
94 last_end = e820->map[i].addr + e820->map[i].size;
95 }
96
97 if (last_end < max_addr)
98 released += xen_release_chunk(last_end, max_addr);
99
100 printk(KERN_INFO "released %ld pages of unused memory\n", released);
101 return released;
102}
35 103
36/** 104/**
37 * machine_specific_memory_setup - Hook for machine specific memory setup. 105 * machine_specific_memory_setup - Hook for machine specific memory setup.
@@ -67,6 +135,8 @@ char * __init xen_memory_setup(void)
67 135
68 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 136 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
69 137
138 xen_return_unused_memory(xen_start_info->nr_pages, &e820);
139
70 return "Xen"; 140 return "Xen";
71} 141}
72 142
@@ -156,6 +226,8 @@ void __init xen_arch_setup(void)
156 struct physdev_set_iopl set_iopl; 226 struct physdev_set_iopl set_iopl;
157 int rc; 227 int rc;
158 228
229 xen_panic_handler_init();
230
159 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); 231 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
160 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); 232 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
161 233
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index a29693fd3138..25f232b18a82 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -394,6 +394,8 @@ static void stop_self(void *v)
394 load_cr3(swapper_pg_dir); 394 load_cr3(swapper_pg_dir);
395 /* should set up a minimal gdt */ 395 /* should set up a minimal gdt */
396 396
397 set_cpu_online(cpu, false);
398
397 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); 399 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
398 BUG(); 400 BUG();
399} 401}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index a9c661108034..1d789d56877c 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -26,6 +26,18 @@ void xen_pre_suspend(void)
26 BUG(); 26 BUG();
27} 27}
28 28
29void xen_hvm_post_suspend(int suspend_cancelled)
30{
31 int cpu;
32 xen_hvm_init_shared_info();
33 xen_callback_vector();
34 if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
35 for_each_online_cpu(cpu) {
36 xen_setup_runstate_info(cpu);
37 }
38 }
39}
40
29void xen_post_suspend(int suspend_cancelled) 41void xen_post_suspend(int suspend_cancelled)
30{ 42{
31 xen_build_mfn_list_list(); 43 xen_build_mfn_list_list();
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index b3c6c59ed302..1a5353a753fc 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -20,6 +20,7 @@
20#include <asm/xen/hypercall.h> 20#include <asm/xen/hypercall.h>
21 21
22#include <xen/events.h> 22#include <xen/events.h>
23#include <xen/features.h>
23#include <xen/interface/xen.h> 24#include <xen/interface/xen.h>
24#include <xen/interface/vcpu.h> 25#include <xen/interface/vcpu.h>
25 26
@@ -155,47 +156,8 @@ static void do_stolen_accounting(void)
155 account_idle_ticks(ticks); 156 account_idle_ticks(ticks);
156} 157}
157 158
158/*
159 * Xen sched_clock implementation. Returns the number of unstolen
160 * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
161 * states.
162 */
163unsigned long long xen_sched_clock(void)
164{
165 struct vcpu_runstate_info state;
166 cycle_t now;
167 u64 ret;
168 s64 offset;
169
170 /*
171 * Ideally sched_clock should be called on a per-cpu basis
172 * anyway, so preempt should already be disabled, but that's
173 * not current practice at the moment.
174 */
175 preempt_disable();
176
177 now = xen_clocksource_read();
178
179 get_runstate_snapshot(&state);
180
181 WARN_ON(state.state != RUNSTATE_running);
182
183 offset = now - state.state_entry_time;
184 if (offset < 0)
185 offset = 0;
186
187 ret = state.time[RUNSTATE_blocked] +
188 state.time[RUNSTATE_running] +
189 offset;
190
191 preempt_enable();
192
193 return ret;
194}
195
196
197/* Get the TSC speed from Xen */ 159/* Get the TSC speed from Xen */
198unsigned long xen_tsc_khz(void) 160static unsigned long xen_tsc_khz(void)
199{ 161{
200 struct pvclock_vcpu_time_info *info = 162 struct pvclock_vcpu_time_info *info =
201 &HYPERVISOR_shared_info->vcpu_info[0].time; 163 &HYPERVISOR_shared_info->vcpu_info[0].time;
@@ -230,7 +192,7 @@ static void xen_read_wallclock(struct timespec *ts)
230 put_cpu_var(xen_vcpu); 192 put_cpu_var(xen_vcpu);
231} 193}
232 194
233unsigned long xen_get_wallclock(void) 195static unsigned long xen_get_wallclock(void)
234{ 196{
235 struct timespec ts; 197 struct timespec ts;
236 198
@@ -238,7 +200,7 @@ unsigned long xen_get_wallclock(void)
238 return ts.tv_sec; 200 return ts.tv_sec;
239} 201}
240 202
241int xen_set_wallclock(unsigned long now) 203static int xen_set_wallclock(unsigned long now)
242{ 204{
243 /* do nothing for domU */ 205 /* do nothing for domU */
244 return -1; 206 return -1;
@@ -473,7 +435,11 @@ void xen_timer_resume(void)
473 } 435 }
474} 436}
475 437
476__init void xen_time_init(void) 438static const struct pv_time_ops xen_time_ops __initdata = {
439 .sched_clock = xen_clocksource_read,
440};
441
442static __init void xen_time_init(void)
477{ 443{
478 int cpu = smp_processor_id(); 444 int cpu = smp_processor_id();
479 struct timespec tp; 445 struct timespec tp;
@@ -497,3 +463,47 @@ __init void xen_time_init(void)
497 xen_setup_timer(cpu); 463 xen_setup_timer(cpu);
498 xen_setup_cpu_clockevents(); 464 xen_setup_cpu_clockevents();
499} 465}
466
467__init void xen_init_time_ops(void)
468{
469 pv_time_ops = xen_time_ops;
470
471 x86_init.timers.timer_init = xen_time_init;
472 x86_init.timers.setup_percpu_clockev = x86_init_noop;
473 x86_cpuinit.setup_percpu_clockev = x86_init_noop;
474
475 x86_platform.calibrate_tsc = xen_tsc_khz;
476 x86_platform.get_wallclock = xen_get_wallclock;
477 x86_platform.set_wallclock = xen_set_wallclock;
478}
479
480#ifdef CONFIG_XEN_PVHVM
481static void xen_hvm_setup_cpu_clockevents(void)
482{
483 int cpu = smp_processor_id();
484 xen_setup_runstate_info(cpu);
485 xen_setup_timer(cpu);
486 xen_setup_cpu_clockevents();
487}
488
489__init void xen_hvm_init_time_ops(void)
490{
491 /* vector callback is needed otherwise we cannot receive interrupts
492 * on cpu > 0 */
493 if (!xen_have_vector_callback && num_present_cpus() > 1)
494 return;
495 if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
496 printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
497 "disable pv timer\n");
498 return;
499 }
500
501 pv_time_ops = xen_time_ops;
502 x86_init.timers.setup_percpu_clockev = xen_time_init;
503 x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
504
505 x86_platform.calibrate_tsc = xen_tsc_khz;
506 x86_platform.get_wallclock = xen_get_wallclock;
507 x86_platform.set_wallclock = xen_set_wallclock;
508}
509#endif
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f9153a300bce..7c8ab86163e9 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -38,6 +38,10 @@ void xen_enable_sysenter(void);
38void xen_enable_syscall(void); 38void xen_enable_syscall(void);
39void xen_vcpu_restore(void); 39void xen_vcpu_restore(void);
40 40
41void xen_callback_vector(void);
42void xen_hvm_init_shared_info(void);
43void __init xen_unplug_emulated_devices(void);
44
41void __init xen_build_dynamic_phys_to_machine(void); 45void __init xen_build_dynamic_phys_to_machine(void);
42 46
43void xen_init_irq_ops(void); 47void xen_init_irq_ops(void);
@@ -46,11 +50,8 @@ void xen_setup_runstate_info(int cpu);
46void xen_teardown_timer(int cpu); 50void xen_teardown_timer(int cpu);
47cycle_t xen_clocksource_read(void); 51cycle_t xen_clocksource_read(void);
48void xen_setup_cpu_clockevents(void); 52void xen_setup_cpu_clockevents(void);
49unsigned long xen_tsc_khz(void); 53void __init xen_init_time_ops(void);
50void __init xen_time_init(void); 54void __init xen_hvm_init_time_ops(void);
51unsigned long xen_get_wallclock(void);
52int xen_set_wallclock(unsigned long time);
53unsigned long long xen_sched_clock(void);
54 55
55irqreturn_t xen_debug_interrupt(int irq, void *dev_id); 56irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
56 57
@@ -101,4 +102,6 @@ void xen_sysret32(void);
101void xen_sysret64(void); 102void xen_sysret64(void);
102void xen_adjust_exception_frame(void); 103void xen_adjust_exception_frame(void);
103 104
105extern int xen_panic_handler_init(void);
106
104#endif /* XEN_OPS_H */ 107#endif /* XEN_OPS_H */