aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig31
-rw-r--r--arch/x86/Kconfig.cpu4
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S6
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c24
-rw-r--r--arch/x86/ia32/ia32_aout.c1
-rw-r--r--arch/x86/ia32/ia32entry.S1
-rw-r--r--arch/x86/include/asm/acpi.h9
-rw-r--r--arch/x86/include/asm/amd_nb.h12
-rw-r--r--arch/x86/include/asm/bitops.h4
-rw-r--r--arch/x86/include/asm/cacheflush.h2
-rw-r--r--arch/x86/include/asm/dma.h6
-rw-r--r--arch/x86/include/asm/kdebug.h2
-rw-r--r--arch/x86/include/asm/kvm_emulate.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h12
-rw-r--r--arch/x86/include/asm/mmu.h6
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/nmi.h4
-rw-r--r--arch/x86/include/asm/nops.h2
-rw-r--r--arch/x86/include/asm/olpc.h2
-rw-r--r--arch/x86/include/asm/percpu.h48
-rw-r--r--arch/x86/include/asm/perf_event_p4.h4
-rw-r--r--arch/x86/include/asm/pgtable-3level.h11
-rw-r--r--arch/x86/include/asm/processor-flags.h2
-rw-r--r--arch/x86/include/asm/ptrace-abi.h2
-rw-r--r--arch/x86/include/asm/ptrace.h4
-rw-r--r--arch/x86/include/asm/reboot.h5
-rw-r--r--arch/x86/include/asm/segment.h12
-rw-r--r--arch/x86/include/asm/stacktrace.h6
-rw-r--r--arch/x86/include/asm/thread_info.h10
-rw-r--r--arch/x86/include/asm/trampoline.h33
-rw-r--r--arch/x86/include/asm/tsc.h2
-rw-r--r--arch/x86/include/asm/types.h16
-rw-r--r--arch/x86/include/asm/unistd_32.h3
-rw-r--r--arch/x86/include/asm/unistd_64.h2
-rw-r--r--arch/x86/include/asm/xen/interface.h2
-rw-r--r--arch/x86/kernel/Makefile7
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S21
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h5
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S28
-rw-r--r--arch/x86/kernel/acpi/sleep.c77
-rw-r--r--arch/x86/kernel/acpi/sleep.h5
-rw-r--r--arch/x86/kernel/acpi/wakeup_rm.S12
-rw-r--r--arch/x86/kernel/alternative.c9
-rw-r--r--arch/x86/kernel/amd_nb.c20
-rw-r--r--arch/x86/kernel/aperture_64.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c6
-rw-r--r--arch/x86/kernel/apm_32.c19
-rw-r--r--arch/x86/kernel/cpu/amd.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c5
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c42
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c8
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/crash_dump_32.c3
-rw-r--r--arch/x86/kernel/crash_dump_64.c3
-rw-r--r--arch/x86/kernel/devicetree.c6
-rw-r--r--arch/x86/kernel/dumpstack.c26
-rw-r--r--arch/x86/kernel/dumpstack_32.c15
-rw-r--r--arch/x86/kernel/dumpstack_64.c14
-rw-r--r--arch/x86/kernel/e820.c1
-rw-r--r--arch/x86/kernel/early-quirks.c7
-rw-r--r--arch/x86/kernel/entry_32.S2
-rw-r--r--arch/x86/kernel/entry_64.S4
-rw-r--r--arch/x86/kernel/head32.c9
-rw-r--r--arch/x86/kernel/head64.c3
-rw-r--r--arch/x86/kernel/head_64.S3
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/mca_32.c2
-rw-r--r--arch/x86/kernel/mpparse.c12
-rw-r--r--arch/x86/kernel/pci-calgary_64.c4
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/process_64.c8
-rw-r--r--arch/x86/kernel/reboot.c120
-rw-r--r--arch/x86/kernel/reboot_32.S135
-rw-r--r--arch/x86/kernel/setup.c56
-rw-r--r--arch/x86/kernel/smpboot.c10
-rw-r--r--arch/x86/kernel/stacktrace.c6
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/syscall_table_32.S1
-rw-r--r--arch/x86/kernel/topology.c2
-rw-r--r--arch/x86/kernel/trampoline.c42
-rw-r--r--arch/x86/kernel/trampoline_32.S15
-rw-r--r--arch/x86/kernel/trampoline_64.S28
-rw-r--r--arch/x86/kernel/tsc.c4
-rw-r--r--arch/x86/kernel/verify_cpu.S2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S17
-rw-r--r--arch/x86/kernel/xsave.c2
-rw-r--r--arch/x86/kvm/emulate.c52
-rw-r--r--arch/x86/kvm/i8259.c25
-rw-r--r--arch/x86/kvm/lapic.c13
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c150
-rw-r--r--arch/x86/kvm/paging_tmpl.h19
-rw-r--r--arch/x86/kvm/svm.c27
-rw-r--r--arch/x86/kvm/timer.c2
-rw-r--r--arch/x86/kvm/vmx.c128
-rw-r--r--arch/x86/kvm/x86.c155
-rw-r--r--arch/x86/lguest/boot.c2
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/cmpxchg16b_emu.S59
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/csum-copy_64.S242
-rw-r--r--arch/x86/lib/csum-partial_64.c2
-rw-r--r--arch/x86/mm/hugetlbpage.c2
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/init_64.c41
-rw-r--r--arch/x86/mm/numa_64.c2
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/mm/pgtable.c3
-rw-r--r--arch/x86/oprofile/backtrace.c2
-rw-r--r--arch/x86/oprofile/op_model_p4.c2
-rw-r--r--arch/x86/pci/i386.c4
-rw-r--r--arch/x86/pci/irq.c15
-rw-r--r--arch/x86/pci/xen.c41
-rw-r--r--arch/x86/platform/olpc/olpc-xo1.c42
-rw-r--r--arch/x86/vdso/vdso32-setup.c15
-rw-r--r--arch/x86/xen/Kconfig2
-rw-r--r--arch/x86/xen/mmu.c26
127 files changed, 1213 insertions, 1030 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f8958b01b975..140e254fe546 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -123,7 +123,7 @@ config NEED_SG_DMA_LENGTH
123 def_bool y 123 def_bool y
124 124
125config GENERIC_ISA_DMA 125config GENERIC_ISA_DMA
126 def_bool y 126 def_bool ISA_DMA_API
127 127
128config GENERIC_IOMAP 128config GENERIC_IOMAP
129 def_bool y 129 def_bool y
@@ -143,7 +143,7 @@ config GENERIC_GPIO
143 bool 143 bool
144 144
145config ARCH_MAY_HAVE_PC_FDC 145config ARCH_MAY_HAVE_PC_FDC
146 def_bool y 146 def_bool ISA_DMA_API
147 147
148config RWSEM_GENERIC_SPINLOCK 148config RWSEM_GENERIC_SPINLOCK
149 def_bool !X86_XADD 149 def_bool !X86_XADD
@@ -221,10 +221,6 @@ config X86_HT
221 def_bool y 221 def_bool y
222 depends on SMP 222 depends on SMP
223 223
224config X86_TRAMPOLINE
225 def_bool y
226 depends on SMP || (64BIT && ACPI_SLEEP)
227
228config X86_32_LAZY_GS 224config X86_32_LAZY_GS
229 def_bool y 225 def_bool y
230 depends on X86_32 && !CC_STACKPROTECTOR 226 depends on X86_32 && !CC_STACKPROTECTOR
@@ -2006,9 +2002,13 @@ source "drivers/pci/pcie/Kconfig"
2006 2002
2007source "drivers/pci/Kconfig" 2003source "drivers/pci/Kconfig"
2008 2004
2009# x86_64 have no ISA slots, but do have ISA-style DMA. 2005# x86_64 have no ISA slots, but can have ISA-style DMA.
2010config ISA_DMA_API 2006config ISA_DMA_API
2011 def_bool y 2007 bool "ISA-style DMA support" if (X86_64 && EXPERT)
2008 default y
2009 help
2010 Enables ISA-style DMA support for devices requiring such controllers.
2011 If unsure, say Y.
2012 2012
2013if X86_32 2013if X86_32
2014 2014
@@ -2096,6 +2096,16 @@ source "drivers/pcmcia/Kconfig"
2096 2096
2097source "drivers/pci/hotplug/Kconfig" 2097source "drivers/pci/hotplug/Kconfig"
2098 2098
2099config RAPIDIO
2100 bool "RapidIO support"
2101 depends on PCI
2102 default n
2103 help
2104 If you say Y here, the kernel will include drivers and
2105 infrastructure code to support RapidIO interconnect devices.
2106
2107source "drivers/rapidio/Kconfig"
2108
2099endmenu 2109endmenu
2100 2110
2101 2111
@@ -2130,6 +2140,11 @@ config SYSVIPC_COMPAT
2130 def_bool y 2140 def_bool y
2131 depends on COMPAT && SYSVIPC 2141 depends on COMPAT && SYSVIPC
2132 2142
2143config KEYS_COMPAT
2144 bool
2145 depends on COMPAT && KEYS
2146 default y
2147
2133endmenu 2148endmenu
2134 2149
2135 2150
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index ed47e6e1747f..d161e939df62 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -326,7 +326,7 @@ config X86_PPRO_FENCE
326 Old PentiumPro multiprocessor systems had errata that could cause 326 Old PentiumPro multiprocessor systems had errata that could cause
327 memory operations to violate the x86 ordering standard in rare cases. 327 memory operations to violate the x86 ordering standard in rare cases.
328 Enabling this option will attempt to work around some (but not all) 328 Enabling this option will attempt to work around some (but not all)
329 occurances of this problem, at the cost of much heavier spinlock and 329 occurrences of this problem, at the cost of much heavier spinlock and
330 memory barrier operations. 330 memory barrier operations.
331 331
332 If unsure, say n here. Even distro kernels should think twice before 332 If unsure, say n here. Even distro kernels should think twice before
@@ -366,7 +366,7 @@ config X86_INTEL_USERCOPY
366 366
367config X86_USE_PPRO_CHECKSUM 367config X86_USE_PPRO_CHECKSUM
368 def_bool y 368 def_bool y
369 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM 369 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
370 370
371config X86_USE_3DNOW 371config X86_USE_3DNOW
372 def_bool y 372 def_bool y
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 8fe2a4966b7a..adcf794b22e2 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -1346,7 +1346,7 @@ _zero_cipher_left_decrypt:
1346 and $15, %r13 # %r13 = arg4 (mod 16) 1346 and $15, %r13 # %r13 = arg4 (mod 16)
1347 je _multiple_of_16_bytes_decrypt 1347 je _multiple_of_16_bytes_decrypt
1348 1348
1349 # Handle the last <16 byte block seperately 1349 # Handle the last <16 byte block separately
1350 1350
1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn 1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1352 movdqa SHUF_MASK(%rip), %xmm10 1352 movdqa SHUF_MASK(%rip), %xmm10
@@ -1355,7 +1355,7 @@ _zero_cipher_left_decrypt:
1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) 1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1356 sub $16, %r11 1356 sub $16, %r11
1357 add %r13, %r11 1357 add %r13, %r11
1358 movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block 1358 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1359 lea SHIFT_MASK+16(%rip), %r12 1359 lea SHIFT_MASK+16(%rip), %r12
1360 sub %r13, %r12 1360 sub %r13, %r12
1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes 1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
@@ -1607,7 +1607,7 @@ _zero_cipher_left_encrypt:
1607 and $15, %r13 # %r13 = arg4 (mod 16) 1607 and $15, %r13 # %r13 = arg4 (mod 16)
1608 je _multiple_of_16_bytes_encrypt 1608 je _multiple_of_16_bytes_encrypt
1609 1609
1610 # Handle the last <16 Byte block seperately 1610 # Handle the last <16 Byte block separately
1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1612 movdqa SHUF_MASK(%rip), %xmm10 1612 movdqa SHUF_MASK(%rip), %xmm10
1613 PSHUFB_XMM %xmm10, %xmm0 1613 PSHUFB_XMM %xmm10, %xmm0
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index e1e60c7d5813..e0e6340c8dad 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -873,22 +873,18 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
873 crypto_ablkcipher_clear_flags(ctr_tfm, ~0); 873 crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
874 874
875 ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len); 875 ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
876 if (ret) { 876 if (ret)
877 crypto_free_ablkcipher(ctr_tfm); 877 goto out_free_ablkcipher;
878 return ret;
879 }
880 878
879 ret = -ENOMEM;
881 req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL); 880 req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
882 if (!req) { 881 if (!req)
883 crypto_free_ablkcipher(ctr_tfm); 882 goto out_free_ablkcipher;
884 return -EINVAL;
885 }
886 883
887 req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); 884 req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
888 if (!req_data) { 885 if (!req_data)
889 crypto_free_ablkcipher(ctr_tfm); 886 goto out_free_request;
890 return -ENOMEM; 887
891 }
892 memset(req_data->iv, 0, sizeof(req_data->iv)); 888 memset(req_data->iv, 0, sizeof(req_data->iv));
893 889
894 /* Clear the data in the hash sub key container to zero.*/ 890 /* Clear the data in the hash sub key container to zero.*/
@@ -913,8 +909,10 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
913 if (!ret) 909 if (!ret)
914 ret = req_data->result.err; 910 ret = req_data->result.err;
915 } 911 }
916 ablkcipher_request_free(req);
917 kfree(req_data); 912 kfree(req_data);
913out_free_request:
914 ablkcipher_request_free(req);
915out_free_ablkcipher:
918 crypto_free_ablkcipher(ctr_tfm); 916 crypto_free_ablkcipher(ctr_tfm);
919 return ret; 917 return ret;
920} 918}
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 2d93bdbc9ac0..fd843877e841 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -298,6 +298,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
298 /* OK, This is the point of no return */ 298 /* OK, This is the point of no return */
299 set_personality(PER_LINUX); 299 set_personality(PER_LINUX);
300 set_thread_flag(TIF_IA32); 300 set_thread_flag(TIF_IA32);
301 current->mm->context.ia32_compat = 1;
301 302
302 setup_new_exec(bprm); 303 setup_new_exec(bprm);
303 304
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 430312ba6e3f..849a9d23c71d 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -847,4 +847,5 @@ ia32_sys_call_table:
847 .quad sys_name_to_handle_at 847 .quad sys_name_to_handle_at
848 .quad compat_sys_open_by_handle_at 848 .quad compat_sys_open_by_handle_at
849 .quad compat_sys_clock_adjtime 849 .quad compat_sys_clock_adjtime
850 .quad sys_syncfs
850ia32_syscall_end: 851ia32_syscall_end:
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index b964ec457546..12e0e7dd869c 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -29,6 +29,7 @@
29#include <asm/processor.h> 29#include <asm/processor.h>
30#include <asm/mmu.h> 30#include <asm/mmu.h>
31#include <asm/mpspec.h> 31#include <asm/mpspec.h>
32#include <asm/trampoline.h>
32 33
33#define COMPILER_DEPENDENT_INT64 long long 34#define COMPILER_DEPENDENT_INT64 long long
34#define COMPILER_DEPENDENT_UINT64 unsigned long long 35#define COMPILER_DEPENDENT_UINT64 unsigned long long
@@ -113,11 +114,11 @@ static inline void acpi_disable_pci(void)
113 acpi_noirq_set(); 114 acpi_noirq_set();
114} 115}
115 116
116/* routines for saving/restoring kernel state */ 117/* Low-level suspend routine. */
117extern int acpi_save_state_mem(void); 118extern int acpi_suspend_lowlevel(void);
118extern void acpi_restore_state_mem(void);
119 119
120extern unsigned long acpi_wakeup_address; 120extern const unsigned char acpi_wakeup_code[];
121#define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code)))
121 122
122/* early initialization routine */ 123/* early initialization routine */
123extern void acpi_reserve_wakeup_memory(void); 124extern void acpi_reserve_wakeup_memory(void);
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index e264ae5a1443..331682231bb4 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -13,7 +13,7 @@ extern const struct pci_device_id amd_nb_misc_ids[];
13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; 13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
14struct bootnode; 14struct bootnode;
15 15
16extern int early_is_amd_nb(u32 value); 16extern bool early_is_amd_nb(u32 value);
17extern int amd_cache_northbridges(void); 17extern int amd_cache_northbridges(void);
18extern void amd_flush_garts(void); 18extern void amd_flush_garts(void);
19extern int amd_numa_init(void); 19extern int amd_numa_init(void);
@@ -32,18 +32,18 @@ struct amd_northbridge_info {
32}; 32};
33extern struct amd_northbridge_info amd_northbridges; 33extern struct amd_northbridge_info amd_northbridges;
34 34
35#define AMD_NB_GART 0x1 35#define AMD_NB_GART BIT(0)
36#define AMD_NB_L3_INDEX_DISABLE 0x2 36#define AMD_NB_L3_INDEX_DISABLE BIT(1)
37#define AMD_NB_L3_PARTITIONING 0x4 37#define AMD_NB_L3_PARTITIONING BIT(2)
38 38
39#ifdef CONFIG_AMD_NB 39#ifdef CONFIG_AMD_NB
40 40
41static inline int amd_nb_num(void) 41static inline u16 amd_nb_num(void)
42{ 42{
43 return amd_northbridges.num; 43 return amd_northbridges.num;
44} 44}
45 45
46static inline int amd_nb_has_feature(int feature) 46static inline bool amd_nb_has_feature(unsigned feature)
47{ 47{
48 return ((amd_northbridges.flags & feature) == feature); 48 return ((amd_northbridges.flags & feature) == feature);
49} 49}
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 903683b07e42..69d58131bc8e 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -456,14 +456,12 @@ static inline int fls(int x)
456 456
457#ifdef __KERNEL__ 457#ifdef __KERNEL__
458 458
459#include <asm-generic/bitops/ext2-non-atomic.h> 459#include <asm-generic/bitops/le.h>
460 460
461#define ext2_set_bit_atomic(lock, nr, addr) \ 461#define ext2_set_bit_atomic(lock, nr, addr) \
462 test_and_set_bit((nr), (unsigned long *)(addr)) 462 test_and_set_bit((nr), (unsigned long *)(addr))
463#define ext2_clear_bit_atomic(lock, nr, addr) \ 463#define ext2_clear_bit_atomic(lock, nr, addr) \
464 test_and_clear_bit((nr), (unsigned long *)(addr)) 464 test_and_clear_bit((nr), (unsigned long *)(addr))
465 465
466#include <asm-generic/bitops/minix.h>
467
468#endif /* __KERNEL__ */ 466#endif /* __KERNEL__ */
469#endif /* _ASM_X86_BITOPS_H */ 467#endif /* _ASM_X86_BITOPS_H */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 62f084478f7e..4e12668711e5 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -71,7 +71,7 @@ static inline void set_page_memtype(struct page *pg, unsigned long memtype) { }
71 * Read/Write : ReadOnly, ReadWrite 71 * Read/Write : ReadOnly, ReadWrite
72 * Presence : NotPresent 72 * Presence : NotPresent
73 * 73 *
74 * Within a catagory, the attributes are mutually exclusive. 74 * Within a category, the attributes are mutually exclusive.
75 * 75 *
76 * The implementation of this API will take care of various aspects that 76 * The implementation of this API will take care of various aspects that
77 * are associated with changing such attributes, such as: 77 * are associated with changing such attributes, such as:
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index ca1098a7e580..97b6d8114a43 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -151,6 +151,7 @@
151#define DMA_AUTOINIT 0x10 151#define DMA_AUTOINIT 0x10
152 152
153 153
154#ifdef CONFIG_ISA_DMA_API
154extern spinlock_t dma_spin_lock; 155extern spinlock_t dma_spin_lock;
155 156
156static inline unsigned long claim_dma_lock(void) 157static inline unsigned long claim_dma_lock(void)
@@ -164,6 +165,7 @@ static inline void release_dma_lock(unsigned long flags)
164{ 165{
165 spin_unlock_irqrestore(&dma_spin_lock, flags); 166 spin_unlock_irqrestore(&dma_spin_lock, flags);
166} 167}
168#endif /* CONFIG_ISA_DMA_API */
167 169
168/* enable/disable a specific DMA channel */ 170/* enable/disable a specific DMA channel */
169static inline void enable_dma(unsigned int dmanr) 171static inline void enable_dma(unsigned int dmanr)
@@ -303,9 +305,11 @@ static inline int get_dma_residue(unsigned int dmanr)
303} 305}
304 306
305 307
306/* These are in kernel/dma.c: */ 308/* These are in kernel/dma.c because x86 uses CONFIG_GENERIC_ISA_DMA */
309#ifdef CONFIG_ISA_DMA_API
307extern int request_dma(unsigned int dmanr, const char *device_id); 310extern int request_dma(unsigned int dmanr, const char *device_id);
308extern void free_dma(unsigned int dmanr); 311extern void free_dma(unsigned int dmanr);
312#endif
309 313
310/* From PCI */ 314/* From PCI */
311 315
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 518bbbb9ee59..fe2cc6e105fa 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -26,7 +26,7 @@ extern void die(const char *, struct pt_regs *,long);
26extern int __must_check __die(const char *, struct pt_regs *, long); 26extern int __must_check __die(const char *, struct pt_regs *, long);
27extern void show_registers(struct pt_regs *regs); 27extern void show_registers(struct pt_regs *regs);
28extern void show_trace(struct task_struct *t, struct pt_regs *regs, 28extern void show_trace(struct task_struct *t, struct pt_regs *regs,
29 unsigned long *sp); 29 unsigned long *sp, unsigned long bp);
30extern void __show_regs(struct pt_regs *regs, int all); 30extern void __show_regs(struct pt_regs *regs, int all);
31extern void show_regs(struct pt_regs *regs); 31extern void show_regs(struct pt_regs *regs);
32extern unsigned long oops_begin(void); 32extern unsigned long oops_begin(void);
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 8e37deb1eb38..0f5213564326 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -142,9 +142,9 @@ struct x86_emulate_ops {
142 int (*pio_out_emulated)(int size, unsigned short port, const void *val, 142 int (*pio_out_emulated)(int size, unsigned short port, const void *val,
143 unsigned int count, struct kvm_vcpu *vcpu); 143 unsigned int count, struct kvm_vcpu *vcpu);
144 144
145 bool (*get_cached_descriptor)(struct desc_struct *desc, 145 bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3,
146 int seg, struct kvm_vcpu *vcpu); 146 int seg, struct kvm_vcpu *vcpu);
147 void (*set_cached_descriptor)(struct desc_struct *desc, 147 void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3,
148 int seg, struct kvm_vcpu *vcpu); 148 int seg, struct kvm_vcpu *vcpu);
149 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); 149 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
150 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); 150 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
@@ -239,6 +239,7 @@ struct x86_emulate_ctxt {
239 int interruptibility; 239 int interruptibility;
240 240
241 bool perm_ok; /* do not check permissions if true */ 241 bool perm_ok; /* do not check permissions if true */
242 bool only_vendor_specific_insn;
242 243
243 bool have_exception; 244 bool have_exception;
244 struct x86_exception exception; 245 struct x86_exception exception;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ffd7f8d29187..c8af0991fdf0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -85,7 +85,7 @@
85 85
86#define ASYNC_PF_PER_VCPU 64 86#define ASYNC_PF_PER_VCPU 64
87 87
88extern spinlock_t kvm_lock; 88extern raw_spinlock_t kvm_lock;
89extern struct list_head vm_list; 89extern struct list_head vm_list;
90 90
91struct kvm_vcpu; 91struct kvm_vcpu;
@@ -255,6 +255,8 @@ struct kvm_mmu {
255 int (*sync_page)(struct kvm_vcpu *vcpu, 255 int (*sync_page)(struct kvm_vcpu *vcpu,
256 struct kvm_mmu_page *sp); 256 struct kvm_mmu_page *sp);
257 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 257 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
258 void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
259 u64 *spte, const void *pte, unsigned long mmu_seq);
258 hpa_t root_hpa; 260 hpa_t root_hpa;
259 int root_level; 261 int root_level;
260 int shadow_root_level; 262 int shadow_root_level;
@@ -335,12 +337,6 @@ struct kvm_vcpu_arch {
335 u64 *last_pte_updated; 337 u64 *last_pte_updated;
336 gfn_t last_pte_gfn; 338 gfn_t last_pte_gfn;
337 339
338 struct {
339 gfn_t gfn; /* presumed gfn during guest pte update */
340 pfn_t pfn; /* pfn corresponding to that gfn */
341 unsigned long mmu_seq;
342 } update_pte;
343
344 struct fpu guest_fpu; 340 struct fpu guest_fpu;
345 u64 xcr0; 341 u64 xcr0;
346 342
@@ -448,7 +444,7 @@ struct kvm_arch {
448 444
449 unsigned long irq_sources_bitmap; 445 unsigned long irq_sources_bitmap;
450 s64 kvmclock_offset; 446 s64 kvmclock_offset;
451 spinlock_t tsc_write_lock; 447 raw_spinlock_t tsc_write_lock;
452 u64 last_tsc_nsec; 448 u64 last_tsc_nsec;
453 u64 last_tsc_offset; 449 u64 last_tsc_offset;
454 u64 last_tsc_write; 450 u64 last_tsc_write;
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 80a1dee5bea5..aeff3e89b222 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -13,6 +13,12 @@ typedef struct {
13 int size; 13 int size;
14 struct mutex lock; 14 struct mutex lock;
15 void *vdso; 15 void *vdso;
16
17#ifdef CONFIG_X86_64
18 /* True if mm supports a task running in 32 bit compatibility mode. */
19 unsigned short ia32_compat;
20#endif
21
16} mm_context_t; 22} mm_context_t;
17 23
18#ifdef CONFIG_SMP 24#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 823d48223400..fd5a1f365c95 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -43,6 +43,7 @@
43 43
44#define MSR_MTRRcap 0x000000fe 44#define MSR_MTRRcap 0x000000fe
45#define MSR_IA32_BBL_CR_CTL 0x00000119 45#define MSR_IA32_BBL_CR_CTL 0x00000119
46#define MSR_IA32_BBL_CR_CTL3 0x0000011e
46 47
47#define MSR_IA32_SYSENTER_CS 0x00000174 48#define MSR_IA32_SYSENTER_CS 0x00000174
48#define MSR_IA32_SYSENTER_ESP 0x00000175 49#define MSR_IA32_SYSENTER_ESP 0x00000175
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 07f46016d3ff..4886a68f267e 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -29,8 +29,8 @@ void arch_trigger_all_cpu_backtrace(void);
29 * external nmis, because the local ones are more frequent. 29 * external nmis, because the local ones are more frequent.
30 * 30 *
31 * Also setup some default high/normal/low settings for 31 * Also setup some default high/normal/low settings for
32 * subsystems to registers with. Using 4 bits to seperate 32 * subsystems to registers with. Using 4 bits to separate
33 * the priorities. This can go alot higher if needed be. 33 * the priorities. This can go a lot higher if needed be.
34 */ 34 */
35 35
36#define NMI_LOCAL_SHIFT 16 /* randomly picked */ 36#define NMI_LOCAL_SHIFT 16 /* randomly picked */
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index 6d8723a766cc..af788496020b 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -38,7 +38,7 @@
38#define K8_NOP8 K8_NOP4 K8_NOP4 38#define K8_NOP8 K8_NOP4 K8_NOP4
39 39
40/* K7 nops 40/* K7 nops
41 uses eax dependencies (arbitary choice) 41 uses eax dependencies (arbitrary choice)
42 1: nop 42 1: nop
43 2: movl %eax,%eax 43 2: movl %eax,%eax
44 3: leal (,%eax,1),%eax 44 3: leal (,%eax,1),%eax
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index f482010350fb..5ca6801b75f3 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -20,7 +20,7 @@ extern struct olpc_platform_t olpc_platform_info;
20 20
21/* 21/*
22 * OLPC board IDs contain the major build number within the mask 0x0ff0, 22 * OLPC board IDs contain the major build number within the mask 0x0ff0,
23 * and the minor build number withing 0x000f. Pre-builds have a minor 23 * and the minor build number within 0x000f. Pre-builds have a minor
24 * number less than 8, and normal builds start at 8. For example, 0x0B10 24 * number less than 8, and normal builds start at 8. For example, 0x0B10
25 * is a PreB1, and 0x0C18 is a C1. 25 * is a PreB1, and 0x0C18 is a C1.
26 */ 26 */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 7e172955ee57..a09e1f052d84 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -451,6 +451,26 @@ do { \
451#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 451#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
452#endif /* !CONFIG_M386 */ 452#endif /* !CONFIG_M386 */
453 453
454#ifdef CONFIG_X86_CMPXCHG64
455#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \
456({ \
457 char __ret; \
458 typeof(o1) __o1 = o1; \
459 typeof(o1) __n1 = n1; \
460 typeof(o2) __o2 = o2; \
461 typeof(o2) __n2 = n2; \
462 typeof(o2) __dummy = n2; \
463 asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \
464 : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \
465 : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \
466 __ret; \
467})
468
469#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
470#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
471#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
472#endif /* CONFIG_X86_CMPXCHG64 */
473
454/* 474/*
455 * Per cpu atomic 64 bit operations are only available under 64 bit. 475 * Per cpu atomic 64 bit operations are only available under 64 bit.
456 * 32 bit must fall back to generic operations. 476 * 32 bit must fall back to generic operations.
@@ -480,6 +500,34 @@ do { \
480#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 500#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
481#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) 501#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
482#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 502#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
503
504/*
505 * Pretty complex macro to generate cmpxchg16 instruction. The instruction
506 * is not supported on early AMD64 processors so we must be able to emulate
507 * it in software. The address used in the cmpxchg16 instruction must be
508 * aligned to a 16 byte boundary.
509 */
510#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \
511({ \
512 char __ret; \
513 typeof(o1) __o1 = o1; \
514 typeof(o1) __n1 = n1; \
515 typeof(o2) __o2 = o2; \
516 typeof(o2) __n2 = n2; \
517 typeof(o2) __dummy; \
518 alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4, \
519 "cmpxchg16b %%gs:(%%rsi)\n\tsetz %0\n\t", \
520 X86_FEATURE_CX16, \
521 ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \
522 "S" (&pcp1), "b"(__n1), "c"(__n2), \
523 "a"(__o1), "d"(__o2)); \
524 __ret; \
525})
526
527#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
528#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
529#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
530
483#endif 531#endif
484 532
485/* This is not atomic against other CPUs -- CPU preemption needs to be off */ 533/* This is not atomic against other CPUs -- CPU preemption needs to be off */
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index cc29086e30cd..56fd9e3abbda 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Netburst Perfomance Events (P4, old Xeon) 2 * Netburst Performance Events (P4, old Xeon)
3 */ 3 */
4 4
5#ifndef PERF_EVENT_P4_H 5#ifndef PERF_EVENT_P4_H
@@ -9,7 +9,7 @@
9#include <linux/bitops.h> 9#include <linux/bitops.h>
10 10
11/* 11/*
12 * NetBurst has perfomance MSRs shared between 12 * NetBurst has performance MSRs shared between
13 * threads if HT is turned on, ie for both logical 13 * threads if HT is turned on, ie for both logical
14 * processors (mem: in turn in Atom with HT support 14 * processors (mem: in turn in Atom with HT support
15 * perf-MSRs are not shared and every thread has its 15 * perf-MSRs are not shared and every thread has its
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 94b979d1b58d..effff47a3c82 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -69,8 +69,6 @@ static inline void native_pmd_clear(pmd_t *pmd)
69 69
70static inline void pud_clear(pud_t *pudp) 70static inline void pud_clear(pud_t *pudp)
71{ 71{
72 unsigned long pgd;
73
74 set_pud(pudp, __pud(0)); 72 set_pud(pudp, __pud(0));
75 73
76 /* 74 /*
@@ -79,13 +77,10 @@ static inline void pud_clear(pud_t *pudp)
79 * section 8.1: in PAE mode we explicitly have to flush the 77 * section 8.1: in PAE mode we explicitly have to flush the
80 * TLB via cr3 if the top-level pgd is changed... 78 * TLB via cr3 if the top-level pgd is changed...
81 * 79 *
82 * Make sure the pud entry we're updating is within the 80 * Currently all places where pud_clear() is called either have
83 * current pgd to avoid unnecessary TLB flushes. 81 * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or
82 * pud_clear_bad()), so we don't need TLB flush here.
84 */ 83 */
85 pgd = read_cr3();
86 if (__pa(pudp) >= pgd && __pa(pudp) <
87 (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
88 write_cr3(pgd);
89} 84}
90 85
91#ifdef CONFIG_SMP 86#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 7a3e836eb2a9..a898a2b6e10c 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -7,7 +7,7 @@
7 */ 7 */
8#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ 8#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
9#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ 9#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
10#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ 10#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */
11#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ 11#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
12#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ 12#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
13#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ 13#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
index 52b098a6eebb..7b0a55a88851 100644
--- a/arch/x86/include/asm/ptrace-abi.h
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -31,7 +31,7 @@
31#define R12 24 31#define R12 24
32#define RBP 32 32#define RBP 32
33#define RBX 40 33#define RBX 40
34/* arguments: interrupts/non tracing syscalls only save upto here*/ 34/* arguments: interrupts/non tracing syscalls only save up to here*/
35#define R11 48 35#define R11 48
36#define R10 56 36#define R10 56
37#define R9 64 37#define R9 64
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 78cd1ea94500..1babf8adecdf 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -73,7 +73,7 @@ struct pt_regs {
73 unsigned long r12; 73 unsigned long r12;
74 unsigned long rbp; 74 unsigned long rbp;
75 unsigned long rbx; 75 unsigned long rbx;
76/* arguments: non interrupts/non tracing syscalls only save upto here*/ 76/* arguments: non interrupts/non tracing syscalls only save up to here*/
77 unsigned long r11; 77 unsigned long r11;
78 unsigned long r10; 78 unsigned long r10;
79 unsigned long r9; 79 unsigned long r9;
@@ -103,7 +103,7 @@ struct pt_regs {
103 unsigned long r12; 103 unsigned long r12;
104 unsigned long bp; 104 unsigned long bp;
105 unsigned long bx; 105 unsigned long bx;
106/* arguments: non interrupts/non tracing syscalls only save upto here*/ 106/* arguments: non interrupts/non tracing syscalls only save up to here*/
107 unsigned long r11; 107 unsigned long r11;
108 unsigned long r10; 108 unsigned long r10;
109 unsigned long r9; 109 unsigned long r9;
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 562d4fd31ba8..3250e3d605d9 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -18,7 +18,10 @@ extern struct machine_ops machine_ops;
18 18
19void native_machine_crash_shutdown(struct pt_regs *regs); 19void native_machine_crash_shutdown(struct pt_regs *regs);
20void native_machine_shutdown(void); 20void native_machine_shutdown(void);
21void machine_real_restart(const unsigned char *code, int length); 21void machine_real_restart(unsigned int type);
22/* These must match dispatch_table in reboot_32.S */
23#define MRR_BIOS 0
24#define MRR_APM 1
22 25
23typedef void (*nmi_shootdown_cb)(int, struct die_args*); 26typedef void (*nmi_shootdown_cb)(int, struct die_args*);
24void nmi_shootdown_cpus(nmi_shootdown_cb callback); 27void nmi_shootdown_cpus(nmi_shootdown_cb callback);
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 231f1c1d6607..cd84f7208f76 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -1,14 +1,16 @@
1#ifndef _ASM_X86_SEGMENT_H 1#ifndef _ASM_X86_SEGMENT_H
2#define _ASM_X86_SEGMENT_H 2#define _ASM_X86_SEGMENT_H
3 3
4#include <linux/const.h>
5
4/* Constructor for a conventional segment GDT (or LDT) entry */ 6/* Constructor for a conventional segment GDT (or LDT) entry */
5/* This is a macro so it can be used in initializers */ 7/* This is a macro so it can be used in initializers */
6#define GDT_ENTRY(flags, base, limit) \ 8#define GDT_ENTRY(flags, base, limit) \
7 ((((base) & 0xff000000ULL) << (56-24)) | \ 9 ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \
8 (((flags) & 0x0000f0ffULL) << 40) | \ 10 (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \
9 (((limit) & 0x000f0000ULL) << (48-16)) | \ 11 (((limit) & _AC(0x000f0000,ULL)) << (48-16)) | \
10 (((base) & 0x00ffffffULL) << 16) | \ 12 (((base) & _AC(0x00ffffff,ULL)) << 16) | \
11 (((limit) & 0x0000ffffULL))) 13 (((limit) & _AC(0x0000ffff,ULL))))
12 14
13/* Simple and small GDT entries for booting only */ 15/* Simple and small GDT entries for booting only */
14 16
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 52b5c7ed3608..d7e89c83645d 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -47,7 +47,7 @@ struct stacktrace_ops {
47}; 47};
48 48
49void dump_trace(struct task_struct *tsk, struct pt_regs *regs, 49void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
50 unsigned long *stack, 50 unsigned long *stack, unsigned long bp,
51 const struct stacktrace_ops *ops, void *data); 51 const struct stacktrace_ops *ops, void *data);
52 52
53#ifdef CONFIG_X86_32 53#ifdef CONFIG_X86_32
@@ -86,11 +86,11 @@ stack_frame(struct task_struct *task, struct pt_regs *regs)
86 86
87extern void 87extern void
88show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 88show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
89 unsigned long *stack, char *log_lvl); 89 unsigned long *stack, unsigned long bp, char *log_lvl);
90 90
91extern void 91extern void
92show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 92show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
93 unsigned long *sp, char *log_lvl); 93 unsigned long *sp, unsigned long bp, char *log_lvl);
94 94
95extern unsigned int code_bytes; 95extern unsigned int code_bytes;
96 96
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index f0b6e5dbc5a0..1f2e61e28981 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -161,8 +161,14 @@ struct thread_info {
161 161
162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR 162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
163 163
164#define alloc_thread_info(tsk) \ 164#define alloc_thread_info_node(tsk, node) \
165 ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER)) 165({ \
166 struct page *page = alloc_pages_node(node, THREAD_FLAGS, \
167 THREAD_ORDER); \
168 struct thread_info *ret = page ? page_address(page) : NULL; \
169 \
170 ret; \
171})
166 172
167#ifdef CONFIG_X86_32 173#ifdef CONFIG_X86_32
168 174
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index f4500fb3b485..feca3118a73b 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -3,25 +3,36 @@
3 3
4#ifndef __ASSEMBLY__ 4#ifndef __ASSEMBLY__
5 5
6#ifdef CONFIG_X86_TRAMPOLINE 6#include <linux/types.h>
7#include <asm/io.h>
8
7/* 9/*
8 * Trampoline 80x86 program as an array. 10 * Trampoline 80x86 program as an array. These are in the init rodata
11 * segment, but that's okay, because we only care about the relative
12 * addresses of the symbols.
9 */ 13 */
10extern const unsigned char trampoline_data []; 14extern const unsigned char x86_trampoline_start [];
11extern const unsigned char trampoline_end []; 15extern const unsigned char x86_trampoline_end [];
12extern unsigned char *trampoline_base; 16extern unsigned char *x86_trampoline_base;
13 17
14extern unsigned long init_rsp; 18extern unsigned long init_rsp;
15extern unsigned long initial_code; 19extern unsigned long initial_code;
16extern unsigned long initial_gs; 20extern unsigned long initial_gs;
17 21
18#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) 22extern void __init setup_trampolines(void);
23
24extern const unsigned char trampoline_data[];
25extern const unsigned char trampoline_status[];
26
27#define TRAMPOLINE_SYM(x) \
28 ((void *)(x86_trampoline_base + \
29 ((const unsigned char *)(x) - x86_trampoline_start)))
19 30
20extern unsigned long setup_trampoline(void); 31/* Address of the SMP trampoline */
21extern void __init reserve_trampoline_memory(void); 32static inline unsigned long trampoline_address(void)
22#else 33{
23static inline void reserve_trampoline_memory(void) {} 34 return virt_to_phys(TRAMPOLINE_SYM(trampoline_data));
24#endif /* CONFIG_X86_TRAMPOLINE */ 35}
25 36
26#endif /* __ASSEMBLY__ */ 37#endif /* __ASSEMBLY__ */
27 38
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 1ca132fc0d03..83e2efd181e2 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -35,7 +35,7 @@ static inline cycles_t get_cycles(void)
35static __always_inline cycles_t vget_cycles(void) 35static __always_inline cycles_t vget_cycles(void)
36{ 36{
37 /* 37 /*
38 * We only do VDSOs on TSC capable CPUs, so this shouldnt 38 * We only do VDSOs on TSC capable CPUs, so this shouldn't
39 * access boot_cpu_data (which is not VDSO-safe): 39 * access boot_cpu_data (which is not VDSO-safe):
40 */ 40 */
41#ifndef CONFIG_X86_TSC 41#ifndef CONFIG_X86_TSC
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h
index df1da20f4534..8e8c23fef08c 100644
--- a/arch/x86/include/asm/types.h
+++ b/arch/x86/include/asm/types.h
@@ -1,22 +1,6 @@
1#ifndef _ASM_X86_TYPES_H 1#ifndef _ASM_X86_TYPES_H
2#define _ASM_X86_TYPES_H 2#define _ASM_X86_TYPES_H
3 3
4#define dma_addr_t dma_addr_t
5
6#include <asm-generic/types.h> 4#include <asm-generic/types.h>
7 5
8#ifdef __KERNEL__
9#ifndef __ASSEMBLY__
10
11typedef u64 dma64_addr_t;
12#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G)
13/* DMA addresses come in 32-bit and 64-bit flavours. */
14typedef u64 dma_addr_t;
15#else
16typedef u32 dma_addr_t;
17#endif
18
19#endif /* __ASSEMBLY__ */
20#endif /* __KERNEL__ */
21
22#endif /* _ASM_X86_TYPES_H */ 6#endif /* _ASM_X86_TYPES_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index ffaf183c619a..a755ef5e5977 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -349,10 +349,11 @@
349#define __NR_name_to_handle_at 341 349#define __NR_name_to_handle_at 341
350#define __NR_open_by_handle_at 342 350#define __NR_open_by_handle_at 342
351#define __NR_clock_adjtime 343 351#define __NR_clock_adjtime 343
352#define __NR_syncfs 344
352 353
353#ifdef __KERNEL__ 354#ifdef __KERNEL__
354 355
355#define NR_syscalls 344 356#define NR_syscalls 345
356 357
357#define __ARCH_WANT_IPC_PARSE_VERSION 358#define __ARCH_WANT_IPC_PARSE_VERSION
358#define __ARCH_WANT_OLD_READDIR 359#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 5466bea670e7..160fa76bd578 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -675,6 +675,8 @@ __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
675__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) 675__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
676#define __NR_clock_adjtime 305 676#define __NR_clock_adjtime 305
677__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime) 677__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
678#define __NR_syncfs 306
679__SYSCALL(__NR_syncfs, sys_syncfs)
678 680
679#ifndef __NO_STUBS 681#ifndef __NO_STUBS
680#define __ARCH_WANT_OLD_READDIR 682#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index 1c10c88ee4e1..5d4922ad4b9b 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -86,7 +86,7 @@ DEFINE_GUEST_HANDLE(void);
86 * The privilege level specifies which modes may enter a trap via a software 86 * The privilege level specifies which modes may enter a trap via a software
87 * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate 87 * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
88 * privilege levels as follows: 88 * privilege levels as follows:
89 * Level == 0: Noone may enter 89 * Level == 0: No one may enter
90 * Level == 1: Kernel may enter 90 * Level == 1: Kernel may enter
91 * Level == 2: Kernel may enter 91 * Level == 2: Kernel may enter
92 * Level == 3: Everyone may enter 92 * Level == 3: Everyone may enter
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 62445ba2f8a8..7338ef2218bc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -41,13 +41,13 @@ obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
41obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 41obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
42obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 42obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
43obj-y += bootflag.o e820.o 43obj-y += bootflag.o e820.o
44obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 44obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
46obj-y += tsc.o io_delay.o rtc.o 46obj-y += tsc.o io_delay.o rtc.o
47obj-y += pci-iommu_table.o 47obj-y += pci-iommu_table.o
48obj-y += resource.o 48obj-y += resource.o
49 49
50obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 50obj-y += trampoline.o trampoline_$(BITS).o
51obj-y += process.o 51obj-y += process.o
52obj-y += i387.o xsave.o 52obj-y += i387.o xsave.o
53obj-y += ptrace.o 53obj-y += ptrace.o
@@ -55,10 +55,12 @@ obj-$(CONFIG_X86_32) += tls.o
55obj-$(CONFIG_IA32_EMULATION) += tls.o 55obj-$(CONFIG_IA32_EMULATION) += tls.o
56obj-y += step.o 56obj-y += step.o
57obj-$(CONFIG_INTEL_TXT) += tboot.o 57obj-$(CONFIG_INTEL_TXT) += tboot.o
58obj-$(CONFIG_ISA_DMA_API) += i8237.o
58obj-$(CONFIG_STACKTRACE) += stacktrace.o 59obj-$(CONFIG_STACKTRACE) += stacktrace.o
59obj-y += cpu/ 60obj-y += cpu/
60obj-y += acpi/ 61obj-y += acpi/
61obj-y += reboot.o 62obj-y += reboot.o
63obj-$(CONFIG_X86_32) += reboot_32.o
62obj-$(CONFIG_MCA) += mca_32.o 64obj-$(CONFIG_MCA) += mca_32.o
63obj-$(CONFIG_X86_MSR) += msr.o 65obj-$(CONFIG_X86_MSR) += msr.o
64obj-$(CONFIG_X86_CPUID) += cpuid.o 66obj-$(CONFIG_X86_CPUID) += cpuid.o
@@ -69,7 +71,6 @@ obj-$(CONFIG_SMP) += smp.o
69obj-$(CONFIG_SMP) += smpboot.o 71obj-$(CONFIG_SMP) += smpboot.o
70obj-$(CONFIG_SMP) += tsc_sync.o 72obj-$(CONFIG_SMP) += tsc_sync.o
71obj-$(CONFIG_SMP) += setup_percpu.o 73obj-$(CONFIG_SMP) += setup_percpu.o
72obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
73obj-$(CONFIG_X86_MPPARSE) += mpparse.o 74obj-$(CONFIG_X86_MPPARSE) += mpparse.o
74obj-y += apic/ 75obj-y += apic/
75obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 76obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 28595d6df47c..ead21b663117 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -6,11 +6,17 @@
6#include <asm/page_types.h> 6#include <asm/page_types.h>
7#include <asm/pgtable_types.h> 7#include <asm/pgtable_types.h>
8#include <asm/processor-flags.h> 8#include <asm/processor-flags.h>
9#include "wakeup.h"
9 10
10 .code16 11 .code16
11 .section ".header", "a" 12 .section ".jump", "ax"
13 .globl _start
14_start:
15 cli
16 jmp wakeup_code
12 17
13/* This should match the structure in wakeup.h */ 18/* This should match the structure in wakeup.h */
19 .section ".header", "a"
14 .globl wakeup_header 20 .globl wakeup_header
15wakeup_header: 21wakeup_header:
16video_mode: .short 0 /* Video mode number */ 22video_mode: .short 0 /* Video mode number */
@@ -30,14 +36,11 @@ wakeup_jmp: .byte 0xea /* ljmpw */
30wakeup_jmp_off: .word 3f 36wakeup_jmp_off: .word 3f
31wakeup_jmp_seg: .word 0 37wakeup_jmp_seg: .word 0
32wakeup_gdt: .quad 0, 0, 0 38wakeup_gdt: .quad 0, 0, 0
33signature: .long 0x51ee1111 39signature: .long WAKEUP_HEADER_SIGNATURE
34 40
35 .text 41 .text
36 .globl _start
37 .code16 42 .code16
38wakeup_code: 43wakeup_code:
39_start:
40 cli
41 cld 44 cld
42 45
43 /* Apparently some dimwit BIOS programmers don't know how to 46 /* Apparently some dimwit BIOS programmers don't know how to
@@ -77,12 +80,12 @@ _start:
77 80
78 /* Check header signature... */ 81 /* Check header signature... */
79 movl signature, %eax 82 movl signature, %eax
80 cmpl $0x51ee1111, %eax 83 cmpl $WAKEUP_HEADER_SIGNATURE, %eax
81 jne bogus_real_magic 84 jne bogus_real_magic
82 85
83 /* Check we really have everything... */ 86 /* Check we really have everything... */
84 movl end_signature, %eax 87 movl end_signature, %eax
85 cmpl $0x65a22c82, %eax 88 cmpl $WAKEUP_END_SIGNATURE, %eax
86 jne bogus_real_magic 89 jne bogus_real_magic
87 90
88 /* Call the C code */ 91 /* Call the C code */
@@ -147,3 +150,7 @@ wakeup_heap:
147wakeup_stack: 150wakeup_stack:
148 .space 2048 151 .space 2048
149wakeup_stack_end: 152wakeup_stack_end:
153
154 .section ".signature","a"
155end_signature:
156 .long WAKEUP_END_SIGNATURE
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index 69d38d0b2b64..e1828c07e79c 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -35,7 +35,8 @@ struct wakeup_header {
35extern struct wakeup_header wakeup_header; 35extern struct wakeup_header wakeup_header;
36#endif 36#endif
37 37
38#define HEADER_OFFSET 0x3f00 38#define WAKEUP_HEADER_OFFSET 8
39#define WAKEUP_SIZE 0x4000 39#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
40#define WAKEUP_END_SIGNATURE 0x65a22c82
40 41
41#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ 42#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
index 060fff8f5c5b..d4f8010a5b1b 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -13,9 +13,19 @@ ENTRY(_start)
13SECTIONS 13SECTIONS
14{ 14{
15 . = 0; 15 . = 0;
16 .jump : {
17 *(.jump)
18 } = 0x90909090
19
20 . = WAKEUP_HEADER_OFFSET;
21 .header : {
22 *(.header)
23 }
24
25 . = ALIGN(16);
16 .text : { 26 .text : {
17 *(.text*) 27 *(.text*)
18 } 28 } = 0x90909090
19 29
20 . = ALIGN(16); 30 . = ALIGN(16);
21 .rodata : { 31 .rodata : {
@@ -33,11 +43,6 @@ SECTIONS
33 *(.data*) 43 *(.data*)
34 } 44 }
35 45
36 .signature : {
37 end_signature = .;
38 LONG(0x65a22c82)
39 }
40
41 . = ALIGN(16); 46 . = ALIGN(16);
42 .bss : { 47 .bss : {
43 __bss_start = .; 48 __bss_start = .;
@@ -45,20 +50,13 @@ SECTIONS
45 __bss_end = .; 50 __bss_end = .;
46 } 51 }
47 52
48 . = HEADER_OFFSET; 53 .signature : {
49 .header : { 54 *(.signature)
50 *(.header)
51 } 55 }
52 56
53 . = ALIGN(16);
54 _end = .; 57 _end = .;
55 58
56 /DISCARD/ : { 59 /DISCARD/ : {
57 *(.note*) 60 *(.note*)
58 } 61 }
59
60 /*
61 * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
62 */
63 . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
64} 62}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 68d1537b8c81..ff93bc1b09c3 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -18,37 +18,28 @@
18#include "realmode/wakeup.h" 18#include "realmode/wakeup.h"
19#include "sleep.h" 19#include "sleep.h"
20 20
21unsigned long acpi_wakeup_address;
22unsigned long acpi_realmode_flags; 21unsigned long acpi_realmode_flags;
23 22
24/* address in low memory of the wakeup routine. */
25static unsigned long acpi_realmode;
26
27#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) 23#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
28static char temp_stack[4096]; 24static char temp_stack[4096];
29#endif 25#endif
30 26
31/** 27/**
32 * acpi_save_state_mem - save kernel state 28 * acpi_suspend_lowlevel - save kernel state
33 * 29 *
34 * Create an identity mapped page table and copy the wakeup routine to 30 * Create an identity mapped page table and copy the wakeup routine to
35 * low memory. 31 * low memory.
36 *
37 * Note that this is too late to change acpi_wakeup_address.
38 */ 32 */
39int acpi_save_state_mem(void) 33int acpi_suspend_lowlevel(void)
40{ 34{
41 struct wakeup_header *header; 35 struct wakeup_header *header;
36 /* address in low memory of the wakeup routine. */
37 char *acpi_realmode;
42 38
43 if (!acpi_realmode) { 39 acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code);
44 printk(KERN_ERR "Could not allocate memory during boot, "
45 "S3 disabled\n");
46 return -ENOMEM;
47 }
48 memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE);
49 40
50 header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET); 41 header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET);
51 if (header->signature != 0x51ee1111) { 42 if (header->signature != WAKEUP_HEADER_SIGNATURE) {
52 printk(KERN_ERR "wakeup header does not match\n"); 43 printk(KERN_ERR "wakeup header does not match\n");
53 return -EINVAL; 44 return -EINVAL;
54 } 45 }
@@ -68,9 +59,7 @@ int acpi_save_state_mem(void)
68 /* GDT[0]: GDT self-pointer */ 59 /* GDT[0]: GDT self-pointer */
69 header->wakeup_gdt[0] = 60 header->wakeup_gdt[0] =
70 (u64)(sizeof(header->wakeup_gdt) - 1) + 61 (u64)(sizeof(header->wakeup_gdt) - 1) +
71 ((u64)(acpi_wakeup_address + 62 ((u64)__pa(&header->wakeup_gdt) << 16);
72 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
73 << 16);
74 /* GDT[1]: big real mode-like code segment */ 63 /* GDT[1]: big real mode-like code segment */
75 header->wakeup_gdt[1] = 64 header->wakeup_gdt[1] =
76 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff); 65 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
@@ -96,7 +85,7 @@ int acpi_save_state_mem(void)
96 header->pmode_cr3 = (u32)__pa(&initial_page_table); 85 header->pmode_cr3 = (u32)__pa(&initial_page_table);
97 saved_magic = 0x12345678; 86 saved_magic = 0x12345678;
98#else /* CONFIG_64BIT */ 87#else /* CONFIG_64BIT */
99 header->trampoline_segment = setup_trampoline() >> 4; 88 header->trampoline_segment = trampoline_address() >> 4;
100#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
101 stack_start = (unsigned long)temp_stack + sizeof(temp_stack); 90 stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
102 early_gdt_descr.address = 91 early_gdt_descr.address =
@@ -107,56 +96,10 @@ int acpi_save_state_mem(void)
107 saved_magic = 0x123456789abcdef0L; 96 saved_magic = 0x123456789abcdef0L;
108#endif /* CONFIG_64BIT */ 97#endif /* CONFIG_64BIT */
109 98
99 do_suspend_lowlevel();
110 return 0; 100 return 0;
111} 101}
112 102
113/*
114 * acpi_restore_state - undo effects of acpi_save_state_mem
115 */
116void acpi_restore_state_mem(void)
117{
118}
119
120
121/**
122 * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
123 *
124 * We allocate a page from the first 1MB of memory for the wakeup
125 * routine for when we come back from a sleep state. The
126 * runtime allocator allows specification of <16MB pages, but not
127 * <1MB pages.
128 */
129void __init acpi_reserve_wakeup_memory(void)
130{
131 phys_addr_t mem;
132
133 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
134 printk(KERN_ERR
135 "ACPI: Wakeup code way too big, S3 disabled.\n");
136 return;
137 }
138
139 mem = memblock_find_in_range(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
140
141 if (mem == MEMBLOCK_ERROR) {
142 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
143 return;
144 }
145 acpi_realmode = (unsigned long) phys_to_virt(mem);
146 acpi_wakeup_address = mem;
147 memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
148}
149
150int __init acpi_configure_wakeup_memory(void)
151{
152 if (acpi_realmode)
153 set_memory_x(acpi_realmode, WAKEUP_SIZE >> PAGE_SHIFT);
154
155 return 0;
156}
157arch_initcall(acpi_configure_wakeup_memory);
158
159
160static int __init acpi_sleep_setup(char *str) 103static int __init acpi_sleep_setup(char *str)
161{ 104{
162 while ((str != NULL) && (*str != '\0')) { 105 while ((str != NULL) && (*str != '\0')) {
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index adbcbaa6f1df..416d4be13fef 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -4,13 +4,12 @@
4 4
5#include <asm/trampoline.h> 5#include <asm/trampoline.h>
6 6
7extern char wakeup_code_start, wakeup_code_end;
8
9extern unsigned long saved_video_mode; 7extern unsigned long saved_video_mode;
10extern long saved_magic; 8extern long saved_magic;
11 9
12extern int wakeup_pmode_return; 10extern int wakeup_pmode_return;
13extern char swsusp_pg_dir[PAGE_SIZE];
14 11
15extern unsigned long acpi_copy_wakeup_routine(unsigned long); 12extern unsigned long acpi_copy_wakeup_routine(unsigned long);
16extern void wakeup_long64(void); 13extern void wakeup_long64(void);
14
15extern void do_suspend_lowlevel(void);
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
index 6ff3b5730575..63b8ab524f2c 100644
--- a/arch/x86/kernel/acpi/wakeup_rm.S
+++ b/arch/x86/kernel/acpi/wakeup_rm.S
@@ -2,9 +2,11 @@
2 * Wrapper script for the realmode binary as a transport object 2 * Wrapper script for the realmode binary as a transport object
3 * before copying to low memory. 3 * before copying to low memory.
4 */ 4 */
5 .section ".rodata","a" 5#include <asm/page_types.h>
6 .globl wakeup_code_start, wakeup_code_end 6
7wakeup_code_start: 7 .section ".x86_trampoline","a"
8 .balign PAGE_SIZE
9 .globl acpi_wakeup_code
10acpi_wakeup_code:
8 .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin" 11 .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin"
9wakeup_code_end: 12 .size acpi_wakeup_code, .-acpi_wakeup_code
10 .size wakeup_code_start, .-wakeup_code_start
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 7038b95d363f..4a234677e213 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -199,7 +199,7 @@ void *text_poke_early(void *addr, const void *opcode, size_t len);
199 199
200/* Replace instructions with better alternatives for this CPU type. 200/* Replace instructions with better alternatives for this CPU type.
201 This runs before SMP is initialized to avoid SMP problems with 201 This runs before SMP is initialized to avoid SMP problems with
202 self modifying code. This implies that assymetric systems where 202 self modifying code. This implies that asymmetric systems where
203 APs have less capabilities than the boot processor are not handled. 203 APs have less capabilities than the boot processor are not handled.
204 Tough. Make sure you disable such features by hand. */ 204 Tough. Make sure you disable such features by hand. */
205 205
@@ -620,7 +620,12 @@ static int __kprobes stop_machine_text_poke(void *data)
620 flush_icache_range((unsigned long)p->addr, 620 flush_icache_range((unsigned long)p->addr,
621 (unsigned long)p->addr + p->len); 621 (unsigned long)p->addr + p->len);
622 } 622 }
623 623 /*
624 * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
625 * that a core serializing instruction such as "cpuid" should be
626 * executed on _each_ core before the new instruction is made visible.
627 */
628 sync_core();
624 return 0; 629 return 0;
625} 630}
626 631
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index ed3c2e5b714a..6801959a8b2a 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -15,7 +15,7 @@ static u32 *flush_words;
15const struct pci_device_id amd_nb_misc_ids[] = { 15const struct pci_device_id amd_nb_misc_ids[] = {
16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, 16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, 17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, 18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
19 {} 19 {}
20}; 20};
21EXPORT_SYMBOL(amd_nb_misc_ids); 21EXPORT_SYMBOL(amd_nb_misc_ids);
@@ -48,7 +48,7 @@ static struct pci_dev *next_northbridge(struct pci_dev *dev,
48 48
49int amd_cache_northbridges(void) 49int amd_cache_northbridges(void)
50{ 50{
51 int i = 0; 51 u16 i = 0;
52 struct amd_northbridge *nb; 52 struct amd_northbridge *nb;
53 struct pci_dev *misc, *link; 53 struct pci_dev *misc, *link;
54 54
@@ -103,9 +103,11 @@ int amd_cache_northbridges(void)
103} 103}
104EXPORT_SYMBOL_GPL(amd_cache_northbridges); 104EXPORT_SYMBOL_GPL(amd_cache_northbridges);
105 105
106/* Ignores subdevice/subvendor but as far as I can figure out 106/*
107 they're useless anyways */ 107 * Ignores subdevice/subvendor but as far as I can figure out
108int __init early_is_amd_nb(u32 device) 108 * they're useless anyways
109 */
110bool __init early_is_amd_nb(u32 device)
109{ 111{
110 const struct pci_device_id *id; 112 const struct pci_device_id *id;
111 u32 vendor = device & 0xffff; 113 u32 vendor = device & 0xffff;
@@ -113,8 +115,8 @@ int __init early_is_amd_nb(u32 device)
113 device >>= 16; 115 device >>= 16;
114 for (id = amd_nb_misc_ids; id->vendor; id++) 116 for (id = amd_nb_misc_ids; id->vendor; id++)
115 if (vendor == id->vendor && device == id->device) 117 if (vendor == id->vendor && device == id->device)
116 return 1; 118 return true;
117 return 0; 119 return false;
118} 120}
119 121
120int amd_get_subcaches(int cpu) 122int amd_get_subcaches(int cpu)
@@ -176,9 +178,9 @@ int amd_set_subcaches(int cpu, int mask)
176 return 0; 178 return 0;
177} 179}
178 180
179int amd_cache_gart(void) 181static int amd_cache_gart(void)
180{ 182{
181 int i; 183 u16 i;
182 184
183 if (!amd_nb_has_feature(AMD_NB_GART)) 185 if (!amd_nb_has_feature(AMD_NB_GART))
184 return 0; 186 return 0;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 7b1e8e10b89c..86d1ad4962a7 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -73,7 +73,7 @@ static u32 __init allocate_aperture(void)
73 /* 73 /*
74 * using 512M as goal, in case kexec will load kernel_big 74 * using 512M as goal, in case kexec will load kernel_big
75 * that will do the on position decompress, and could overlap with 75 * that will do the on position decompress, and could overlap with
76 * that positon with gart that is used. 76 * that position with gart that is used.
77 * sequende: 77 * sequende:
78 * kernel_small 78 * kernel_small
79 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) 79 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 4b5ebd26f565..180ca240e03c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1886,7 +1886,7 @@ void disable_IO_APIC(void)
1886 * 1886 *
1887 * With interrupt-remapping, for now we will use virtual wire A mode, 1887 * With interrupt-remapping, for now we will use virtual wire A mode,
1888 * as virtual wire B is little complex (need to configure both 1888 * as virtual wire B is little complex (need to configure both
1889 * IOAPIC RTE aswell as interrupt-remapping table entry). 1889 * IOAPIC RTE as well as interrupt-remapping table entry).
1890 * As this gets called during crash dump, keep this simple for now. 1890 * As this gets called during crash dump, keep this simple for now.
1891 */ 1891 */
1892 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { 1892 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
@@ -2905,7 +2905,7 @@ void __init setup_IO_APIC(void)
2905} 2905}
2906 2906
2907/* 2907/*
2908 * Called after all the initialization is done. If we didnt find any 2908 * Called after all the initialization is done. If we didn't find any
2909 * APIC bugs then we can allow the modify fast path 2909 * APIC bugs then we can allow the modify fast path
2910 */ 2910 */
2911 2911
@@ -3983,7 +3983,7 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
3983static __init int bad_ioapic(unsigned long address) 3983static __init int bad_ioapic(unsigned long address)
3984{ 3984{
3985 if (nr_ioapics >= MAX_IO_APICS) { 3985 if (nr_ioapics >= MAX_IO_APICS) {
3986 printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " 3986 printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded "
3987 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); 3987 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
3988 return 1; 3988 return 1;
3989 } 3989 }
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 0e4f24c2a746..0b4be431c620 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -66,7 +66,7 @@
66 * 1.5: Fix segment register reloading (in case of bad segments saved 66 * 1.5: Fix segment register reloading (in case of bad segments saved
67 * across BIOS call). 67 * across BIOS call).
68 * Stephen Rothwell 68 * Stephen Rothwell
69 * 1.6: Cope with complier/assembler differences. 69 * 1.6: Cope with compiler/assembler differences.
70 * Only try to turn off the first display device. 70 * Only try to turn off the first display device.
71 * Fix OOPS at power off with no APM BIOS by Jan Echternach 71 * Fix OOPS at power off with no APM BIOS by Jan Echternach
72 * <echter@informatik.uni-rostock.de> 72 * <echter@informatik.uni-rostock.de>
@@ -227,6 +227,7 @@
227#include <linux/suspend.h> 227#include <linux/suspend.h>
228#include <linux/kthread.h> 228#include <linux/kthread.h>
229#include <linux/jiffies.h> 229#include <linux/jiffies.h>
230#include <linux/acpi.h>
230 231
231#include <asm/system.h> 232#include <asm/system.h>
232#include <asm/uaccess.h> 233#include <asm/uaccess.h>
@@ -975,20 +976,10 @@ recalc:
975 976
976static void apm_power_off(void) 977static void apm_power_off(void)
977{ 978{
978 unsigned char po_bios_call[] = {
979 0xb8, 0x00, 0x10, /* movw $0x1000,ax */
980 0x8e, 0xd0, /* movw ax,ss */
981 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */
982 0xb8, 0x07, 0x53, /* movw $0x5307,ax */
983 0xbb, 0x01, 0x00, /* movw $0x0001,bx */
984 0xb9, 0x03, 0x00, /* movw $0x0003,cx */
985 0xcd, 0x15 /* int $0x15 */
986 };
987
988 /* Some bioses don't like being called from CPU != 0 */ 979 /* Some bioses don't like being called from CPU != 0 */
989 if (apm_info.realmode_power_off) { 980 if (apm_info.realmode_power_off) {
990 set_cpus_allowed_ptr(current, cpumask_of(0)); 981 set_cpus_allowed_ptr(current, cpumask_of(0));
991 machine_real_restart(po_bios_call, sizeof(po_bios_call)); 982 machine_real_restart(MRR_APM);
992 } else { 983 } else {
993 (void)set_system_power_state(APM_STATE_OFF); 984 (void)set_system_power_state(APM_STATE_OFF);
994 } 985 }
@@ -2331,12 +2322,11 @@ static int __init apm_init(void)
2331 apm_info.disabled = 1; 2322 apm_info.disabled = 1;
2332 return -ENODEV; 2323 return -ENODEV;
2333 } 2324 }
2334 if (pm_flags & PM_ACPI) { 2325 if (!acpi_disabled) {
2335 printk(KERN_NOTICE "apm: overridden by ACPI.\n"); 2326 printk(KERN_NOTICE "apm: overridden by ACPI.\n");
2336 apm_info.disabled = 1; 2327 apm_info.disabled = 1;
2337 return -ENODEV; 2328 return -ENODEV;
2338 } 2329 }
2339 pm_flags |= PM_APM;
2340 2330
2341 /* 2331 /*
2342 * Set up the long jump entry point to the APM BIOS, which is called 2332 * Set up the long jump entry point to the APM BIOS, which is called
@@ -2428,7 +2418,6 @@ static void __exit apm_exit(void)
2428 kthread_stop(kapmd_task); 2418 kthread_stop(kapmd_task);
2429 kapmd_task = NULL; 2419 kapmd_task = NULL;
2430 } 2420 }
2431 pm_flags &= ~PM_APM;
2432} 2421}
2433 2422
2434module_init(apm_init); 2423module_init(apm_init);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f771ab6b49e9..3ecece0217ef 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -611,6 +611,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
611 } 611 }
612 } 612 }
613#endif 613#endif
614
615 /* As a rule processors have APIC timer running in deep C states */
616 if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400))
617 set_cpu_cap(c, X86_FEATURE_ARAT);
614} 618}
615 619
616#ifdef CONFIG_X86_32 620#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 03162dac6271..cf48cdd6907d 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -444,7 +444,7 @@ static int __cpuinit longhaul_get_ranges(void)
444 return -EINVAL; 444 return -EINVAL;
445 } 445 }
446 /* Get max multiplier - as we always did. 446 /* Get max multiplier - as we always did.
447 * Longhaul MSR is usefull only when voltage scaling is enabled. 447 * Longhaul MSR is useful only when voltage scaling is enabled.
448 * C3 is booting at max anyway. */ 448 * C3 is booting at max anyway. */
449 maxmult = mult; 449 maxmult = mult;
450 /* Get min multiplier */ 450 /* Get min multiplier */
@@ -1011,7 +1011,7 @@ static void __exit longhaul_exit(void)
1011 * trigger frequency transition in some cases. */ 1011 * trigger frequency transition in some cases. */
1012module_param(disable_acpi_c3, int, 0644); 1012module_param(disable_acpi_c3, int, 0644);
1013MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support"); 1013MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
1014/* Change CPU voltage with frequency. Very usefull to save 1014/* Change CPU voltage with frequency. Very useful to save
1015 * power, but most VIA C3 processors aren't supporting it. */ 1015 * power, but most VIA C3 processors aren't supporting it. */
1016module_param(scale_voltage, int, 0644); 1016module_param(scale_voltage, int, 0644);
1017MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); 1017MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index 4a5a42b842ad..755a31e0f5b0 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -315,8 +315,6 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
315 315
316 input.count = 4; 316 input.count = 4;
317 input.pointer = in_params; 317 input.pointer = in_params;
318 input.count = 4;
319 input.pointer = in_params;
320 in_params[0].type = ACPI_TYPE_BUFFER; 318 in_params[0].type = ACPI_TYPE_BUFFER;
321 in_params[0].buffer.length = 16; 319 in_params[0].buffer.length = 16;
322 in_params[0].buffer.pointer = OSC_UUID; 320 in_params[0].buffer.pointer = OSC_UUID;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index c567dec854f6..2368e38327b3 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -630,8 +630,7 @@ static void print_basics(struct powernow_k8_data *data)
630 data->powernow_table[j].frequency/1000); 630 data->powernow_table[j].frequency/1000);
631 } else { 631 } else {
632 printk(KERN_INFO PFX 632 printk(KERN_INFO PFX
633 " %d : fid 0x%x (%d MHz), vid 0x%x\n", 633 "fid 0x%x (%d MHz), vid 0x%x\n",
634 j,
635 data->powernow_table[j].index & 0xff, 634 data->powernow_table[j].index & 0xff,
636 data->powernow_table[j].frequency/1000, 635 data->powernow_table[j].frequency/1000,
637 data->powernow_table[j].index >> 8); 636 data->powernow_table[j].index >> 8);
@@ -1276,7 +1275,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1276 1275
1277 if (powernow_k8_cpu_init_acpi(data)) { 1276 if (powernow_k8_cpu_init_acpi(data)) {
1278 /* 1277 /*
1279 * Use the PSB BIOS structure. This is only availabe on 1278 * Use the PSB BIOS structure. This is only available on
1280 * an UP version, and is deprecated by AMD. 1279 * an UP version, and is deprecated by AMD.
1281 */ 1280 */
1282 if (num_online_cpus() != 1) { 1281 if (num_online_cpus() != 1) {
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index 8abd869baabf..91bc25b67bc1 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -292,7 +292,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
292 292
293 result = speedstep_smi_ownership(); 293 result = speedstep_smi_ownership();
294 if (result) { 294 if (result) {
295 dprintk("fails in aquiring ownership of a SMI interface.\n"); 295 dprintk("fails in acquiring ownership of a SMI interface.\n");
296 return -EINVAL; 296 return -EINVAL;
297 } 297 }
298 298
@@ -360,7 +360,7 @@ static int speedstep_resume(struct cpufreq_policy *policy)
360 int result = speedstep_smi_ownership(); 360 int result = speedstep_smi_ownership();
361 361
362 if (result) 362 if (result)
363 dprintk("fails in re-aquiring ownership of a SMI interface.\n"); 363 dprintk("fails in re-acquiring ownership of a SMI interface.\n");
364 364
365 return result; 365 return result;
366} 366}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 8209472b27a5..83930deec3c6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -106,24 +106,34 @@ int apei_write_mce(struct mce *m)
106ssize_t apei_read_mce(struct mce *m, u64 *record_id) 106ssize_t apei_read_mce(struct mce *m, u64 *record_id)
107{ 107{
108 struct cper_mce_record rcd; 108 struct cper_mce_record rcd;
109 ssize_t len; 109 int rc, pos;
110 110
111 len = erst_read_next(&rcd.hdr, sizeof(rcd)); 111 rc = erst_get_record_id_begin(&pos);
112 if (len <= 0) 112 if (rc)
113 return len; 113 return rc;
114 /* Can not skip other records in storage via ERST unless clear them */ 114retry:
115 else if (len != sizeof(rcd) || 115 rc = erst_get_record_id_next(&pos, record_id);
116 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { 116 if (rc)
117 if (printk_ratelimit()) 117 goto out;
118 pr_warning( 118 /* no more record */
119 "MCE-APEI: Can not skip the unknown record in ERST"); 119 if (*record_id == APEI_ERST_INVALID_RECORD_ID)
120 return -EIO; 120 goto out;
121 } 121 rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
122 122 /* someone else has cleared the record, try next one */
123 if (rc == -ENOENT)
124 goto retry;
125 else if (rc < 0)
126 goto out;
127 /* try to skip other type records in storage */
128 else if (rc != sizeof(rcd) ||
129 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
130 goto retry;
123 memcpy(m, &rcd.mce, sizeof(*m)); 131 memcpy(m, &rcd.mce, sizeof(*m));
124 *record_id = rcd.hdr.record_id; 132 rc = sizeof(*m);
133out:
134 erst_get_record_id_end();
125 135
126 return sizeof(*m); 136 return rc;
127} 137}
128 138
129/* Check whether there is record in ERST */ 139/* Check whether there is record in ERST */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index a77971979564..0ed633c5048b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -32,7 +32,7 @@ static void inject_mce(struct mce *m)
32{ 32{
33 struct mce *i = &per_cpu(injectm, m->extcpu); 33 struct mce *i = &per_cpu(injectm, m->extcpu);
34 34
35 /* Make sure noone reads partially written injectm */ 35 /* Make sure no one reads partially written injectm */
36 i->finished = 0; 36 i->finished = 0;
37 mb(); 37 mb();
38 m->finished = 0; 38 m->finished = 0;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d916183b7f9c..ab1122998dba 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -881,7 +881,7 @@ reset:
881 * Check if the address reported by the CPU is in a format we can parse. 881 * Check if the address reported by the CPU is in a format we can parse.
882 * It would be possible to add code for most other cases, but all would 882 * It would be possible to add code for most other cases, but all would
883 * be somewhat complicated (e.g. segment offset would require an instruction 883 * be somewhat complicated (e.g. segment offset would require an instruction
884 * parser). So only support physical addresses upto page granuality for now. 884 * parser). So only support physical addresses up to page granuality for now.
885 */ 885 */
886static int mce_usable_address(struct mce *m) 886static int mce_usable_address(struct mce *m)
887{ 887{
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 9f27228ceffd..a71efcdbb092 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong 2 * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
3 * because MTRRs can span upto 40 bits (36bits on most modern x86) 3 * because MTRRs can span up to 40 bits (36bits on most modern x86)
4 */ 4 */
5#define DEBUG 5#define DEBUG
6 6
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index eb00677ee2ae..eed3673a8656 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1114,7 +1114,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)
1114 1114
1115 /* 1115 /*
1116 * If group events scheduling transaction was started, 1116 * If group events scheduling transaction was started,
1117 * skip the schedulability test here, it will be peformed 1117 * skip the schedulability test here, it will be performed
1118 * at commit time (->commit_txn) as a whole 1118 * at commit time (->commit_txn) as a whole
1119 */ 1119 */
1120 if (cpuc->group_flag & PERF_EVENT_TXN) 1120 if (cpuc->group_flag & PERF_EVENT_TXN)
@@ -1795,7 +1795,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1795 1795
1796 perf_callchain_store(entry, regs->ip); 1796 perf_callchain_store(entry, regs->ip);
1797 1797
1798 dump_trace(NULL, regs, NULL, &backtrace_ops, entry); 1798 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1799} 1799}
1800 1800
1801#ifdef CONFIG_COMPAT 1801#ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index d3d7b59841e5..c2520e178d32 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Netburst Perfomance Events (P4, old Xeon) 2 * Netburst Performance Events (P4, old Xeon)
3 * 3 *
4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org> 4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com> 5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
@@ -679,7 +679,7 @@ static int p4_validate_raw_event(struct perf_event *event)
679 */ 679 */
680 680
681 /* 681 /*
682 * if an event is shared accross the logical threads 682 * if an event is shared across the logical threads
683 * the user needs special permissions to be able to use it 683 * the user needs special permissions to be able to use it
684 */ 684 */
685 if (p4_ht_active() && p4_event_bind_map[v].shared) { 685 if (p4_ht_active() && p4_event_bind_map[v].shared) {
@@ -791,13 +791,13 @@ static void p4_pmu_disable_pebs(void)
791 * 791 *
792 * It's still allowed that two threads setup same cache 792 * It's still allowed that two threads setup same cache
793 * events so we can't simply clear metrics until we knew 793 * events so we can't simply clear metrics until we knew
794 * noone is depending on us, so we need kind of counter 794 * no one is depending on us, so we need kind of counter
795 * for "ReplayEvent" users. 795 * for "ReplayEvent" users.
796 * 796 *
797 * What is more complex -- RAW events, if user (for some 797 * What is more complex -- RAW events, if user (for some
798 * reason) will pass some cache event metric with improper 798 * reason) will pass some cache event metric with improper
799 * event opcode -- it's fine from hardware point of view 799 * event opcode -- it's fine from hardware point of view
800 * but completely nonsence from "meaning" of such action. 800 * but completely nonsense from "meaning" of such action.
801 * 801 *
802 * So at moment let leave metrics turned on forever -- it's 802 * So at moment let leave metrics turned on forever -- it's
803 * ok for now but need to be revisited! 803 * ok for now but need to be revisited!
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 227b0448960d..d22d0c4edcfd 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -86,7 +86,7 @@ static void __init vmware_platform_setup(void)
86} 86}
87 87
88/* 88/*
89 * While checking the dmi string infomation, just checking the product 89 * While checking the dmi string information, just checking the product
90 * serial key should be enough, as this will always have a VMware 90 * serial key should be enough, as this will always have a VMware
91 * specific string when running under VMware hypervisor. 91 * specific string when running under VMware hypervisor.
92 */ 92 */
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index d5cd13945d5a..642f75a68cd5 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -14,9 +14,6 @@
14 14
15static void *kdump_buf_page; 15static void *kdump_buf_page;
16 16
17/* Stores the physical address of elf header of crash image. */
18unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
19
20static inline bool is_crashed_pfn_valid(unsigned long pfn) 17static inline bool is_crashed_pfn_valid(unsigned long pfn)
21{ 18{
22#ifndef CONFIG_X86_PAE 19#ifndef CONFIG_X86_PAE
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 994828899e09..afa64adb75ee 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -10,9 +10,6 @@
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/io.h> 11#include <linux/io.h>
12 12
13/* Stores the physical address of elf header of crash image. */
14unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
15
16/** 13/**
17 * copy_oldmem_page - copy one page from "oldmem" 14 * copy_oldmem_page - copy one page from "oldmem"
18 * @pfn: page frame number to be copied 15 * @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 7a8cebc9ff29..706a9fb46a58 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -65,12 +65,10 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
65 return 0; 65 return 0;
66 ret = ih->xlate(ih, intspec, intsize, &virq, &type); 66 ret = ih->xlate(ih, intspec, intsize, &virq, &type);
67 if (ret) 67 if (ret)
68 return ret; 68 return 0;
69 if (type == IRQ_TYPE_NONE) 69 if (type == IRQ_TYPE_NONE)
70 return virq; 70 return virq;
71 /* set the mask if it is different from current */ 71 irq_set_irq_type(virq, type);
72 if (type == (irq_to_desc(virq)->status & IRQF_TRIGGER_MASK))
73 set_irq_type(virq, type);
74 return virq; 72 return virq;
75} 73}
76EXPORT_SYMBOL_GPL(irq_create_of_mapping); 74EXPORT_SYMBOL_GPL(irq_create_of_mapping);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 220a1c11cfde..e2a3f0606da4 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -27,7 +27,7 @@ static int die_counter;
27 27
28void printk_address(unsigned long address, int reliable) 28void printk_address(unsigned long address, int reliable)
29{ 29{
30 printk(" [<%p>] %s%pS\n", (void *) address, 30 printk(" [<%p>] %s%pB\n", (void *) address,
31 reliable ? "" : "? ", (void *) address); 31 reliable ? "" : "? ", (void *) address);
32} 32}
33 33
@@ -175,21 +175,21 @@ static const struct stacktrace_ops print_trace_ops = {
175 175
176void 176void
177show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 177show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
178 unsigned long *stack, char *log_lvl) 178 unsigned long *stack, unsigned long bp, char *log_lvl)
179{ 179{
180 printk("%sCall Trace:\n", log_lvl); 180 printk("%sCall Trace:\n", log_lvl);
181 dump_trace(task, regs, stack, &print_trace_ops, log_lvl); 181 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
182} 182}
183 183
184void show_trace(struct task_struct *task, struct pt_regs *regs, 184void show_trace(struct task_struct *task, struct pt_regs *regs,
185 unsigned long *stack) 185 unsigned long *stack, unsigned long bp)
186{ 186{
187 show_trace_log_lvl(task, regs, stack, ""); 187 show_trace_log_lvl(task, regs, stack, bp, "");
188} 188}
189 189
190void show_stack(struct task_struct *task, unsigned long *sp) 190void show_stack(struct task_struct *task, unsigned long *sp)
191{ 191{
192 show_stack_log_lvl(task, NULL, sp, ""); 192 show_stack_log_lvl(task, NULL, sp, 0, "");
193} 193}
194 194
195/* 195/*
@@ -197,14 +197,16 @@ void show_stack(struct task_struct *task, unsigned long *sp)
197 */ 197 */
198void dump_stack(void) 198void dump_stack(void)
199{ 199{
200 unsigned long bp;
200 unsigned long stack; 201 unsigned long stack;
201 202
203 bp = stack_frame(current, NULL);
202 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 204 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
203 current->pid, current->comm, print_tainted(), 205 current->pid, current->comm, print_tainted(),
204 init_utsname()->release, 206 init_utsname()->release,
205 (int)strcspn(init_utsname()->version, " "), 207 (int)strcspn(init_utsname()->version, " "),
206 init_utsname()->version); 208 init_utsname()->version);
207 show_trace(NULL, NULL, &stack); 209 show_trace(NULL, NULL, &stack, bp);
208} 210}
209EXPORT_SYMBOL(dump_stack); 211EXPORT_SYMBOL(dump_stack);
210 212
@@ -320,16 +322,6 @@ void die(const char *str, struct pt_regs *regs, long err)
320 oops_end(flags, regs, sig); 322 oops_end(flags, regs, sig);
321} 323}
322 324
323static int __init oops_setup(char *s)
324{
325 if (!s)
326 return -EINVAL;
327 if (!strcmp(s, "panic"))
328 panic_on_oops = 1;
329 return 0;
330}
331early_param("oops", oops_setup);
332
333static int __init kstack_setup(char *s) 325static int __init kstack_setup(char *s)
334{ 326{
335 if (!s) 327 if (!s)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 74cc1eda384b..3b97a80ce329 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,12 +17,11 @@
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19 19
20void dump_trace(struct task_struct *task, 20void dump_trace(struct task_struct *task, struct pt_regs *regs,
21 struct pt_regs *regs, unsigned long *stack, 21 unsigned long *stack, unsigned long bp,
22 const struct stacktrace_ops *ops, void *data) 22 const struct stacktrace_ops *ops, void *data)
23{ 23{
24 int graph = 0; 24 int graph = 0;
25 unsigned long bp;
26 25
27 if (!task) 26 if (!task)
28 task = current; 27 task = current;
@@ -35,7 +34,9 @@ void dump_trace(struct task_struct *task,
35 stack = (unsigned long *)task->thread.sp; 34 stack = (unsigned long *)task->thread.sp;
36 } 35 }
37 36
38 bp = stack_frame(task, regs); 37 if (!bp)
38 bp = stack_frame(task, regs);
39
39 for (;;) { 40 for (;;) {
40 struct thread_info *context; 41 struct thread_info *context;
41 42
@@ -55,7 +56,7 @@ EXPORT_SYMBOL(dump_trace);
55 56
56void 57void
57show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 58show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
58 unsigned long *sp, char *log_lvl) 59 unsigned long *sp, unsigned long bp, char *log_lvl)
59{ 60{
60 unsigned long *stack; 61 unsigned long *stack;
61 int i; 62 int i;
@@ -77,7 +78,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
77 touch_nmi_watchdog(); 78 touch_nmi_watchdog();
78 } 79 }
79 printk(KERN_CONT "\n"); 80 printk(KERN_CONT "\n");
80 show_trace_log_lvl(task, regs, sp, log_lvl); 81 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
81} 82}
82 83
83 84
@@ -102,7 +103,7 @@ void show_registers(struct pt_regs *regs)
102 u8 *ip; 103 u8 *ip;
103 104
104 printk(KERN_EMERG "Stack:\n"); 105 printk(KERN_EMERG "Stack:\n");
105 show_stack_log_lvl(NULL, regs, &regs->sp, KERN_EMERG); 106 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
106 107
107 printk(KERN_EMERG "Code: "); 108 printk(KERN_EMERG "Code: ");
108 109
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index a6b6fcf7f0ae..e71c98d3c0d2 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -139,8 +139,8 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
139 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 139 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
140 */ 140 */
141 141
142void dump_trace(struct task_struct *task, 142void dump_trace(struct task_struct *task, struct pt_regs *regs,
143 struct pt_regs *regs, unsigned long *stack, 143 unsigned long *stack, unsigned long bp,
144 const struct stacktrace_ops *ops, void *data) 144 const struct stacktrace_ops *ops, void *data)
145{ 145{
146 const unsigned cpu = get_cpu(); 146 const unsigned cpu = get_cpu();
@@ -150,7 +150,6 @@ void dump_trace(struct task_struct *task,
150 struct thread_info *tinfo; 150 struct thread_info *tinfo;
151 int graph = 0; 151 int graph = 0;
152 unsigned long dummy; 152 unsigned long dummy;
153 unsigned long bp;
154 153
155 if (!task) 154 if (!task)
156 task = current; 155 task = current;
@@ -161,7 +160,8 @@ void dump_trace(struct task_struct *task,
161 stack = (unsigned long *)task->thread.sp; 160 stack = (unsigned long *)task->thread.sp;
162 } 161 }
163 162
164 bp = stack_frame(task, regs); 163 if (!bp)
164 bp = stack_frame(task, regs);
165 /* 165 /*
166 * Print function call entries in all stacks, starting at the 166 * Print function call entries in all stacks, starting at the
167 * current stack address. If the stacks consist of nested 167 * current stack address. If the stacks consist of nested
@@ -225,7 +225,7 @@ EXPORT_SYMBOL(dump_trace);
225 225
226void 226void
227show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 227show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
228 unsigned long *sp, char *log_lvl) 228 unsigned long *sp, unsigned long bp, char *log_lvl)
229{ 229{
230 unsigned long *irq_stack_end; 230 unsigned long *irq_stack_end;
231 unsigned long *irq_stack; 231 unsigned long *irq_stack;
@@ -269,7 +269,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
269 preempt_enable(); 269 preempt_enable();
270 270
271 printk(KERN_CONT "\n"); 271 printk(KERN_CONT "\n");
272 show_trace_log_lvl(task, regs, sp, log_lvl); 272 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
273} 273}
274 274
275void show_registers(struct pt_regs *regs) 275void show_registers(struct pt_regs *regs)
@@ -298,7 +298,7 @@ void show_registers(struct pt_regs *regs)
298 298
299 printk(KERN_EMERG "Stack:\n"); 299 printk(KERN_EMERG "Stack:\n");
300 show_stack_log_lvl(NULL, regs, (unsigned long *)sp, 300 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
301 KERN_EMERG); 301 0, KERN_EMERG);
302 302
303 printk(KERN_EMERG "Code: "); 303 printk(KERN_EMERG "Code: ");
304 304
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index cdf5bfd9d4d5..3e2ef8425316 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -11,6 +11,7 @@
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/crash_dump.h>
14#include <linux/bootmem.h> 15#include <linux/bootmem.h>
15#include <linux/pfn.h> 16#include <linux/pfn.h>
16#include <linux/suspend.h> 17#include <linux/suspend.h>
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 9efbdcc56425..3755ef494390 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -159,7 +159,12 @@ static void __init ati_bugs_contd(int num, int slot, int func)
159 if (rev >= 0x40) 159 if (rev >= 0x40)
160 acpi_fix_pin2_polarity = 1; 160 acpi_fix_pin2_polarity = 1;
161 161
162 if (rev > 0x13) 162 /*
163 * SB600: revisions 0x11, 0x12, 0x13, 0x14, ...
164 * SB700: revisions 0x39, 0x3a, ...
165 * SB800: revisions 0x40, 0x41, ...
166 */
167 if (rev >= 0x39)
163 return; 168 return;
164 169
165 if (acpi_use_timer_override) 170 if (acpi_use_timer_override)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index fa41f7298c84..5c1a91974918 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1414,7 +1414,7 @@ ENTRY(async_page_fault)
1414 pushl_cfi $do_async_page_fault 1414 pushl_cfi $do_async_page_fault
1415 jmp error_code 1415 jmp error_code
1416 CFI_ENDPROC 1416 CFI_ENDPROC
1417END(apf_page_fault) 1417END(async_page_fault)
1418#endif 1418#endif
1419 1419
1420/* 1420/*
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b72b4a6466a9..8a445a0c989e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -18,7 +18,7 @@
18 * A note on terminology: 18 * A note on terminology:
19 * - top of stack: Architecture defined interrupt frame from SS to RIP 19 * - top of stack: Architecture defined interrupt frame from SS to RIP
20 * at the top of the kernel process stack. 20 * at the top of the kernel process stack.
21 * - partial stack frame: partially saved registers upto R11. 21 * - partial stack frame: partially saved registers up to R11.
22 * - full stack frame: Like partial stack frame, but all register saved. 22 * - full stack frame: Like partial stack frame, but all register saved.
23 * 23 *
24 * Some macro usage: 24 * Some macro usage:
@@ -422,7 +422,7 @@ ENTRY(ret_from_fork)
422END(ret_from_fork) 422END(ret_from_fork)
423 423
424/* 424/*
425 * System call entry. Upto 6 arguments in registers are supported. 425 * System call entry. Up to 6 arguments in registers are supported.
426 * 426 *
427 * SYSCALL does not save anything on the stack and does not change the 427 * SYSCALL does not save anything on the stack and does not change the
428 * stack pointer. 428 * stack pointer.
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 7f138b3c3c52..d6d6bb361931 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -34,15 +34,6 @@ void __init i386_start_kernel(void)
34{ 34{
35 memblock_init(); 35 memblock_init();
36 36
37#ifdef CONFIG_X86_TRAMPOLINE
38 /*
39 * But first pinch a few for the stack/trampoline stuff
40 * FIXME: Don't need the extra page at 4K, but need to fix
41 * trampoline before removing it. (see the GDT stuff)
42 */
43 memblock_x86_reserve_range(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
44#endif
45
46 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 37 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
47 38
48#ifdef CONFIG_BLK_DEV_INITRD 39#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 2d2673c28aff..5655c2272adb 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -77,9 +77,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
77 /* Make NULL pointers segfault */ 77 /* Make NULL pointers segfault */
78 zap_identity_mappings(); 78 zap_identity_mappings();
79 79
80 /* Cleanup the over mapped high alias */
81 cleanup_highmap();
82
83 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; 80 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
84 81
85 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { 82 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 239046bd447f..e11e39478a49 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -136,10 +136,9 @@ ident_complete:
136 /* Fixup phys_base */ 136 /* Fixup phys_base */
137 addq %rbp, phys_base(%rip) 137 addq %rbp, phys_base(%rip)
138 138
139#ifdef CONFIG_X86_TRAMPOLINE 139 /* Fixup trampoline */
140 addq %rbp, trampoline_level4_pgt + 0(%rip) 140 addq %rbp, trampoline_level4_pgt + 0(%rip)
141 addq %rbp, trampoline_level4_pgt + (511*8)(%rip) 141 addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
142#endif
143 142
144 /* Due to ENTRY(), sometimes the empty space gets filled with 143 /* Due to ENTRY(), sometimes the empty space gets filled with
145 * zeros. Better take a jmp than relying on empty space being 144 * zeros. Better take a jmp than relying on empty space being
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index e60c38cc0eed..12aff2537682 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -145,7 +145,7 @@ EXPORT_SYMBOL_GPL(fpu_finit);
145 * The _current_ task is using the FPU for the first time 145 * The _current_ task is using the FPU for the first time
146 * so initialize it and set the mxcsr to its default 146 * so initialize it and set the mxcsr to its default
147 * value at reset if we support XMM instructions and then 147 * value at reset if we support XMM instructions and then
148 * remeber the current task has used the FPU. 148 * remember the current task has used the FPU.
149 */ 149 */
150int init_fpu(struct task_struct *tsk) 150int init_fpu(struct task_struct *tsk)
151{ 151{
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 9974d21048fd..72090705a656 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -172,7 +172,7 @@ asmlinkage void do_softirq(void)
172 172
173 call_on_stack(__do_softirq, isp); 173 call_on_stack(__do_softirq, isp);
174 /* 174 /*
175 * Shouldnt happen, we returned above if in_interrupt(): 175 * Shouldn't happen, we returned above if in_interrupt():
176 */ 176 */
177 WARN_ON_ONCE(softirq_count()); 177 WARN_ON_ONCE(softirq_count());
178 } 178 }
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 7c64c420a9f6..dba0b36941a5 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -278,7 +278,7 @@ static int hw_break_release_slot(int breakno)
278 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); 278 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
279 if (dbg_release_bp_slot(*pevent)) 279 if (dbg_release_bp_slot(*pevent))
280 /* 280 /*
281 * The debugger is responisble for handing the retry on 281 * The debugger is responsible for handing the retry on
282 * remove failure. 282 * remove failure.
283 */ 283 */
284 return -1; 284 return -1;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8dc44662394b..33c07b0b122e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -493,7 +493,7 @@ static void __init kvm_smp_prepare_boot_cpu(void)
493 native_smp_prepare_boot_cpu(); 493 native_smp_prepare_boot_cpu();
494} 494}
495 495
496static void kvm_guest_cpu_online(void *dummy) 496static void __cpuinit kvm_guest_cpu_online(void *dummy)
497{ 497{
498 kvm_guest_cpu_init(); 498 kvm_guest_cpu_init();
499} 499}
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 63eaf6596233..177183cbb6ae 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -259,7 +259,7 @@ static int __init mca_init(void)
259 /* 259 /*
260 * WARNING: Be careful when making changes here. Putting an adapter 260 * WARNING: Be careful when making changes here. Putting an adapter
261 * and the motherboard simultaneously into setup mode may result in 261 * and the motherboard simultaneously into setup mode may result in
262 * damage to chips (according to The Indispensible PC Hardware Book 262 * damage to chips (according to The Indispensable PC Hardware Book
263 * by Hans-Peter Messmer). Also, we disable system interrupts (so 263 * by Hans-Peter Messmer). Also, we disable system interrupts (so
264 * that we are not disturbed in the middle of this). 264 * that we are not disturbed in the middle of this).
265 */ 265 */
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 01b0f6d06451..5a532ce646bf 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -714,10 +714,6 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
714 *nr_m_spare += 1; 714 *nr_m_spare += 1;
715 } 715 }
716} 716}
717#else /* CONFIG_X86_IO_APIC */
718static
719inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
720#endif /* CONFIG_X86_IO_APIC */
721 717
722static int 718static int
723check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) 719check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
@@ -731,6 +727,10 @@ check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
731 727
732 return ret; 728 return ret;
733} 729}
730#else /* CONFIG_X86_IO_APIC */
731static
732inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
733#endif /* CONFIG_X86_IO_APIC */
734 734
735static int __init replace_intsrc_all(struct mpc_table *mpc, 735static int __init replace_intsrc_all(struct mpc_table *mpc,
736 unsigned long mpc_new_phys, 736 unsigned long mpc_new_phys,
@@ -883,7 +883,7 @@ static int __init update_mp_table(void)
883 883
884 if (!mpc_new_phys) { 884 if (!mpc_new_phys) {
885 unsigned char old, new; 885 unsigned char old, new;
886 /* check if we can change the postion */ 886 /* check if we can change the position */
887 mpc->checksum = 0; 887 mpc->checksum = 0;
888 old = mpf_checksum((unsigned char *)mpc, mpc->length); 888 old = mpf_checksum((unsigned char *)mpc, mpc->length);
889 mpc->checksum = 0xff; 889 mpc->checksum = 0xff;
@@ -892,7 +892,7 @@ static int __init update_mp_table(void)
892 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); 892 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
893 return 0; 893 return 0;
894 } 894 }
895 printk(KERN_INFO "use in-positon replacing\n"); 895 printk(KERN_INFO "use in-position replacing\n");
896 } else { 896 } else {
897 mpf->physptr = mpc_new_phys; 897 mpf->physptr = mpc_new_phys;
898 mpc_new = phys_to_virt(mpc_new_phys); 898 mpc_new = phys_to_virt(mpc_new_phys);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index f56a117cef68..e8c33a302006 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1279,7 +1279,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1279 1279
1280 if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { 1280 if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
1281 /* 1281 /*
1282 * FIXME: properly scan for devices accross the 1282 * FIXME: properly scan for devices across the
1283 * PCI-to-PCI bridge on every CalIOC2 port. 1283 * PCI-to-PCI bridge on every CalIOC2 port.
1284 */ 1284 */
1285 return 1; 1285 return 1;
@@ -1295,7 +1295,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1295 1295
1296/* 1296/*
1297 * calgary_init_bitmap_from_tce_table(): 1297 * calgary_init_bitmap_from_tce_table():
1298 * Funtion for kdump case. In the second/kdump kernel initialize 1298 * Function for kdump case. In the second/kdump kernel initialize
1299 * the bitmap based on the tce table entries obtained from first kernel 1299 * the bitmap based on the tce table entries obtained from first kernel
1300 */ 1300 */
1301static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) 1301static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 99fa3adf0141..d46cbe46b7ab 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -87,7 +87,7 @@ void exit_thread(void)
87void show_regs(struct pt_regs *regs) 87void show_regs(struct pt_regs *regs)
88{ 88{
89 show_registers(regs); 89 show_registers(regs);
90 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs)); 90 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
91} 91}
92 92
93void show_regs_common(void) 93void show_regs_common(void)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index bd387e8f73b4..6c9dd922ac0d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -501,6 +501,10 @@ void set_personality_64bit(void)
501 /* Make sure to be in 64bit mode */ 501 /* Make sure to be in 64bit mode */
502 clear_thread_flag(TIF_IA32); 502 clear_thread_flag(TIF_IA32);
503 503
504 /* Ensure the corresponding mm is not marked. */
505 if (current->mm)
506 current->mm->context.ia32_compat = 0;
507
504 /* TBD: overwrites user setup. Should have two bits. 508 /* TBD: overwrites user setup. Should have two bits.
505 But 64bit processes have always behaved this way, 509 But 64bit processes have always behaved this way,
506 so it's not too bad. The main problem is just that 510 so it's not too bad. The main problem is just that
@@ -516,6 +520,10 @@ void set_personality_ia32(void)
516 set_thread_flag(TIF_IA32); 520 set_thread_flag(TIF_IA32);
517 current->personality |= force_personality32; 521 current->personality |= force_personality32;
518 522
523 /* Mark the associated mm as containing 32-bit tasks. */
524 if (current->mm)
525 current->mm->context.ia32_compat = 1;
526
519 /* Prepare the first "return" to user space */ 527 /* Prepare the first "return" to user space */
520 current_thread_info()->status |= TS_COMPAT; 528 current_thread_info()->status |= TS_COMPAT;
521} 529}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 715037caeb43..d3ce37edb54d 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -303,68 +303,16 @@ static int __init reboot_init(void)
303} 303}
304core_initcall(reboot_init); 304core_initcall(reboot_init);
305 305
306/* The following code and data reboots the machine by switching to real 306extern const unsigned char machine_real_restart_asm[];
307 mode and jumping to the BIOS reset entry point, as if the CPU has 307extern const u64 machine_real_restart_gdt[3];
308 really been reset. The previous version asked the keyboard
309 controller to pulse the CPU reset line, which is more thorough, but
310 doesn't work with at least one type of 486 motherboard. It is easy
311 to stop this code working; hence the copious comments. */
312static const unsigned long long
313real_mode_gdt_entries [3] =
314{
315 0x0000000000000000ULL, /* Null descriptor */
316 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
317 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
318};
319 308
320static const struct desc_ptr 309void machine_real_restart(unsigned int type)
321real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
322real_mode_idt = { 0x3ff, 0 };
323
324/* This is 16-bit protected mode code to disable paging and the cache,
325 switch to real mode and jump to the BIOS reset code.
326
327 The instruction that switches to real mode by writing to CR0 must be
328 followed immediately by a far jump instruction, which set CS to a
329 valid value for real mode, and flushes the prefetch queue to avoid
330 running instructions that have already been decoded in protected
331 mode.
332
333 Clears all the flags except ET, especially PG (paging), PE
334 (protected-mode enable) and TS (task switch for coprocessor state
335 save). Flushes the TLB after paging has been disabled. Sets CD and
336 NW, to disable the cache on a 486, and invalidates the cache. This
337 is more like the state of a 486 after reset. I don't know if
338 something else should be done for other chips.
339
340 More could be done here to set up the registers as if a CPU reset had
341 occurred; hopefully real BIOSs don't assume much. */
342static const unsigned char real_mode_switch [] =
343{
344 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
345 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
346 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */
347 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
348 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
349 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
350 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
351 0x74, 0x02, /* jz f */
352 0x0f, 0x09, /* wbinvd */
353 0x24, 0x10, /* f: andb $0x10,al */
354 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
355};
356static const unsigned char jump_to_bios [] =
357{ 310{
358 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ 311 void *restart_va;
359}; 312 unsigned long restart_pa;
313 void (*restart_lowmem)(unsigned int);
314 u64 *lowmem_gdt;
360 315
361/*
362 * Switch to real mode and then execute the code
363 * specified by the code and length parameters.
364 * We assume that length will aways be less that 100!
365 */
366void machine_real_restart(const unsigned char *code, int length)
367{
368 local_irq_disable(); 316 local_irq_disable();
369 317
370 /* Write zero to CMOS register number 0x0f, which the BIOS POST 318 /* Write zero to CMOS register number 0x0f, which the BIOS POST
@@ -392,41 +340,23 @@ void machine_real_restart(const unsigned char *code, int length)
392 too. */ 340 too. */
393 *((unsigned short *)0x472) = reboot_mode; 341 *((unsigned short *)0x472) = reboot_mode;
394 342
395 /* For the switch to real mode, copy some code to low memory. It has 343 /* Patch the GDT in the low memory trampoline */
396 to be in the first 64k because it is running in 16-bit mode, and it 344 lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt);
397 has to have the same physical and virtual address, because it turns 345
398 off paging. Copy it near the end of the first page, out of the way 346 restart_va = TRAMPOLINE_SYM(machine_real_restart_asm);
399 of BIOS variables. */ 347 restart_pa = virt_to_phys(restart_va);
400 memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100), 348 restart_lowmem = (void (*)(unsigned int))restart_pa;
401 real_mode_switch, sizeof (real_mode_switch)); 349
402 memcpy((void *)(0x1000 - 100), code, length); 350 /* GDT[0]: GDT self-pointer */
403 351 lowmem_gdt[0] =
404 /* Set up the IDT for real mode. */ 352 (u64)(sizeof(machine_real_restart_gdt) - 1) +
405 load_idt(&real_mode_idt); 353 ((u64)virt_to_phys(lowmem_gdt) << 16);
406 354 /* GDT[1]: 64K real mode code segment */
407 /* Set up a GDT from which we can load segment descriptors for real 355 lowmem_gdt[1] =
408 mode. The GDT is not used in real mode; it is just needed here to 356 GDT_ENTRY(0x009b, restart_pa, 0xffff);
409 prepare the descriptors. */ 357
410 load_gdt(&real_mode_gdt); 358 /* Jump to the identity-mapped low memory code */
411 359 restart_lowmem(type);
412 /* Load the data segment registers, and thus the descriptors ready for
413 real mode. The base address of each segment is 0x100, 16 times the
414 selector value being loaded here. This is so that the segment
415 registers don't have to be reloaded after switching to real mode:
416 the values are consistent for real mode operation already. */
417 __asm__ __volatile__ ("movl $0x0010,%%eax\n"
418 "\tmovl %%eax,%%ds\n"
419 "\tmovl %%eax,%%es\n"
420 "\tmovl %%eax,%%fs\n"
421 "\tmovl %%eax,%%gs\n"
422 "\tmovl %%eax,%%ss" : : : "eax");
423
424 /* Jump to the 16-bit code that we copied earlier. It disables paging
425 and the cache, switches to real mode, and jumps to the BIOS reset
426 entry point. */
427 __asm__ __volatile__ ("ljmp $0x0008,%0"
428 :
429 : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100)));
430} 360}
431#ifdef CONFIG_APM_MODULE 361#ifdef CONFIG_APM_MODULE
432EXPORT_SYMBOL(machine_real_restart); 362EXPORT_SYMBOL(machine_real_restart);
@@ -581,7 +511,7 @@ static void native_machine_emergency_restart(void)
581 511
582#ifdef CONFIG_X86_32 512#ifdef CONFIG_X86_32
583 case BOOT_BIOS: 513 case BOOT_BIOS:
584 machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); 514 machine_real_restart(MRR_BIOS);
585 515
586 reboot_type = BOOT_KBD; 516 reboot_type = BOOT_KBD;
587 break; 517 break;
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S
new file mode 100644
index 000000000000..29092b38d816
--- /dev/null
+++ b/arch/x86/kernel/reboot_32.S
@@ -0,0 +1,135 @@
1#include <linux/linkage.h>
2#include <linux/init.h>
3#include <asm/segment.h>
4#include <asm/page_types.h>
5
6/*
7 * The following code and data reboots the machine by switching to real
8 * mode and jumping to the BIOS reset entry point, as if the CPU has
9 * really been reset. The previous version asked the keyboard
10 * controller to pulse the CPU reset line, which is more thorough, but
11 * doesn't work with at least one type of 486 motherboard. It is easy
12 * to stop this code working; hence the copious comments.
13 *
14 * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax.
15 */
16 .section ".x86_trampoline","a"
17 .balign 16
18 .code32
19ENTRY(machine_real_restart_asm)
20r_base = .
21 /* Get our own relocated address */
22 call 1f
231: popl %ebx
24 subl $1b, %ebx
25
26 /* Compute the equivalent real-mode segment */
27 movl %ebx, %ecx
28 shrl $4, %ecx
29
30 /* Patch post-real-mode segment jump */
31 movw dispatch_table(%ebx,%eax,2),%ax
32 movw %ax, 101f(%ebx)
33 movw %cx, 102f(%ebx)
34
35 /* Set up the IDT for real mode. */
36 lidtl machine_real_restart_idt(%ebx)
37
38 /*
39 * Set up a GDT from which we can load segment descriptors for real
40 * mode. The GDT is not used in real mode; it is just needed here to
41 * prepare the descriptors.
42 */
43 lgdtl machine_real_restart_gdt(%ebx)
44
45 /*
46 * Load the data segment registers with 16-bit compatible values
47 */
48 movl $16, %ecx
49 movl %ecx, %ds
50 movl %ecx, %es
51 movl %ecx, %fs
52 movl %ecx, %gs
53 movl %ecx, %ss
54 ljmpl $8, $1f - r_base
55
56/*
57 * This is 16-bit protected mode code to disable paging and the cache,
58 * switch to real mode and jump to the BIOS reset code.
59 *
60 * The instruction that switches to real mode by writing to CR0 must be
61 * followed immediately by a far jump instruction, which set CS to a
62 * valid value for real mode, and flushes the prefetch queue to avoid
63 * running instructions that have already been decoded in protected
64 * mode.
65 *
66 * Clears all the flags except ET, especially PG (paging), PE
67 * (protected-mode enable) and TS (task switch for coprocessor state
68 * save). Flushes the TLB after paging has been disabled. Sets CD and
69 * NW, to disable the cache on a 486, and invalidates the cache. This
70 * is more like the state of a 486 after reset. I don't know if
71 * something else should be done for other chips.
72 *
73 * More could be done here to set up the registers as if a CPU reset had
74 * occurred; hopefully real BIOSs don't assume much. This is not the
75 * actual BIOS entry point, anyway (that is at 0xfffffff0).
76 *
77 * Most of this work is probably excessive, but it is what is tested.
78 */
79 .code16
801:
81 xorl %ecx, %ecx
82 movl %cr0, %eax
83 andl $0x00000011, %eax
84 orl $0x60000000, %eax
85 movl %eax, %cr0
86 movl %ecx, %cr3
87 movl %cr0, %edx
88 andl $0x60000000, %edx /* If no cache bits -> no wbinvd */
89 jz 2f
90 wbinvd
912:
92 andb $0x10, %al
93 movl %eax, %cr0
94 .byte 0xea /* ljmpw */
95101: .word 0 /* Offset */
96102: .word 0 /* Segment */
97
98bios:
99 ljmpw $0xf000, $0xfff0
100
101apm:
102 movw $0x1000, %ax
103 movw %ax, %ss
104 movw $0xf000, %sp
105 movw $0x5307, %ax
106 movw $0x0001, %bx
107 movw $0x0003, %cx
108 int $0x15
109
110END(machine_real_restart_asm)
111
112 .balign 16
113 /* These must match <asm/reboot.h */
114dispatch_table:
115 .word bios - r_base
116 .word apm - r_base
117END(dispatch_table)
118
119 .balign 16
120machine_real_restart_idt:
121 .word 0xffff /* Length - real mode default value */
122 .long 0 /* Base - real mode default value */
123END(machine_real_restart_idt)
124
125 .balign 16
126ENTRY(machine_real_restart_gdt)
127 .quad 0 /* Self-pointer, filled in by PM code */
128 .quad 0 /* 16-bit code segment, filled in by PM code */
129 /*
130 * 16-bit data segment with the selector value 16 = 0x10 and
131 * base value 0x100; since this is consistent with real mode
132 * semantics we don't have to reload the segments once CR0.PE = 0.
133 */
134 .quad GDT_ENTRY(0x0093, 0x100, 0xffff)
135END(machine_real_restart_gdt)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b176f2b1f45d..5a0484a95ad6 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -294,30 +294,11 @@ static void __init init_gbpages(void)
294 else 294 else
295 direct_gbpages = 0; 295 direct_gbpages = 0;
296} 296}
297
298static void __init cleanup_highmap_brk_end(void)
299{
300 pud_t *pud;
301 pmd_t *pmd;
302
303 mmu_cr4_features = read_cr4();
304
305 /*
306 * _brk_end cannot change anymore, but it and _end may be
307 * located on different 2M pages. cleanup_highmap(), however,
308 * can only consider _end when it runs, so destroy any
309 * mappings beyond _brk_end here.
310 */
311 pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
312 pmd = pmd_offset(pud, _brk_end - 1);
313 while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
314 pmd_clear(pmd);
315}
316#else 297#else
317static inline void init_gbpages(void) 298static inline void init_gbpages(void)
318{ 299{
319} 300}
320static inline void cleanup_highmap_brk_end(void) 301static void __init cleanup_highmap(void)
321{ 302{
322} 303}
323#endif 304#endif
@@ -330,8 +311,6 @@ static void __init reserve_brk(void)
330 /* Mark brk area as locked down and no longer taking any 311 /* Mark brk area as locked down and no longer taking any
331 new allocations */ 312 new allocations */
332 _brk_start = 0; 313 _brk_start = 0;
333
334 cleanup_highmap_brk_end();
335} 314}
336 315
337#ifdef CONFIG_BLK_DEV_INITRD 316#ifdef CONFIG_BLK_DEV_INITRD
@@ -640,28 +619,6 @@ void __init reserve_standard_io_resources(void)
640 619
641} 620}
642 621
643/*
644 * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
645 * is_kdump_kernel() to determine if we are booting after a panic. Hence
646 * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
647 */
648
649#ifdef CONFIG_CRASH_DUMP
650/* elfcorehdr= specifies the location of elf core header
651 * stored by the crashed kernel. This option will be passed
652 * by kexec loader to the capture kernel.
653 */
654static int __init setup_elfcorehdr(char *arg)
655{
656 char *end;
657 if (!arg)
658 return -EINVAL;
659 elfcorehdr_addr = memparse(arg, &end);
660 return end > arg ? 0 : -EINVAL;
661}
662early_param("elfcorehdr", setup_elfcorehdr);
663#endif
664
665static __init void reserve_ibft_region(void) 622static __init void reserve_ibft_region(void)
666{ 623{
667 unsigned long addr, size = 0; 624 unsigned long addr, size = 0;
@@ -950,6 +907,8 @@ void __init setup_arch(char **cmdline_p)
950 */ 907 */
951 reserve_brk(); 908 reserve_brk();
952 909
910 cleanup_highmap();
911
953 memblock.current_limit = get_max_mapped(); 912 memblock.current_limit = get_max_mapped();
954 memblock_x86_fill(); 913 memblock_x86_fill();
955 914
@@ -963,15 +922,8 @@ void __init setup_arch(char **cmdline_p)
963 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", 922 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
964 max_pfn_mapped<<PAGE_SHIFT); 923 max_pfn_mapped<<PAGE_SHIFT);
965 924
966 reserve_trampoline_memory(); 925 setup_trampolines();
967 926
968#ifdef CONFIG_ACPI_SLEEP
969 /*
970 * Reserve low memory region for sleep support.
971 * even before init_memory_mapping
972 */
973 acpi_reserve_wakeup_memory();
974#endif
975 init_gbpages(); 927 init_gbpages();
976 928
977 /* max_pfn_mapped is updated here */ 929 /* max_pfn_mapped is updated here */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e9efdfd51c8d..c2871d3c71b6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -711,7 +711,7 @@ do_rest:
711 stack_start = c_idle.idle->thread.sp; 711 stack_start = c_idle.idle->thread.sp;
712 712
713 /* start_ip had better be page-aligned! */ 713 /* start_ip had better be page-aligned! */
714 start_ip = setup_trampoline(); 714 start_ip = trampoline_address();
715 715
716 /* So we see what's up */ 716 /* So we see what's up */
717 announce_cpu(cpu, apicid); 717 announce_cpu(cpu, apicid);
@@ -721,6 +721,8 @@ do_rest:
721 * the targeted processor. 721 * the targeted processor.
722 */ 722 */
723 723
724 printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip);
725
724 atomic_set(&init_deasserted, 0); 726 atomic_set(&init_deasserted, 0);
725 727
726 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 728 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
@@ -774,8 +776,8 @@ do_rest:
774 pr_debug("CPU%d: has booted.\n", cpu); 776 pr_debug("CPU%d: has booted.\n", cpu);
775 else { 777 else {
776 boot_error = 1; 778 boot_error = 1;
777 if (*((volatile unsigned char *)trampoline_base) 779 if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
778 == 0xA5) 780 == 0xA5A5A5A5)
779 /* trampoline started but...? */ 781 /* trampoline started but...? */
780 pr_err("CPU%d: Stuck ??\n", cpu); 782 pr_err("CPU%d: Stuck ??\n", cpu);
781 else 783 else
@@ -801,7 +803,7 @@ do_rest:
801 } 803 }
802 804
803 /* mark "stuck" area as not stuck */ 805 /* mark "stuck" area as not stuck */
804 *((volatile unsigned long *)trampoline_base) = 0; 806 *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0;
805 807
806 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 808 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
807 /* 809 /*
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 938c8e10a19a..6515733a289d 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -73,7 +73,7 @@ static const struct stacktrace_ops save_stack_ops_nosched = {
73 */ 73 */
74void save_stack_trace(struct stack_trace *trace) 74void save_stack_trace(struct stack_trace *trace)
75{ 75{
76 dump_trace(current, NULL, NULL, &save_stack_ops, trace); 76 dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
77 if (trace->nr_entries < trace->max_entries) 77 if (trace->nr_entries < trace->max_entries)
78 trace->entries[trace->nr_entries++] = ULONG_MAX; 78 trace->entries[trace->nr_entries++] = ULONG_MAX;
79} 79}
@@ -81,14 +81,14 @@ EXPORT_SYMBOL_GPL(save_stack_trace);
81 81
82void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs) 82void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
83{ 83{
84 dump_trace(current, regs, NULL, &save_stack_ops, trace); 84 dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
85 if (trace->nr_entries < trace->max_entries) 85 if (trace->nr_entries < trace->max_entries)
86 trace->entries[trace->nr_entries++] = ULONG_MAX; 86 trace->entries[trace->nr_entries++] = ULONG_MAX;
87} 87}
88 88
89void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 89void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
90{ 90{
91 dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace); 91 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
92 if (trace->nr_entries < trace->max_entries) 92 if (trace->nr_entries < trace->max_entries)
93 trace->entries[trace->nr_entries++] = ULONG_MAX; 93 trace->entries[trace->nr_entries++] = ULONG_MAX;
94} 94}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 58de45ee08b6..7977f0cfe339 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -166,7 +166,7 @@ static void enable_step(struct task_struct *child, bool block)
166 * Make sure block stepping (BTF) is not enabled unless it should be. 166 * Make sure block stepping (BTF) is not enabled unless it should be.
167 * Note that we don't try to worry about any is_setting_trap_flag() 167 * Note that we don't try to worry about any is_setting_trap_flag()
168 * instructions after the first when using block stepping. 168 * instructions after the first when using block stepping.
169 * So noone should try to use debugger block stepping in a program 169 * So no one should try to use debugger block stepping in a program
170 * that uses user-mode single stepping itself. 170 * that uses user-mode single stepping itself.
171 */ 171 */
172 if (enable_single_step(child) && block) { 172 if (enable_single_step(child) && block) {
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 5f181742e8f9..abce34d5c79d 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -343,3 +343,4 @@ ENTRY(sys_call_table)
343 .long sys_name_to_handle_at 343 .long sys_name_to_handle_at
344 .long sys_open_by_handle_at 344 .long sys_open_by_handle_at
345 .long sys_clock_adjtime 345 .long sys_clock_adjtime
346 .long sys_syncfs
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 7e4515957a1c..8927486a4649 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -39,7 +39,7 @@ int __ref arch_register_cpu(int num)
39 /* 39 /*
40 * CPU0 cannot be offlined due to several 40 * CPU0 cannot be offlined due to several
41 * restrictions and assumptions in kernel. This basically 41 * restrictions and assumptions in kernel. This basically
42 * doesnt add a control file, one cannot attempt to offline 42 * doesn't add a control file, one cannot attempt to offline
43 * BSP. 43 * BSP.
44 * 44 *
45 * Also certain PCI quirks require not to enable hotplug control 45 * Also certain PCI quirks require not to enable hotplug control
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index a375616d77f7..a91ae7709b49 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -2,39 +2,41 @@
2#include <linux/memblock.h> 2#include <linux/memblock.h>
3 3
4#include <asm/trampoline.h> 4#include <asm/trampoline.h>
5#include <asm/cacheflush.h>
5#include <asm/pgtable.h> 6#include <asm/pgtable.h>
6 7
7#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) 8unsigned char *x86_trampoline_base;
8#define __trampinit
9#define __trampinitdata
10#else
11#define __trampinit __cpuinit
12#define __trampinitdata __cpuinitdata
13#endif
14 9
15/* ready for x86_64 and x86 */ 10void __init setup_trampolines(void)
16unsigned char *__trampinitdata trampoline_base;
17
18void __init reserve_trampoline_memory(void)
19{ 11{
20 phys_addr_t mem; 12 phys_addr_t mem;
13 size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
21 14
22 /* Has to be in very low memory so we can execute real-mode AP code. */ 15 /* Has to be in very low memory so we can execute real-mode AP code. */
23 mem = memblock_find_in_range(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); 16 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
24 if (mem == MEMBLOCK_ERROR) 17 if (mem == MEMBLOCK_ERROR)
25 panic("Cannot allocate trampoline\n"); 18 panic("Cannot allocate trampoline\n");
26 19
27 trampoline_base = __va(mem); 20 x86_trampoline_base = __va(mem);
28 memblock_x86_reserve_range(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); 21 memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE");
22
23 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
24 x86_trampoline_base, (unsigned long long)mem, size);
25
26 memcpy(x86_trampoline_base, x86_trampoline_start, size);
29} 27}
30 28
31/* 29/*
32 * Currently trivial. Write the real->protected mode 30 * setup_trampolines() gets called very early, to guarantee the
33 * bootstrap into the page concerned. The caller 31 * availability of low memory. This is before the proper kernel page
34 * has made sure it's suitably aligned. 32 * tables are set up, so we cannot set page permissions in that
33 * function. Thus, we use an arch_initcall instead.
35 */ 34 */
36unsigned long __trampinit setup_trampoline(void) 35static int __init configure_trampolines(void)
37{ 36{
38 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 37 size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
39 return virt_to_phys(trampoline_base); 38
39 set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT);
40 return 0;
40} 41}
42arch_initcall(configure_trampolines);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 8508237e8e43..451c0a7ef7fd 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -32,9 +32,11 @@
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/page_types.h> 33#include <asm/page_types.h>
34 34
35/* We can free up trampoline after bootup if cpu hotplug is not supported. */ 35#ifdef CONFIG_SMP
36__CPUINITRODATA 36
37.code16 37 .section ".x86_trampoline","a"
38 .balign PAGE_SIZE
39 .code16
38 40
39ENTRY(trampoline_data) 41ENTRY(trampoline_data)
40r_base = . 42r_base = .
@@ -44,7 +46,7 @@ r_base = .
44 46
45 cli # We should be safe anyway 47 cli # We should be safe anyway
46 48
47 movl $0xA5A5A5A5, trampoline_data - r_base 49 movl $0xA5A5A5A5, trampoline_status - r_base
48 # write marker for master knows we're running 50 # write marker for master knows we're running
49 51
50 /* GDT tables in non default location kernel can be beyond 16MB and 52 /* GDT tables in non default location kernel can be beyond 16MB and
@@ -72,5 +74,10 @@ boot_idt_descr:
72 .word 0 # idt limit = 0 74 .word 0 # idt limit = 0
73 .long 0 # idt base = 0L 75 .long 0 # idt base = 0L
74 76
77ENTRY(trampoline_status)
78 .long 0
79
75.globl trampoline_end 80.globl trampoline_end
76trampoline_end: 81trampoline_end:
82
83#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 075d130efcf9..09ff51799e96 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -32,13 +32,9 @@
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
34 34
35#ifdef CONFIG_ACPI_SLEEP 35 .section ".x86_trampoline","a"
36.section .rodata, "a", @progbits 36 .balign PAGE_SIZE
37#else 37 .code16
38/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
39__CPUINITRODATA
40#endif
41.code16
42 38
43ENTRY(trampoline_data) 39ENTRY(trampoline_data)
44r_base = . 40r_base = .
@@ -50,7 +46,7 @@ r_base = .
50 mov %ax, %ss 46 mov %ax, %ss
51 47
52 48
53 movl $0xA5A5A5A5, trampoline_data - r_base 49 movl $0xA5A5A5A5, trampoline_status - r_base
54 # write marker for master knows we're running 50 # write marker for master knows we're running
55 51
56 # Setup stack 52 # Setup stack
@@ -64,10 +60,13 @@ r_base = .
64 movzx %ax, %esi # Find the 32bit trampoline location 60 movzx %ax, %esi # Find the 32bit trampoline location
65 shll $4, %esi 61 shll $4, %esi
66 62
67 # Fixup the vectors 63 # Fixup the absolute vectors
68 addl %esi, startup_32_vector - r_base 64 leal (startup_32 - r_base)(%esi), %eax
69 addl %esi, startup_64_vector - r_base 65 movl %eax, startup_32_vector - r_base
70 addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer 66 leal (startup_64 - r_base)(%esi), %eax
67 movl %eax, startup_64_vector - r_base
68 leal (tgdt - r_base)(%esi), %eax
69 movl %eax, (tgdt + 2 - r_base)
71 70
72 /* 71 /*
73 * GDT tables in non default location kernel can be beyond 16MB and 72 * GDT tables in non default location kernel can be beyond 16MB and
@@ -129,6 +128,7 @@ no_longmode:
129 jmp no_longmode 128 jmp no_longmode
130#include "verify_cpu.S" 129#include "verify_cpu.S"
131 130
131 .balign 4
132 # Careful these need to be in the same 64K segment as the above; 132 # Careful these need to be in the same 64K segment as the above;
133tidt: 133tidt:
134 .word 0 # idt limit = 0 134 .word 0 # idt limit = 0
@@ -156,6 +156,10 @@ startup_64_vector:
156 .long startup_64 - r_base 156 .long startup_64 - r_base
157 .word __KERNEL_CS, 0 157 .word __KERNEL_CS, 0
158 158
159 .balign 4
160ENTRY(trampoline_status)
161 .long 0
162
159trampoline_stack: 163trampoline_stack:
160 .org 0x1000 164 .org 0x1000
161trampoline_stack_end: 165trampoline_stack_end:
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index ffe5755caa8b..9335bf7dd2e7 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -427,7 +427,7 @@ unsigned long native_calibrate_tsc(void)
427 * the delta to the previous read. We keep track of the min 427 * the delta to the previous read. We keep track of the min
428 * and max values of that delta. The delta is mostly defined 428 * and max values of that delta. The delta is mostly defined
429 * by the IO time of the PIT access, so we can detect when a 429 * by the IO time of the PIT access, so we can detect when a
430 * SMI/SMM disturbance happend between the two reads. If the 430 * SMI/SMM disturbance happened between the two reads. If the
431 * maximum time is significantly larger than the minimum time, 431 * maximum time is significantly larger than the minimum time,
432 * then we discard the result and have another try. 432 * then we discard the result and have another try.
433 * 433 *
@@ -900,7 +900,7 @@ static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
900 * timer based, instead of loop based, we don't block the boot 900 * timer based, instead of loop based, we don't block the boot
901 * process while this longer calibration is done. 901 * process while this longer calibration is done.
902 * 902 *
903 * If there are any calibration anomolies (too many SMIs, etc), 903 * If there are any calibration anomalies (too many SMIs, etc),
904 * or the refined calibration is off by 1% of the fast early 904 * or the refined calibration is off by 1% of the fast early
905 * calibration, we throw out the new calibration and use the 905 * calibration, we throw out the new calibration and use the
906 * early calibration. 906 * early calibration.
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 0edefc19a113..b9242bacbe59 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -18,7 +18,7 @@
18 * This file is expected to run in 32bit code. Currently: 18 * This file is expected to run in 32bit code. Currently:
19 * 19 *
20 * arch/x86/boot/compressed/head_64.S: Boot cpu verification 20 * arch/x86/boot/compressed/head_64.S: Boot cpu verification
21 * arch/x86/kernel/trampoline_64.S: secondary processor verfication 21 * arch/x86/kernel/trampoline_64.S: secondary processor verification
22 * arch/x86/kernel/head_32.S: processor startup 22 * arch/x86/kernel/head_32.S: processor startup
23 * 23 *
24 * verify_cpu, returns the status of longmode and SSE in register %eax. 24 * verify_cpu, returns the status of longmode and SSE in register %eax.
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 0381e1f3baed..624a2016198e 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -231,7 +231,7 @@ SECTIONS
231 * output PHDR, so the next output section - .init.text - should 231 * output PHDR, so the next output section - .init.text - should
232 * start another segment - init. 232 * start another segment - init.
233 */ 233 */
234 PERCPU_VADDR(0, :percpu) 234 PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
235#endif 235#endif
236 236
237 INIT_TEXT_SECTION(PAGE_SIZE) 237 INIT_TEXT_SECTION(PAGE_SIZE)
@@ -241,6 +241,18 @@ SECTIONS
241 241
242 INIT_DATA_SECTION(16) 242 INIT_DATA_SECTION(16)
243 243
244 /*
245 * Code and data for a variety of lowlevel trampolines, to be
246 * copied into base memory (< 1 MiB) during initialization.
247 * Since it is copied early, the main copy can be discarded
248 * afterwards.
249 */
250 .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) {
251 x86_trampoline_start = .;
252 *(.x86_trampoline)
253 x86_trampoline_end = .;
254 }
255
244 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 256 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
245 __x86_cpu_dev_start = .; 257 __x86_cpu_dev_start = .;
246 *(.x86_cpu_dev.init) 258 *(.x86_cpu_dev.init)
@@ -292,6 +304,7 @@ SECTIONS
292 *(.iommu_table) 304 *(.iommu_table)
293 __iommu_table_end = .; 305 __iommu_table_end = .;
294 } 306 }
307
295 . = ALIGN(8); 308 . = ALIGN(8);
296 /* 309 /*
297 * .exit.text is discard at runtime, not link time, to deal with 310 * .exit.text is discard at runtime, not link time, to deal with
@@ -306,7 +319,7 @@ SECTIONS
306 } 319 }
307 320
308#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) 321#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
309 PERCPU(PAGE_SIZE) 322 PERCPU(INTERNODE_CACHE_BYTES, PAGE_SIZE)
310#endif 323#endif
311 324
312 . = ALIGN(PAGE_SIZE); 325 . = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 547128546cc3..a3911343976b 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -53,7 +53,7 @@ void __sanitize_i387_state(struct task_struct *tsk)
53 53
54 /* 54 /*
55 * None of the feature bits are in init state. So nothing else 55 * None of the feature bits are in init state. So nothing else
56 * to do for us, as the memory layout is upto date. 56 * to do for us, as the memory layout is up to date.
57 */ 57 */
58 if ((xstate_bv & pcntxt_mask) == pcntxt_mask) 58 if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
59 return; 59 return;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index caf966781d25..0ad47b819a8b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -76,6 +76,7 @@
76#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 76#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
77#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 77#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
78/* Misc flags */ 78/* Misc flags */
79#define VendorSpecific (1<<22) /* Vendor specific instruction */
79#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ 80#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
80#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ 81#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
81#define Undefined (1<<25) /* No Such Instruction */ 82#define Undefined (1<<25) /* No Such Instruction */
@@ -877,7 +878,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
877 if (selector & 1 << 2) { 878 if (selector & 1 << 2) {
878 struct desc_struct desc; 879 struct desc_struct desc;
879 memset (dt, 0, sizeof *dt); 880 memset (dt, 0, sizeof *dt);
880 if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) 881 if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR,
882 ctxt->vcpu))
881 return; 883 return;
882 884
883 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ 885 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
@@ -929,6 +931,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
929 return ret; 931 return ret;
930} 932}
931 933
934/* Does not support long mode */
932static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 935static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
933 struct x86_emulate_ops *ops, 936 struct x86_emulate_ops *ops,
934 u16 selector, int seg) 937 u16 selector, int seg)
@@ -1040,7 +1043,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1040 } 1043 }
1041load: 1044load:
1042 ops->set_segment_selector(selector, seg, ctxt->vcpu); 1045 ops->set_segment_selector(selector, seg, ctxt->vcpu);
1043 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); 1046 ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu);
1044 return X86EMUL_CONTINUE; 1047 return X86EMUL_CONTINUE;
1045exception: 1048exception:
1046 emulate_exception(ctxt, err_vec, err_code, true); 1049 emulate_exception(ctxt, err_vec, err_code, true);
@@ -1560,7 +1563,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1560 struct desc_struct *ss) 1563 struct desc_struct *ss)
1561{ 1564{
1562 memset(cs, 0, sizeof(struct desc_struct)); 1565 memset(cs, 0, sizeof(struct desc_struct));
1563 ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); 1566 ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu);
1564 memset(ss, 0, sizeof(struct desc_struct)); 1567 memset(ss, 0, sizeof(struct desc_struct));
1565 1568
1566 cs->l = 0; /* will be adjusted later */ 1569 cs->l = 0; /* will be adjusted later */
@@ -1607,9 +1610,9 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1607 cs.d = 0; 1610 cs.d = 0;
1608 cs.l = 1; 1611 cs.l = 1;
1609 } 1612 }
1610 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1613 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
1611 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1614 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1612 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1615 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1613 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 1616 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1614 1617
1615 c->regs[VCPU_REGS_RCX] = c->eip; 1618 c->regs[VCPU_REGS_RCX] = c->eip;
@@ -1679,9 +1682,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1679 cs.l = 1; 1682 cs.l = 1;
1680 } 1683 }
1681 1684
1682 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1685 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
1683 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1686 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1684 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1687 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1685 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 1688 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1686 1689
1687 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); 1690 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
@@ -1736,9 +1739,9 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1736 cs_sel |= SELECTOR_RPL_MASK; 1739 cs_sel |= SELECTOR_RPL_MASK;
1737 ss_sel |= SELECTOR_RPL_MASK; 1740 ss_sel |= SELECTOR_RPL_MASK;
1738 1741
1739 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1742 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
1740 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1743 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1741 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1744 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1742 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 1745 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1743 1746
1744 c->eip = c->regs[VCPU_REGS_RDX]; 1747 c->eip = c->regs[VCPU_REGS_RDX];
@@ -1764,24 +1767,28 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
1764 u16 port, u16 len) 1767 u16 port, u16 len)
1765{ 1768{
1766 struct desc_struct tr_seg; 1769 struct desc_struct tr_seg;
1770 u32 base3;
1767 int r; 1771 int r;
1768 u16 io_bitmap_ptr; 1772 u16 io_bitmap_ptr, perm, bit_idx = port & 0x7;
1769 u8 perm, bit_idx = port & 0x7;
1770 unsigned mask = (1 << len) - 1; 1773 unsigned mask = (1 << len) - 1;
1774 unsigned long base;
1771 1775
1772 ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); 1776 ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu);
1773 if (!tr_seg.p) 1777 if (!tr_seg.p)
1774 return false; 1778 return false;
1775 if (desc_limit_scaled(&tr_seg) < 103) 1779 if (desc_limit_scaled(&tr_seg) < 103)
1776 return false; 1780 return false;
1777 r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, 1781 base = get_desc_base(&tr_seg);
1778 ctxt->vcpu, NULL); 1782#ifdef CONFIG_X86_64
1783 base |= ((u64)base3) << 32;
1784#endif
1785 r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL);
1779 if (r != X86EMUL_CONTINUE) 1786 if (r != X86EMUL_CONTINUE)
1780 return false; 1787 return false;
1781 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) 1788 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
1782 return false; 1789 return false;
1783 r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, 1790 r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu,
1784 &perm, 1, ctxt->vcpu, NULL); 1791 NULL);
1785 if (r != X86EMUL_CONTINUE) 1792 if (r != X86EMUL_CONTINUE)
1786 return false; 1793 return false;
1787 if ((perm >> bit_idx) & mask) 1794 if ((perm >> bit_idx) & mask)
@@ -2126,7 +2133,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2126 } 2133 }
2127 2134
2128 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); 2135 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
2129 ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); 2136 ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu);
2130 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); 2137 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
2131 2138
2132 if (has_error_code) { 2139 if (has_error_code) {
@@ -2365,7 +2372,8 @@ static struct group_dual group7 = { {
2365 D(SrcMem16 | ModRM | Mov | Priv), 2372 D(SrcMem16 | ModRM | Mov | Priv),
2366 D(SrcMem | ModRM | ByteOp | Priv | NoAccess), 2373 D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
2367}, { 2374}, {
2368 D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), 2375 D(SrcNone | ModRM | Priv | VendorSpecific), N,
2376 N, D(SrcNone | ModRM | Priv | VendorSpecific),
2369 D(SrcNone | ModRM | DstMem | Mov), N, 2377 D(SrcNone | ModRM | DstMem | Mov), N,
2370 D(SrcMem16 | ModRM | Mov | Priv), N, 2378 D(SrcMem16 | ModRM | Mov | Priv), N,
2371} }; 2379} };
@@ -2489,7 +2497,7 @@ static struct opcode opcode_table[256] = {
2489static struct opcode twobyte_table[256] = { 2497static struct opcode twobyte_table[256] = {
2490 /* 0x00 - 0x0F */ 2498 /* 0x00 - 0x0F */
2491 N, GD(0, &group7), N, N, 2499 N, GD(0, &group7), N, N,
2492 N, D(ImplicitOps), D(ImplicitOps | Priv), N, 2500 N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N,
2493 D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, 2501 D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
2494 N, D(ImplicitOps | ModRM), N, N, 2502 N, D(ImplicitOps | ModRM), N, N,
2495 /* 0x10 - 0x1F */ 2503 /* 0x10 - 0x1F */
@@ -2502,7 +2510,8 @@ static struct opcode twobyte_table[256] = {
2502 /* 0x30 - 0x3F */ 2510 /* 0x30 - 0x3F */
2503 D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), 2511 D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
2504 D(ImplicitOps | Priv), N, 2512 D(ImplicitOps | Priv), N,
2505 D(ImplicitOps), D(ImplicitOps | Priv), N, N, 2513 D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
2514 N, N,
2506 N, N, N, N, N, N, N, N, 2515 N, N, N, N, N, N, N, N,
2507 /* 0x40 - 0x4F */ 2516 /* 0x40 - 0x4F */
2508 X16(D(DstReg | SrcMem | ModRM | Mov)), 2517 X16(D(DstReg | SrcMem | ModRM | Mov)),
@@ -2741,6 +2750,9 @@ done_prefixes:
2741 if (c->d == 0 || (c->d & Undefined)) 2750 if (c->d == 0 || (c->d & Undefined))
2742 return -1; 2751 return -1;
2743 2752
2753 if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
2754 return -1;
2755
2744 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) 2756 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
2745 c->op_bytes = 8; 2757 c->op_bytes = 8;
2746 2758
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 3cece05e4ac4..19fe855e7953 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -62,9 +62,6 @@ static void pic_unlock(struct kvm_pic *s)
62 } 62 }
63 63
64 if (!found) 64 if (!found)
65 found = s->kvm->bsp_vcpu;
66
67 if (!found)
68 return; 65 return;
69 66
70 kvm_make_request(KVM_REQ_EVENT, found); 67 kvm_make_request(KVM_REQ_EVENT, found);
@@ -75,7 +72,6 @@ static void pic_unlock(struct kvm_pic *s)
75static void pic_clear_isr(struct kvm_kpic_state *s, int irq) 72static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
76{ 73{
77 s->isr &= ~(1 << irq); 74 s->isr &= ~(1 << irq);
78 s->isr_ack |= (1 << irq);
79 if (s != &s->pics_state->pics[0]) 75 if (s != &s->pics_state->pics[0])
80 irq += 8; 76 irq += 8;
81 /* 77 /*
@@ -89,16 +85,6 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
89 pic_lock(s->pics_state); 85 pic_lock(s->pics_state);
90} 86}
91 87
92void kvm_pic_clear_isr_ack(struct kvm *kvm)
93{
94 struct kvm_pic *s = pic_irqchip(kvm);
95
96 pic_lock(s);
97 s->pics[0].isr_ack = 0xff;
98 s->pics[1].isr_ack = 0xff;
99 pic_unlock(s);
100}
101
102/* 88/*
103 * set irq level. If an edge is detected, then the IRR is set to 1 89 * set irq level. If an edge is detected, then the IRR is set to 1
104 */ 90 */
@@ -281,7 +267,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
281 s->irr = 0; 267 s->irr = 0;
282 s->imr = 0; 268 s->imr = 0;
283 s->isr = 0; 269 s->isr = 0;
284 s->isr_ack = 0xff;
285 s->priority_add = 0; 270 s->priority_add = 0;
286 s->irq_base = 0; 271 s->irq_base = 0;
287 s->read_reg_select = 0; 272 s->read_reg_select = 0;
@@ -545,15 +530,11 @@ static int picdev_read(struct kvm_io_device *this,
545 */ 530 */
546static void pic_irq_request(struct kvm *kvm, int level) 531static void pic_irq_request(struct kvm *kvm, int level)
547{ 532{
548 struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
549 struct kvm_pic *s = pic_irqchip(kvm); 533 struct kvm_pic *s = pic_irqchip(kvm);
550 int irq = pic_get_irq(&s->pics[0]);
551 534
552 s->output = level; 535 if (!s->output)
553 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
554 s->pics[0].isr_ack &= ~(1 << irq);
555 s->wakeup_needed = true; 536 s->wakeup_needed = true;
556 } 537 s->output = level;
557} 538}
558 539
559static const struct kvm_io_device_ops picdev_ops = { 540static const struct kvm_io_device_ops picdev_ops = {
@@ -575,8 +556,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
575 s->pics[1].elcr_mask = 0xde; 556 s->pics[1].elcr_mask = 0xde;
576 s->pics[0].pics_state = s; 557 s->pics[0].pics_state = s;
577 s->pics[1].pics_state = s; 558 s->pics[1].pics_state = s;
578 s->pics[0].isr_ack = 0xff;
579 s->pics[1].isr_ack = 0xff;
580 559
581 /* 560 /*
582 * Initialize PIO device 561 * Initialize PIO device
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 93cf9d0d3653..2b2255b1f04b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -417,10 +417,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
417 case APIC_DM_INIT: 417 case APIC_DM_INIT:
418 if (level) { 418 if (level) {
419 result = 1; 419 result = 1;
420 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
421 printk(KERN_DEBUG
422 "INIT on a runnable vcpu %d\n",
423 vcpu->vcpu_id);
424 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 420 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
425 kvm_make_request(KVM_REQ_EVENT, vcpu); 421 kvm_make_request(KVM_REQ_EVENT, vcpu);
426 kvm_vcpu_kick(vcpu); 422 kvm_vcpu_kick(vcpu);
@@ -875,8 +871,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
875 871
876 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); 872 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
877 873
878 if (vcpu->arch.apic->regs_page) 874 if (vcpu->arch.apic->regs)
879 __free_page(vcpu->arch.apic->regs_page); 875 free_page((unsigned long)vcpu->arch.apic->regs);
880 876
881 kfree(vcpu->arch.apic); 877 kfree(vcpu->arch.apic);
882} 878}
@@ -1065,13 +1061,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
1065 1061
1066 vcpu->arch.apic = apic; 1062 vcpu->arch.apic = apic;
1067 1063
1068 apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO); 1064 apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
1069 if (apic->regs_page == NULL) { 1065 if (!apic->regs) {
1070 printk(KERN_ERR "malloc apic regs error for vcpu %x\n", 1066 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
1071 vcpu->vcpu_id); 1067 vcpu->vcpu_id);
1072 goto nomem_free_apic; 1068 goto nomem_free_apic;
1073 } 1069 }
1074 apic->regs = page_address(apic->regs_page);
1075 apic->vcpu = vcpu; 1070 apic->vcpu = vcpu;
1076 1071
1077 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, 1072 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index f5fe32c5edad..52c9e6b9e725 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -13,7 +13,6 @@ struct kvm_lapic {
13 u32 divide_count; 13 u32 divide_count;
14 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
15 bool irr_pending; 15 bool irr_pending;
16 struct page *regs_page;
17 void *regs; 16 void *regs;
18 gpa_t vapic_addr; 17 gpa_t vapic_addr;
19 struct page *vapic_page; 18 struct page *vapic_page;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f02b8edc3d44..22fae7593ee7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -111,9 +111,6 @@ module_param(oos_shadow, bool, 0644);
111#define PT64_LEVEL_SHIFT(level) \ 111#define PT64_LEVEL_SHIFT(level) \
112 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) 112 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
113 113
114#define PT64_LEVEL_MASK(level) \
115 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
116
117#define PT64_INDEX(address, level)\ 114#define PT64_INDEX(address, level)\
118 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) 115 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
119 116
@@ -123,8 +120,6 @@ module_param(oos_shadow, bool, 0644);
123#define PT32_LEVEL_SHIFT(level) \ 120#define PT32_LEVEL_SHIFT(level) \
124 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) 121 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
125 122
126#define PT32_LEVEL_MASK(level) \
127 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
128#define PT32_LVL_OFFSET_MASK(level) \ 123#define PT32_LVL_OFFSET_MASK(level) \
129 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 124 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
130 * PT32_LEVEL_BITS))) - 1)) 125 * PT32_LEVEL_BITS))) - 1))
@@ -379,15 +374,15 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
379static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 374static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
380 int min) 375 int min)
381{ 376{
382 struct page *page; 377 void *page;
383 378
384 if (cache->nobjs >= min) 379 if (cache->nobjs >= min)
385 return 0; 380 return 0;
386 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 381 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
387 page = alloc_page(GFP_KERNEL); 382 page = (void *)__get_free_page(GFP_KERNEL);
388 if (!page) 383 if (!page)
389 return -ENOMEM; 384 return -ENOMEM;
390 cache->objects[cache->nobjs++] = page_address(page); 385 cache->objects[cache->nobjs++] = page;
391 } 386 }
392 return 0; 387 return 0;
393} 388}
@@ -554,13 +549,23 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
554 return ret; 549 return ret;
555} 550}
556 551
557static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) 552static struct kvm_memory_slot *
553gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
554 bool no_dirty_log)
558{ 555{
559 struct kvm_memory_slot *slot; 556 struct kvm_memory_slot *slot;
560 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 557
561 if (slot && slot->dirty_bitmap) 558 slot = gfn_to_memslot(vcpu->kvm, gfn);
562 return true; 559 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
563 return false; 560 (no_dirty_log && slot->dirty_bitmap))
561 slot = NULL;
562
563 return slot;
564}
565
566static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567{
568 return gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
564} 569}
565 570
566static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 571static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -1032,9 +1037,9 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1032 ASSERT(is_empty_shadow_page(sp->spt)); 1037 ASSERT(is_empty_shadow_page(sp->spt));
1033 hlist_del(&sp->hash_link); 1038 hlist_del(&sp->hash_link);
1034 list_del(&sp->link); 1039 list_del(&sp->link);
1035 __free_page(virt_to_page(sp->spt)); 1040 free_page((unsigned long)sp->spt);
1036 if (!sp->role.direct) 1041 if (!sp->role.direct)
1037 __free_page(virt_to_page(sp->gfns)); 1042 free_page((unsigned long)sp->gfns);
1038 kmem_cache_free(mmu_page_header_cache, sp); 1043 kmem_cache_free(mmu_page_header_cache, sp);
1039 kvm_mod_used_mmu_pages(kvm, -1); 1044 kvm_mod_used_mmu_pages(kvm, -1);
1040} 1045}
@@ -1199,6 +1204,13 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1199{ 1204{
1200} 1205}
1201 1206
1207static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
1208 struct kvm_mmu_page *sp, u64 *spte,
1209 const void *pte, unsigned long mmu_seq)
1210{
1211 WARN_ON(1);
1212}
1213
1202#define KVM_PAGE_ARRAY_NR 16 1214#define KVM_PAGE_ARRAY_NR 16
1203 1215
1204struct kvm_mmu_pages { 1216struct kvm_mmu_pages {
@@ -2150,26 +2162,13 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2150{ 2162{
2151} 2163}
2152 2164
2153static struct kvm_memory_slot *
2154pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
2155{
2156 struct kvm_memory_slot *slot;
2157
2158 slot = gfn_to_memslot(vcpu->kvm, gfn);
2159 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
2160 (no_dirty_log && slot->dirty_bitmap))
2161 slot = NULL;
2162
2163 return slot;
2164}
2165
2166static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2165static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2167 bool no_dirty_log) 2166 bool no_dirty_log)
2168{ 2167{
2169 struct kvm_memory_slot *slot; 2168 struct kvm_memory_slot *slot;
2170 unsigned long hva; 2169 unsigned long hva;
2171 2170
2172 slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log); 2171 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2173 if (!slot) { 2172 if (!slot) {
2174 get_page(bad_page); 2173 get_page(bad_page);
2175 return page_to_pfn(bad_page); 2174 return page_to_pfn(bad_page);
@@ -2190,7 +2189,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2190 gfn_t gfn; 2189 gfn_t gfn;
2191 2190
2192 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); 2191 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2193 if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK)) 2192 if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
2194 return -1; 2193 return -1;
2195 2194
2196 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); 2195 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
@@ -2804,6 +2803,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2804 context->prefetch_page = nonpaging_prefetch_page; 2803 context->prefetch_page = nonpaging_prefetch_page;
2805 context->sync_page = nonpaging_sync_page; 2804 context->sync_page = nonpaging_sync_page;
2806 context->invlpg = nonpaging_invlpg; 2805 context->invlpg = nonpaging_invlpg;
2806 context->update_pte = nonpaging_update_pte;
2807 context->root_level = 0; 2807 context->root_level = 0;
2808 context->shadow_root_level = PT32E_ROOT_LEVEL; 2808 context->shadow_root_level = PT32E_ROOT_LEVEL;
2809 context->root_hpa = INVALID_PAGE; 2809 context->root_hpa = INVALID_PAGE;
@@ -2933,6 +2933,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2933 context->prefetch_page = paging64_prefetch_page; 2933 context->prefetch_page = paging64_prefetch_page;
2934 context->sync_page = paging64_sync_page; 2934 context->sync_page = paging64_sync_page;
2935 context->invlpg = paging64_invlpg; 2935 context->invlpg = paging64_invlpg;
2936 context->update_pte = paging64_update_pte;
2936 context->free = paging_free; 2937 context->free = paging_free;
2937 context->root_level = level; 2938 context->root_level = level;
2938 context->shadow_root_level = level; 2939 context->shadow_root_level = level;
@@ -2961,6 +2962,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
2961 context->prefetch_page = paging32_prefetch_page; 2962 context->prefetch_page = paging32_prefetch_page;
2962 context->sync_page = paging32_sync_page; 2963 context->sync_page = paging32_sync_page;
2963 context->invlpg = paging32_invlpg; 2964 context->invlpg = paging32_invlpg;
2965 context->update_pte = paging32_update_pte;
2964 context->root_level = PT32_ROOT_LEVEL; 2966 context->root_level = PT32_ROOT_LEVEL;
2965 context->shadow_root_level = PT32E_ROOT_LEVEL; 2967 context->shadow_root_level = PT32E_ROOT_LEVEL;
2966 context->root_hpa = INVALID_PAGE; 2968 context->root_hpa = INVALID_PAGE;
@@ -2985,6 +2987,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2985 context->prefetch_page = nonpaging_prefetch_page; 2987 context->prefetch_page = nonpaging_prefetch_page;
2986 context->sync_page = nonpaging_sync_page; 2988 context->sync_page = nonpaging_sync_page;
2987 context->invlpg = nonpaging_invlpg; 2989 context->invlpg = nonpaging_invlpg;
2990 context->update_pte = nonpaging_update_pte;
2988 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 2991 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2989 context->root_hpa = INVALID_PAGE; 2992 context->root_hpa = INVALID_PAGE;
2990 context->direct_map = true; 2993 context->direct_map = true;
@@ -3089,8 +3092,6 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3089 3092
3090static int init_kvm_mmu(struct kvm_vcpu *vcpu) 3093static int init_kvm_mmu(struct kvm_vcpu *vcpu)
3091{ 3094{
3092 vcpu->arch.update_pte.pfn = bad_pfn;
3093
3094 if (mmu_is_nested(vcpu)) 3095 if (mmu_is_nested(vcpu))
3095 return init_kvm_nested_mmu(vcpu); 3096 return init_kvm_nested_mmu(vcpu);
3096 else if (tdp_enabled) 3097 else if (tdp_enabled)
@@ -3164,7 +3165,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
3164static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 3165static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3165 struct kvm_mmu_page *sp, 3166 struct kvm_mmu_page *sp,
3166 u64 *spte, 3167 u64 *spte,
3167 const void *new) 3168 const void *new, unsigned long mmu_seq)
3168{ 3169{
3169 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 3170 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
3170 ++vcpu->kvm->stat.mmu_pde_zapped; 3171 ++vcpu->kvm->stat.mmu_pde_zapped;
@@ -3172,10 +3173,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3172 } 3173 }
3173 3174
3174 ++vcpu->kvm->stat.mmu_pte_updated; 3175 ++vcpu->kvm->stat.mmu_pte_updated;
3175 if (!sp->role.cr4_pae) 3176 vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq);
3176 paging32_update_pte(vcpu, sp, spte, new);
3177 else
3178 paging64_update_pte(vcpu, sp, spte, new);
3179} 3177}
3180 3178
3181static bool need_remote_flush(u64 old, u64 new) 3179static bool need_remote_flush(u64 old, u64 new)
@@ -3210,28 +3208,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
3210 return !!(spte && (*spte & shadow_accessed_mask)); 3208 return !!(spte && (*spte & shadow_accessed_mask));
3211} 3209}
3212 3210
3213static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3214 u64 gpte)
3215{
3216 gfn_t gfn;
3217 pfn_t pfn;
3218
3219 if (!is_present_gpte(gpte))
3220 return;
3221 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
3222
3223 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
3224 smp_rmb();
3225 pfn = gfn_to_pfn(vcpu->kvm, gfn);
3226
3227 if (is_error_pfn(pfn)) {
3228 kvm_release_pfn_clean(pfn);
3229 return;
3230 }
3231 vcpu->arch.update_pte.gfn = gfn;
3232 vcpu->arch.update_pte.pfn = pfn;
3233}
3234
3235static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) 3211static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
3236{ 3212{
3237 u64 *spte = vcpu->arch.last_pte_updated; 3213 u64 *spte = vcpu->arch.last_pte_updated;
@@ -3253,21 +3229,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3253 struct kvm_mmu_page *sp; 3229 struct kvm_mmu_page *sp;
3254 struct hlist_node *node; 3230 struct hlist_node *node;
3255 LIST_HEAD(invalid_list); 3231 LIST_HEAD(invalid_list);
3256 u64 entry, gentry; 3232 unsigned long mmu_seq;
3257 u64 *spte; 3233 u64 entry, gentry, *spte;
3258 unsigned offset = offset_in_page(gpa); 3234 unsigned pte_size, page_offset, misaligned, quadrant, offset;
3259 unsigned pte_size; 3235 int level, npte, invlpg_counter, r, flooded = 0;
3260 unsigned page_offset;
3261 unsigned misaligned;
3262 unsigned quadrant;
3263 int level;
3264 int flooded = 0;
3265 int npte;
3266 int r;
3267 int invlpg_counter;
3268 bool remote_flush, local_flush, zap_page; 3236 bool remote_flush, local_flush, zap_page;
3269 3237
3270 zap_page = remote_flush = local_flush = false; 3238 zap_page = remote_flush = local_flush = false;
3239 offset = offset_in_page(gpa);
3271 3240
3272 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 3241 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
3273 3242
@@ -3275,9 +3244,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3275 3244
3276 /* 3245 /*
3277 * Assume that the pte write on a page table of the same type 3246 * Assume that the pte write on a page table of the same type
3278 * as the current vcpu paging mode. This is nearly always true 3247 * as the current vcpu paging mode since we update the sptes only
3279 * (might be false while changing modes). Note it is verified later 3248 * when they have the same mode.
3280 * by update_pte().
3281 */ 3249 */
3282 if ((is_pae(vcpu) && bytes == 4) || !new) { 3250 if ((is_pae(vcpu) && bytes == 4) || !new) {
3283 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 3251 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
@@ -3303,15 +3271,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3303 break; 3271 break;
3304 } 3272 }
3305 3273
3306 mmu_guess_page_from_pte_write(vcpu, gpa, gentry); 3274 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3275 smp_rmb();
3276
3307 spin_lock(&vcpu->kvm->mmu_lock); 3277 spin_lock(&vcpu->kvm->mmu_lock);
3308 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) 3278 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
3309 gentry = 0; 3279 gentry = 0;
3310 kvm_mmu_access_page(vcpu, gfn);
3311 kvm_mmu_free_some_pages(vcpu); 3280 kvm_mmu_free_some_pages(vcpu);
3312 ++vcpu->kvm->stat.mmu_pte_write; 3281 ++vcpu->kvm->stat.mmu_pte_write;
3313 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); 3282 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
3314 if (guest_initiated) { 3283 if (guest_initiated) {
3284 kvm_mmu_access_page(vcpu, gfn);
3315 if (gfn == vcpu->arch.last_pt_write_gfn 3285 if (gfn == vcpu->arch.last_pt_write_gfn
3316 && !last_updated_pte_accessed(vcpu)) { 3286 && !last_updated_pte_accessed(vcpu)) {
3317 ++vcpu->arch.last_pt_write_count; 3287 ++vcpu->arch.last_pt_write_count;
@@ -3375,7 +3345,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3375 if (gentry && 3345 if (gentry &&
3376 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3346 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3377 & mask.word)) 3347 & mask.word))
3378 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 3348 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry,
3349 mmu_seq);
3379 if (!remote_flush && need_remote_flush(entry, *spte)) 3350 if (!remote_flush && need_remote_flush(entry, *spte))
3380 remote_flush = true; 3351 remote_flush = true;
3381 ++spte; 3352 ++spte;
@@ -3385,10 +3356,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3385 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3356 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3386 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); 3357 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
3387 spin_unlock(&vcpu->kvm->mmu_lock); 3358 spin_unlock(&vcpu->kvm->mmu_lock);
3388 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
3389 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
3390 vcpu->arch.update_pte.pfn = bad_pfn;
3391 }
3392} 3359}
3393 3360
3394int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 3361int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@@ -3538,14 +3505,23 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3538 if (!test_bit(slot, sp->slot_bitmap)) 3505 if (!test_bit(slot, sp->slot_bitmap))
3539 continue; 3506 continue;
3540 3507
3541 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3542 continue;
3543
3544 pt = sp->spt; 3508 pt = sp->spt;
3545 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3509 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3510 if (!is_shadow_present_pte(pt[i]) ||
3511 !is_last_spte(pt[i], sp->role.level))
3512 continue;
3513
3514 if (is_large_pte(pt[i])) {
3515 drop_spte(kvm, &pt[i],
3516 shadow_trap_nonpresent_pte);
3517 --kvm->stat.lpages;
3518 continue;
3519 }
3520
3546 /* avoid RMW */ 3521 /* avoid RMW */
3547 if (is_writable_pte(pt[i])) 3522 if (is_writable_pte(pt[i]))
3548 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); 3523 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3524 }
3549 } 3525 }
3550 kvm_flush_remote_tlbs(kvm); 3526 kvm_flush_remote_tlbs(kvm);
3551} 3527}
@@ -3583,7 +3559,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3583 if (nr_to_scan == 0) 3559 if (nr_to_scan == 0)
3584 goto out; 3560 goto out;
3585 3561
3586 spin_lock(&kvm_lock); 3562 raw_spin_lock(&kvm_lock);
3587 3563
3588 list_for_each_entry(kvm, &vm_list, vm_list) { 3564 list_for_each_entry(kvm, &vm_list, vm_list) {
3589 int idx, freed_pages; 3565 int idx, freed_pages;
@@ -3606,7 +3582,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3606 if (kvm_freed) 3582 if (kvm_freed)
3607 list_move_tail(&kvm_freed->vm_list, &vm_list); 3583 list_move_tail(&kvm_freed->vm_list, &vm_list);
3608 3584
3609 spin_unlock(&kvm_lock); 3585 raw_spin_unlock(&kvm_lock);
3610 3586
3611out: 3587out:
3612 return percpu_counter_read_positive(&kvm_total_used_mmu_pages); 3588 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bccc24c4181..c6397795d865 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -31,7 +31,6 @@
31 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) 31 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) 32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
34 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
35 #define PT_LEVEL_BITS PT64_LEVEL_BITS 34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
36 #ifdef CONFIG_X86_64 35 #ifdef CONFIG_X86_64
37 #define PT_MAX_FULL_LEVELS 4 36 #define PT_MAX_FULL_LEVELS 4
@@ -48,7 +47,6 @@
48 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) 47 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
49 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) 48 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
50 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
51 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
52 #define PT_LEVEL_BITS PT32_LEVEL_BITS 50 #define PT_LEVEL_BITS PT32_LEVEL_BITS
53 #define PT_MAX_FULL_LEVELS 2 51 #define PT_MAX_FULL_LEVELS 2
54 #define CMPXCHG cmpxchg 52 #define CMPXCHG cmpxchg
@@ -327,7 +325,7 @@ no_present:
327} 325}
328 326
329static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 327static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
330 u64 *spte, const void *pte) 328 u64 *spte, const void *pte, unsigned long mmu_seq)
331{ 329{
332 pt_element_t gpte; 330 pt_element_t gpte;
333 unsigned pte_access; 331 unsigned pte_access;
@@ -339,16 +337,16 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
339 337
340 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 338 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
341 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 339 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
342 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 340 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
341 if (is_error_pfn(pfn)) {
342 kvm_release_pfn_clean(pfn);
343 return; 343 return;
344 pfn = vcpu->arch.update_pte.pfn; 344 }
345 if (is_error_pfn(pfn)) 345 if (mmu_notifier_retry(vcpu, mmu_seq))
346 return;
347 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
348 return; 346 return;
349 kvm_get_pfn(pfn); 347
350 /* 348 /*
351 * we call mmu_set_spte() with host_writable = true beacuse that 349 * we call mmu_set_spte() with host_writable = true because that
352 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 350 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
353 */ 351 */
354 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 352 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -829,7 +827,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
829#undef FNAME 827#undef FNAME
830#undef PT_BASE_ADDR_MASK 828#undef PT_BASE_ADDR_MASK
831#undef PT_INDEX 829#undef PT_INDEX
832#undef PT_LEVEL_MASK
833#undef PT_LVL_ADDR_MASK 830#undef PT_LVL_ADDR_MASK
834#undef PT_LVL_OFFSET_MASK 831#undef PT_LVL_OFFSET_MASK
835#undef PT_LEVEL_BITS 832#undef PT_LEVEL_BITS
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 63fec1531e89..6bb15d583e47 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -135,6 +135,8 @@ struct vcpu_svm {
135 135
136 u32 *msrpm; 136 u32 *msrpm;
137 137
138 ulong nmi_iret_rip;
139
138 struct nested_state nested; 140 struct nested_state nested;
139 141
140 bool nmi_singlestep; 142 bool nmi_singlestep;
@@ -1153,8 +1155,10 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1153 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); 1155 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1154 load_gs_index(svm->host.gs); 1156 load_gs_index(svm->host.gs);
1155#else 1157#else
1158#ifdef CONFIG_X86_32_LAZY_GS
1156 loadsegment(gs, svm->host.gs); 1159 loadsegment(gs, svm->host.gs);
1157#endif 1160#endif
1161#endif
1158 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1162 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1159 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1163 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1160} 1164}
@@ -2653,6 +2657,7 @@ static int iret_interception(struct vcpu_svm *svm)
2653 ++svm->vcpu.stat.nmi_window_exits; 2657 ++svm->vcpu.stat.nmi_window_exits;
2654 clr_intercept(svm, INTERCEPT_IRET); 2658 clr_intercept(svm, INTERCEPT_IRET);
2655 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2659 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2660 svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2656 return 1; 2661 return 1;
2657} 2662}
2658 2663
@@ -3474,7 +3479,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3474 3479
3475 svm->int3_injected = 0; 3480 svm->int3_injected = 0;
3476 3481
3477 if (svm->vcpu.arch.hflags & HF_IRET_MASK) { 3482 /*
3483 * If we've made progress since setting HF_IRET_MASK, we've
3484 * executed an IRET and can allow NMI injection.
3485 */
3486 if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
3487 && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
3478 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 3488 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3479 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3489 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3480 } 3490 }
@@ -3641,19 +3651,30 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3641 wrmsrl(MSR_GS_BASE, svm->host.gs_base); 3651 wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3642#else 3652#else
3643 loadsegment(fs, svm->host.fs); 3653 loadsegment(fs, svm->host.fs);
3654#ifndef CONFIG_X86_32_LAZY_GS
3655 loadsegment(gs, svm->host.gs);
3656#endif
3644#endif 3657#endif
3645 3658
3646 reload_tss(vcpu); 3659 reload_tss(vcpu);
3647 3660
3648 local_irq_disable(); 3661 local_irq_disable();
3649 3662
3650 stgi();
3651
3652 vcpu->arch.cr2 = svm->vmcb->save.cr2; 3663 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3653 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 3664 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3654 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 3665 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3655 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 3666 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3656 3667
3668 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3669 kvm_before_handle_nmi(&svm->vcpu);
3670
3671 stgi();
3672
3673 /* Any pending NMI will happen here */
3674
3675 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3676 kvm_after_handle_nmi(&svm->vcpu);
3677
3657 sync_cr8_to_lapic(vcpu); 3678 sync_cr8_to_lapic(vcpu);
3658 3679
3659 svm->next_rip = 0; 3680 svm->next_rip = 0;
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index fc7a101c4a35..abd86e865be3 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -25,7 +25,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
25 25
26 /* 26 /*
27 * There is a race window between reading and incrementing, but we do 27 * There is a race window between reading and incrementing, but we do
28 * not care about potentially loosing timer events in the !reinject 28 * not care about potentially losing timer events in the !reinject
29 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked 29 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
30 * in vcpu_enter_guest. 30 * in vcpu_enter_guest.
31 */ 31 */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bf89ec2cfb82..5b4cdcbd154c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -93,14 +93,14 @@ module_param(yield_on_hlt, bool, S_IRUGO);
93 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 93 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
94 * ple_gap: upper bound on the amount of time between two successive 94 * ple_gap: upper bound on the amount of time between two successive
95 * executions of PAUSE in a loop. Also indicate if ple enabled. 95 * executions of PAUSE in a loop. Also indicate if ple enabled.
96 * According to test, this time is usually small than 41 cycles. 96 * According to test, this time is usually smaller than 128 cycles.
97 * ple_window: upper bound on the amount of time a guest is allowed to execute 97 * ple_window: upper bound on the amount of time a guest is allowed to execute
98 * in a PAUSE loop. Tests indicate that most spinlocks are held for 98 * in a PAUSE loop. Tests indicate that most spinlocks are held for
99 * less than 2^12 cycles 99 * less than 2^12 cycles
100 * Time is measured based on a counter that runs at the same rate as the TSC, 100 * Time is measured based on a counter that runs at the same rate as the TSC,
101 * refer SDM volume 3b section 21.6.13 & 22.1.3. 101 * refer SDM volume 3b section 21.6.13 & 22.1.3.
102 */ 102 */
103#define KVM_VMX_DEFAULT_PLE_GAP 41 103#define KVM_VMX_DEFAULT_PLE_GAP 128
104#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 104#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
105static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; 105static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
106module_param(ple_gap, int, S_IRUGO); 106module_param(ple_gap, int, S_IRUGO);
@@ -176,11 +176,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
176 return container_of(vcpu, struct vcpu_vmx, vcpu); 176 return container_of(vcpu, struct vcpu_vmx, vcpu);
177} 177}
178 178
179static int init_rmode(struct kvm *kvm);
180static u64 construct_eptp(unsigned long root_hpa); 179static u64 construct_eptp(unsigned long root_hpa);
181static void kvm_cpu_vmxon(u64 addr); 180static void kvm_cpu_vmxon(u64 addr);
182static void kvm_cpu_vmxoff(void); 181static void kvm_cpu_vmxoff(void);
183static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 182static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
183static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
184 184
185static DEFINE_PER_CPU(struct vmcs *, vmxarea); 185static DEFINE_PER_CPU(struct vmcs *, vmxarea);
186static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 186static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1333,19 +1333,25 @@ static __init int vmx_disabled_by_bios(void)
1333 1333
1334 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 1334 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1335 if (msr & FEATURE_CONTROL_LOCKED) { 1335 if (msr & FEATURE_CONTROL_LOCKED) {
1336 /* launched w/ TXT and VMX disabled */
1336 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 1337 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1337 && tboot_enabled()) 1338 && tboot_enabled())
1338 return 1; 1339 return 1;
1340 /* launched w/o TXT and VMX only enabled w/ TXT */
1339 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 1341 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1342 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1340 && !tboot_enabled()) { 1343 && !tboot_enabled()) {
1341 printk(KERN_WARNING "kvm: disable TXT in the BIOS or " 1344 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
1342 " activate TXT before enabling KVM\n"); 1345 "activate TXT before enabling KVM\n");
1343 return 1; 1346 return 1;
1344 } 1347 }
1348 /* launched w/o TXT and VMX disabled */
1349 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1350 && !tboot_enabled())
1351 return 1;
1345 } 1352 }
1346 1353
1347 return 0; 1354 return 0;
1348 /* locked but not enabled */
1349} 1355}
1350 1356
1351static void kvm_cpu_vmxon(u64 addr) 1357static void kvm_cpu_vmxon(u64 addr)
@@ -1683,6 +1689,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1683 vmx->emulation_required = 1; 1689 vmx->emulation_required = 1;
1684 vmx->rmode.vm86_active = 0; 1690 vmx->rmode.vm86_active = 0;
1685 1691
1692 vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
1686 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); 1693 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
1687 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); 1694 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
1688 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); 1695 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
@@ -1756,6 +1763,19 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1756 vmx->emulation_required = 1; 1763 vmx->emulation_required = 1;
1757 vmx->rmode.vm86_active = 1; 1764 vmx->rmode.vm86_active = 1;
1758 1765
1766 /*
1767 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
1768 * vcpu. Call it here with phys address pointing 16M below 4G.
1769 */
1770 if (!vcpu->kvm->arch.tss_addr) {
1771 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
1772 "called before entering vcpu\n");
1773 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
1774 vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
1775 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1776 }
1777
1778 vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
1759 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1779 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1760 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1780 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1761 1781
@@ -1794,7 +1814,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1794 1814
1795continue_rmode: 1815continue_rmode:
1796 kvm_mmu_reset_context(vcpu); 1816 kvm_mmu_reset_context(vcpu);
1797 init_rmode(vcpu->kvm);
1798} 1817}
1799 1818
1800static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1819static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -2030,23 +2049,40 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2030 vmcs_writel(GUEST_CR4, hw_cr4); 2049 vmcs_writel(GUEST_CR4, hw_cr4);
2031} 2050}
2032 2051
2033static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2034{
2035 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2036
2037 return vmcs_readl(sf->base);
2038}
2039
2040static void vmx_get_segment(struct kvm_vcpu *vcpu, 2052static void vmx_get_segment(struct kvm_vcpu *vcpu,
2041 struct kvm_segment *var, int seg) 2053 struct kvm_segment *var, int seg)
2042{ 2054{
2055 struct vcpu_vmx *vmx = to_vmx(vcpu);
2043 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2056 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2057 struct kvm_save_segment *save;
2044 u32 ar; 2058 u32 ar;
2045 2059
2060 if (vmx->rmode.vm86_active
2061 && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
2062 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
2063 || seg == VCPU_SREG_GS)
2064 && !emulate_invalid_guest_state) {
2065 switch (seg) {
2066 case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
2067 case VCPU_SREG_ES: save = &vmx->rmode.es; break;
2068 case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
2069 case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
2070 case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
2071 default: BUG();
2072 }
2073 var->selector = save->selector;
2074 var->base = save->base;
2075 var->limit = save->limit;
2076 ar = save->ar;
2077 if (seg == VCPU_SREG_TR
2078 || var->selector == vmcs_read16(sf->selector))
2079 goto use_saved_rmode_seg;
2080 }
2046 var->base = vmcs_readl(sf->base); 2081 var->base = vmcs_readl(sf->base);
2047 var->limit = vmcs_read32(sf->limit); 2082 var->limit = vmcs_read32(sf->limit);
2048 var->selector = vmcs_read16(sf->selector); 2083 var->selector = vmcs_read16(sf->selector);
2049 ar = vmcs_read32(sf->ar_bytes); 2084 ar = vmcs_read32(sf->ar_bytes);
2085use_saved_rmode_seg:
2050 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) 2086 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
2051 ar = 0; 2087 ar = 0;
2052 var->type = ar & 15; 2088 var->type = ar & 15;
@@ -2060,6 +2096,18 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
2060 var->unusable = (ar >> 16) & 1; 2096 var->unusable = (ar >> 16) & 1;
2061} 2097}
2062 2098
2099static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2100{
2101 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2102 struct kvm_segment s;
2103
2104 if (to_vmx(vcpu)->rmode.vm86_active) {
2105 vmx_get_segment(vcpu, &s, seg);
2106 return s.base;
2107 }
2108 return vmcs_readl(sf->base);
2109}
2110
2063static int vmx_get_cpl(struct kvm_vcpu *vcpu) 2111static int vmx_get_cpl(struct kvm_vcpu *vcpu)
2064{ 2112{
2065 if (!is_protmode(vcpu)) 2113 if (!is_protmode(vcpu))
@@ -2101,6 +2149,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
2101 u32 ar; 2149 u32 ar;
2102 2150
2103 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { 2151 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
2152 vmcs_write16(sf->selector, var->selector);
2104 vmx->rmode.tr.selector = var->selector; 2153 vmx->rmode.tr.selector = var->selector;
2105 vmx->rmode.tr.base = var->base; 2154 vmx->rmode.tr.base = var->base;
2106 vmx->rmode.tr.limit = var->limit; 2155 vmx->rmode.tr.limit = var->limit;
@@ -2361,11 +2410,12 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
2361 2410
2362static int init_rmode_tss(struct kvm *kvm) 2411static int init_rmode_tss(struct kvm *kvm)
2363{ 2412{
2364 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 2413 gfn_t fn;
2365 u16 data = 0; 2414 u16 data = 0;
2366 int ret = 0; 2415 int r, idx, ret = 0;
2367 int r;
2368 2416
2417 idx = srcu_read_lock(&kvm->srcu);
2418 fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
2369 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 2419 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2370 if (r < 0) 2420 if (r < 0)
2371 goto out; 2421 goto out;
@@ -2389,12 +2439,13 @@ static int init_rmode_tss(struct kvm *kvm)
2389 2439
2390 ret = 1; 2440 ret = 1;
2391out: 2441out:
2442 srcu_read_unlock(&kvm->srcu, idx);
2392 return ret; 2443 return ret;
2393} 2444}
2394 2445
2395static int init_rmode_identity_map(struct kvm *kvm) 2446static int init_rmode_identity_map(struct kvm *kvm)
2396{ 2447{
2397 int i, r, ret; 2448 int i, idx, r, ret;
2398 pfn_t identity_map_pfn; 2449 pfn_t identity_map_pfn;
2399 u32 tmp; 2450 u32 tmp;
2400 2451
@@ -2409,6 +2460,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
2409 return 1; 2460 return 1;
2410 ret = 0; 2461 ret = 0;
2411 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; 2462 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
2463 idx = srcu_read_lock(&kvm->srcu);
2412 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); 2464 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
2413 if (r < 0) 2465 if (r < 0)
2414 goto out; 2466 goto out;
@@ -2424,6 +2476,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
2424 kvm->arch.ept_identity_pagetable_done = true; 2476 kvm->arch.ept_identity_pagetable_done = true;
2425 ret = 1; 2477 ret = 1;
2426out: 2478out:
2479 srcu_read_unlock(&kvm->srcu, idx);
2427 return ret; 2480 return ret;
2428} 2481}
2429 2482
@@ -2699,22 +2752,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2699 return 0; 2752 return 0;
2700} 2753}
2701 2754
2702static int init_rmode(struct kvm *kvm)
2703{
2704 int idx, ret = 0;
2705
2706 idx = srcu_read_lock(&kvm->srcu);
2707 if (!init_rmode_tss(kvm))
2708 goto exit;
2709 if (!init_rmode_identity_map(kvm))
2710 goto exit;
2711
2712 ret = 1;
2713exit:
2714 srcu_read_unlock(&kvm->srcu, idx);
2715 return ret;
2716}
2717
2718static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) 2755static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2719{ 2756{
2720 struct vcpu_vmx *vmx = to_vmx(vcpu); 2757 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2722,10 +2759,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2722 int ret; 2759 int ret;
2723 2760
2724 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2761 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2725 if (!init_rmode(vmx->vcpu.kvm)) {
2726 ret = -ENOMEM;
2727 goto out;
2728 }
2729 2762
2730 vmx->rmode.vm86_active = 0; 2763 vmx->rmode.vm86_active = 0;
2731 2764
@@ -2805,7 +2838,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2805 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 2838 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
2806 if (vm_need_tpr_shadow(vmx->vcpu.kvm)) 2839 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
2807 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 2840 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
2808 page_to_phys(vmx->vcpu.arch.apic->regs_page)); 2841 __pa(vmx->vcpu.arch.apic->regs));
2809 vmcs_write32(TPR_THRESHOLD, 0); 2842 vmcs_write32(TPR_THRESHOLD, 0);
2810 } 2843 }
2811 2844
@@ -2971,6 +3004,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
2971 if (ret) 3004 if (ret)
2972 return ret; 3005 return ret;
2973 kvm->arch.tss_addr = addr; 3006 kvm->arch.tss_addr = addr;
3007 if (!init_rmode_tss(kvm))
3008 return -ENOMEM;
3009
2974 return 0; 3010 return 0;
2975} 3011}
2976 3012
@@ -3962,7 +3998,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
3962#define Q "l" 3998#define Q "l"
3963#endif 3999#endif
3964 4000
3965static void vmx_vcpu_run(struct kvm_vcpu *vcpu) 4001static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
3966{ 4002{
3967 struct vcpu_vmx *vmx = to_vmx(vcpu); 4003 struct vcpu_vmx *vmx = to_vmx(vcpu);
3968 4004
@@ -3991,6 +4027,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3991 asm( 4027 asm(
3992 /* Store host registers */ 4028 /* Store host registers */
3993 "push %%"R"dx; push %%"R"bp;" 4029 "push %%"R"dx; push %%"R"bp;"
4030 "push %%"R"cx \n\t" /* placeholder for guest rcx */
3994 "push %%"R"cx \n\t" 4031 "push %%"R"cx \n\t"
3995 "cmp %%"R"sp, %c[host_rsp](%0) \n\t" 4032 "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
3996 "je 1f \n\t" 4033 "je 1f \n\t"
@@ -4032,10 +4069,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4032 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" 4069 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
4033 ".Lkvm_vmx_return: " 4070 ".Lkvm_vmx_return: "
4034 /* Save guest registers, load host registers, keep flags */ 4071 /* Save guest registers, load host registers, keep flags */
4035 "xchg %0, (%%"R"sp) \n\t" 4072 "mov %0, %c[wordsize](%%"R"sp) \n\t"
4073 "pop %0 \n\t"
4036 "mov %%"R"ax, %c[rax](%0) \n\t" 4074 "mov %%"R"ax, %c[rax](%0) \n\t"
4037 "mov %%"R"bx, %c[rbx](%0) \n\t" 4075 "mov %%"R"bx, %c[rbx](%0) \n\t"
4038 "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" 4076 "pop"Q" %c[rcx](%0) \n\t"
4039 "mov %%"R"dx, %c[rdx](%0) \n\t" 4077 "mov %%"R"dx, %c[rdx](%0) \n\t"
4040 "mov %%"R"si, %c[rsi](%0) \n\t" 4078 "mov %%"R"si, %c[rsi](%0) \n\t"
4041 "mov %%"R"di, %c[rdi](%0) \n\t" 4079 "mov %%"R"di, %c[rdi](%0) \n\t"
@@ -4053,7 +4091,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4053 "mov %%cr2, %%"R"ax \n\t" 4091 "mov %%cr2, %%"R"ax \n\t"
4054 "mov %%"R"ax, %c[cr2](%0) \n\t" 4092 "mov %%"R"ax, %c[cr2](%0) \n\t"
4055 4093
4056 "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" 4094 "pop %%"R"bp; pop %%"R"dx \n\t"
4057 "setbe %c[fail](%0) \n\t" 4095 "setbe %c[fail](%0) \n\t"
4058 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 4096 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
4059 [launched]"i"(offsetof(struct vcpu_vmx, launched)), 4097 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
@@ -4076,7 +4114,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4076 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), 4114 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
4077 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), 4115 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
4078#endif 4116#endif
4079 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) 4117 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
4118 [wordsize]"i"(sizeof(ulong))
4080 : "cc", "memory" 4119 : "cc", "memory"
4081 , R"ax", R"bx", R"di", R"si" 4120 , R"ax", R"bx", R"di", R"si"
4082#ifdef CONFIG_X86_64 4121#ifdef CONFIG_X86_64
@@ -4183,8 +4222,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4183 if (!kvm->arch.ept_identity_map_addr) 4222 if (!kvm->arch.ept_identity_map_addr)
4184 kvm->arch.ept_identity_map_addr = 4223 kvm->arch.ept_identity_map_addr =
4185 VMX_EPT_IDENTITY_PAGETABLE_ADDR; 4224 VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4225 err = -ENOMEM;
4186 if (alloc_identity_pagetable(kvm) != 0) 4226 if (alloc_identity_pagetable(kvm) != 0)
4187 goto free_vmcs; 4227 goto free_vmcs;
4228 if (!init_rmode_identity_map(kvm))
4229 goto free_vmcs;
4188 } 4230 }
4189 4231
4190 return &vmx->vcpu; 4232 return &vmx->vcpu;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bcc0efce85bf..58f517b59645 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -81,9 +81,10 @@
81 * - enable LME and LMA per default on 64 bit KVM 81 * - enable LME and LMA per default on 64 bit KVM
82 */ 82 */
83#ifdef CONFIG_X86_64 83#ifdef CONFIG_X86_64
84static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 84static
85u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
85#else 86#else
86static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 87static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
87#endif 88#endif
88 89
89#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 90#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
@@ -360,8 +361,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
360 361
361void kvm_inject_nmi(struct kvm_vcpu *vcpu) 362void kvm_inject_nmi(struct kvm_vcpu *vcpu)
362{ 363{
364 kvm_make_request(KVM_REQ_NMI, vcpu);
363 kvm_make_request(KVM_REQ_EVENT, vcpu); 365 kvm_make_request(KVM_REQ_EVENT, vcpu);
364 vcpu->arch.nmi_pending = 1;
365} 366}
366EXPORT_SYMBOL_GPL(kvm_inject_nmi); 367EXPORT_SYMBOL_GPL(kvm_inject_nmi);
367 368
@@ -525,8 +526,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
525 526
526 kvm_x86_ops->set_cr0(vcpu, cr0); 527 kvm_x86_ops->set_cr0(vcpu, cr0);
527 528
528 if ((cr0 ^ old_cr0) & X86_CR0_PG) 529 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
529 kvm_clear_async_pf_completion_queue(vcpu); 530 kvm_clear_async_pf_completion_queue(vcpu);
531 kvm_async_pf_hash_reset(vcpu);
532 }
530 533
531 if ((cr0 ^ old_cr0) & update_bits) 534 if ((cr0 ^ old_cr0) & update_bits)
532 kvm_mmu_reset_context(vcpu); 535 kvm_mmu_reset_context(vcpu);
@@ -1017,7 +1020,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1017 unsigned long flags; 1020 unsigned long flags;
1018 s64 sdiff; 1021 s64 sdiff;
1019 1022
1020 spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1023 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1021 offset = data - native_read_tsc(); 1024 offset = data - native_read_tsc();
1022 ns = get_kernel_ns(); 1025 ns = get_kernel_ns();
1023 elapsed = ns - kvm->arch.last_tsc_nsec; 1026 elapsed = ns - kvm->arch.last_tsc_nsec;
@@ -1028,7 +1031,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1028 /* 1031 /*
1029 * Special case: close write to TSC within 5 seconds of 1032 * Special case: close write to TSC within 5 seconds of
1030 * another CPU is interpreted as an attempt to synchronize 1033 * another CPU is interpreted as an attempt to synchronize
1031 * The 5 seconds is to accomodate host load / swapping as 1034 * The 5 seconds is to accommodate host load / swapping as
1032 * well as any reset of TSC during the boot process. 1035 * well as any reset of TSC during the boot process.
1033 * 1036 *
1034 * In that case, for a reliable TSC, we can match TSC offsets, 1037 * In that case, for a reliable TSC, we can match TSC offsets,
@@ -1050,7 +1053,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1050 kvm->arch.last_tsc_write = data; 1053 kvm->arch.last_tsc_write = data;
1051 kvm->arch.last_tsc_offset = offset; 1054 kvm->arch.last_tsc_offset = offset;
1052 kvm_x86_ops->write_tsc_offset(vcpu, offset); 1055 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1053 spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 1056 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1054 1057
1055 /* Reset of TSC must disable overshoot protection below */ 1058 /* Reset of TSC must disable overshoot protection below */
1056 vcpu->arch.hv_clock.tsc_timestamp = 0; 1059 vcpu->arch.hv_clock.tsc_timestamp = 0;
@@ -1453,6 +1456,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1453 return 0; 1456 return 0;
1454} 1457}
1455 1458
1459static void kvmclock_reset(struct kvm_vcpu *vcpu)
1460{
1461 if (vcpu->arch.time_page) {
1462 kvm_release_page_dirty(vcpu->arch.time_page);
1463 vcpu->arch.time_page = NULL;
1464 }
1465}
1466
1456int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1467int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1457{ 1468{
1458 switch (msr) { 1469 switch (msr) {
@@ -1510,10 +1521,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1510 break; 1521 break;
1511 case MSR_KVM_SYSTEM_TIME_NEW: 1522 case MSR_KVM_SYSTEM_TIME_NEW:
1512 case MSR_KVM_SYSTEM_TIME: { 1523 case MSR_KVM_SYSTEM_TIME: {
1513 if (vcpu->arch.time_page) { 1524 kvmclock_reset(vcpu);
1514 kvm_release_page_dirty(vcpu->arch.time_page);
1515 vcpu->arch.time_page = NULL;
1516 }
1517 1525
1518 vcpu->arch.time = data; 1526 vcpu->arch.time = data;
1519 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 1527 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -1592,6 +1600,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1592 } else 1600 } else
1593 return set_msr_hyperv(vcpu, msr, data); 1601 return set_msr_hyperv(vcpu, msr, data);
1594 break; 1602 break;
1603 case MSR_IA32_BBL_CR_CTL3:
1604 /* Drop writes to this legacy MSR -- see rdmsr
1605 * counterpart for further detail.
1606 */
1607 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
1608 break;
1595 default: 1609 default:
1596 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1610 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1597 return xen_hvm_config(vcpu, data); 1611 return xen_hvm_config(vcpu, data);
@@ -1846,6 +1860,19 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1846 } else 1860 } else
1847 return get_msr_hyperv(vcpu, msr, pdata); 1861 return get_msr_hyperv(vcpu, msr, pdata);
1848 break; 1862 break;
1863 case MSR_IA32_BBL_CR_CTL3:
1864 /* This legacy MSR exists but isn't fully documented in current
1865 * silicon. It is however accessed by winxp in very narrow
1866 * scenarios where it sets bit #19, itself documented as
1867 * a "reserved" bit. Best effort attempt to source coherent
1868 * read data here should the balance of the register be
1869 * interpreted by the guest:
1870 *
1871 * L2 cache control register 3: 64GB range, 256KB size,
1872 * enabled, latency 0x1, configured
1873 */
1874 data = 0xbe702111;
1875 break;
1849 default: 1876 default:
1850 if (!ignore_msrs) { 1877 if (!ignore_msrs) {
1851 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1878 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -2100,8 +2127,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2100 if (check_tsc_unstable()) { 2127 if (check_tsc_unstable()) {
2101 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); 2128 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
2102 vcpu->arch.tsc_catchup = 1; 2129 vcpu->arch.tsc_catchup = 1;
2103 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2104 } 2130 }
2131 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2105 if (vcpu->cpu != cpu) 2132 if (vcpu->cpu != cpu)
2106 kvm_migrate_timers(vcpu); 2133 kvm_migrate_timers(vcpu);
2107 vcpu->cpu = cpu; 2134 vcpu->cpu = cpu;
@@ -2575,9 +2602,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2575 if (mce->status & MCI_STATUS_UC) { 2602 if (mce->status & MCI_STATUS_UC) {
2576 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 2603 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
2577 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { 2604 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
2578 printk(KERN_DEBUG "kvm: set_mce: "
2579 "injects mce exception while "
2580 "previous one is in progress!\n");
2581 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2605 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2582 return 0; 2606 return 0;
2583 } 2607 }
@@ -2648,8 +2672,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2648 vcpu->arch.interrupt.pending = events->interrupt.injected; 2672 vcpu->arch.interrupt.pending = events->interrupt.injected;
2649 vcpu->arch.interrupt.nr = events->interrupt.nr; 2673 vcpu->arch.interrupt.nr = events->interrupt.nr;
2650 vcpu->arch.interrupt.soft = events->interrupt.soft; 2674 vcpu->arch.interrupt.soft = events->interrupt.soft;
2651 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
2652 kvm_pic_clear_isr_ack(vcpu->kvm);
2653 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) 2675 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2654 kvm_x86_ops->set_interrupt_shadow(vcpu, 2676 kvm_x86_ops->set_interrupt_shadow(vcpu,
2655 events->interrupt.shadow); 2677 events->interrupt.shadow);
@@ -4140,8 +4162,8 @@ static unsigned long emulator_get_cached_segment_base(int seg,
4140 return get_segment_base(vcpu, seg); 4162 return get_segment_base(vcpu, seg);
4141} 4163}
4142 4164
4143static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, 4165static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3,
4144 struct kvm_vcpu *vcpu) 4166 int seg, struct kvm_vcpu *vcpu)
4145{ 4167{
4146 struct kvm_segment var; 4168 struct kvm_segment var;
4147 4169
@@ -4154,6 +4176,10 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
4154 var.limit >>= 12; 4176 var.limit >>= 12;
4155 set_desc_limit(desc, var.limit); 4177 set_desc_limit(desc, var.limit);
4156 set_desc_base(desc, (unsigned long)var.base); 4178 set_desc_base(desc, (unsigned long)var.base);
4179#ifdef CONFIG_X86_64
4180 if (base3)
4181 *base3 = var.base >> 32;
4182#endif
4157 desc->type = var.type; 4183 desc->type = var.type;
4158 desc->s = var.s; 4184 desc->s = var.s;
4159 desc->dpl = var.dpl; 4185 desc->dpl = var.dpl;
@@ -4166,8 +4192,8 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
4166 return true; 4192 return true;
4167} 4193}
4168 4194
4169static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, 4195static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3,
4170 struct kvm_vcpu *vcpu) 4196 int seg, struct kvm_vcpu *vcpu)
4171{ 4197{
4172 struct kvm_segment var; 4198 struct kvm_segment var;
4173 4199
@@ -4175,6 +4201,9 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
4175 kvm_get_segment(vcpu, &var, seg); 4201 kvm_get_segment(vcpu, &var, seg);
4176 4202
4177 var.base = get_desc_base(desc); 4203 var.base = get_desc_base(desc);
4204#ifdef CONFIG_X86_64
4205 var.base |= ((u64)base3) << 32;
4206#endif
4178 var.limit = get_desc_limit(desc); 4207 var.limit = get_desc_limit(desc);
4179 if (desc->g) 4208 if (desc->g)
4180 var.limit = (var.limit << 12) | 0xfff; 4209 var.limit = (var.limit << 12) | 0xfff;
@@ -4390,41 +4419,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4390 vcpu->arch.emulate_ctxt.have_exception = false; 4419 vcpu->arch.emulate_ctxt.have_exception = false;
4391 vcpu->arch.emulate_ctxt.perm_ok = false; 4420 vcpu->arch.emulate_ctxt.perm_ok = false;
4392 4421
4422 vcpu->arch.emulate_ctxt.only_vendor_specific_insn
4423 = emulation_type & EMULTYPE_TRAP_UD;
4424
4393 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); 4425 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
4394 if (r == X86EMUL_PROPAGATE_FAULT)
4395 goto done;
4396 4426
4397 trace_kvm_emulate_insn_start(vcpu); 4427 trace_kvm_emulate_insn_start(vcpu);
4398
4399 /* Only allow emulation of specific instructions on #UD
4400 * (namely VMMCALL, sysenter, sysexit, syscall)*/
4401 if (emulation_type & EMULTYPE_TRAP_UD) {
4402 if (!c->twobyte)
4403 return EMULATE_FAIL;
4404 switch (c->b) {
4405 case 0x01: /* VMMCALL */
4406 if (c->modrm_mod != 3 || c->modrm_rm != 1)
4407 return EMULATE_FAIL;
4408 break;
4409 case 0x34: /* sysenter */
4410 case 0x35: /* sysexit */
4411 if (c->modrm_mod != 0 || c->modrm_rm != 0)
4412 return EMULATE_FAIL;
4413 break;
4414 case 0x05: /* syscall */
4415 if (c->modrm_mod != 0 || c->modrm_rm != 0)
4416 return EMULATE_FAIL;
4417 break;
4418 default:
4419 return EMULATE_FAIL;
4420 }
4421
4422 if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
4423 return EMULATE_FAIL;
4424 }
4425
4426 ++vcpu->stat.insn_emulation; 4428 ++vcpu->stat.insn_emulation;
4427 if (r) { 4429 if (r) {
4430 if (emulation_type & EMULTYPE_TRAP_UD)
4431 return EMULATE_FAIL;
4428 if (reexecute_instruction(vcpu, cr2)) 4432 if (reexecute_instruction(vcpu, cr2))
4429 return EMULATE_DONE; 4433 return EMULATE_DONE;
4430 if (emulation_type & EMULTYPE_SKIP) 4434 if (emulation_type & EMULTYPE_SKIP)
@@ -4452,7 +4456,6 @@ restart:
4452 return handle_emulation_failure(vcpu); 4456 return handle_emulation_failure(vcpu);
4453 } 4457 }
4454 4458
4455done:
4456 if (vcpu->arch.emulate_ctxt.have_exception) { 4459 if (vcpu->arch.emulate_ctxt.have_exception) {
4457 inject_emulated_exception(vcpu); 4460 inject_emulated_exception(vcpu);
4458 r = EMULATE_DONE; 4461 r = EMULATE_DONE;
@@ -4562,7 +4565,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4562 4565
4563 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); 4566 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4564 4567
4565 spin_lock(&kvm_lock); 4568 raw_spin_lock(&kvm_lock);
4566 list_for_each_entry(kvm, &vm_list, vm_list) { 4569 list_for_each_entry(kvm, &vm_list, vm_list) {
4567 kvm_for_each_vcpu(i, vcpu, kvm) { 4570 kvm_for_each_vcpu(i, vcpu, kvm) {
4568 if (vcpu->cpu != freq->cpu) 4571 if (vcpu->cpu != freq->cpu)
@@ -4572,7 +4575,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4572 send_ipi = 1; 4575 send_ipi = 1;
4573 } 4576 }
4574 } 4577 }
4575 spin_unlock(&kvm_lock); 4578 raw_spin_unlock(&kvm_lock);
4576 4579
4577 if (freq->old < freq->new && send_ipi) { 4580 if (freq->old < freq->new && send_ipi) {
4578 /* 4581 /*
@@ -5185,6 +5188,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5185 r = 1; 5188 r = 1;
5186 goto out; 5189 goto out;
5187 } 5190 }
5191 if (kvm_check_request(KVM_REQ_NMI, vcpu))
5192 vcpu->arch.nmi_pending = true;
5188 } 5193 }
5189 5194
5190 r = kvm_mmu_reload(vcpu); 5195 r = kvm_mmu_reload(vcpu);
@@ -5213,14 +5218,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5213 kvm_load_guest_fpu(vcpu); 5218 kvm_load_guest_fpu(vcpu);
5214 kvm_load_guest_xcr0(vcpu); 5219 kvm_load_guest_xcr0(vcpu);
5215 5220
5216 atomic_set(&vcpu->guest_mode, 1); 5221 vcpu->mode = IN_GUEST_MODE;
5217 smp_wmb(); 5222
5223 /* We should set ->mode before check ->requests,
5224 * see the comment in make_all_cpus_request.
5225 */
5226 smp_mb();
5218 5227
5219 local_irq_disable(); 5228 local_irq_disable();
5220 5229
5221 if (!atomic_read(&vcpu->guest_mode) || vcpu->requests 5230 if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
5222 || need_resched() || signal_pending(current)) { 5231 || need_resched() || signal_pending(current)) {
5223 atomic_set(&vcpu->guest_mode, 0); 5232 vcpu->mode = OUTSIDE_GUEST_MODE;
5224 smp_wmb(); 5233 smp_wmb();
5225 local_irq_enable(); 5234 local_irq_enable();
5226 preempt_enable(); 5235 preempt_enable();
@@ -5256,7 +5265,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5256 5265
5257 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); 5266 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
5258 5267
5259 atomic_set(&vcpu->guest_mode, 0); 5268 vcpu->mode = OUTSIDE_GUEST_MODE;
5260 smp_wmb(); 5269 smp_wmb();
5261 local_irq_enable(); 5270 local_irq_enable();
5262 5271
@@ -5574,7 +5583,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5574 struct kvm_sregs *sregs) 5583 struct kvm_sregs *sregs)
5575{ 5584{
5576 int mmu_reset_needed = 0; 5585 int mmu_reset_needed = 0;
5577 int pending_vec, max_bits; 5586 int pending_vec, max_bits, idx;
5578 struct desc_ptr dt; 5587 struct desc_ptr dt;
5579 5588
5580 dt.size = sregs->idt.limit; 5589 dt.size = sregs->idt.limit;
@@ -5603,10 +5612,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5603 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5612 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
5604 if (sregs->cr4 & X86_CR4_OSXSAVE) 5613 if (sregs->cr4 & X86_CR4_OSXSAVE)
5605 update_cpuid(vcpu); 5614 update_cpuid(vcpu);
5615
5616 idx = srcu_read_lock(&vcpu->kvm->srcu);
5606 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5617 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5607 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); 5618 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
5608 mmu_reset_needed = 1; 5619 mmu_reset_needed = 1;
5609 } 5620 }
5621 srcu_read_unlock(&vcpu->kvm->srcu, idx);
5610 5622
5611 if (mmu_reset_needed) 5623 if (mmu_reset_needed)
5612 kvm_mmu_reset_context(vcpu); 5624 kvm_mmu_reset_context(vcpu);
@@ -5617,8 +5629,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5617 if (pending_vec < max_bits) { 5629 if (pending_vec < max_bits) {
5618 kvm_queue_interrupt(vcpu, pending_vec, false); 5630 kvm_queue_interrupt(vcpu, pending_vec, false);
5619 pr_debug("Set back pending irq %d\n", pending_vec); 5631 pr_debug("Set back pending irq %d\n", pending_vec);
5620 if (irqchip_in_kernel(vcpu->kvm))
5621 kvm_pic_clear_isr_ack(vcpu->kvm);
5622 } 5632 }
5623 5633
5624 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5634 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -5814,10 +5824,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
5814 5824
5815void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 5825void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5816{ 5826{
5817 if (vcpu->arch.time_page) { 5827 kvmclock_reset(vcpu);
5818 kvm_release_page_dirty(vcpu->arch.time_page);
5819 vcpu->arch.time_page = NULL;
5820 }
5821 5828
5822 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 5829 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
5823 fx_free(vcpu); 5830 fx_free(vcpu);
@@ -5878,6 +5885,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5878 kvm_make_request(KVM_REQ_EVENT, vcpu); 5885 kvm_make_request(KVM_REQ_EVENT, vcpu);
5879 vcpu->arch.apf.msr_val = 0; 5886 vcpu->arch.apf.msr_val = 0;
5880 5887
5888 kvmclock_reset(vcpu);
5889
5881 kvm_clear_async_pf_completion_queue(vcpu); 5890 kvm_clear_async_pf_completion_queue(vcpu);
5882 kvm_async_pf_hash_reset(vcpu); 5891 kvm_async_pf_hash_reset(vcpu);
5883 vcpu->arch.apf.halted = false; 5892 vcpu->arch.apf.halted = false;
@@ -6005,7 +6014,7 @@ int kvm_arch_init_vm(struct kvm *kvm)
6005 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 6014 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
6006 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 6015 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
6007 6016
6008 spin_lock_init(&kvm->arch.tsc_write_lock); 6017 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
6009 6018
6010 return 0; 6019 return 0;
6011} 6020}
@@ -6103,7 +6112,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6103 int user_alloc) 6112 int user_alloc)
6104{ 6113{
6105 6114
6106 int npages = mem->memory_size >> PAGE_SHIFT; 6115 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
6107 6116
6108 if (!user_alloc && !old.user_alloc && old.rmap && !npages) { 6117 if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
6109 int ret; 6118 int ret;
@@ -6118,12 +6127,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6118 "failed to munmap memory\n"); 6127 "failed to munmap memory\n");
6119 } 6128 }
6120 6129
6130 if (!kvm->arch.n_requested_mmu_pages)
6131 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6132
6121 spin_lock(&kvm->mmu_lock); 6133 spin_lock(&kvm->mmu_lock);
6122 if (!kvm->arch.n_requested_mmu_pages) { 6134 if (nr_mmu_pages)
6123 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6124 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 6135 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
6125 }
6126
6127 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 6136 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
6128 spin_unlock(&kvm->mmu_lock); 6137 spin_unlock(&kvm->mmu_lock);
6129} 6138}
@@ -6157,7 +6166,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
6157 6166
6158 me = get_cpu(); 6167 me = get_cpu();
6159 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 6168 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
6160 if (atomic_xchg(&vcpu->guest_mode, 0)) 6169 if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
6161 smp_send_reschedule(cpu); 6170 smp_send_reschedule(cpu);
6162 put_cpu(); 6171 put_cpu();
6163} 6172}
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index b9ec1c74943c..1cd608973ce5 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -397,7 +397,7 @@ static void lguest_load_tr_desc(void)
397 * instead we just use the real "cpuid" instruction. Then I pretty much turned 397 * instead we just use the real "cpuid" instruction. Then I pretty much turned
398 * off feature bits until the Guest booted. (Don't say that: you'll damage 398 * off feature bits until the Guest booted. (Don't say that: you'll damage
399 * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is 399 * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is
400 * hardly future proof.) Noone's listening! They don't like you anyway, 400 * hardly future proof.) No one's listening! They don't like you anyway,
401 * parenthetic weirdo! 401 * parenthetic weirdo!
402 * 402 *
403 * Replacing the cpuid so we can turn features off is great for the kernel, but 403 * Replacing the cpuid so we can turn features off is great for the kernel, but
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index e10cf070ede0..f2479f19ddde 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -42,4 +42,5 @@ else
42 lib-y += memmove_64.o memset_64.o 42 lib-y += memmove_64.o memset_64.o
43 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o 43 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
44 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o 44 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
45 lib-y += cmpxchg16b_emu.o
45endif 46endif
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
new file mode 100644
index 000000000000..3e8b08a6de2b
--- /dev/null
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -0,0 +1,59 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; version 2
5 * of the License.
6 *
7 */
8#include <linux/linkage.h>
9#include <asm/alternative-asm.h>
10#include <asm/frame.h>
11#include <asm/dwarf2.h>
12
13.text
14
15/*
16 * Inputs:
17 * %rsi : memory location to compare
18 * %rax : low 64 bits of old value
19 * %rdx : high 64 bits of old value
20 * %rbx : low 64 bits of new value
21 * %rcx : high 64 bits of new value
22 * %al : Operation successful
23 */
24ENTRY(this_cpu_cmpxchg16b_emu)
25CFI_STARTPROC
26
27#
28# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
29# via the ZF. Caller will access %al to get result.
30#
31# Note that this is only useful for a cpuops operation. Meaning that we
32# do *not* have a fully atomic operation but just an operation that is
33# *atomic* on a single cpu (as provided by the this_cpu_xx class of
34# macros).
35#
36this_cpu_cmpxchg16b_emu:
37 pushf
38 cli
39
40 cmpq %gs:(%rsi), %rax
41 jne not_same
42 cmpq %gs:8(%rsi), %rdx
43 jne not_same
44
45 movq %rbx, %gs:(%rsi)
46 movq %rcx, %gs:8(%rsi)
47
48 popf
49 mov $1, %al
50 ret
51
52 not_same:
53 popf
54 xor %al,%al
55 ret
56
57CFI_ENDPROC
58
59ENDPROC(this_cpu_cmpxchg16b_emu)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index a460158b5ac5..99e482615195 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -117,7 +117,7 @@ ENDPROC(bad_from_user)
117 * rdx count 117 * rdx count
118 * 118 *
119 * Output: 119 * Output:
120 * eax uncopied bytes or 0 if successfull. 120 * eax uncopied bytes or 0 if successful.
121 */ 121 */
122ENTRY(copy_user_generic_unrolled) 122ENTRY(copy_user_generic_unrolled)
123 CFI_STARTPROC 123 CFI_STARTPROC
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index f0dba36578ea..fb903b758da8 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 2 * Copyright 2002, 2003 Andi Kleen, SuSE Labs.
3 * 3 *
4 * This file is subject to the terms and conditions of the GNU General Public 4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file COPYING in the main directory of this archive 5 * License. See the file COPYING in the main directory of this archive
6 * for more details. No warranty for anything given at all. 6 * for more details. No warranty for anything given at all.
@@ -11,82 +11,82 @@
11 11
12/* 12/*
13 * Checksum copy with exception handling. 13 * Checksum copy with exception handling.
14 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 14 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
15 * destination is zeroed. 15 * destination is zeroed.
16 * 16 *
17 * Input 17 * Input
18 * rdi source 18 * rdi source
19 * rsi destination 19 * rsi destination
20 * edx len (32bit) 20 * edx len (32bit)
21 * ecx sum (32bit) 21 * ecx sum (32bit)
22 * r8 src_err_ptr (int) 22 * r8 src_err_ptr (int)
23 * r9 dst_err_ptr (int) 23 * r9 dst_err_ptr (int)
24 * 24 *
25 * Output 25 * Output
26 * eax 64bit sum. undefined in case of exception. 26 * eax 64bit sum. undefined in case of exception.
27 * 27 *
28 * Wrappers need to take care of valid exception sum and zeroing. 28 * Wrappers need to take care of valid exception sum and zeroing.
29 * They also should align source or destination to 8 bytes. 29 * They also should align source or destination to 8 bytes.
30 */ 30 */
31 31
32 .macro source 32 .macro source
3310: 3310:
34 .section __ex_table,"a" 34 .section __ex_table, "a"
35 .align 8 35 .align 8
36 .quad 10b,.Lbad_source 36 .quad 10b, .Lbad_source
37 .previous 37 .previous
38 .endm 38 .endm
39 39
40 .macro dest 40 .macro dest
4120: 4120:
42 .section __ex_table,"a" 42 .section __ex_table, "a"
43 .align 8 43 .align 8
44 .quad 20b,.Lbad_dest 44 .quad 20b, .Lbad_dest
45 .previous 45 .previous
46 .endm 46 .endm
47 47
48 .macro ignore L=.Lignore 48 .macro ignore L=.Lignore
4930: 4930:
50 .section __ex_table,"a" 50 .section __ex_table, "a"
51 .align 8 51 .align 8
52 .quad 30b,\L 52 .quad 30b, \L
53 .previous 53 .previous
54 .endm 54 .endm
55 55
56 56
57ENTRY(csum_partial_copy_generic) 57ENTRY(csum_partial_copy_generic)
58 CFI_STARTPROC 58 CFI_STARTPROC
59 cmpl $3*64,%edx 59 cmpl $3*64, %edx
60 jle .Lignore 60 jle .Lignore
61 61
62.Lignore: 62.Lignore:
63 subq $7*8,%rsp 63 subq $7*8, %rsp
64 CFI_ADJUST_CFA_OFFSET 7*8 64 CFI_ADJUST_CFA_OFFSET 7*8
65 movq %rbx,2*8(%rsp) 65 movq %rbx, 2*8(%rsp)
66 CFI_REL_OFFSET rbx, 2*8 66 CFI_REL_OFFSET rbx, 2*8
67 movq %r12,3*8(%rsp) 67 movq %r12, 3*8(%rsp)
68 CFI_REL_OFFSET r12, 3*8 68 CFI_REL_OFFSET r12, 3*8
69 movq %r14,4*8(%rsp) 69 movq %r14, 4*8(%rsp)
70 CFI_REL_OFFSET r14, 4*8 70 CFI_REL_OFFSET r14, 4*8
71 movq %r13,5*8(%rsp) 71 movq %r13, 5*8(%rsp)
72 CFI_REL_OFFSET r13, 5*8 72 CFI_REL_OFFSET r13, 5*8
73 movq %rbp,6*8(%rsp) 73 movq %rbp, 6*8(%rsp)
74 CFI_REL_OFFSET rbp, 6*8 74 CFI_REL_OFFSET rbp, 6*8
75 75
76 movq %r8,(%rsp) 76 movq %r8, (%rsp)
77 movq %r9,1*8(%rsp) 77 movq %r9, 1*8(%rsp)
78
79 movl %ecx,%eax
80 movl %edx,%ecx
81 78
82 xorl %r9d,%r9d 79 movl %ecx, %eax
83 movq %rcx,%r12 80 movl %edx, %ecx
84 81
85 shrq $6,%r12 82 xorl %r9d, %r9d
86 jz .Lhandle_tail /* < 64 */ 83 movq %rcx, %r12
84
85 shrq $6, %r12
86 jz .Lhandle_tail /* < 64 */
87 87
88 clc 88 clc
89 89
90 /* main loop. clear in 64 byte blocks */ 90 /* main loop. clear in 64 byte blocks */
91 /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ 91 /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
92 /* r11: temp3, rdx: temp4, r12 loopcnt */ 92 /* r11: temp3, rdx: temp4, r12 loopcnt */
@@ -94,156 +94,156 @@ ENTRY(csum_partial_copy_generic)
94 .p2align 4 94 .p2align 4
95.Lloop: 95.Lloop:
96 source 96 source
97 movq (%rdi),%rbx 97 movq (%rdi), %rbx
98 source 98 source
99 movq 8(%rdi),%r8 99 movq 8(%rdi), %r8
100 source 100 source
101 movq 16(%rdi),%r11 101 movq 16(%rdi), %r11
102 source 102 source
103 movq 24(%rdi),%rdx 103 movq 24(%rdi), %rdx
104 104
105 source 105 source
106 movq 32(%rdi),%r10 106 movq 32(%rdi), %r10
107 source 107 source
108 movq 40(%rdi),%rbp 108 movq 40(%rdi), %rbp
109 source 109 source
110 movq 48(%rdi),%r14 110 movq 48(%rdi), %r14
111 source 111 source
112 movq 56(%rdi),%r13 112 movq 56(%rdi), %r13
113 113
114 ignore 2f 114 ignore 2f
115 prefetcht0 5*64(%rdi) 115 prefetcht0 5*64(%rdi)
1162: 1162:
117 adcq %rbx,%rax 117 adcq %rbx, %rax
118 adcq %r8,%rax 118 adcq %r8, %rax
119 adcq %r11,%rax 119 adcq %r11, %rax
120 adcq %rdx,%rax 120 adcq %rdx, %rax
121 adcq %r10,%rax 121 adcq %r10, %rax
122 adcq %rbp,%rax 122 adcq %rbp, %rax
123 adcq %r14,%rax 123 adcq %r14, %rax
124 adcq %r13,%rax 124 adcq %r13, %rax
125 125
126 decl %r12d 126 decl %r12d
127 127
128 dest 128 dest
129 movq %rbx,(%rsi) 129 movq %rbx, (%rsi)
130 dest 130 dest
131 movq %r8,8(%rsi) 131 movq %r8, 8(%rsi)
132 dest 132 dest
133 movq %r11,16(%rsi) 133 movq %r11, 16(%rsi)
134 dest 134 dest
135 movq %rdx,24(%rsi) 135 movq %rdx, 24(%rsi)
136 136
137 dest 137 dest
138 movq %r10,32(%rsi) 138 movq %r10, 32(%rsi)
139 dest 139 dest
140 movq %rbp,40(%rsi) 140 movq %rbp, 40(%rsi)
141 dest 141 dest
142 movq %r14,48(%rsi) 142 movq %r14, 48(%rsi)
143 dest 143 dest
144 movq %r13,56(%rsi) 144 movq %r13, 56(%rsi)
145 145
1463: 1463:
147
148 leaq 64(%rdi),%rdi
149 leaq 64(%rsi),%rsi
150 147
151 jnz .Lloop 148 leaq 64(%rdi), %rdi
149 leaq 64(%rsi), %rsi
152 150
153 adcq %r9,%rax 151 jnz .Lloop
154 152
155 /* do last upto 56 bytes */ 153 adcq %r9, %rax
154
155 /* do last up to 56 bytes */
156.Lhandle_tail: 156.Lhandle_tail:
157 /* ecx: count */ 157 /* ecx: count */
158 movl %ecx,%r10d 158 movl %ecx, %r10d
159 andl $63,%ecx 159 andl $63, %ecx
160 shrl $3,%ecx 160 shrl $3, %ecx
161 jz .Lfold 161 jz .Lfold
162 clc 162 clc
163 .p2align 4 163 .p2align 4
164.Lloop_8: 164.Lloop_8:
165 source 165 source
166 movq (%rdi),%rbx 166 movq (%rdi), %rbx
167 adcq %rbx,%rax 167 adcq %rbx, %rax
168 decl %ecx 168 decl %ecx
169 dest 169 dest
170 movq %rbx,(%rsi) 170 movq %rbx, (%rsi)
171 leaq 8(%rsi),%rsi /* preserve carry */ 171 leaq 8(%rsi), %rsi /* preserve carry */
172 leaq 8(%rdi),%rdi 172 leaq 8(%rdi), %rdi
173 jnz .Lloop_8 173 jnz .Lloop_8
174 adcq %r9,%rax /* add in carry */ 174 adcq %r9, %rax /* add in carry */
175 175
176.Lfold: 176.Lfold:
177 /* reduce checksum to 32bits */ 177 /* reduce checksum to 32bits */
178 movl %eax,%ebx 178 movl %eax, %ebx
179 shrq $32,%rax 179 shrq $32, %rax
180 addl %ebx,%eax 180 addl %ebx, %eax
181 adcl %r9d,%eax 181 adcl %r9d, %eax
182 182
183 /* do last upto 6 bytes */ 183 /* do last up to 6 bytes */
184.Lhandle_7: 184.Lhandle_7:
185 movl %r10d,%ecx 185 movl %r10d, %ecx
186 andl $7,%ecx 186 andl $7, %ecx
187 shrl $1,%ecx 187 shrl $1, %ecx
188 jz .Lhandle_1 188 jz .Lhandle_1
189 movl $2,%edx 189 movl $2, %edx
190 xorl %ebx,%ebx 190 xorl %ebx, %ebx
191 clc 191 clc
192 .p2align 4 192 .p2align 4
193.Lloop_1: 193.Lloop_1:
194 source 194 source
195 movw (%rdi),%bx 195 movw (%rdi), %bx
196 adcl %ebx,%eax 196 adcl %ebx, %eax
197 decl %ecx 197 decl %ecx
198 dest 198 dest
199 movw %bx,(%rsi) 199 movw %bx, (%rsi)
200 leaq 2(%rdi),%rdi 200 leaq 2(%rdi), %rdi
201 leaq 2(%rsi),%rsi 201 leaq 2(%rsi), %rsi
202 jnz .Lloop_1 202 jnz .Lloop_1
203 adcl %r9d,%eax /* add in carry */ 203 adcl %r9d, %eax /* add in carry */
204 204
205 /* handle last odd byte */ 205 /* handle last odd byte */
206.Lhandle_1: 206.Lhandle_1:
207 testl $1,%r10d 207 testl $1, %r10d
208 jz .Lende 208 jz .Lende
209 xorl %ebx,%ebx 209 xorl %ebx, %ebx
210 source 210 source
211 movb (%rdi),%bl 211 movb (%rdi), %bl
212 dest 212 dest
213 movb %bl,(%rsi) 213 movb %bl, (%rsi)
214 addl %ebx,%eax 214 addl %ebx, %eax
215 adcl %r9d,%eax /* carry */ 215 adcl %r9d, %eax /* carry */
216 216
217 CFI_REMEMBER_STATE 217 CFI_REMEMBER_STATE
218.Lende: 218.Lende:
219 movq 2*8(%rsp),%rbx 219 movq 2*8(%rsp), %rbx
220 CFI_RESTORE rbx 220 CFI_RESTORE rbx
221 movq 3*8(%rsp),%r12 221 movq 3*8(%rsp), %r12
222 CFI_RESTORE r12 222 CFI_RESTORE r12
223 movq 4*8(%rsp),%r14 223 movq 4*8(%rsp), %r14
224 CFI_RESTORE r14 224 CFI_RESTORE r14
225 movq 5*8(%rsp),%r13 225 movq 5*8(%rsp), %r13
226 CFI_RESTORE r13 226 CFI_RESTORE r13
227 movq 6*8(%rsp),%rbp 227 movq 6*8(%rsp), %rbp
228 CFI_RESTORE rbp 228 CFI_RESTORE rbp
229 addq $7*8,%rsp 229 addq $7*8, %rsp
230 CFI_ADJUST_CFA_OFFSET -7*8 230 CFI_ADJUST_CFA_OFFSET -7*8
231 ret 231 ret
232 CFI_RESTORE_STATE 232 CFI_RESTORE_STATE
233 233
234 /* Exception handlers. Very simple, zeroing is done in the wrappers */ 234 /* Exception handlers. Very simple, zeroing is done in the wrappers */
235.Lbad_source: 235.Lbad_source:
236 movq (%rsp),%rax 236 movq (%rsp), %rax
237 testq %rax,%rax 237 testq %rax, %rax
238 jz .Lende 238 jz .Lende
239 movl $-EFAULT,(%rax) 239 movl $-EFAULT, (%rax)
240 jmp .Lende 240 jmp .Lende
241 241
242.Lbad_dest: 242.Lbad_dest:
243 movq 8(%rsp),%rax 243 movq 8(%rsp), %rax
244 testq %rax,%rax 244 testq %rax, %rax
245 jz .Lende 245 jz .Lende
246 movl $-EFAULT,(%rax) 246 movl $-EFAULT, (%rax)
247 jmp .Lende 247 jmp .Lende
248 CFI_ENDPROC 248 CFI_ENDPROC
249ENDPROC(csum_partial_copy_generic) 249ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index bf51144d97e1..9845371c5c36 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -84,7 +84,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned len)
84 count64--; 84 count64--;
85 } 85 }
86 86
87 /* last upto 7 8byte blocks */ 87 /* last up to 7 8byte blocks */
88 count %= 8; 88 count %= 8;
89 while (count) { 89 while (count) {
90 asm("addq %1,%0\n\t" 90 asm("addq %1,%0\n\t"
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 069ce7c37c01..d4203988504a 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -326,7 +326,7 @@ try_again:
326 if (mm->free_area_cache < len) 326 if (mm->free_area_cache < len)
327 goto fail; 327 goto fail;
328 328
329 /* either no address requested or cant fit in requested address hole */ 329 /* either no address requested or can't fit in requested address hole */
330 addr = (mm->free_area_cache - len) & huge_page_mask(h); 330 addr = (mm->free_area_cache - len) & huge_page_mask(h);
331 do { 331 do {
332 /* 332 /*
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 73ad7ebd6e9c..80088f994193 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -917,7 +917,7 @@ static void mark_nxdata_nx(void)
917{ 917{
918 /* 918 /*
919 * When this called, init has already been executed and released, 919 * When this called, init has already been executed and released,
920 * so everything past _etext sould be NX. 920 * so everything past _etext should be NX.
921 */ 921 */
922 unsigned long start = PFN_ALIGN(_etext); 922 unsigned long start = PFN_ALIGN(_etext);
923 /* 923 /*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a08a62cb136e..794233587287 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -51,6 +51,8 @@
51#include <asm/numa.h> 51#include <asm/numa.h>
52#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
53#include <asm/init.h> 53#include <asm/init.h>
54#include <asm/uv/uv.h>
55#include <asm/setup.h>
54 56
55static int __init parse_direct_gbpages_off(char *arg) 57static int __init parse_direct_gbpages_off(char *arg)
56{ 58{
@@ -293,18 +295,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
293 * to the compile time generated pmds. This results in invalid pmds up 295 * to the compile time generated pmds. This results in invalid pmds up
294 * to the point where we hit the physaddr 0 mapping. 296 * to the point where we hit the physaddr 0 mapping.
295 * 297 *
296 * We limit the mappings to the region from _text to _end. _end is 298 * We limit the mappings to the region from _text to _brk_end. _brk_end
297 * rounded up to the 2MB boundary. This catches the invalid pmds as 299 * is rounded up to the 2MB boundary. This catches the invalid pmds as
298 * well, as they are located before _text: 300 * well, as they are located before _text:
299 */ 301 */
300void __init cleanup_highmap(void) 302void __init cleanup_highmap(void)
301{ 303{
302 unsigned long vaddr = __START_KERNEL_map; 304 unsigned long vaddr = __START_KERNEL_map;
303 unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; 305 unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
306 unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
304 pmd_t *pmd = level2_kernel_pgt; 307 pmd_t *pmd = level2_kernel_pgt;
305 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
306 308
307 for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { 309 for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
308 if (pmd_none(*pmd)) 310 if (pmd_none(*pmd))
309 continue; 311 continue;
310 if (vaddr < (unsigned long) _text || vaddr > end) 312 if (vaddr < (unsigned long) _text || vaddr > end)
@@ -860,18 +862,18 @@ static struct vm_area_struct gate_vma = {
860 .vm_flags = VM_READ | VM_EXEC 862 .vm_flags = VM_READ | VM_EXEC
861}; 863};
862 864
863struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 865struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
864{ 866{
865#ifdef CONFIG_IA32_EMULATION 867#ifdef CONFIG_IA32_EMULATION
866 if (test_tsk_thread_flag(tsk, TIF_IA32)) 868 if (!mm || mm->context.ia32_compat)
867 return NULL; 869 return NULL;
868#endif 870#endif
869 return &gate_vma; 871 return &gate_vma;
870} 872}
871 873
872int in_gate_area(struct task_struct *task, unsigned long addr) 874int in_gate_area(struct mm_struct *mm, unsigned long addr)
873{ 875{
874 struct vm_area_struct *vma = get_gate_vma(task); 876 struct vm_area_struct *vma = get_gate_vma(mm);
875 877
876 if (!vma) 878 if (!vma)
877 return 0; 879 return 0;
@@ -880,11 +882,11 @@ int in_gate_area(struct task_struct *task, unsigned long addr)
880} 882}
881 883
882/* 884/*
883 * Use this when you have no reliable task/vma, typically from interrupt 885 * Use this when you have no reliable mm, typically from interrupt
884 * context. It is less reliable than using the task's vma and may give 886 * context. It is less reliable than using a task's mm and may give
885 * false positives: 887 * false positives.
886 */ 888 */
887int in_gate_area_no_task(unsigned long addr) 889int in_gate_area_no_mm(unsigned long addr)
888{ 890{
889 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); 891 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
890} 892}
@@ -898,6 +900,19 @@ const char *arch_vma_name(struct vm_area_struct *vma)
898 return NULL; 900 return NULL;
899} 901}
900 902
903#ifdef CONFIG_X86_UV
904#define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS)
905
906unsigned long memory_block_size_bytes(void)
907{
908 if (is_uv_system()) {
909 printk(KERN_INFO "UV: memory block size 2GB\n");
910 return 2UL * 1024 * 1024 * 1024;
911 }
912 return MIN_MEMORY_BLOCK_SIZE;
913}
914#endif
915
901#ifdef CONFIG_SPARSEMEM_VMEMMAP 916#ifdef CONFIG_SPARSEMEM_VMEMMAP
902/* 917/*
903 * Initialise the sparsemem vmemmap using huge-pages at the PMD level. 918 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 9ec0f209a6a4..e8c00cc72033 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -446,7 +446,7 @@ static int __init numa_alloc_distance(void)
446 * @distance: NUMA distance 446 * @distance: NUMA distance
447 * 447 *
448 * Set the distance from node @from to @to to @distance. If distance table 448 * Set the distance from node @from to @to to @distance. If distance table
449 * doesn't exist, one which is large enough to accomodate all the currently 449 * doesn't exist, one which is large enough to accommodate all the currently
450 * known nodes will be created. 450 * known nodes will be created.
451 * 451 *
452 * If such table cannot be allocated, a warning is printed and further 452 * If such table cannot be allocated, a warning is printed and further
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 90825f2eb0f4..f9e526742fa1 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -310,7 +310,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
310 * these shared mappings are made of small page mappings. 310 * these shared mappings are made of small page mappings.
311 * Thus this don't enforce !RW mapping for small page kernel 311 * Thus this don't enforce !RW mapping for small page kernel
312 * text mapping logic will help Linux Xen parvirt guest boot 312 * text mapping logic will help Linux Xen parvirt guest boot
313 * aswell. 313 * as well.
314 */ 314 */
315 if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) 315 if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
316 pgprot_val(forbidden) |= _PAGE_RW; 316 pgprot_val(forbidden) |= _PAGE_RW;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 0113d19c8aa6..8573b83a63d0 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -168,8 +168,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
168 * section 8.1: in PAE mode we explicitly have to flush the 168 * section 8.1: in PAE mode we explicitly have to flush the
169 * TLB via cr3 if the top-level pgd is changed... 169 * TLB via cr3 if the top-level pgd is changed...
170 */ 170 */
171 if (mm == current->active_mm) 171 flush_tlb_mm(mm);
172 write_cr3(read_cr3());
173} 172}
174#else /* !CONFIG_X86_PAE */ 173#else /* !CONFIG_X86_PAE */
175 174
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 72cbec14d783..2d49d4e19a36 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -126,7 +126,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
126 if (!user_mode_vm(regs)) { 126 if (!user_mode_vm(regs)) {
127 unsigned long stack = kernel_stack_pointer(regs); 127 unsigned long stack = kernel_stack_pointer(regs);
128 if (depth) 128 if (depth)
129 dump_trace(NULL, regs, (unsigned long *)stack, 129 dump_trace(NULL, regs, (unsigned long *)stack, 0,
130 &backtrace_ops, &depth); 130 &backtrace_ops, &depth);
131 return; 131 return;
132 } 132 }
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 9fadec074142..98ab13058f89 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -50,7 +50,7 @@ static inline void setup_num_counters(void)
50#endif 50#endif
51} 51}
52 52
53static int inline addr_increment(void) 53static inline int addr_increment(void)
54{ 54{
55#ifdef CONFIG_SMP 55#ifdef CONFIG_SMP
56 return smp_num_siblings == 2 ? 2 : 1; 56 return smp_num_siblings == 2 ? 2 : 1;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index b1805b78842f..494f2e7ea2b4 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -241,7 +241,7 @@ void __init pcibios_resource_survey(void)
241 e820_reserve_resources_late(); 241 e820_reserve_resources_late();
242 /* 242 /*
243 * Insert the IO APIC resources after PCI initialization has 243 * Insert the IO APIC resources after PCI initialization has
244 * occured to handle IO APICS that are mapped in on a BAR in 244 * occurred to handle IO APICS that are mapped in on a BAR in
245 * PCI space, but before trying to assign unassigned pci res. 245 * PCI space, but before trying to assign unassigned pci res.
246 */ 246 */
247 ioapic_insert_resources(); 247 ioapic_insert_resources();
@@ -304,7 +304,7 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
304 /* 304 /*
305 * ioremap() and ioremap_nocache() defaults to UC MINUS for now. 305 * ioremap() and ioremap_nocache() defaults to UC MINUS for now.
306 * To avoid attribute conflicts, request UC MINUS here 306 * To avoid attribute conflicts, request UC MINUS here
307 * aswell. 307 * as well.
308 */ 308 */
309 prot |= _PAGE_CACHE_UC_MINUS; 309 prot |= _PAGE_CACHE_UC_MINUS;
310 310
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 87e6c8323117..8201165bae28 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -597,21 +597,18 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
597 return 1; 597 return 1;
598 } 598 }
599 599
600 if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN) && 600 if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN &&
601 (device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)) { 601 device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)
602 || (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN &&
603 device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)
604 || (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN &&
605 device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)) {
602 r->name = "PIIX/ICH"; 606 r->name = "PIIX/ICH";
603 r->get = pirq_piix_get; 607 r->get = pirq_piix_get;
604 r->set = pirq_piix_set; 608 r->set = pirq_piix_set;
605 return 1; 609 return 1;
606 } 610 }
607 611
608 if ((device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN) &&
609 (device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)) {
610 r->name = "PIIX/ICH";
611 r->get = pirq_piix_get;
612 r->set = pirq_piix_set;
613 return 1;
614 }
615 return 0; 612 return 0;
616} 613}
617 614
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 8c4085a95ef1..e37b407a0ee8 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -50,7 +50,7 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
50 name = "ioapic-level"; 50 name = "ioapic-level";
51 } 51 }
52 52
53 irq = xen_map_pirq_gsi(map_irq.pirq, gsi, shareable, name); 53 irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name);
54 54
55 printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); 55 printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
56 56
@@ -237,6 +237,7 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev)
237{ 237{
238 int rc; 238 int rc;
239 int share = 1; 239 int share = 1;
240 int pirq;
240 u8 gsi; 241 u8 gsi;
241 242
242 rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi); 243 rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
@@ -246,13 +247,21 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev)
246 return rc; 247 return rc;
247 } 248 }
248 249
250 rc = xen_allocate_pirq_gsi(gsi);
251 if (rc < 0) {
252 dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n",
253 gsi, rc);
254 return rc;
255 }
256 pirq = rc;
257
249 if (gsi < NR_IRQS_LEGACY) 258 if (gsi < NR_IRQS_LEGACY)
250 share = 0; 259 share = 0;
251 260
252 rc = xen_allocate_pirq(gsi, share, "pcifront"); 261 rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
253 if (rc < 0) { 262 if (rc < 0) {
254 dev_warn(&dev->dev, "Xen PCI: failed to register GSI%d: %d\n", 263 dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n",
255 gsi, rc); 264 gsi, pirq, rc);
256 return rc; 265 return rc;
257 } 266 }
258 267
@@ -309,7 +318,7 @@ int __init pci_xen_hvm_init(void)
309#ifdef CONFIG_XEN_DOM0 318#ifdef CONFIG_XEN_DOM0
310static int xen_register_pirq(u32 gsi, int triggering) 319static int xen_register_pirq(u32 gsi, int triggering)
311{ 320{
312 int rc, irq; 321 int rc, pirq, irq = -1;
313 struct physdev_map_pirq map_irq; 322 struct physdev_map_pirq map_irq;
314 int shareable = 0; 323 int shareable = 0;
315 char *name; 324 char *name;
@@ -325,17 +334,20 @@ static int xen_register_pirq(u32 gsi, int triggering)
325 name = "ioapic-level"; 334 name = "ioapic-level";
326 } 335 }
327 336
328 irq = xen_allocate_pirq(gsi, shareable, name); 337 pirq = xen_allocate_pirq_gsi(gsi);
329 338 if (pirq < 0)
330 printk(KERN_DEBUG "xen: --> irq=%d\n", irq); 339 goto out;
331 340
341 irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name);
332 if (irq < 0) 342 if (irq < 0)
333 goto out; 343 goto out;
334 344
345 printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d\n", pirq, irq);
346
335 map_irq.domid = DOMID_SELF; 347 map_irq.domid = DOMID_SELF;
336 map_irq.type = MAP_PIRQ_TYPE_GSI; 348 map_irq.type = MAP_PIRQ_TYPE_GSI;
337 map_irq.index = gsi; 349 map_irq.index = gsi;
338 map_irq.pirq = irq; 350 map_irq.pirq = pirq;
339 351
340 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); 352 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
341 if (rc) { 353 if (rc) {
@@ -422,13 +434,18 @@ static int __init pci_xen_initial_domain(void)
422 434
423void __init xen_setup_pirqs(void) 435void __init xen_setup_pirqs(void)
424{ 436{
425 int irq; 437 int pirq, irq;
426 438
427 pci_xen_initial_domain(); 439 pci_xen_initial_domain();
428 440
429 if (0 == nr_ioapics) { 441 if (0 == nr_ioapics) {
430 for (irq = 0; irq < NR_IRQS_LEGACY; irq++) 442 for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
431 xen_allocate_pirq(irq, 0, "xt-pic"); 443 pirq = xen_allocate_pirq_gsi(irq);
444 if (WARN(pirq < 0,
445 "Could not allocate PIRQ for legacy interrupt\n"))
446 break;
447 irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic");
448 }
432 return; 449 return;
433 } 450 }
434 451
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
index 127775696d6c..99513642a0e6 100644
--- a/arch/x86/platform/olpc/olpc-xo1.c
+++ b/arch/x86/platform/olpc/olpc-xo1.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/platform_device.h> 16#include <linux/platform_device.h>
17#include <linux/pm.h> 17#include <linux/pm.h>
18#include <linux/mfd/core.h>
18 19
19#include <asm/io.h> 20#include <asm/io.h>
20#include <asm/olpc.h> 21#include <asm/olpc.h>
@@ -56,25 +57,24 @@ static void xo1_power_off(void)
56static int __devinit olpc_xo1_probe(struct platform_device *pdev) 57static int __devinit olpc_xo1_probe(struct platform_device *pdev)
57{ 58{
58 struct resource *res; 59 struct resource *res;
60 int err;
59 61
60 /* don't run on non-XOs */ 62 /* don't run on non-XOs */
61 if (!machine_is_olpc()) 63 if (!machine_is_olpc())
62 return -ENODEV; 64 return -ENODEV;
63 65
66 err = mfd_cell_enable(pdev);
67 if (err)
68 return err;
69
64 res = platform_get_resource(pdev, IORESOURCE_IO, 0); 70 res = platform_get_resource(pdev, IORESOURCE_IO, 0);
65 if (!res) { 71 if (!res) {
66 dev_err(&pdev->dev, "can't fetch device resource info\n"); 72 dev_err(&pdev->dev, "can't fetch device resource info\n");
67 return -EIO; 73 return -EIO;
68 } 74 }
69 75 if (strcmp(pdev->name, "olpc-xo1-pms") == 0)
70 if (!request_region(res->start, resource_size(res), DRV_NAME)) {
71 dev_err(&pdev->dev, "can't request region\n");
72 return -EIO;
73 }
74
75 if (strcmp(pdev->name, "cs5535-pms") == 0)
76 pms_base = res->start; 76 pms_base = res->start;
77 else if (strcmp(pdev->name, "cs5535-acpi") == 0) 77 else if (strcmp(pdev->name, "olpc-xo1-ac-acpi") == 0)
78 acpi_base = res->start; 78 acpi_base = res->start;
79 79
80 /* If we have both addresses, we can override the poweroff hook */ 80 /* If we have both addresses, we can override the poweroff hook */
@@ -88,14 +88,11 @@ static int __devinit olpc_xo1_probe(struct platform_device *pdev)
88 88
89static int __devexit olpc_xo1_remove(struct platform_device *pdev) 89static int __devexit olpc_xo1_remove(struct platform_device *pdev)
90{ 90{
91 struct resource *r; 91 mfd_cell_disable(pdev);
92
93 r = platform_get_resource(pdev, IORESOURCE_IO, 0);
94 release_region(r->start, resource_size(r));
95 92
96 if (strcmp(pdev->name, "cs5535-pms") == 0) 93 if (strcmp(pdev->name, "olpc-xo1-pms") == 0)
97 pms_base = 0; 94 pms_base = 0;
98 else if (strcmp(pdev->name, "cs5535-acpi") == 0) 95 else if (strcmp(pdev->name, "olpc-xo1-acpi") == 0)
99 acpi_base = 0; 96 acpi_base = 0;
100 97
101 pm_power_off = NULL; 98 pm_power_off = NULL;
@@ -104,7 +101,7 @@ static int __devexit olpc_xo1_remove(struct platform_device *pdev)
104 101
105static struct platform_driver cs5535_pms_drv = { 102static struct platform_driver cs5535_pms_drv = {
106 .driver = { 103 .driver = {
107 .name = "cs5535-pms", 104 .name = "olpc-xo1-pms",
108 .owner = THIS_MODULE, 105 .owner = THIS_MODULE,
109 }, 106 },
110 .probe = olpc_xo1_probe, 107 .probe = olpc_xo1_probe,
@@ -113,7 +110,7 @@ static struct platform_driver cs5535_pms_drv = {
113 110
114static struct platform_driver cs5535_acpi_drv = { 111static struct platform_driver cs5535_acpi_drv = {
115 .driver = { 112 .driver = {
116 .name = "cs5535-acpi", 113 .name = "olpc-xo1-acpi",
117 .owner = THIS_MODULE, 114 .owner = THIS_MODULE,
118 }, 115 },
119 .probe = olpc_xo1_probe, 116 .probe = olpc_xo1_probe,
@@ -124,26 +121,27 @@ static int __init olpc_xo1_init(void)
124{ 121{
125 int r; 122 int r;
126 123
127 r = platform_driver_register(&cs5535_pms_drv); 124 r = mfd_shared_platform_driver_register(&cs5535_pms_drv, "cs5535-pms");
128 if (r) 125 if (r)
129 return r; 126 return r;
130 127
131 r = platform_driver_register(&cs5535_acpi_drv); 128 r = mfd_shared_platform_driver_register(&cs5535_acpi_drv,
129 "cs5535-acpi");
132 if (r) 130 if (r)
133 platform_driver_unregister(&cs5535_pms_drv); 131 mfd_shared_platform_driver_unregister(&cs5535_pms_drv);
134 132
135 return r; 133 return r;
136} 134}
137 135
138static void __exit olpc_xo1_exit(void) 136static void __exit olpc_xo1_exit(void)
139{ 137{
140 platform_driver_unregister(&cs5535_acpi_drv); 138 mfd_shared_platform_driver_unregister(&cs5535_acpi_drv);
141 platform_driver_unregister(&cs5535_pms_drv); 139 mfd_shared_platform_driver_unregister(&cs5535_pms_drv);
142} 140}
143 141
144MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); 142MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
145MODULE_LICENSE("GPL"); 143MODULE_LICENSE("GPL");
146MODULE_ALIAS("platform:olpc-xo1"); 144MODULE_ALIAS("platform:cs5535-pms");
147 145
148module_init(olpc_xo1_init); 146module_init(olpc_xo1_init);
149module_exit(olpc_xo1_exit); 147module_exit(olpc_xo1_exit);
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 36df991985b2..468d591dde31 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -417,24 +417,25 @@ const char *arch_vma_name(struct vm_area_struct *vma)
417 return NULL; 417 return NULL;
418} 418}
419 419
420struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 420struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
421{ 421{
422 struct mm_struct *mm = tsk->mm; 422 /*
423 423 * Check to see if the corresponding task was created in compat vdso
424 /* Check to see if this task was created in compat vdso mode */ 424 * mode.
425 */
425 if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) 426 if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
426 return &gate_vma; 427 return &gate_vma;
427 return NULL; 428 return NULL;
428} 429}
429 430
430int in_gate_area(struct task_struct *task, unsigned long addr) 431int in_gate_area(struct mm_struct *mm, unsigned long addr)
431{ 432{
432 const struct vm_area_struct *vma = get_gate_vma(task); 433 const struct vm_area_struct *vma = get_gate_vma(mm);
433 434
434 return vma && addr >= vma->vm_start && addr < vma->vm_end; 435 return vma && addr >= vma->vm_start && addr < vma->vm_end;
435} 436}
436 437
437int in_gate_area_no_task(unsigned long addr) 438int in_gate_area_no_mm(unsigned long addr)
438{ 439{
439 return 0; 440 return 0;
440} 441}
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index e4343fe488ed..1c7121ba18ff 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -38,7 +38,7 @@ config XEN_MAX_DOMAIN_MEMORY
38 38
39config XEN_SAVE_RESTORE 39config XEN_SAVE_RESTORE
40 bool 40 bool
41 depends on XEN && PM 41 depends on XEN
42 default y 42 default y
43 43
44config XEN_DEBUG_FS 44config XEN_DEBUG_FS
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3f6f3347aa17..c82df6c9c0f0 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -79,8 +79,7 @@
79 79
80/* 80/*
81 * Protects atomic reservation decrease/increase against concurrent increases. 81 * Protects atomic reservation decrease/increase against concurrent increases.
82 * Also protects non-atomic updates of current_pages and driver_pages, and 82 * Also protects non-atomic updates of current_pages and balloon lists.
83 * balloon lists.
84 */ 83 */
85DEFINE_SPINLOCK(xen_reservation_lock); 84DEFINE_SPINLOCK(xen_reservation_lock);
86 85
@@ -1488,10 +1487,12 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1488 /* 1487 /*
1489 * If the new pfn is within the range of the newly allocated 1488 * If the new pfn is within the range of the newly allocated
1490 * kernel pagetable, and it isn't being mapped into an 1489 * kernel pagetable, and it isn't being mapped into an
1491 * early_ioremap fixmap slot, make sure it is RO. 1490 * early_ioremap fixmap slot as a freshly allocated page, make sure
1491 * it is RO.
1492 */ 1492 */
1493 if (!is_early_ioremap_ptep(ptep) && 1493 if (((!is_early_ioremap_ptep(ptep) &&
1494 pfn >= pgt_buf_start && pfn < pgt_buf_end) 1494 pfn >= pgt_buf_start && pfn < pgt_buf_end)) ||
1495 (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
1495 pte = pte_wrprotect(pte); 1496 pte = pte_wrprotect(pte);
1496 1497
1497 return pte; 1498 return pte;
@@ -1701,9 +1702,6 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1701 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { 1702 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1702 pte_t pte; 1703 pte_t pte;
1703 1704
1704 if (pfn > max_pfn_mapped)
1705 max_pfn_mapped = pfn;
1706
1707 if (!pte_none(pte_page[pteidx])) 1705 if (!pte_none(pte_page[pteidx]))
1708 continue; 1706 continue;
1709 1707
@@ -1745,7 +1743,7 @@ static void convert_pfn_mfn(void *v)
1745} 1743}
1746 1744
1747/* 1745/*
1748 * Set up the inital kernel pagetable. 1746 * Set up the initial kernel pagetable.
1749 * 1747 *
1750 * We can construct this by grafting the Xen provided pagetable into 1748 * We can construct this by grafting the Xen provided pagetable into
1751 * head_64.S's preconstructed pagetables. We copy the Xen L2's into 1749 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
@@ -1761,6 +1759,12 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1761 pud_t *l3; 1759 pud_t *l3;
1762 pmd_t *l2; 1760 pmd_t *l2;
1763 1761
1762 /* max_pfn_mapped is the last pfn mapped in the initial memory
1763 * mappings. Considering that on Xen after the kernel mappings we
1764 * have the mappings of some pages that don't exist in pfn space, we
1765 * set max_pfn_mapped to the last real pfn mapped. */
1766 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1767
1764 /* Zap identity mapping */ 1768 /* Zap identity mapping */
1765 init_level4_pgt[0] = __pgd(0); 1769 init_level4_pgt[0] = __pgd(0);
1766 1770
@@ -1865,9 +1869,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1865 initial_kernel_pmd = 1869 initial_kernel_pmd =
1866 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); 1870 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1867 1871
1868 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + 1872 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1869 xen_start_info->nr_pt_frames * PAGE_SIZE +
1870 512*1024);
1871 1873
1872 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 1874 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1873 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); 1875 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);