aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-08-23 05:18:47 -0400
committerIngo Molnar <mingo@elte.hu>2009-08-23 05:18:47 -0400
commit8a517c514d5893602cf85c1b4c47afbbc04d2198 (patch)
treee7c40f68ef97bb2bdb4c366c0b45437bc049feb1 /arch/x86
parentc64b04fe6e0cb7c78e01751a44ef56cf20344e87 (diff)
parent422bef879e84104fee6dc68ded0e371dbeb5f88e (diff)
Merge commit 'v2.6.31-rc7' into x86/cpu
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig18
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/boot/Makefile1
-rw-r--r--arch/x86/boot/bioscall.S2
-rw-r--r--arch/x86/boot/compressed/Makefile1
-rw-r--r--arch/x86/boot/video-bios.c3
-rw-r--r--arch/x86/boot/video-vesa.c4
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S5
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c4
-rw-r--r--arch/x86/crypto/fpu.c4
-rw-r--r--arch/x86/include/asm/acpi.h1
-rw-r--r--arch/x86/include/asm/amd_iommu.h2
-rw-r--r--arch/x86/include/asm/atomic_32.h186
-rw-r--r--arch/x86/include/asm/atomic_64.h42
-rw-r--r--arch/x86/include/asm/boot.h6
-rw-r--r--arch/x86/include/asm/desc.h26
-rw-r--r--arch/x86/include/asm/dma-mapping.h168
-rw-r--r--arch/x86/include/asm/efi.h5
-rw-r--r--arch/x86/include/asm/fixmap.h10
-rw-r--r--arch/x86/include/asm/io_apic.h2
-rw-r--r--arch/x86/include/asm/iommu.h1
-rw-r--r--arch/x86/include/asm/irqflags.h8
-rw-r--r--arch/x86/include/asm/kmap_types.h23
-rw-r--r--arch/x86/include/asm/kmemcheck.h42
-rw-r--r--arch/x86/include/asm/lguest.h3
-rw-r--r--arch/x86/include/asm/lguest_hcall.h18
-rw-r--r--arch/x86/include/asm/mce.h63
-rw-r--r--arch/x86/include/asm/msr-index.h4
-rw-r--r--arch/x86/include/asm/msr.h7
-rw-r--r--arch/x86/include/asm/nmi.h1
-rw-r--r--arch/x86/include/asm/page_64_types.h2
-rw-r--r--arch/x86/include/asm/pci.h3
-rw-r--r--arch/x86/include/asm/pci_x86.h3
-rw-r--r--arch/x86/include/asm/percpu.h10
-rw-r--r--arch/x86/include/asm/perf_counter.h8
-rw-r--r--arch/x86/include/asm/pgalloc.h25
-rw-r--r--arch/x86/include/asm/pgtable.h5
-rw-r--r--arch/x86/include/asm/pgtable_32.h8
-rw-r--r--arch/x86/include/asm/pgtable_64.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h9
-rw-r--r--arch/x86/include/asm/proto.h11
-rw-r--r--arch/x86/include/asm/spinlock.h4
-rw-r--r--arch/x86/include/asm/stacktrace.h2
-rw-r--r--arch/x86/include/asm/string_32.h8
-rw-r--r--arch/x86/include/asm/string_64.h8
-rw-r--r--arch/x86/include/asm/therm_throt.h9
-rw-r--r--arch/x86/include/asm/thread_info.h6
-rw-r--r--arch/x86/include/asm/timer.h6
-rw-r--r--arch/x86/include/asm/timex.h4
-rw-r--r--arch/x86/include/asm/uaccess.h6
-rw-r--r--arch/x86/include/asm/uaccess_64.h10
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h2
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h9
-rw-r--r--arch/x86/include/asm/xor.h5
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/boot.c80
-rw-r--r--arch/x86/kernel/acpi/cstate.c16
-rw-r--r--arch/x86/kernel/acpi/processor.c13
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile1
-rw-r--r--arch/x86/kernel/amd_iommu.c20
-rw-r--r--arch/x86/kernel/amd_iommu_init.c39
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/es7000_32.c3
-rw-r--r--arch/x86/kernel/apic/io_apic.c62
-rw-r--r--arch/x86/kernel/apic/ipi.c3
-rw-r--r--arch/x86/kernel/apic/numaq_32.c3
-rw-r--r--arch/x86/kernel/apic/probe_32.c11
-rw-r--r--arch/x86/kernel/apic/summit_32.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c44
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c13
-rw-r--r--arch/x86/kernel/cpu/common.c61
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c221
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c60
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c93
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c23
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile9
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c272
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.h38
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c (renamed from arch/x86/kernel/cpu/mcheck/mce_amd_64.c)0
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c250
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c248
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c48
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c15
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c129
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c3
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c445
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c17
-rw-r--r--arch/x86/kernel/cpuid.c6
-rw-r--r--arch/x86/kernel/crash.c6
-rw-r--r--arch/x86/kernel/dumpstack.c1
-rw-r--r--arch/x86/kernel/dumpstack_32.c6
-rw-r--r--arch/x86/kernel/dumpstack_64.c22
-rw-r--r--arch/x86/kernel/e820.c23
-rw-r--r--arch/x86/kernel/efi.c35
-rw-r--r--arch/x86/kernel/efi_64.c6
-rw-r--r--arch/x86/kernel/entry_32.S66
-rw-r--r--arch/x86/kernel/entry_64.S2
-rw-r--r--arch/x86/kernel/ftrace.c6
-rw-r--r--arch/x86/kernel/head_32.S7
-rw-r--r--arch/x86/kernel/head_64.S1
-rw-r--r--arch/x86/kernel/hpet.c3
-rw-r--r--arch/x86/kernel/i8253.c1
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/mfgpt_32.c2
-rw-r--r--arch/x86/kernel/microcode_core.c1
-rw-r--r--arch/x86/kernel/msr.c6
-rw-r--r--arch/x86/kernel/pci-dma.c8
-rw-r--r--arch/x86/kernel/pci-gart_64.c2
-rw-r--r--arch/x86/kernel/pci-swiotlb.c3
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/pvclock.c2
-rw-r--r--arch/x86/kernel/reboot.c50
-rw-r--r--arch/x86/kernel/setup.c29
-rw-r--r--arch/x86/kernel/setup_percpu.c221
-rw-r--r--arch/x86/kernel/stacktrace.c7
-rw-r--r--arch/x86/kernel/tlb_uv.c10
-rw-r--r--arch/x86/kernel/traps.c11
-rw-r--r--arch/x86/kernel/tsc.c46
-rw-r--r--arch/x86/kernel/vmi_32.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S23
-rw-r--r--arch/x86/kvm/i8254.c3
-rw-r--r--arch/x86/kvm/mmu.c54
-rw-r--r--arch/x86/kvm/paging_tmpl.h2
-rw-r--r--arch/x86/kvm/svm.c6
-rw-r--r--arch/x86/kvm/vmx.c23
-rw-r--r--arch/x86/kvm/x86.c45
-rw-r--r--arch/x86/kvm/x86_emulate.c2
-rw-r--r--arch/x86/lguest/boot.c510
-rw-r--r--arch/x86/lguest/i386_head.S112
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/atomic64_32.c230
-rw-r--r--arch/x86/lib/clear_page_64.S5
-rw-r--r--arch/x86/lib/copy_user_64.S1
-rw-r--r--arch/x86/lib/delay.c3
-rw-r--r--arch/x86/lib/msr.c26
-rw-r--r--arch/x86/lib/usercopy_32.c2
-rw-r--r--arch/x86/lib/usercopy_64.c2
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c34
-rw-r--r--arch/x86/mm/gup.c67
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--arch/x86/mm/init.c20
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/init_64.c15
-rw-r--r--arch/x86/mm/kmemcheck/Makefile1
-rw-r--r--arch/x86/mm/kmemcheck/error.c228
-rw-r--r--arch/x86/mm/kmemcheck/error.h15
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c640
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c106
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h9
-rw-r--r--arch/x86/mm/kmemcheck/pte.c22
-rw-r--r--arch/x86/mm/kmemcheck/pte.h10
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c69
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h6
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c162
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h16
-rw-r--r--arch/x86/mm/pageattr.c106
-rw-r--r--arch/x86/mm/pgtable.c19
-rw-r--r--arch/x86/mm/srat_64.c6
-rw-r--r--arch/x86/mm/tlb.c21
-rw-r--r--arch/x86/oprofile/nmi_int.c2
-rw-r--r--arch/x86/pci/acpi.c70
-rw-r--r--arch/x86/pci/amd_bus.c8
-rw-r--r--arch/x86/pci/i386.c24
-rw-r--r--arch/x86/pci/mmconfig-shared.c65
-rw-r--r--arch/x86/power/Makefile2
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/vdso/Makefile1
178 files changed, 4487 insertions, 2038 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 356d2ec8e2fb..13ffa5df37d7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,15 +24,18 @@ config X86
24 select HAVE_UNSTABLE_SCHED_CLOCK 24 select HAVE_UNSTABLE_SCHED_CLOCK
25 select HAVE_IDE 25 select HAVE_IDE
26 select HAVE_OPROFILE 26 select HAVE_OPROFILE
27 select HAVE_PERF_COUNTERS if (!M386 && !M486)
27 select HAVE_IOREMAP_PROT 28 select HAVE_IOREMAP_PROT
28 select HAVE_KPROBES 29 select HAVE_KPROBES
29 select ARCH_WANT_OPTIONAL_GPIOLIB 30 select ARCH_WANT_OPTIONAL_GPIOLIB
30 select ARCH_WANT_FRAME_POINTERS 31 select ARCH_WANT_FRAME_POINTERS
32 select HAVE_DMA_ATTRS
31 select HAVE_KRETPROBES 33 select HAVE_KRETPROBES
32 select HAVE_FTRACE_MCOUNT_RECORD 34 select HAVE_FTRACE_MCOUNT_RECORD
33 select HAVE_DYNAMIC_FTRACE 35 select HAVE_DYNAMIC_FTRACE
34 select HAVE_FUNCTION_TRACER 36 select HAVE_FUNCTION_TRACER
35 select HAVE_FUNCTION_GRAPH_TRACER 37 select HAVE_FUNCTION_GRAPH_TRACER
38 select HAVE_FUNCTION_GRAPH_FP_TEST
36 select HAVE_FUNCTION_TRACE_MCOUNT_TEST 39 select HAVE_FUNCTION_TRACE_MCOUNT_TEST
37 select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE 40 select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
38 select HAVE_FTRACE_SYSCALLS 41 select HAVE_FTRACE_SYSCALLS
@@ -46,6 +49,7 @@ config X86
46 select HAVE_KERNEL_GZIP 49 select HAVE_KERNEL_GZIP
47 select HAVE_KERNEL_BZIP2 50 select HAVE_KERNEL_BZIP2
48 select HAVE_KERNEL_LZMA 51 select HAVE_KERNEL_LZMA
52 select HAVE_ARCH_KMEMCHECK
49 53
50config OUTPUT_FORMAT 54config OUTPUT_FORMAT
51 string 55 string
@@ -739,7 +743,6 @@ config X86_UP_IOAPIC
739config X86_LOCAL_APIC 743config X86_LOCAL_APIC
740 def_bool y 744 def_bool y
741 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC 745 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
742 select HAVE_PERF_COUNTERS if (!M386 && !M486)
743 746
744config X86_IO_APIC 747config X86_IO_APIC
745 def_bool y 748 def_bool y
@@ -1910,25 +1913,26 @@ config DMAR_DEFAULT_ON
1910 recommended you say N here while the DMAR code remains 1913 recommended you say N here while the DMAR code remains
1911 experimental. 1914 experimental.
1912 1915
1913config DMAR_GFX_WA 1916config DMAR_BROKEN_GFX_WA
1914 def_bool y 1917 def_bool n
1915 prompt "Support for Graphics workaround" 1918 prompt "Workaround broken graphics drivers (going away soon)"
1916 depends on DMAR 1919 depends on DMAR
1917 ---help--- 1920 ---help---
1918 Current Graphics drivers tend to use physical address 1921 Current Graphics drivers tend to use physical address
1919 for DMA and avoid using DMA APIs. Setting this config 1922 for DMA and avoid using DMA APIs. Setting this config
1920 option permits the IOMMU driver to set a unity map for 1923 option permits the IOMMU driver to set a unity map for
1921 all the OS-visible memory. Hence the driver can continue 1924 all the OS-visible memory. Hence the driver can continue
1922 to use physical addresses for DMA. 1925 to use physical addresses for DMA, at least until this
1926 option is removed in the 2.6.32 kernel.
1923 1927
1924config DMAR_FLOPPY_WA 1928config DMAR_FLOPPY_WA
1925 def_bool y 1929 def_bool y
1926 depends on DMAR 1930 depends on DMAR
1927 ---help--- 1931 ---help---
1928 Floppy disk drivers are know to bypass DMA API calls 1932 Floppy disk drivers are known to bypass DMA API calls
1929 thereby failing to work when IOMMU is enabled. This 1933 thereby failing to work when IOMMU is enabled. This
1930 workaround will setup a 1:1 mapping for the first 1934 workaround will setup a 1:1 mapping for the first
1931 16M to make floppy (an ISA device) work. 1935 16MiB to make floppy (an ISA device) work.
1932 1936
1933config INTR_REMAP 1937config INTR_REMAP
1934 bool "Support for Interrupt Remapping (EXPERIMENTAL)" 1938 bool "Support for Interrupt Remapping (EXPERIMENTAL)"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index edbd0ca62067..1b68659c41b4 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -81,6 +81,11 @@ ifdef CONFIG_CC_STACKPROTECTOR
81 endif 81 endif
82endif 82endif
83 83
84# Don't unroll struct assignments with kmemcheck enabled
85ifeq ($(CONFIG_KMEMCHECK),y)
86 KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
87endif
88
84# Stackpointer is addressed different for 32 bit and 64 bit x86 89# Stackpointer is addressed different for 32 bit and 64 bit x86
85sp-$(CONFIG_X86_32) := esp 90sp-$(CONFIG_X86_32) := esp
86sp-$(CONFIG_X86_64) := rsp 91sp-$(CONFIG_X86_64) := rsp
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 8d16ada25048..ec749c2bfdd7 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -70,6 +70,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
70 $(call cc-option, -mpreferred-stack-boundary=2) 70 $(call cc-option, -mpreferred-stack-boundary=2)
71KBUILD_CFLAGS += $(call cc-option, -m32) 71KBUILD_CFLAGS += $(call cc-option, -m32)
72KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 72KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
73GCOV_PROFILE := n
73 74
74$(obj)/bzImage: asflags-y := $(SVGA_MODE) 75$(obj)/bzImage: asflags-y := $(SVGA_MODE)
75 76
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
index 507793739ea5..1dfbf64e52a2 100644
--- a/arch/x86/boot/bioscall.S
+++ b/arch/x86/boot/bioscall.S
@@ -13,7 +13,7 @@
13 * touching registers they shouldn't be. 13 * touching registers they shouldn't be.
14 */ 14 */
15 15
16 .code16 16 .code16gcc
17 .text 17 .text
18 .globl intcall 18 .globl intcall
19 .type intcall, @function 19 .type intcall, @function
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 49c8a4c37d7c..e2ff504b4ddc 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -15,6 +15,7 @@ KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
15KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) 15KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
16 16
17KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 17KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
18GCOV_PROFILE := n
18 19
19LDFLAGS := -m elf_$(UTS_MACHINE) 20LDFLAGS := -m elf_$(UTS_MACHINE)
20LDFLAGS_vmlinux := -T 21LDFLAGS_vmlinux := -T
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index d660be492363..49e0c18833e0 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -37,14 +37,13 @@ static int set_bios_mode(u8 mode)
37 ireg.al = mode; /* AH=0x00 Set Video Mode */ 37 ireg.al = mode; /* AH=0x00 Set Video Mode */
38 intcall(0x10, &ireg, NULL); 38 intcall(0x10, &ireg, NULL);
39 39
40
41 ireg.ah = 0x0f; /* Get Current Video Mode */ 40 ireg.ah = 0x0f; /* Get Current Video Mode */
42 intcall(0x10, &ireg, &oreg); 41 intcall(0x10, &ireg, &oreg);
43 42
44 do_restore = 1; /* Assume video contents were lost */ 43 do_restore = 1; /* Assume video contents were lost */
45 44
46 /* Not all BIOSes are clean with the top bit */ 45 /* Not all BIOSes are clean with the top bit */
47 new_mode = ireg.al & 0x7f; 46 new_mode = oreg.al & 0x7f;
48 47
49 if (new_mode == mode) 48 if (new_mode == mode)
50 return 0; /* Mode change OK */ 49 return 0; /* Mode change OK */
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index c700147d6ffb..275dd177f198 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -45,7 +45,7 @@ static int vesa_probe(void)
45 ireg.di = (size_t)&vginfo; 45 ireg.di = (size_t)&vginfo;
46 intcall(0x10, &ireg, &oreg); 46 intcall(0x10, &ireg, &oreg);
47 47
48 if (ireg.ax != 0x004f || 48 if (oreg.ax != 0x004f ||
49 vginfo.signature != VESA_MAGIC || 49 vginfo.signature != VESA_MAGIC ||
50 vginfo.version < 0x0102) 50 vginfo.version < 0x0102)
51 return 0; /* Not present */ 51 return 0; /* Not present */
@@ -70,7 +70,7 @@ static int vesa_probe(void)
70 ireg.di = (size_t)&vminfo; 70 ireg.di = (size_t)&vminfo;
71 intcall(0x10, &ireg, &oreg); 71 intcall(0x10, &ireg, &oreg);
72 72
73 if (ireg.ax != 0x004f) 73 if (oreg.ax != 0x004f)
74 continue; 74 continue;
75 75
76 if ((vminfo.mode_attr & 0x15) == 0x05) { 76 if ((vminfo.mode_attr & 0x15) == 0x05) {
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index caba99601703..eb0566e83319 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -845,7 +845,7 @@ ENTRY(aesni_cbc_enc)
845 */ 845 */
846ENTRY(aesni_cbc_dec) 846ENTRY(aesni_cbc_dec)
847 cmp $16, LEN 847 cmp $16, LEN
848 jb .Lcbc_dec_ret 848 jb .Lcbc_dec_just_ret
849 mov 480(KEYP), KLEN 849 mov 480(KEYP), KLEN
850 add $240, KEYP 850 add $240, KEYP
851 movups (IVP), IV 851 movups (IVP), IV
@@ -891,6 +891,7 @@ ENTRY(aesni_cbc_dec)
891 add $16, OUTP 891 add $16, OUTP
892 cmp $16, LEN 892 cmp $16, LEN
893 jge .Lcbc_dec_loop1 893 jge .Lcbc_dec_loop1
894 movups IV, (IVP)
895.Lcbc_dec_ret: 894.Lcbc_dec_ret:
895 movups IV, (IVP)
896.Lcbc_dec_just_ret:
896 ret 897 ret
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 4e663398f77f..c580c5ec1cad 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -198,6 +198,7 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
198 198
199 blkcipher_walk_init(&walk, dst, src, nbytes); 199 blkcipher_walk_init(&walk, dst, src, nbytes);
200 err = blkcipher_walk_virt(desc, &walk); 200 err = blkcipher_walk_virt(desc, &walk);
201 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
201 202
202 kernel_fpu_begin(); 203 kernel_fpu_begin();
203 while ((nbytes = walk.nbytes)) { 204 while ((nbytes = walk.nbytes)) {
@@ -221,6 +222,7 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
221 222
222 blkcipher_walk_init(&walk, dst, src, nbytes); 223 blkcipher_walk_init(&walk, dst, src, nbytes);
223 err = blkcipher_walk_virt(desc, &walk); 224 err = blkcipher_walk_virt(desc, &walk);
225 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
224 226
225 kernel_fpu_begin(); 227 kernel_fpu_begin();
226 while ((nbytes = walk.nbytes)) { 228 while ((nbytes = walk.nbytes)) {
@@ -266,6 +268,7 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
266 268
267 blkcipher_walk_init(&walk, dst, src, nbytes); 269 blkcipher_walk_init(&walk, dst, src, nbytes);
268 err = blkcipher_walk_virt(desc, &walk); 270 err = blkcipher_walk_virt(desc, &walk);
271 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
269 272
270 kernel_fpu_begin(); 273 kernel_fpu_begin();
271 while ((nbytes = walk.nbytes)) { 274 while ((nbytes = walk.nbytes)) {
@@ -289,6 +292,7 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
289 292
290 blkcipher_walk_init(&walk, dst, src, nbytes); 293 blkcipher_walk_init(&walk, dst, src, nbytes);
291 err = blkcipher_walk_virt(desc, &walk); 294 err = blkcipher_walk_virt(desc, &walk);
295 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
292 296
293 kernel_fpu_begin(); 297 kernel_fpu_begin();
294 while ((nbytes = walk.nbytes)) { 298 while ((nbytes = walk.nbytes)) {
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
index 5f9781a3815f..daef6cd2b45d 100644
--- a/arch/x86/crypto/fpu.c
+++ b/arch/x86/crypto/fpu.c
@@ -48,7 +48,7 @@ static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in,
48 struct blkcipher_desc desc = { 48 struct blkcipher_desc desc = {
49 .tfm = child, 49 .tfm = child,
50 .info = desc_in->info, 50 .info = desc_in->info,
51 .flags = desc_in->flags, 51 .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP,
52 }; 52 };
53 53
54 kernel_fpu_begin(); 54 kernel_fpu_begin();
@@ -67,7 +67,7 @@ static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in,
67 struct blkcipher_desc desc = { 67 struct blkcipher_desc desc = {
68 .tfm = child, 68 .tfm = child,
69 .info = desc_in->info, 69 .info = desc_in->info,
70 .flags = desc_in->flags, 70 .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP,
71 }; 71 };
72 72
73 kernel_fpu_begin(); 73 kernel_fpu_begin();
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 4518dc500903..20d1465a2ab0 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -144,6 +144,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
144 144
145#else /* !CONFIG_ACPI */ 145#else /* !CONFIG_ACPI */
146 146
147#define acpi_disabled 1
147#define acpi_lapic 0 148#define acpi_lapic 0
148#define acpi_ioapic 0 149#define acpi_ioapic 0
149static inline void acpi_noirq_set(void) { } 150static inline void acpi_noirq_set(void) { }
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index 262e02820049..bdf96f119f06 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -29,9 +29,11 @@ extern void amd_iommu_detect(void);
29extern irqreturn_t amd_iommu_int_handler(int irq, void *data); 29extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
30extern void amd_iommu_flush_all_domains(void); 30extern void amd_iommu_flush_all_domains(void);
31extern void amd_iommu_flush_all_devices(void); 31extern void amd_iommu_flush_all_devices(void);
32extern void amd_iommu_shutdown(void);
32#else 33#else
33static inline int amd_iommu_init(void) { return -ENODEV; } 34static inline int amd_iommu_init(void) { return -ENODEV; }
34static inline void amd_iommu_detect(void) { } 35static inline void amd_iommu_detect(void) { }
36static inline void amd_iommu_shutdown(void) { }
35#endif 37#endif
36 38
37#endif /* _ASM_X86_AMD_IOMMU_H */ 39#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 8cb9c814e120..dc5a667ff791 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -19,7 +19,10 @@
19 * 19 *
20 * Atomically reads the value of @v. 20 * Atomically reads the value of @v.
21 */ 21 */
22#define atomic_read(v) ((v)->counter) 22static inline int atomic_read(const atomic_t *v)
23{
24 return v->counter;
25}
23 26
24/** 27/**
25 * atomic_set - set atomic variable 28 * atomic_set - set atomic variable
@@ -28,7 +31,10 @@
28 * 31 *
29 * Atomically sets the value of @v to @i. 32 * Atomically sets the value of @v to @i.
30 */ 33 */
31#define atomic_set(v, i) (((v)->counter) = (i)) 34static inline void atomic_set(atomic_t *v, int i)
35{
36 v->counter = i;
37}
32 38
33/** 39/**
34 * atomic_add - add integer to atomic variable 40 * atomic_add - add integer to atomic variable
@@ -200,8 +206,15 @@ static inline int atomic_sub_return(int i, atomic_t *v)
200 return atomic_add_return(-i, v); 206 return atomic_add_return(-i, v);
201} 207}
202 208
203#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) 209static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
204#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) 210{
211 return cmpxchg(&v->counter, old, new);
212}
213
214static inline int atomic_xchg(atomic_t *v, int new)
215{
216 return xchg(&v->counter, new);
217}
205 218
206/** 219/**
207 * atomic_add_unless - add unless the number is already a given value 220 * atomic_add_unless - add unless the number is already a given value
@@ -250,67 +263,22 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
250/* An 64bit atomic type */ 263/* An 64bit atomic type */
251 264
252typedef struct { 265typedef struct {
253 unsigned long long counter; 266 u64 __aligned(8) counter;
254} atomic64_t; 267} atomic64_t;
255 268
256#define ATOMIC64_INIT(val) { (val) } 269#define ATOMIC64_INIT(val) { (val) }
257 270
258/** 271extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
259 * atomic64_read - read atomic64 variable
260 * @v: pointer of type atomic64_t
261 *
262 * Atomically reads the value of @v.
263 * Doesn't imply a read memory barrier.
264 */
265#define __atomic64_read(ptr) ((ptr)->counter)
266
267static inline unsigned long long
268cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
269{
270 asm volatile(
271
272 LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
273
274 : "=A" (old)
275
276 : [ptr] "D" (ptr),
277 "A" (old),
278 "b" (ll_low(new)),
279 "c" (ll_high(new))
280
281 : "memory");
282
283 return old;
284}
285
286static inline unsigned long long
287atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
288 unsigned long long new_val)
289{
290 return cmpxchg8b(&ptr->counter, old_val, new_val);
291}
292 272
293/** 273/**
294 * atomic64_xchg - xchg atomic64 variable 274 * atomic64_xchg - xchg atomic64 variable
295 * @ptr: pointer to type atomic64_t 275 * @ptr: pointer to type atomic64_t
296 * @new_val: value to assign 276 * @new_val: value to assign
297 * @old_val: old value that was there
298 * 277 *
299 * Atomically xchgs the value of @ptr to @new_val and returns 278 * Atomically xchgs the value of @ptr to @new_val and returns
300 * the old value. 279 * the old value.
301 */ 280 */
302 281extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
303static inline unsigned long long
304atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
305{
306 unsigned long long old_val;
307
308 do {
309 old_val = atomic_read(ptr);
310 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
311
312 return old_val;
313}
314 282
315/** 283/**
316 * atomic64_set - set atomic64 variable 284 * atomic64_set - set atomic64 variable
@@ -319,10 +287,7 @@ atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
319 * 287 *
320 * Atomically sets the value of @ptr to @new_val. 288 * Atomically sets the value of @ptr to @new_val.
321 */ 289 */
322static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) 290extern void atomic64_set(atomic64_t *ptr, u64 new_val);
323{
324 atomic64_xchg(ptr, new_val);
325}
326 291
327/** 292/**
328 * atomic64_read - read atomic64 variable 293 * atomic64_read - read atomic64 variable
@@ -330,17 +295,30 @@ static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
330 * 295 *
331 * Atomically reads the value of @ptr and returns it. 296 * Atomically reads the value of @ptr and returns it.
332 */ 297 */
333static inline unsigned long long atomic64_read(atomic64_t *ptr) 298static inline u64 atomic64_read(atomic64_t *ptr)
334{ 299{
335 unsigned long long curr_val; 300 u64 res;
336 301
337 do { 302 /*
338 curr_val = __atomic64_read(ptr); 303 * Note, we inline this atomic64_t primitive because
339 } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); 304 * it only clobbers EAX/EDX and leaves the others
340 305 * untouched. We also (somewhat subtly) rely on the
341 return curr_val; 306 * fact that cmpxchg8b returns the current 64-bit value
307 * of the memory location we are touching:
308 */
309 asm volatile(
310 "mov %%ebx, %%eax\n\t"
311 "mov %%ecx, %%edx\n\t"
312 LOCK_PREFIX "cmpxchg8b %1\n"
313 : "=&A" (res)
314 : "m" (*ptr)
315 );
316
317 return res;
342} 318}
343 319
320extern u64 atomic64_read(atomic64_t *ptr);
321
344/** 322/**
345 * atomic64_add_return - add and return 323 * atomic64_add_return - add and return
346 * @delta: integer value to add 324 * @delta: integer value to add
@@ -348,34 +326,14 @@ static inline unsigned long long atomic64_read(atomic64_t *ptr)
348 * 326 *
349 * Atomically adds @delta to @ptr and returns @delta + *@ptr 327 * Atomically adds @delta to @ptr and returns @delta + *@ptr
350 */ 328 */
351static inline unsigned long long 329extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
352atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
353{
354 unsigned long long old_val, new_val;
355
356 do {
357 old_val = atomic_read(ptr);
358 new_val = old_val + delta;
359
360 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
361
362 return new_val;
363}
364
365static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
366{
367 return atomic64_add_return(-delta, ptr);
368}
369 330
370static inline long atomic64_inc_return(atomic64_t *ptr) 331/*
371{ 332 * Other variants with different arithmetic operators:
372 return atomic64_add_return(1, ptr); 333 */
373} 334extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
374 335extern u64 atomic64_inc_return(atomic64_t *ptr);
375static inline long atomic64_dec_return(atomic64_t *ptr) 336extern u64 atomic64_dec_return(atomic64_t *ptr);
376{
377 return atomic64_sub_return(1, ptr);
378}
379 337
380/** 338/**
381 * atomic64_add - add integer to atomic64 variable 339 * atomic64_add - add integer to atomic64 variable
@@ -384,10 +342,7 @@ static inline long atomic64_dec_return(atomic64_t *ptr)
384 * 342 *
385 * Atomically adds @delta to @ptr. 343 * Atomically adds @delta to @ptr.
386 */ 344 */
387static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) 345extern void atomic64_add(u64 delta, atomic64_t *ptr);
388{
389 atomic64_add_return(delta, ptr);
390}
391 346
392/** 347/**
393 * atomic64_sub - subtract the atomic64 variable 348 * atomic64_sub - subtract the atomic64 variable
@@ -396,10 +351,7 @@ static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
396 * 351 *
397 * Atomically subtracts @delta from @ptr. 352 * Atomically subtracts @delta from @ptr.
398 */ 353 */
399static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) 354extern void atomic64_sub(u64 delta, atomic64_t *ptr);
400{
401 atomic64_add(-delta, ptr);
402}
403 355
404/** 356/**
405 * atomic64_sub_and_test - subtract value from variable and test result 357 * atomic64_sub_and_test - subtract value from variable and test result
@@ -410,13 +362,7 @@ static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
410 * true if the result is zero, or false for all 362 * true if the result is zero, or false for all
411 * other cases. 363 * other cases.
412 */ 364 */
413static inline int 365extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
414atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
415{
416 unsigned long long old_val = atomic64_sub_return(delta, ptr);
417
418 return old_val == 0;
419}
420 366
421/** 367/**
422 * atomic64_inc - increment atomic64 variable 368 * atomic64_inc - increment atomic64 variable
@@ -424,10 +370,7 @@ atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
424 * 370 *
425 * Atomically increments @ptr by 1. 371 * Atomically increments @ptr by 1.
426 */ 372 */
427static inline void atomic64_inc(atomic64_t *ptr) 373extern void atomic64_inc(atomic64_t *ptr);
428{
429 atomic64_add(1, ptr);
430}
431 374
432/** 375/**
433 * atomic64_dec - decrement atomic64 variable 376 * atomic64_dec - decrement atomic64 variable
@@ -435,10 +378,7 @@ static inline void atomic64_inc(atomic64_t *ptr)
435 * 378 *
436 * Atomically decrements @ptr by 1. 379 * Atomically decrements @ptr by 1.
437 */ 380 */
438static inline void atomic64_dec(atomic64_t *ptr) 381extern void atomic64_dec(atomic64_t *ptr);
439{
440 atomic64_sub(1, ptr);
441}
442 382
443/** 383/**
444 * atomic64_dec_and_test - decrement and test 384 * atomic64_dec_and_test - decrement and test
@@ -448,10 +388,7 @@ static inline void atomic64_dec(atomic64_t *ptr)
448 * returns true if the result is 0, or false for all other 388 * returns true if the result is 0, or false for all other
449 * cases. 389 * cases.
450 */ 390 */
451static inline int atomic64_dec_and_test(atomic64_t *ptr) 391extern int atomic64_dec_and_test(atomic64_t *ptr);
452{
453 return atomic64_sub_and_test(1, ptr);
454}
455 392
456/** 393/**
457 * atomic64_inc_and_test - increment and test 394 * atomic64_inc_and_test - increment and test
@@ -461,10 +398,7 @@ static inline int atomic64_dec_and_test(atomic64_t *ptr)
461 * and returns true if the result is zero, or false for all 398 * and returns true if the result is zero, or false for all
462 * other cases. 399 * other cases.
463 */ 400 */
464static inline int atomic64_inc_and_test(atomic64_t *ptr) 401extern int atomic64_inc_and_test(atomic64_t *ptr);
465{
466 return atomic64_sub_and_test(-1, ptr);
467}
468 402
469/** 403/**
470 * atomic64_add_negative - add and test if negative 404 * atomic64_add_negative - add and test if negative
@@ -475,13 +409,7 @@ static inline int atomic64_inc_and_test(atomic64_t *ptr)
475 * if the result is negative, or false when 409 * if the result is negative, or false when
476 * result is greater than or equal to zero. 410 * result is greater than or equal to zero.
477 */ 411 */
478static inline int 412extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
479atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
480{
481 long long old_val = atomic64_add_return(delta, ptr);
482
483 return old_val < 0;
484}
485 413
486#include <asm-generic/atomic-long.h> 414#include <asm-generic/atomic-long.h>
487#endif /* _ASM_X86_ATOMIC_32_H */ 415#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
index 0d6360220007..d605dc268e79 100644
--- a/arch/x86/include/asm/atomic_64.h
+++ b/arch/x86/include/asm/atomic_64.h
@@ -18,7 +18,10 @@
18 * 18 *
19 * Atomically reads the value of @v. 19 * Atomically reads the value of @v.
20 */ 20 */
21#define atomic_read(v) ((v)->counter) 21static inline int atomic_read(const atomic_t *v)
22{
23 return v->counter;
24}
22 25
23/** 26/**
24 * atomic_set - set atomic variable 27 * atomic_set - set atomic variable
@@ -27,7 +30,10 @@
27 * 30 *
28 * Atomically sets the value of @v to @i. 31 * Atomically sets the value of @v to @i.
29 */ 32 */
30#define atomic_set(v, i) (((v)->counter) = (i)) 33static inline void atomic_set(atomic_t *v, int i)
34{
35 v->counter = i;
36}
31 37
32/** 38/**
33 * atomic_add - add integer to atomic variable 39 * atomic_add - add integer to atomic variable
@@ -192,7 +198,10 @@ static inline int atomic_sub_return(int i, atomic_t *v)
192 * Atomically reads the value of @v. 198 * Atomically reads the value of @v.
193 * Doesn't imply a read memory barrier. 199 * Doesn't imply a read memory barrier.
194 */ 200 */
195#define atomic64_read(v) ((v)->counter) 201static inline long atomic64_read(const atomic64_t *v)
202{
203 return v->counter;
204}
196 205
197/** 206/**
198 * atomic64_set - set atomic64 variable 207 * atomic64_set - set atomic64 variable
@@ -201,7 +210,10 @@ static inline int atomic_sub_return(int i, atomic_t *v)
201 * 210 *
202 * Atomically sets the value of @v to @i. 211 * Atomically sets the value of @v to @i.
203 */ 212 */
204#define atomic64_set(v, i) (((v)->counter) = (i)) 213static inline void atomic64_set(atomic64_t *v, long i)
214{
215 v->counter = i;
216}
205 217
206/** 218/**
207 * atomic64_add - add integer to atomic64 variable 219 * atomic64_add - add integer to atomic64 variable
@@ -355,11 +367,25 @@ static inline long atomic64_sub_return(long i, atomic64_t *v)
355#define atomic64_inc_return(v) (atomic64_add_return(1, (v))) 367#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
356#define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) 368#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
357 369
358#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) 370static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
359#define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) 371{
372 return cmpxchg(&v->counter, old, new);
373}
374
375static inline long atomic64_xchg(atomic64_t *v, long new)
376{
377 return xchg(&v->counter, new);
378}
360 379
361#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) 380static inline long atomic_cmpxchg(atomic_t *v, int old, int new)
362#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) 381{
382 return cmpxchg(&v->counter, old, new);
383}
384
385static inline long atomic_xchg(atomic_t *v, int new)
386{
387 return xchg(&v->counter, new);
388}
363 389
364/** 390/**
365 * atomic_add_unless - add unless the number is a given value 391 * atomic_add_unless - add unless the number is a given value
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 418e632d4a80..7a1065958ba9 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -8,7 +8,7 @@
8 8
9#ifdef __KERNEL__ 9#ifdef __KERNEL__
10 10
11#include <asm/page_types.h> 11#include <asm/pgtable_types.h>
12 12
13/* Physical address where kernel should be loaded. */ 13/* Physical address where kernel should be loaded. */
14#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ 14#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
@@ -16,10 +16,10 @@
16 & ~(CONFIG_PHYSICAL_ALIGN - 1)) 16 & ~(CONFIG_PHYSICAL_ALIGN - 1))
17 17
18/* Minimum kernel alignment, as a power of two */ 18/* Minimum kernel alignment, as a power of two */
19#ifdef CONFIG_x86_64 19#ifdef CONFIG_X86_64
20#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT 20#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
21#else 21#else
22#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT+1) 22#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_ORDER)
23#endif 23#endif
24#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) 24#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
25 25
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index c45f415ce315..c993e9e0fed4 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -1,7 +1,6 @@
1#ifndef _ASM_X86_DESC_H 1#ifndef _ASM_X86_DESC_H
2#define _ASM_X86_DESC_H 2#define _ASM_X86_DESC_H
3 3
4#ifndef __ASSEMBLY__
5#include <asm/desc_defs.h> 4#include <asm/desc_defs.h>
6#include <asm/ldt.h> 5#include <asm/ldt.h>
7#include <asm/mmu.h> 6#include <asm/mmu.h>
@@ -380,29 +379,4 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
380 _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); 379 _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
381} 380}
382 381
383#else
384/*
385 * GET_DESC_BASE reads the descriptor base of the specified segment.
386 *
387 * Args:
388 * idx - descriptor index
389 * gdt - GDT pointer
390 * base - 32bit register to which the base will be written
391 * lo_w - lo word of the "base" register
392 * lo_b - lo byte of the "base" register
393 * hi_b - hi byte of the low word of the "base" register
394 *
395 * Example:
396 * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
397 * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
398 */
399#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
400 movb idx * 8 + 4(gdt), lo_b; \
401 movb idx * 8 + 7(gdt), hi_b; \
402 shll $16, base; \
403 movw idx * 8 + 2(gdt), lo_w;
404
405
406#endif /* __ASSEMBLY__ */
407
408#endif /* _ASM_X86_DESC_H */ 382#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index f82fdc412c64..1c3f9435f1c9 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -6,6 +6,7 @@
6 * Documentation/DMA-API.txt for documentation. 6 * Documentation/DMA-API.txt for documentation.
7 */ 7 */
8 8
9#include <linux/kmemcheck.h>
9#include <linux/scatterlist.h> 10#include <linux/scatterlist.h>
10#include <linux/dma-debug.h> 11#include <linux/dma-debug.h>
11#include <linux/dma-attrs.h> 12#include <linux/dma-attrs.h>
@@ -32,6 +33,8 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
32#endif 33#endif
33} 34}
34 35
36#include <asm-generic/dma-mapping-common.h>
37
35/* Make sure we keep the same behaviour */ 38/* Make sure we keep the same behaviour */
36static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) 39static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
37{ 40{
@@ -52,171 +55,6 @@ extern int dma_set_mask(struct device *dev, u64 mask);
52extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, 55extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
53 dma_addr_t *dma_addr, gfp_t flag); 56 dma_addr_t *dma_addr, gfp_t flag);
54 57
55static inline dma_addr_t
56dma_map_single(struct device *hwdev, void *ptr, size_t size,
57 enum dma_data_direction dir)
58{
59 struct dma_map_ops *ops = get_dma_ops(hwdev);
60 dma_addr_t addr;
61
62 BUG_ON(!valid_dma_direction(dir));
63 addr = ops->map_page(hwdev, virt_to_page(ptr),
64 (unsigned long)ptr & ~PAGE_MASK, size,
65 dir, NULL);
66 debug_dma_map_page(hwdev, virt_to_page(ptr),
67 (unsigned long)ptr & ~PAGE_MASK, size,
68 dir, addr, true);
69 return addr;
70}
71
72static inline void
73dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
74 enum dma_data_direction dir)
75{
76 struct dma_map_ops *ops = get_dma_ops(dev);
77
78 BUG_ON(!valid_dma_direction(dir));
79 if (ops->unmap_page)
80 ops->unmap_page(dev, addr, size, dir, NULL);
81 debug_dma_unmap_page(dev, addr, size, dir, true);
82}
83
84static inline int
85dma_map_sg(struct device *hwdev, struct scatterlist *sg,
86 int nents, enum dma_data_direction dir)
87{
88 struct dma_map_ops *ops = get_dma_ops(hwdev);
89 int ents;
90
91 BUG_ON(!valid_dma_direction(dir));
92 ents = ops->map_sg(hwdev, sg, nents, dir, NULL);
93 debug_dma_map_sg(hwdev, sg, nents, ents, dir);
94
95 return ents;
96}
97
98static inline void
99dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
100 enum dma_data_direction dir)
101{
102 struct dma_map_ops *ops = get_dma_ops(hwdev);
103
104 BUG_ON(!valid_dma_direction(dir));
105 debug_dma_unmap_sg(hwdev, sg, nents, dir);
106 if (ops->unmap_sg)
107 ops->unmap_sg(hwdev, sg, nents, dir, NULL);
108}
109
110static inline void
111dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
112 size_t size, enum dma_data_direction dir)
113{
114 struct dma_map_ops *ops = get_dma_ops(hwdev);
115
116 BUG_ON(!valid_dma_direction(dir));
117 if (ops->sync_single_for_cpu)
118 ops->sync_single_for_cpu(hwdev, dma_handle, size, dir);
119 debug_dma_sync_single_for_cpu(hwdev, dma_handle, size, dir);
120 flush_write_buffers();
121}
122
123static inline void
124dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
125 size_t size, enum dma_data_direction dir)
126{
127 struct dma_map_ops *ops = get_dma_ops(hwdev);
128
129 BUG_ON(!valid_dma_direction(dir));
130 if (ops->sync_single_for_device)
131 ops->sync_single_for_device(hwdev, dma_handle, size, dir);
132 debug_dma_sync_single_for_device(hwdev, dma_handle, size, dir);
133 flush_write_buffers();
134}
135
136static inline void
137dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
138 unsigned long offset, size_t size,
139 enum dma_data_direction dir)
140{
141 struct dma_map_ops *ops = get_dma_ops(hwdev);
142
143 BUG_ON(!valid_dma_direction(dir));
144 if (ops->sync_single_range_for_cpu)
145 ops->sync_single_range_for_cpu(hwdev, dma_handle, offset,
146 size, dir);
147 debug_dma_sync_single_range_for_cpu(hwdev, dma_handle,
148 offset, size, dir);
149 flush_write_buffers();
150}
151
152static inline void
153dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
154 unsigned long offset, size_t size,
155 enum dma_data_direction dir)
156{
157 struct dma_map_ops *ops = get_dma_ops(hwdev);
158
159 BUG_ON(!valid_dma_direction(dir));
160 if (ops->sync_single_range_for_device)
161 ops->sync_single_range_for_device(hwdev, dma_handle,
162 offset, size, dir);
163 debug_dma_sync_single_range_for_device(hwdev, dma_handle,
164 offset, size, dir);
165 flush_write_buffers();
166}
167
168static inline void
169dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
170 int nelems, enum dma_data_direction dir)
171{
172 struct dma_map_ops *ops = get_dma_ops(hwdev);
173
174 BUG_ON(!valid_dma_direction(dir));
175 if (ops->sync_sg_for_cpu)
176 ops->sync_sg_for_cpu(hwdev, sg, nelems, dir);
177 debug_dma_sync_sg_for_cpu(hwdev, sg, nelems, dir);
178 flush_write_buffers();
179}
180
181static inline void
182dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
183 int nelems, enum dma_data_direction dir)
184{
185 struct dma_map_ops *ops = get_dma_ops(hwdev);
186
187 BUG_ON(!valid_dma_direction(dir));
188 if (ops->sync_sg_for_device)
189 ops->sync_sg_for_device(hwdev, sg, nelems, dir);
190 debug_dma_sync_sg_for_device(hwdev, sg, nelems, dir);
191
192 flush_write_buffers();
193}
194
195static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
196 size_t offset, size_t size,
197 enum dma_data_direction dir)
198{
199 struct dma_map_ops *ops = get_dma_ops(dev);
200 dma_addr_t addr;
201
202 BUG_ON(!valid_dma_direction(dir));
203 addr = ops->map_page(dev, page, offset, size, dir, NULL);
204 debug_dma_map_page(dev, page, offset, size, dir, addr, false);
205
206 return addr;
207}
208
209static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
210 size_t size, enum dma_data_direction dir)
211{
212 struct dma_map_ops *ops = get_dma_ops(dev);
213
214 BUG_ON(!valid_dma_direction(dir));
215 if (ops->unmap_page)
216 ops->unmap_page(dev, addr, size, dir, NULL);
217 debug_dma_unmap_page(dev, addr, size, dir, false);
218}
219
220static inline void 58static inline void
221dma_cache_sync(struct device *dev, void *vaddr, size_t size, 59dma_cache_sync(struct device *dev, void *vaddr, size_t size,
222 enum dma_data_direction dir) 60 enum dma_data_direction dir)
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index edc90f23e708..8406ed7f9926 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -33,7 +33,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
33#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ 33#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
34 efi_call_virt(f, a1, a2, a3, a4, a5, a6) 34 efi_call_virt(f, a1, a2, a3, a4, a5, a6)
35 35
36#define efi_ioremap(addr, size) ioremap_cache(addr, size) 36#define efi_ioremap(addr, size, type) ioremap_cache(addr, size)
37 37
38#else /* !CONFIG_X86_32 */ 38#else /* !CONFIG_X86_32 */
39 39
@@ -84,7 +84,8 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
84 efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ 84 efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
85 (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) 85 (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
86 86
87extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size); 87extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
88 u32 type);
88 89
89#endif /* CONFIG_X86_32 */ 90#endif /* CONFIG_X86_32 */
90 91
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 2d81af3974a0..7b2d71df39a6 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -111,12 +111,9 @@ enum fixed_addresses {
111#ifdef CONFIG_PARAVIRT 111#ifdef CONFIG_PARAVIRT
112 FIX_PARAVIRT_BOOTMAP, 112 FIX_PARAVIRT_BOOTMAP,
113#endif 113#endif
114 FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */ 114 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
115 FIX_TEXT_POKE1, 115 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
116 __end_of_permanent_fixed_addresses, 116 __end_of_permanent_fixed_addresses,
117#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
118 FIX_OHCI1394_BASE,
119#endif
120 /* 117 /*
121 * 256 temporary boot-time mappings, used by early_ioremap(), 118 * 256 temporary boot-time mappings, used by early_ioremap(),
122 * before ioremap() is functional. 119 * before ioremap() is functional.
@@ -129,6 +126,9 @@ enum fixed_addresses {
129 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - 126 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
130 (__end_of_permanent_fixed_addresses & 255), 127 (__end_of_permanent_fixed_addresses & 255),
131 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, 128 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
129#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
130 FIX_OHCI1394_BASE,
131#endif
132#ifdef CONFIG_X86_32 132#ifdef CONFIG_X86_32
133 FIX_WP_TEST, 133 FIX_WP_TEST,
134#endif 134#endif
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index daf866ed0612..330ee807f89e 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -161,6 +161,7 @@ extern int io_apic_set_pci_routing(struct device *dev, int irq,
161 struct io_apic_irq_attr *irq_attr); 161 struct io_apic_irq_attr *irq_attr);
162extern int (*ioapic_renumber_irq)(int ioapic, int irq); 162extern int (*ioapic_renumber_irq)(int ioapic, int irq);
163extern void ioapic_init_mappings(void); 163extern void ioapic_init_mappings(void);
164extern void ioapic_insert_resources(void);
164 165
165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); 166extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
166extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); 167extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
@@ -180,6 +181,7 @@ extern void ioapic_write_entry(int apic, int pin,
180#define io_apic_assign_pci_irqs 0 181#define io_apic_assign_pci_irqs 0
181static const int timer_through_8259 = 0; 182static const int timer_through_8259 = 0;
182static inline void ioapic_init_mappings(void) { } 183static inline void ioapic_init_mappings(void) { }
184static inline void ioapic_insert_resources(void) { }
183 185
184static inline void probe_nr_irqs_gsi(void) { } 186static inline void probe_nr_irqs_gsi(void) { }
185#endif 187#endif
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index af326a2975b5..fd6d21bbee6c 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -6,6 +6,7 @@ extern void no_iommu_init(void);
6extern struct dma_map_ops nommu_dma_ops; 6extern struct dma_map_ops nommu_dma_ops;
7extern int force_iommu, no_iommu; 7extern int force_iommu, no_iommu;
8extern int iommu_detected; 8extern int iommu_detected;
9extern int iommu_pass_through;
9 10
10/* 10 seconds */ 11/* 10 seconds */
11#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) 12#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 2bdab21f0898..c6ccbe7e81ad 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -12,9 +12,15 @@ static inline unsigned long native_save_fl(void)
12{ 12{
13 unsigned long flags; 13 unsigned long flags;
14 14
15 /*
16 * Note: this needs to be "=r" not "=rm", because we have the
17 * stack offset from what gcc expects at the time the "pop" is
18 * executed, and so a memory reference with respect to the stack
19 * would end up using the wrong address.
20 */
15 asm volatile("# __raw_save_flags\n\t" 21 asm volatile("# __raw_save_flags\n\t"
16 "pushf ; pop %0" 22 "pushf ; pop %0"
17 : "=g" (flags) 23 : "=r" (flags)
18 : /* no input */ 24 : /* no input */
19 : "memory"); 25 : "memory");
20 26
diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
index 5759c165a5cf..9e00a731a7fb 100644
--- a/arch/x86/include/asm/kmap_types.h
+++ b/arch/x86/include/asm/kmap_types.h
@@ -2,28 +2,11 @@
2#define _ASM_X86_KMAP_TYPES_H 2#define _ASM_X86_KMAP_TYPES_H
3 3
4#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM) 4#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM)
5# define D(n) __KM_FENCE_##n , 5#define __WITH_KM_FENCE
6#else
7# define D(n)
8#endif 6#endif
9 7
10enum km_type { 8#include <asm-generic/kmap_types.h>
11D(0) KM_BOUNCE_READ,
12D(1) KM_SKB_SUNRPC_DATA,
13D(2) KM_SKB_DATA_SOFTIRQ,
14D(3) KM_USER0,
15D(4) KM_USER1,
16D(5) KM_BIO_SRC_IRQ,
17D(6) KM_BIO_DST_IRQ,
18D(7) KM_PTE0,
19D(8) KM_PTE1,
20D(9) KM_IRQ0,
21D(10) KM_IRQ1,
22D(11) KM_SOFTIRQ0,
23D(12) KM_SOFTIRQ1,
24D(13) KM_TYPE_NR
25};
26 9
27#undef D 10#undef __WITH_KM_FENCE
28 11
29#endif /* _ASM_X86_KMAP_TYPES_H */ 12#endif /* _ASM_X86_KMAP_TYPES_H */
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
new file mode 100644
index 000000000000..ed01518f297e
--- /dev/null
+++ b/arch/x86/include/asm/kmemcheck.h
@@ -0,0 +1,42 @@
1#ifndef ASM_X86_KMEMCHECK_H
2#define ASM_X86_KMEMCHECK_H
3
4#include <linux/types.h>
5#include <asm/ptrace.h>
6
7#ifdef CONFIG_KMEMCHECK
8bool kmemcheck_active(struct pt_regs *regs);
9
10void kmemcheck_show(struct pt_regs *regs);
11void kmemcheck_hide(struct pt_regs *regs);
12
13bool kmemcheck_fault(struct pt_regs *regs,
14 unsigned long address, unsigned long error_code);
15bool kmemcheck_trap(struct pt_regs *regs);
16#else
17static inline bool kmemcheck_active(struct pt_regs *regs)
18{
19 return false;
20}
21
22static inline void kmemcheck_show(struct pt_regs *regs)
23{
24}
25
26static inline void kmemcheck_hide(struct pt_regs *regs)
27{
28}
29
30static inline bool kmemcheck_fault(struct pt_regs *regs,
31 unsigned long address, unsigned long error_code)
32{
33 return false;
34}
35
36static inline bool kmemcheck_trap(struct pt_regs *regs)
37{
38 return false;
39}
40#endif /* CONFIG_KMEMCHECK */
41
42#endif
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 313389cd50d2..5136dad57cbb 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -17,8 +17,7 @@
17/* Pages for switcher itself, then two pages per cpu */ 17/* Pages for switcher itself, then two pages per cpu */
18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) 18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
19 19
20/* We map at -4M (-2M when PAE is activated) for ease of mapping 20/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */
21 * into the guest (one PTE page). */
22#ifdef CONFIG_X86_PAE 21#ifdef CONFIG_X86_PAE
23#define SWITCHER_ADDR 0xFFE00000 22#define SWITCHER_ADDR 0xFFE00000
24#else 23#else
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index d31c4a684078..ba0eed8aa1a6 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -30,27 +30,27 @@
30#include <asm/hw_irq.h> 30#include <asm/hw_irq.h>
31#include <asm/kvm_para.h> 31#include <asm/kvm_para.h>
32 32
33/*G:031 But first, how does our Guest contact the Host to ask for privileged 33/*G:030
34 * But first, how does our Guest contact the Host to ask for privileged
34 * operations? There are two ways: the direct way is to make a "hypercall", 35 * operations? There are two ways: the direct way is to make a "hypercall",
35 * to make requests of the Host Itself. 36 * to make requests of the Host Itself.
36 * 37 *
37 * We use the KVM hypercall mechanism. Seventeen hypercalls are 38 * We use the KVM hypercall mechanism, though completely different hypercall
38 * available: the hypercall number is put in the %eax register, and the 39 * numbers. Seventeen hypercalls are available: the hypercall number is put in
39 * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. 40 * the %eax register, and the arguments (when required) are placed in %ebx,
40 * If a return value makes sense, it's returned in %eax. 41 * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax.
41 * 42 *
42 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 43 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
43 * Host, rather than returning failure. This reflects Winston Churchill's 44 * Host, rather than returning failure. This reflects Winston Churchill's
44 * definition of a gentleman: "someone who is only rude intentionally". */ 45 * definition of a gentleman: "someone who is only rude intentionally".
45/*:*/ 46:*/
46 47
47/* Can't use our min() macro here: needs to be a constant */ 48/* Can't use our min() macro here: needs to be a constant */
48#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 49#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
49 50
50#define LHCALL_RING_SIZE 64 51#define LHCALL_RING_SIZE 64
51struct hcall_args { 52struct hcall_args {
52 /* These map directly onto eax, ebx, ecx, edx and esi 53 /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
53 * in struct lguest_regs */
54 unsigned long arg0, arg1, arg2, arg3, arg4; 54 unsigned long arg0, arg1, arg2, arg3, arg4;
55}; 55};
56 56
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 540a466e50f5..5cdd8d100ec9 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -102,15 +102,39 @@ struct mce_log {
102 102
103#ifdef __KERNEL__ 103#ifdef __KERNEL__
104 104
105#include <linux/percpu.h>
106#include <linux/init.h>
107#include <asm/atomic.h>
108
105extern int mce_disabled; 109extern int mce_disabled;
110extern int mce_p5_enabled;
106 111
107#include <asm/atomic.h> 112#ifdef CONFIG_X86_MCE
108#include <linux/percpu.h> 113void mcheck_init(struct cpuinfo_x86 *c);
114#else
115static inline void mcheck_init(struct cpuinfo_x86 *c) {}
116#endif
117
118#ifdef CONFIG_X86_OLD_MCE
119extern int nr_mce_banks;
120void amd_mcheck_init(struct cpuinfo_x86 *c);
121void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
122void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
123#endif
124
125#ifdef CONFIG_X86_ANCIENT_MCE
126void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
127void winchip_mcheck_init(struct cpuinfo_x86 *c);
128static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
129#else
130static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
131static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
132static inline void enable_p5_mce(void) {}
133#endif
109 134
110void mce_setup(struct mce *m); 135void mce_setup(struct mce *m);
111void mce_log(struct mce *m); 136void mce_log(struct mce *m);
112DECLARE_PER_CPU(struct sys_device, mce_dev); 137DECLARE_PER_CPU(struct sys_device, mce_dev);
113extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
114 138
115/* 139/*
116 * To support more than 128 would need to escape the predefined 140 * To support more than 128 would need to escape the predefined
@@ -145,12 +169,8 @@ int mce_available(struct cpuinfo_x86 *c);
145DECLARE_PER_CPU(unsigned, mce_exception_count); 169DECLARE_PER_CPU(unsigned, mce_exception_count);
146DECLARE_PER_CPU(unsigned, mce_poll_count); 170DECLARE_PER_CPU(unsigned, mce_poll_count);
147 171
148void mce_log_therm_throt_event(__u64 status);
149
150extern atomic_t mce_entry; 172extern atomic_t mce_entry;
151 173
152void do_machine_check(struct pt_regs *, long);
153
154typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); 174typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
155DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); 175DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
156 176
@@ -167,13 +187,32 @@ void mce_notify_process(void);
167DECLARE_PER_CPU(struct mce, injectm); 187DECLARE_PER_CPU(struct mce, injectm);
168extern struct file_operations mce_chrdev_ops; 188extern struct file_operations mce_chrdev_ops;
169 189
170#ifdef CONFIG_X86_MCE 190/*
171void mcheck_init(struct cpuinfo_x86 *c); 191 * Exception handler
172#else 192 */
173#define mcheck_init(c) do { } while (0) 193
174#endif 194/* Call the installed machine check handler for this CPU setup. */
195extern void (*machine_check_vector)(struct pt_regs *, long error_code);
196void do_machine_check(struct pt_regs *, long);
197
198/*
199 * Threshold handler
200 */
175 201
176extern void (*mce_threshold_vector)(void); 202extern void (*mce_threshold_vector)(void);
203extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
204
205/*
206 * Thermal handler
207 */
208
209void intel_init_thermal(struct cpuinfo_x86 *c);
210
211#ifdef CONFIG_X86_NEW_MCE
212void mce_log_therm_throt_event(__u64 status);
213#else
214static inline void mce_log_therm_throt_event(__u64 status) {}
215#endif
177 216
178#endif /* __KERNEL__ */ 217#endif /* __KERNEL__ */
179#endif /* _ASM_X86_MCE_H */ 218#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1692fb5050e3..6be7fc254b59 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -246,10 +246,6 @@
246#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38) 246#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38)
247#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39) 247#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39)
248 248
249/* Intel Model 6 */
250#define MSR_P6_EVNTSEL0 0x00000186
251#define MSR_P6_EVNTSEL1 0x00000187
252
253/* P4/Xeon+ specific */ 249/* P4/Xeon+ specific */
254#define MSR_IA32_MCG_EAX 0x00000180 250#define MSR_IA32_MCG_EAX 0x00000180
255#define MSR_IA32_MCG_EBX 0x00000181 251#define MSR_IA32_MCG_EBX 0x00000181
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 22603764e7db..48ad9d29484a 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -3,13 +3,10 @@
3 3
4#include <asm/msr-index.h> 4#include <asm/msr-index.h>
5 5
6#ifndef __ASSEMBLY__
7# include <linux/types.h>
8#endif
9
10#ifdef __KERNEL__ 6#ifdef __KERNEL__
11#ifndef __ASSEMBLY__ 7#ifndef __ASSEMBLY__
12 8
9#include <linux/types.h>
13#include <asm/asm.h> 10#include <asm/asm.h>
14#include <asm/errno.h> 11#include <asm/errno.h>
15#include <asm/cpumask.h> 12#include <asm/cpumask.h>
@@ -264,6 +261,4 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
264#endif /* CONFIG_SMP */ 261#endif /* CONFIG_SMP */
265#endif /* __ASSEMBLY__ */ 262#endif /* __ASSEMBLY__ */
266#endif /* __KERNEL__ */ 263#endif /* __KERNEL__ */
267
268
269#endif /* _ASM_X86_MSR_H */ 264#endif /* _ASM_X86_MSR_H */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c97264409934..c86e5ed4af51 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -72,7 +72,6 @@ void lapic_watchdog_stop(void);
72int lapic_watchdog_init(unsigned nmi_hz); 72int lapic_watchdog_init(unsigned nmi_hz);
73int lapic_wd_event(unsigned nmi_hz); 73int lapic_wd_event(unsigned nmi_hz);
74unsigned lapic_adjust_nmi_hz(unsigned hz); 74unsigned lapic_adjust_nmi_hz(unsigned hz);
75int lapic_watchdog_ok(void);
76void disable_lapic_nmi_watchdog(void); 75void disable_lapic_nmi_watchdog(void);
77void enable_lapic_nmi_watchdog(void); 76void enable_lapic_nmi_watchdog(void);
78void stop_nmi(void); 77void stop_nmi(void);
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 8d382d3abf38..7639dbf5d223 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -41,7 +41,7 @@
41 41
42/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ 42/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
43#define __PHYSICAL_MASK_SHIFT 46 43#define __PHYSICAL_MASK_SHIFT 46
44#define __VIRTUAL_MASK_SHIFT 48 44#define __VIRTUAL_MASK_SHIFT 47
45 45
46/* 46/*
47 * Kernel image size is limited to 512 MB (see level2_kernel_pgt in 47 * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index b51a1e8b0baf..1ff685ca221c 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -91,7 +91,7 @@ extern void pci_iommu_alloc(void);
91 91
92#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) 92#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
93 93
94#if defined(CONFIG_X86_64) || defined(CONFIG_DMA_API_DEBUG) 94#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG)
95 95
96#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ 96#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
97 dma_addr_t ADDR_NAME; 97 dma_addr_t ADDR_NAME;
@@ -130,6 +130,7 @@ extern void pci_iommu_alloc(void);
130 130
131/* generic pci stuff */ 131/* generic pci stuff */
132#include <asm-generic/pci.h> 132#include <asm-generic/pci.h>
133#define PCIBIOS_MAX_MEM_32 0xffffffff
133 134
134#ifdef CONFIG_NUMA 135#ifdef CONFIG_NUMA
135/* Returns the node based on pci bus */ 136/* Returns the node based on pci bus */
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index e60fd3e14bdf..b399988eee3a 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -121,6 +121,9 @@ extern int __init pcibios_init(void);
121extern int __init pci_mmcfg_arch_init(void); 121extern int __init pci_mmcfg_arch_init(void);
122extern void __init pci_mmcfg_arch_free(void); 122extern void __init pci_mmcfg_arch_free(void);
123 123
124extern struct acpi_mcfg_allocation *pci_mmcfg_config;
125extern int pci_mmcfg_config_num;
126
124/* 127/*
125 * AMD Fam10h CPUs are buggy, and cannot access MMIO config space 128 * AMD Fam10h CPUs are buggy, and cannot access MMIO config space
126 * on their northbrige except through the * %eax register. As such, you MUST 129 * on their northbrige except through the * %eax register. As such, you MUST
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 02ecb30982a3..103f1ddb0d85 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -42,6 +42,7 @@
42 42
43#else /* ...!ASSEMBLY */ 43#else /* ...!ASSEMBLY */
44 44
45#include <linux/kernel.h>
45#include <linux/stringify.h> 46#include <linux/stringify.h>
46 47
47#ifdef CONFIG_SMP 48#ifdef CONFIG_SMP
@@ -155,6 +156,15 @@ do { \
155/* We can use this directly for local CPU (faster). */ 156/* We can use this directly for local CPU (faster). */
156DECLARE_PER_CPU(unsigned long, this_cpu_off); 157DECLARE_PER_CPU(unsigned long, this_cpu_off);
157 158
159#ifdef CONFIG_NEED_MULTIPLE_NODES
160void *pcpu_lpage_remapped(void *kaddr);
161#else
162static inline void *pcpu_lpage_remapped(void *kaddr)
163{
164 return NULL;
165}
166#endif
167
158#endif /* !__ASSEMBLY__ */ 168#endif /* !__ASSEMBLY__ */
159 169
160#ifdef CONFIG_SMP 170#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 876ed97147b3..fa64e401589d 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,14 +84,12 @@ union cpuid10_edx {
84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b 84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) 85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
86 86
87extern void set_perf_counter_pending(void);
88
89#define clear_perf_counter_pending() do { } while (0)
90#define test_perf_counter_pending() (0)
91
92#ifdef CONFIG_PERF_COUNTERS 87#ifdef CONFIG_PERF_COUNTERS
93extern void init_hw_perf_counters(void); 88extern void init_hw_perf_counters(void);
94extern void perf_counters_lapic_init(void); 89extern void perf_counters_lapic_init(void);
90
91#define PERF_COUNTER_INDEX_OFFSET 0
92
95#else 93#else
96static inline void init_hw_perf_counters(void) { } 94static inline void init_hw_perf_counters(void) { }
97static inline void perf_counters_lapic_init(void) { } 95static inline void perf_counters_lapic_init(void) { }
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index dd14c54ac718..0e8c2a0fd922 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -46,7 +46,13 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte)
46 __free_page(pte); 46 __free_page(pte);
47} 47}
48 48
49extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte); 49extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
50
51static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
52 unsigned long address)
53{
54 ___pte_free_tlb(tlb, pte);
55}
50 56
51static inline void pmd_populate_kernel(struct mm_struct *mm, 57static inline void pmd_populate_kernel(struct mm_struct *mm,
52 pmd_t *pmd, pte_t *pte) 58 pmd_t *pmd, pte_t *pte)
@@ -78,7 +84,13 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
78 free_page((unsigned long)pmd); 84 free_page((unsigned long)pmd);
79} 85}
80 86
81extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); 87extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
88
89static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
90 unsigned long adddress)
91{
92 ___pmd_free_tlb(tlb, pmd);
93}
82 94
83#ifdef CONFIG_X86_PAE 95#ifdef CONFIG_X86_PAE
84extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); 96extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
@@ -108,7 +120,14 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
108 free_page((unsigned long)pud); 120 free_page((unsigned long)pud);
109} 121}
110 122
111extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); 123extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
124
125static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
126 unsigned long address)
127{
128 ___pud_free_tlb(tlb, pud);
129}
130
112#endif /* PAGETABLE_LEVELS > 3 */ 131#endif /* PAGETABLE_LEVELS > 3 */
113#endif /* PAGETABLE_LEVELS > 2 */ 132#endif /* PAGETABLE_LEVELS > 2 */
114 133
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 18ef7ebf2631..3cc06e3fceb8 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -317,6 +317,11 @@ static inline int pte_present(pte_t a)
317 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); 317 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
318} 318}
319 319
320static inline int pte_hidden(pte_t pte)
321{
322 return pte_flags(pte) & _PAGE_HIDDEN;
323}
324
320static inline int pmd_present(pmd_t pmd) 325static inline int pmd_present(pmd_t pmd)
321{ 326{
322 return pmd_flags(pmd) & _PAGE_PRESENT; 327 return pmd_flags(pmd) & _PAGE_PRESENT;
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 31bd120cf2a2..01fd9461d323 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -49,13 +49,17 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
49#endif 49#endif
50 50
51#if defined(CONFIG_HIGHPTE) 51#if defined(CONFIG_HIGHPTE)
52#define __KM_PTE \
53 (in_nmi() ? KM_NMI_PTE : \
54 in_irq() ? KM_IRQ_PTE : \
55 KM_PTE0)
52#define pte_offset_map(dir, address) \ 56#define pte_offset_map(dir, address) \
53 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \ 57 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \
54 pte_index((address))) 58 pte_index((address)))
55#define pte_offset_map_nested(dir, address) \ 59#define pte_offset_map_nested(dir, address) \
56 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ 60 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \
57 pte_index((address))) 61 pte_index((address)))
58#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0) 62#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
59#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) 63#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
60#else 64#else
61#define pte_offset_map(dir, address) \ 65#define pte_offset_map(dir, address) \
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index abde308fdb0f..c57a30117149 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -165,10 +165,7 @@ extern void cleanup_highmap(void);
165 165
166/* fs/proc/kcore.c */ 166/* fs/proc/kcore.c */
167#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK) 167#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
168#define kc_offset_to_vaddr(o) \ 168#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
169 (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \
170 ? ((o) | ~__VIRTUAL_MASK) \
171 : (o))
172 169
173#define __HAVE_ARCH_PTE_SAME 170#define __HAVE_ARCH_PTE_SAME
174#endif /* !__ASSEMBLY__ */ 171#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4d258ad76a0f..54cb697f4900 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -18,7 +18,7 @@
18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ 18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
19#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ 19#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
20#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ 20#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
21#define _PAGE_BIT_UNUSED3 11 21#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
@@ -41,13 +41,18 @@
41#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) 41#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
42#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) 42#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
43#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) 43#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
44#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
45#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) 44#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
46#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 45#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
47#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 46#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
48#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 47#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
49#define __HAVE_ARCH_PTE_SPECIAL 48#define __HAVE_ARCH_PTE_SPECIAL
50 49
50#ifdef CONFIG_KMEMCHECK
51#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
52#else
53#define _PAGE_HIDDEN (_AT(pteval_t, 0))
54#endif
55
51#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 56#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
52#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) 57#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
53#else 58#else
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 49fb3ecf3bb3..621f56d73121 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -22,7 +22,14 @@ extern int reboot_force;
22 22
23long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); 23long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
24 24
25#define round_up(x, y) (((x) + (y) - 1) & ~((y) - 1)) 25/*
26#define round_down(x, y) ((x) & ~((y) - 1)) 26 * This looks more complex than it should be. But we need to
27 * get the type for the ~ right in round_down (it needs to be
28 * as wide as the result!), and we want to evaluate the macro
29 * arguments just once each.
30 */
31#define __round_mask(x,y) ((__typeof__(x))((y)-1))
32#define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1)
33#define round_down(x,y) ((x) & ~__round_mask(x,y))
27 34
28#endif /* _ASM_X86_PROTO_H */ 35#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index b7e5db876399..4e77853321db 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -302,4 +302,8 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
302#define _raw_read_relax(lock) cpu_relax() 302#define _raw_read_relax(lock) cpu_relax()
303#define _raw_write_relax(lock) cpu_relax() 303#define _raw_write_relax(lock) cpu_relax()
304 304
305/* The {read|write|spin}_lock() on x86 are full memory barriers. */
306static inline void smp_mb__after_lock(void) { }
307#define ARCH_HAS_SMP_MB_AFTER_LOCK
308
305#endif /* _ASM_X86_SPINLOCK_H */ 309#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index f517944b2b17..cf86a5e73815 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -3,6 +3,8 @@
3 3
4extern int kstack_depth_to_print; 4extern int kstack_depth_to_print;
5 5
6int x86_is_stack_id(int id, char *name);
7
6/* Generic stack tracer with callbacks */ 8/* Generic stack tracer with callbacks */
7 9
8struct stacktrace_ops { 10struct stacktrace_ops {
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 0e0e3ba827f7..c86f452256de 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -177,10 +177,18 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
177 * No 3D Now! 177 * No 3D Now!
178 */ 178 */
179 179
180#ifndef CONFIG_KMEMCHECK
180#define memcpy(t, f, n) \ 181#define memcpy(t, f, n) \
181 (__builtin_constant_p((n)) \ 182 (__builtin_constant_p((n)) \
182 ? __constant_memcpy((t), (f), (n)) \ 183 ? __constant_memcpy((t), (f), (n)) \
183 : __memcpy((t), (f), (n))) 184 : __memcpy((t), (f), (n)))
185#else
186/*
187 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
188 * because it means that we know both memory operands in advance.
189 */
190#define memcpy(t, f, n) __memcpy((t), (f), (n))
191#endif
184 192
185#endif 193#endif
186 194
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 2afe164bf1e6..19e2c468fc2c 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -27,6 +27,7 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
27 function. */ 27 function. */
28 28
29#define __HAVE_ARCH_MEMCPY 1 29#define __HAVE_ARCH_MEMCPY 1
30#ifndef CONFIG_KMEMCHECK
30#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 31#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
31extern void *memcpy(void *to, const void *from, size_t len); 32extern void *memcpy(void *to, const void *from, size_t len);
32#else 33#else
@@ -42,6 +43,13 @@ extern void *__memcpy(void *to, const void *from, size_t len);
42 __ret; \ 43 __ret; \
43}) 44})
44#endif 45#endif
46#else
47/*
48 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
49 * because it means that we know both memory operands in advance.
50 */
51#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len))
52#endif
45 53
46#define __HAVE_ARCH_MEMSET 54#define __HAVE_ARCH_MEMSET
47void *memset(void *s, int c, size_t n); 55void *memset(void *s, int c, size_t n);
diff --git a/arch/x86/include/asm/therm_throt.h b/arch/x86/include/asm/therm_throt.h
deleted file mode 100644
index c62349ee7860..000000000000
--- a/arch/x86/include/asm/therm_throt.h
+++ /dev/null
@@ -1,9 +0,0 @@
1#ifndef _ASM_X86_THERM_THROT_H
2#define _ASM_X86_THERM_THROT_H
3
4#include <asm/atomic.h>
5
6extern atomic_t therm_throt_en;
7int therm_throt_process(int curr);
8
9#endif /* _ASM_X86_THERM_THROT_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 602c769fc98c..fad7d40b75f8 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -49,7 +49,7 @@ struct thread_info {
49 .exec_domain = &default_exec_domain, \ 49 .exec_domain = &default_exec_domain, \
50 .flags = 0, \ 50 .flags = 0, \
51 .cpu = 0, \ 51 .cpu = 0, \
52 .preempt_count = 1, \ 52 .preempt_count = INIT_PREEMPT_COUNT, \
53 .addr_limit = KERNEL_DS, \ 53 .addr_limit = KERNEL_DS, \
54 .restart_block = { \ 54 .restart_block = { \
55 .fn = do_no_restart_syscall, \ 55 .fn = do_no_restart_syscall, \
@@ -154,9 +154,9 @@ struct thread_info {
154 154
155/* thread information allocation */ 155/* thread information allocation */
156#ifdef CONFIG_DEBUG_STACK_USAGE 156#ifdef CONFIG_DEBUG_STACK_USAGE
157#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) 157#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
158#else 158#else
159#define THREAD_FLAGS GFP_KERNEL 159#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK)
160#endif 160#endif
161 161
162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR 162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index bd37ed444a21..20ca9c4d4686 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -45,12 +45,16 @@ extern int no_timer_check;
45 */ 45 */
46 46
47DECLARE_PER_CPU(unsigned long, cyc2ns); 47DECLARE_PER_CPU(unsigned long, cyc2ns);
48DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
48 49
49#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ 50#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
50 51
51static inline unsigned long long __cycles_2_ns(unsigned long long cyc) 52static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
52{ 53{
53 return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR; 54 int cpu = smp_processor_id();
55 unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
56 ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR;
57 return ns;
54} 58}
55 59
56static inline unsigned long long cycles_2_ns(unsigned long long cyc) 60static inline unsigned long long cycles_2_ns(unsigned long long cyc)
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
index b5c9d45c981f..1375cfc93960 100644
--- a/arch/x86/include/asm/timex.h
+++ b/arch/x86/include/asm/timex.h
@@ -4,9 +4,7 @@
4#include <asm/processor.h> 4#include <asm/processor.h>
5#include <asm/tsc.h> 5#include <asm/tsc.h>
6 6
7/* The PIT ticks at this frequency (in HZ): */ 7/* Assume we use the PIT time source for the clock tick */
8#define PIT_TICK_RATE 1193182
9
10#define CLOCK_TICK_RATE PIT_TICK_RATE 8#define CLOCK_TICK_RATE PIT_TICK_RATE
11 9
12#define ARCH_HAS_READ_CURRENT_TIMER 10#define ARCH_HAS_READ_CURRENT_TIMER
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index b685ece89d5c..d2c6c930b491 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -25,7 +25,7 @@
25#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) 25#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
26 26
27#define KERNEL_DS MAKE_MM_SEG(-1UL) 27#define KERNEL_DS MAKE_MM_SEG(-1UL)
28#define USER_DS MAKE_MM_SEG(PAGE_OFFSET) 28#define USER_DS MAKE_MM_SEG(TASK_SIZE_MAX)
29 29
30#define get_ds() (KERNEL_DS) 30#define get_ds() (KERNEL_DS)
31#define get_fs() (current_thread_info()->addr_limit) 31#define get_fs() (current_thread_info()->addr_limit)
@@ -212,9 +212,9 @@ extern int __get_user_bad(void);
212 : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") 212 : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
213#else 213#else
214#define __put_user_asm_u64(x, ptr, retval, errret) \ 214#define __put_user_asm_u64(x, ptr, retval, errret) \
215 __put_user_asm(x, ptr, retval, "q", "", "Zr", errret) 215 __put_user_asm(x, ptr, retval, "q", "", "er", errret)
216#define __put_user_asm_ex_u64(x, addr) \ 216#define __put_user_asm_ex_u64(x, addr) \
217 __put_user_asm_ex(x, addr, "q", "", "Zr") 217 __put_user_asm_ex(x, addr, "q", "", "er")
218#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) 218#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
219#endif 219#endif
220 220
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 8cc687326eb8..db24b215fc50 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -88,11 +88,11 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
88 ret, "l", "k", "ir", 4); 88 ret, "l", "k", "ir", 4);
89 return ret; 89 return ret;
90 case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, 90 case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
91 ret, "q", "", "ir", 8); 91 ret, "q", "", "er", 8);
92 return ret; 92 return ret;
93 case 10: 93 case 10:
94 __put_user_asm(*(u64 *)src, (u64 __user *)dst, 94 __put_user_asm(*(u64 *)src, (u64 __user *)dst,
95 ret, "q", "", "ir", 10); 95 ret, "q", "", "er", 10);
96 if (unlikely(ret)) 96 if (unlikely(ret))
97 return ret; 97 return ret;
98 asm("":::"memory"); 98 asm("":::"memory");
@@ -101,12 +101,12 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
101 return ret; 101 return ret;
102 case 16: 102 case 16:
103 __put_user_asm(*(u64 *)src, (u64 __user *)dst, 103 __put_user_asm(*(u64 *)src, (u64 __user *)dst,
104 ret, "q", "", "ir", 16); 104 ret, "q", "", "er", 16);
105 if (unlikely(ret)) 105 if (unlikely(ret))
106 return ret; 106 return ret;
107 asm("":::"memory"); 107 asm("":::"memory");
108 __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, 108 __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
109 ret, "q", "", "ir", 8); 109 ret, "q", "", "er", 8);
110 return ret; 110 return ret;
111 default: 111 default:
112 return copy_user_generic((__force void *)dst, src, size); 112 return copy_user_generic((__force void *)dst, src, size);
@@ -157,7 +157,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
157 ret, "q", "", "=r", 8); 157 ret, "q", "", "=r", 8);
158 if (likely(!ret)) 158 if (likely(!ret))
159 __put_user_asm(tmp, (u64 __user *)dst, 159 __put_user_asm(tmp, (u64 __user *)dst,
160 ret, "q", "", "ir", 8); 160 ret, "q", "", "er", 8);
161 return ret; 161 return ret;
162 } 162 }
163 default: 163 default:
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index bddd44f2f0ab..80e2984f521c 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -133,7 +133,7 @@ struct bau_msg_payload {
133 * see table 4.2.3.0.1 in broacast_assist spec. 133 * see table 4.2.3.0.1 in broacast_assist spec.
134 */ 134 */
135struct bau_msg_header { 135struct bau_msg_header {
136 unsigned int dest_subnodeid:6; /* must be zero */ 136 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */
137 /* bits 5:0 */ 137 /* bits 5:0 */
138 unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */ 138 unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */
139 /* bits 20:6 */ /* first bit in node_map */ 139 /* bits 20:6 */ /* first bit in node_map */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 341070f7ad5c..77a68505419a 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -175,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
175#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) 175#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
176 176
177#define UV_GLOBAL_MMR64_PNODE_BITS(p) \ 177#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
178 ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) 178 (((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
179 179
180#define UV_APIC_PNODE_SHIFT 6 180#define UV_APIC_PNODE_SHIFT 6
181 181
@@ -327,6 +327,7 @@ struct uv_blade_info {
327 unsigned short nr_possible_cpus; 327 unsigned short nr_possible_cpus;
328 unsigned short nr_online_cpus; 328 unsigned short nr_online_cpus;
329 unsigned short pnode; 329 unsigned short pnode;
330 short memory_nid;
330}; 331};
331extern struct uv_blade_info *uv_blade_info; 332extern struct uv_blade_info *uv_blade_info;
332extern short *uv_node_to_blade; 333extern short *uv_node_to_blade;
@@ -363,6 +364,12 @@ static inline int uv_blade_to_pnode(int bid)
363 return uv_blade_info[bid].pnode; 364 return uv_blade_info[bid].pnode;
364} 365}
365 366
367/* Nid of memory node on blade. -1 if no blade-local memory */
368static inline int uv_blade_to_memory_nid(int bid)
369{
370 return uv_blade_info[bid].memory_nid;
371}
372
366/* Determine the number of possible cpus on a blade */ 373/* Determine the number of possible cpus on a blade */
367static inline int uv_blade_nr_possible_cpus(int bid) 374static inline int uv_blade_nr_possible_cpus(int bid)
368{ 375{
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 11b3bb86e17b..7fcf6f3dbcc3 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,5 +1,10 @@
1#ifdef CONFIG_KMEMCHECK
2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
3# include <asm-generic/xor.h>
4#else
1#ifdef CONFIG_X86_32 5#ifdef CONFIG_X86_32
2# include "xor_32.h" 6# include "xor_32.h"
3#else 7#else
4# include "xor_64.h" 8# include "xor_64.h"
5#endif 9#endif
10#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index f3477bb84566..430d5b24af7b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,6 +24,10 @@ CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
24CFLAGS_hpet.o := $(nostackp) 24CFLAGS_hpet.o := $(nostackp)
25CFLAGS_tsc.o := $(nostackp) 25CFLAGS_tsc.o := $(nostackp)
26CFLAGS_paravirt.o := $(nostackp) 26CFLAGS_paravirt.o := $(nostackp)
27GCOV_PROFILE_vsyscall_64.o := n
28GCOV_PROFILE_hpet.o := n
29GCOV_PROFILE_tsc.o := n
30GCOV_PROFILE_paravirt.o := n
27 31
28obj-y := process_$(BITS).o signal.o entry_$(BITS).o 32obj-y := process_$(BITS).o signal.o entry_$(BITS).o
29obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 33obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 631086159c53..6b8ca3a0285d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -44,11 +44,7 @@
44 44
45static int __initdata acpi_force = 0; 45static int __initdata acpi_force = 0;
46u32 acpi_rsdt_forced; 46u32 acpi_rsdt_forced;
47#ifdef CONFIG_ACPI 47int acpi_disabled;
48int acpi_disabled = 0;
49#else
50int acpi_disabled = 1;
51#endif
52EXPORT_SYMBOL(acpi_disabled); 48EXPORT_SYMBOL(acpi_disabled);
53 49
54#ifdef CONFIG_X86_64 50#ifdef CONFIG_X86_64
@@ -122,72 +118,6 @@ void __init __acpi_unmap_table(char *map, unsigned long size)
122 early_iounmap(map, size); 118 early_iounmap(map, size);
123} 119}
124 120
125#ifdef CONFIG_PCI_MMCONFIG
126
127static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
128
129/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
130struct acpi_mcfg_allocation *pci_mmcfg_config;
131int pci_mmcfg_config_num;
132
133static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
134{
135 if (!strcmp(mcfg->header.oem_id, "SGI"))
136 acpi_mcfg_64bit_base_addr = TRUE;
137
138 return 0;
139}
140
141int __init acpi_parse_mcfg(struct acpi_table_header *header)
142{
143 struct acpi_table_mcfg *mcfg;
144 unsigned long i;
145 int config_size;
146
147 if (!header)
148 return -EINVAL;
149
150 mcfg = (struct acpi_table_mcfg *)header;
151
152 /* how many config structures do we have */
153 pci_mmcfg_config_num = 0;
154 i = header->length - sizeof(struct acpi_table_mcfg);
155 while (i >= sizeof(struct acpi_mcfg_allocation)) {
156 ++pci_mmcfg_config_num;
157 i -= sizeof(struct acpi_mcfg_allocation);
158 };
159 if (pci_mmcfg_config_num == 0) {
160 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
161 return -ENODEV;
162 }
163
164 config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
165 pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
166 if (!pci_mmcfg_config) {
167 printk(KERN_WARNING PREFIX
168 "No memory for MCFG config tables\n");
169 return -ENOMEM;
170 }
171
172 memcpy(pci_mmcfg_config, &mcfg[1], config_size);
173
174 acpi_mcfg_oem_check(mcfg);
175
176 for (i = 0; i < pci_mmcfg_config_num; ++i) {
177 if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
178 !acpi_mcfg_64bit_base_addr) {
179 printk(KERN_ERR PREFIX
180 "MMCONFIG not in low 4GB of memory\n");
181 kfree(pci_mmcfg_config);
182 pci_mmcfg_config_num = 0;
183 return -ENODEV;
184 }
185 }
186
187 return 0;
188}
189#endif /* CONFIG_PCI_MMCONFIG */
190
191#ifdef CONFIG_X86_LOCAL_APIC 121#ifdef CONFIG_X86_LOCAL_APIC
192static int __init acpi_parse_madt(struct acpi_table_header *table) 122static int __init acpi_parse_madt(struct acpi_table_header *table)
193{ 123{
@@ -1519,14 +1449,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1519 }, 1449 },
1520 { 1450 {
1521 .callback = force_acpi_ht, 1451 .callback = force_acpi_ht,
1522 .ident = "ASUS P4B266",
1523 .matches = {
1524 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1525 DMI_MATCH(DMI_BOARD_NAME, "P4B266"),
1526 },
1527 },
1528 {
1529 .callback = force_acpi_ht,
1530 .ident = "ASUS P2B-DS", 1452 .ident = "ASUS P2B-DS",
1531 .matches = { 1453 .matches = {
1532 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), 1454 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index bbbe4bbb6f34..8c44c232efcb 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -34,12 +34,22 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
34 flags->bm_check = 1; 34 flags->bm_check = 1;
35 else if (c->x86_vendor == X86_VENDOR_INTEL) { 35 else if (c->x86_vendor == X86_VENDOR_INTEL) {
36 /* 36 /*
37 * Today all CPUs that support C3 share cache. 37 * Today all MP CPUs that support C3 share cache.
38 * TBD: This needs to look at cache shared map, once 38 * And caches should not be flushed by software while
39 * multi-core detection patch makes to the base. 39 * entering C3 type state.
40 */ 40 */
41 flags->bm_check = 1; 41 flags->bm_check = 1;
42 } 42 }
43
44 /*
45 * On all recent Intel platforms, ARB_DISABLE is a nop.
46 * So, set bm_control to zero to indicate that ARB_DISABLE
47 * is not required while entering C3 type state on
48 * P4, Core and beyond CPUs
49 */
50 if (c->x86_vendor == X86_VENDOR_INTEL &&
51 (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14)))
52 flags->bm_control = 0;
43} 53}
44EXPORT_SYMBOL(acpi_processor_power_init_bm_check); 54EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
45 55
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index 7c074eec39fb..d296f4a195c9 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -72,6 +72,7 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
72 return; 72 return;
73} 73}
74 74
75
75/* Initialize _PDC data based on the CPU vendor */ 76/* Initialize _PDC data based on the CPU vendor */
76void arch_acpi_processor_init_pdc(struct acpi_processor *pr) 77void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
77{ 78{
@@ -85,3 +86,15 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
85} 86}
86 87
87EXPORT_SYMBOL(arch_acpi_processor_init_pdc); 88EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
89
90void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr)
91{
92 if (pr->pdc) {
93 kfree(pr->pdc->pointer->buffer.pointer);
94 kfree(pr->pdc->pointer);
95 kfree(pr->pdc);
96 pr->pdc = NULL;
97 }
98}
99
100EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc);
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
index 167bc16ce0e5..6a564ac67ef5 100644
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -42,6 +42,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \
42 $(call cc-option, -mpreferred-stack-boundary=2) 42 $(call cc-option, -mpreferred-stack-boundary=2)
43KBUILD_CFLAGS += $(call cc-option, -m32) 43KBUILD_CFLAGS += $(call cc-option, -m32)
44KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 44KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
45GCOV_PROFILE := n
45 46
46WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y)) 47WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
47 48
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 1c60554537c3..6c99f5037801 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -434,6 +434,16 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
434 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); 434 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
435} 435}
436 436
437/* Flush the whole IO/TLB for a given protection domain - including PDE */
438static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid)
439{
440 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
441
442 INC_STATS_COUNTER(domain_flush_single);
443
444 iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1);
445}
446
437/* 447/*
438 * This function is used to flush the IO/TLB for a given protection domain 448 * This function is used to flush the IO/TLB for a given protection domain
439 * on every IOMMU in the system 449 * on every IOMMU in the system
@@ -1078,7 +1088,13 @@ static void attach_device(struct amd_iommu *iommu,
1078 amd_iommu_pd_table[devid] = domain; 1088 amd_iommu_pd_table[devid] = domain;
1079 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1089 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1080 1090
1091 /*
1092 * We might boot into a crash-kernel here. The crashed kernel
1093 * left the caches in the IOMMU dirty. So we have to flush
1094 * here to evict all dirty stuff.
1095 */
1081 iommu_queue_inv_dev_entry(iommu, devid); 1096 iommu_queue_inv_dev_entry(iommu, devid);
1097 iommu_flush_tlb_pde(iommu, domain->id);
1082} 1098}
1083 1099
1084/* 1100/*
@@ -1176,7 +1192,7 @@ out:
1176 return 0; 1192 return 0;
1177} 1193}
1178 1194
1179struct notifier_block device_nb = { 1195static struct notifier_block device_nb = {
1180 .notifier_call = device_change_notifier, 1196 .notifier_call = device_change_notifier,
1181}; 1197};
1182 1198
@@ -1747,7 +1763,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
1747 flag |= __GFP_ZERO; 1763 flag |= __GFP_ZERO;
1748 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 1764 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1749 if (!virt_addr) 1765 if (!virt_addr)
1750 return 0; 1766 return NULL;
1751 1767
1752 paddr = virt_to_phys(virt_addr); 1768 paddr = virt_to_phys(virt_addr);
1753 1769
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 238989ec077d..c1b17e97252e 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -260,6 +260,14 @@ static void iommu_enable(struct amd_iommu *iommu)
260 260
261static void iommu_disable(struct amd_iommu *iommu) 261static void iommu_disable(struct amd_iommu *iommu)
262{ 262{
263 /* Disable command buffer */
264 iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
265
266 /* Disable event logging and event interrupts */
267 iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
268 iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
269
270 /* Disable IOMMU hardware itself */
263 iommu_feature_disable(iommu, CONTROL_IOMMU_EN); 271 iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
264} 272}
265 273
@@ -464,6 +472,8 @@ static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
464 if (iommu->evt_buf == NULL) 472 if (iommu->evt_buf == NULL)
465 return NULL; 473 return NULL;
466 474
475 iommu->evt_buf_size = EVT_BUFFER_SIZE;
476
467 return iommu->evt_buf; 477 return iommu->evt_buf;
468} 478}
469 479
@@ -478,6 +488,10 @@ static void iommu_enable_event_buffer(struct amd_iommu *iommu)
478 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, 488 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
479 &entry, sizeof(entry)); 489 &entry, sizeof(entry));
480 490
491 /* set head and tail to zero manually */
492 writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
493 writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
494
481 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); 495 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
482} 496}
483 497
@@ -679,6 +693,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
679 693
680 devid = e->devid; 694 devid = e->devid;
681 devid_to = e->ext >> 8; 695 devid_to = e->ext >> 8;
696 set_dev_entry_from_acpi(iommu, devid , e->flags, 0);
682 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); 697 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
683 amd_iommu_alias_table[devid] = devid_to; 698 amd_iommu_alias_table[devid] = devid_to;
684 break; 699 break;
@@ -737,11 +752,13 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
737 752
738 devid = e->devid; 753 devid = e->devid;
739 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 754 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
740 if (alias) 755 if (alias) {
741 amd_iommu_alias_table[dev_i] = devid_to; 756 amd_iommu_alias_table[dev_i] = devid_to;
742 set_dev_entry_from_acpi(iommu, 757 set_dev_entry_from_acpi(iommu,
743 amd_iommu_alias_table[dev_i], 758 devid_to, flags, ext_flags);
744 flags, ext_flags); 759 }
760 set_dev_entry_from_acpi(iommu, dev_i,
761 flags, ext_flags);
745 } 762 }
746 break; 763 break;
747 default: 764 default:
@@ -1042,6 +1059,7 @@ static void enable_iommus(void)
1042 struct amd_iommu *iommu; 1059 struct amd_iommu *iommu;
1043 1060
1044 for_each_iommu(iommu) { 1061 for_each_iommu(iommu) {
1062 iommu_disable(iommu);
1045 iommu_set_device_table(iommu); 1063 iommu_set_device_table(iommu);
1046 iommu_enable_command_buffer(iommu); 1064 iommu_enable_command_buffer(iommu);
1047 iommu_enable_event_buffer(iommu); 1065 iommu_enable_event_buffer(iommu);
@@ -1066,12 +1084,6 @@ static void disable_iommus(void)
1066 1084
1067static int amd_iommu_resume(struct sys_device *dev) 1085static int amd_iommu_resume(struct sys_device *dev)
1068{ 1086{
1069 /*
1070 * Disable IOMMUs before reprogramming the hardware registers.
1071 * IOMMU is still enabled from the resume kernel.
1072 */
1073 disable_iommus();
1074
1075 /* re-load the hardware */ 1087 /* re-load the hardware */
1076 enable_iommus(); 1088 enable_iommus();
1077 1089
@@ -1079,8 +1091,8 @@ static int amd_iommu_resume(struct sys_device *dev)
1079 * we have to flush after the IOMMUs are enabled because a 1091 * we have to flush after the IOMMUs are enabled because a
1080 * disabled IOMMU will never execute the commands we send 1092 * disabled IOMMU will never execute the commands we send
1081 */ 1093 */
1082 amd_iommu_flush_all_domains();
1083 amd_iommu_flush_all_devices(); 1094 amd_iommu_flush_all_devices();
1095 amd_iommu_flush_all_domains();
1084 1096
1085 return 0; 1097 return 0;
1086} 1098}
@@ -1273,6 +1285,11 @@ free:
1273 goto out; 1285 goto out;
1274} 1286}
1275 1287
1288void amd_iommu_shutdown(void)
1289{
1290 disable_iommus();
1291}
1292
1276/**************************************************************************** 1293/****************************************************************************
1277 * 1294 *
1278 * Early detect code. This code runs at IOMMU detection time in the DMA 1295 * Early detect code. This code runs at IOMMU detection time in the DMA
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8c7c042ecad1..0a1c2830ec66 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -140,7 +140,6 @@ int x2apic_mode;
140#ifdef CONFIG_X86_X2APIC 140#ifdef CONFIG_X86_X2APIC
141/* x2apic enabled before OS handover */ 141/* x2apic enabled before OS handover */
142static int x2apic_preenabled; 142static int x2apic_preenabled;
143static int disable_x2apic;
144static __init int setup_nox2apic(char *str) 143static __init int setup_nox2apic(char *str)
145{ 144{
146 if (x2apic_enabled()) { 145 if (x2apic_enabled()) {
@@ -149,7 +148,6 @@ static __init int setup_nox2apic(char *str)
149 return 0; 148 return 0;
150 } 149 }
151 150
152 disable_x2apic = 1;
153 setup_clear_cpu_cap(X86_FEATURE_X2APIC); 151 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
154 return 0; 152 return 0;
155} 153}
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 69328ac8de9c..8952a5890281 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -652,7 +652,8 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
652 return ret && es7000_apic_is_cluster(); 652 return ret && es7000_apic_is_cluster();
653} 653}
654 654
655struct apic apic_es7000_cluster = { 655/* We've been warned by a false positive warning.Use __refdata to keep calm. */
656struct apic __refdata apic_es7000_cluster = {
656 657
657 .name = "es7000", 658 .name = "es7000",
658 .probe = probe_es7000, 659 .probe = probe_es7000,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ef8d9290c7ea..d2ed6c5ddc80 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -462,7 +462,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
462static void 462static void
463__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 463__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
464{ 464{
465 union entry_union eu; 465 union entry_union eu = {{0, 0}};
466
466 eu.entry = e; 467 eu.entry = e;
467 io_apic_write(apic, 0x11 + 2*pin, eu.w2); 468 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
468 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 469 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
@@ -1413,6 +1414,9 @@ int setup_ioapic_entry(int apic_id, int irq,
1413 irte.vector = vector; 1414 irte.vector = vector;
1414 irte.dest_id = IRTE_DEST(destination); 1415 irte.dest_id = IRTE_DEST(destination);
1415 1416
1417 /* Set source-id of interrupt request */
1418 set_ioapic_sid(&irte, apic_id);
1419
1416 modify_irte(irq, &irte); 1420 modify_irte(irq, &irte);
1417 1421
1418 ir_entry->index2 = (index >> 15) & 0x1; 1422 ir_entry->index2 = (index >> 15) & 0x1;
@@ -1712,25 +1716,19 @@ __apicdebuginit(void) print_IO_APIC(void)
1712 return; 1716 return;
1713} 1717}
1714 1718
1715__apicdebuginit(void) print_APIC_bitfield(int base) 1719__apicdebuginit(void) print_APIC_field(int base)
1716{ 1720{
1717 unsigned int v; 1721 int i;
1718 int i, j;
1719 1722
1720 if (apic_verbosity == APIC_QUIET) 1723 if (apic_verbosity == APIC_QUIET)
1721 return; 1724 return;
1722 1725
1723 printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); 1726 printk(KERN_DEBUG);
1724 for (i = 0; i < 8; i++) { 1727
1725 v = apic_read(base + i*0x10); 1728 for (i = 0; i < 8; i++)
1726 for (j = 0; j < 32; j++) { 1729 printk(KERN_CONT "%08x", apic_read(base + i*0x10));
1727 if (v & (1<<j)) 1730
1728 printk("1"); 1731 printk(KERN_CONT "\n");
1729 else
1730 printk("0");
1731 }
1732 printk("\n");
1733 }
1734} 1732}
1735 1733
1736__apicdebuginit(void) print_local_APIC(void *dummy) 1734__apicdebuginit(void) print_local_APIC(void *dummy)
@@ -1741,7 +1739,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1741 if (apic_verbosity == APIC_QUIET) 1739 if (apic_verbosity == APIC_QUIET)
1742 return; 1740 return;
1743 1741
1744 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1742 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1745 smp_processor_id(), hard_smp_processor_id()); 1743 smp_processor_id(), hard_smp_processor_id());
1746 v = apic_read(APIC_ID); 1744 v = apic_read(APIC_ID);
1747 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); 1745 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
@@ -1782,11 +1780,11 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1782 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); 1780 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1783 1781
1784 printk(KERN_DEBUG "... APIC ISR field:\n"); 1782 printk(KERN_DEBUG "... APIC ISR field:\n");
1785 print_APIC_bitfield(APIC_ISR); 1783 print_APIC_field(APIC_ISR);
1786 printk(KERN_DEBUG "... APIC TMR field:\n"); 1784 printk(KERN_DEBUG "... APIC TMR field:\n");
1787 print_APIC_bitfield(APIC_TMR); 1785 print_APIC_field(APIC_TMR);
1788 printk(KERN_DEBUG "... APIC IRR field:\n"); 1786 printk(KERN_DEBUG "... APIC IRR field:\n");
1789 print_APIC_bitfield(APIC_IRR); 1787 print_APIC_field(APIC_IRR);
1790 1788
1791 if (APIC_INTEGRATED(ver)) { /* !82489DX */ 1789 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1792 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 1790 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
@@ -2003,7 +2001,9 @@ void disable_IO_APIC(void)
2003 /* 2001 /*
2004 * Use virtual wire A mode when interrupt remapping is enabled. 2002 * Use virtual wire A mode when interrupt remapping is enabled.
2005 */ 2003 */
2006 disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); 2004 if (cpu_has_apic)
2005 disconnect_bsp_APIC(!intr_remapping_enabled &&
2006 ioapic_i8259.pin != -1);
2007} 2007}
2008 2008
2009#ifdef CONFIG_X86_32 2009#ifdef CONFIG_X86_32
@@ -3287,6 +3287,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3287 irte.vector = cfg->vector; 3287 irte.vector = cfg->vector;
3288 irte.dest_id = IRTE_DEST(dest); 3288 irte.dest_id = IRTE_DEST(dest);
3289 3289
3290 /* Set source-id of interrupt request */
3291 set_msi_sid(&irte, pdev);
3292
3290 modify_irte(irq, &irte); 3293 modify_irte(irq, &irte);
3291 3294
3292 msg->address_hi = MSI_ADDR_BASE_HI; 3295 msg->address_hi = MSI_ADDR_BASE_HI;
@@ -3567,7 +3570,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3567 3570
3568#endif /* CONFIG_SMP */ 3571#endif /* CONFIG_SMP */
3569 3572
3570struct irq_chip dmar_msi_type = { 3573static struct irq_chip dmar_msi_type = {
3571 .name = "DMAR_MSI", 3574 .name = "DMAR_MSI",
3572 .unmask = dmar_msi_unmask, 3575 .unmask = dmar_msi_unmask,
3573 .mask = dmar_msi_mask, 3576 .mask = dmar_msi_mask,
@@ -3790,6 +3793,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3790 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3793 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3791 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 3794 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3792 3795
3796 if (cfg->move_in_progress)
3797 send_cleanup_vector(cfg);
3798
3793 return irq; 3799 return irq;
3794} 3800}
3795 3801
@@ -4178,28 +4184,20 @@ fake_ioapic_page:
4178 } 4184 }
4179} 4185}
4180 4186
4181static int __init ioapic_insert_resources(void) 4187void __init ioapic_insert_resources(void)
4182{ 4188{
4183 int i; 4189 int i;
4184 struct resource *r = ioapic_resources; 4190 struct resource *r = ioapic_resources;
4185 4191
4186 if (!r) { 4192 if (!r) {
4187 if (nr_ioapics > 0) { 4193 if (nr_ioapics > 0)
4188 printk(KERN_ERR 4194 printk(KERN_ERR
4189 "IO APIC resources couldn't be allocated.\n"); 4195 "IO APIC resources couldn't be allocated.\n");
4190 return -1; 4196 return;
4191 }
4192 return 0;
4193 } 4197 }
4194 4198
4195 for (i = 0; i < nr_ioapics; i++) { 4199 for (i = 0; i < nr_ioapics; i++) {
4196 insert_resource(&iomem_resource, r); 4200 insert_resource(&iomem_resource, r);
4197 r++; 4201 r++;
4198 } 4202 }
4199
4200 return 0;
4201} 4203}
4202
4203/* Insert the IO APIC resources after PCI initialization has occured to handle
4204 * IO APICS that are mapped in on a BAR in PCI space. */
4205late_initcall(ioapic_insert_resources);
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index dbf5445727a9..6ef00ba4c886 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -106,6 +106,9 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
106 unsigned long mask = cpumask_bits(cpumask)[0]; 106 unsigned long mask = cpumask_bits(cpumask)[0];
107 unsigned long flags; 107 unsigned long flags;
108 108
109 if (WARN_ONCE(!mask, "empty IPI mask"))
110 return;
111
109 local_irq_save(flags); 112 local_irq_save(flags);
110 WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); 113 WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
111 __default_send_IPI_dest_field(mask, vector, apic->dest_logical); 114 __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 533e59c6fc82..ca96e68f0d23 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -493,7 +493,8 @@ static void numaq_setup_portio_remap(void)
493 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); 493 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
494} 494}
495 495
496struct apic apic_numaq = { 496/* Use __refdata to keep false positive warning calm. */
497struct apic __refdata apic_numaq = {
497 498
498 .name = "NUMAQ", 499 .name = "NUMAQ",
499 .probe = probe_numaq, 500 .probe = probe_numaq,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 440a8bccd91a..0c0182cc947d 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -20,23 +20,12 @@
20#include <asm/apic.h> 20#include <asm/apic.h>
21#include <asm/setup.h> 21#include <asm/setup.h>
22 22
23#include <linux/threads.h>
24#include <linux/cpumask.h>
25#include <asm/mpspec.h>
26#include <asm/fixmap.h>
27#include <asm/apicdef.h>
28#include <linux/kernel.h>
29#include <linux/string.h>
30#include <linux/smp.h> 23#include <linux/smp.h>
31#include <linux/init.h>
32#include <asm/ipi.h> 24#include <asm/ipi.h>
33 25
34#include <linux/smp.h>
35#include <linux/init.h>
36#include <linux/interrupt.h> 26#include <linux/interrupt.h>
37#include <asm/acpi.h> 27#include <asm/acpi.h>
38#include <asm/e820.h> 28#include <asm/e820.h>
39#include <asm/setup.h>
40 29
41#ifdef CONFIG_HOTPLUG_CPU 30#ifdef CONFIG_HOTPLUG_CPU
42#define DEFAULT_SEND_IPI (1) 31#define DEFAULT_SEND_IPI (1)
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 344eee4ac0a4..eafdfbd1ea95 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -44,7 +44,6 @@
44#include <asm/ipi.h> 44#include <asm/ipi.h>
45#include <linux/kernel.h> 45#include <linux/kernel.h>
46#include <linux/string.h> 46#include <linux/string.h>
47#include <linux/init.h>
48#include <linux/gfp.h> 47#include <linux/gfp.h>
49#include <linux/smp.h> 48#include <linux/smp.h>
50 49
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8e4cbb255c38..a5371ec36776 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -17,11 +17,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
17 return x2apic_enabled(); 17 return x2apic_enabled();
18} 18}
19 19
20/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 20/*
21 21 * need to use more than cpu 0, because we need more vectors when
22 * MSI-X are used.
23 */
22static const struct cpumask *x2apic_target_cpus(void) 24static const struct cpumask *x2apic_target_cpus(void)
23{ 25{
24 return cpumask_of(0); 26 return cpu_online_mask;
25} 27}
26 28
27/* 29/*
@@ -170,7 +172,7 @@ static unsigned long set_apic_id(unsigned int id)
170 172
171static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) 173static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
172{ 174{
173 return current_cpu_data.initial_apicid >> index_msb; 175 return initial_apicid >> index_msb;
174} 176}
175 177
176static void x2apic_send_IPI_self(int vector) 178static void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a284359627e7..a8989aadc99a 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -27,11 +27,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
27 return 0; 27 return 0;
28} 28}
29 29
30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 30/*
31 31 * need to use more than cpu 0, because we need more vectors when
32 * MSI-X are used.
33 */
32static const struct cpumask *x2apic_target_cpus(void) 34static const struct cpumask *x2apic_target_cpus(void)
33{ 35{
34 return cpumask_of(0); 36 return cpu_online_mask;
35} 37}
36 38
37static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) 39static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
@@ -162,7 +164,7 @@ static unsigned long set_apic_id(unsigned int id)
162 164
163static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) 165static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
164{ 166{
165 return current_cpu_data.initial_apicid >> index_msb; 167 return initial_apicid >> index_msb;
166} 168}
167 169
168static void x2apic_send_IPI_self(int vector) 170static void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index ef0ae207a7c8..601159374e87 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -46,7 +46,7 @@ static int early_get_nodeid(void)
46 return node_id.s.node_id; 46 return node_id.s.node_id;
47} 47}
48 48
49static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 49static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
50{ 50{
51 if (!strcmp(oem_id, "SGI")) { 51 if (!strcmp(oem_id, "SGI")) {
52 if (!strcmp(oem_table_id, "UVL")) 52 if (!strcmp(oem_table_id, "UVL"))
@@ -253,7 +253,7 @@ static void uv_send_IPI_self(int vector)
253 apic_write(APIC_SELF_IPI, vector); 253 apic_write(APIC_SELF_IPI, vector);
254} 254}
255 255
256struct apic apic_x2apic_uv_x = { 256struct apic __refdata apic_x2apic_uv_x = {
257 257
258 .name = "UV large system", 258 .name = "UV large system",
259 .probe = NULL, 259 .probe = NULL,
@@ -261,7 +261,7 @@ struct apic apic_x2apic_uv_x = {
261 .apic_id_registered = uv_apic_id_registered, 261 .apic_id_registered = uv_apic_id_registered,
262 262
263 .irq_delivery_mode = dest_Fixed, 263 .irq_delivery_mode = dest_Fixed,
264 .irq_dest_mode = 1, /* logical */ 264 .irq_dest_mode = 0, /* physical */
265 265
266 .target_cpus = uv_target_cpus, 266 .target_cpus = uv_target_cpus,
267 .disable_esr = 0, 267 .disable_esr = 0,
@@ -362,12 +362,6 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
362 BUG(); 362 BUG();
363} 363}
364 364
365static __init void map_low_mmrs(void)
366{
367 init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
368 init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
369}
370
371enum map_type {map_wb, map_uc}; 365enum map_type {map_wb, map_uc};
372 366
373static __init void map_high(char *id, unsigned long base, int shift, 367static __init void map_high(char *id, unsigned long base, int shift,
@@ -395,26 +389,6 @@ static __init void map_gru_high(int max_pnode)
395 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
396} 390}
397 391
398static __init void map_config_high(int max_pnode)
399{
400 union uvh_rh_gam_cfg_overlay_config_mmr_u cfg;
401 int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT;
402
403 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
404 if (cfg.s.enable)
405 map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc);
406}
407
408static __init void map_mmr_high(int max_pnode)
409{
410 union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
411 int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
412
413 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
414 if (mmr.s.enable)
415 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
416}
417
418static __init void map_mmioh_high(int max_pnode) 392static __init void map_mmioh_high(int max_pnode)
419{ 393{
420 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; 394 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
@@ -463,7 +437,7 @@ static void uv_heartbeat(unsigned long ignored)
463 uv_set_scir_bits(bits); 437 uv_set_scir_bits(bits);
464 438
465 /* enable next timer period */ 439 /* enable next timer period */
466 mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); 440 mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
467} 441}
468 442
469static void __cpuinit uv_heartbeat_enable(int cpu) 443static void __cpuinit uv_heartbeat_enable(int cpu)
@@ -566,8 +540,6 @@ void __init uv_system_init(void)
566 unsigned long mmr_base, present, paddr; 540 unsigned long mmr_base, present, paddr;
567 unsigned short pnode_mask; 541 unsigned short pnode_mask;
568 542
569 map_low_mmrs();
570
571 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 543 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
572 m_val = m_n_config.s.m_skt; 544 m_val = m_n_config.s.m_skt;
573 n_val = m_n_config.s.n_skt; 545 n_val = m_n_config.s.n_skt;
@@ -591,6 +563,8 @@ void __init uv_system_init(void)
591 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 563 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
592 uv_blade_info = kmalloc(bytes, GFP_KERNEL); 564 uv_blade_info = kmalloc(bytes, GFP_KERNEL);
593 BUG_ON(!uv_blade_info); 565 BUG_ON(!uv_blade_info);
566 for (blade = 0; blade < uv_num_possible_blades(); blade++)
567 uv_blade_info[blade].memory_nid = -1;
594 568
595 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); 569 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
596 570
@@ -629,6 +603,9 @@ void __init uv_system_init(void)
629 lcpu = uv_blade_info[blade].nr_possible_cpus; 603 lcpu = uv_blade_info[blade].nr_possible_cpus;
630 uv_blade_info[blade].nr_possible_cpus++; 604 uv_blade_info[blade].nr_possible_cpus++;
631 605
606 /* Any node on the blade, else will contain -1. */
607 uv_blade_info[blade].memory_nid = nid;
608
632 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; 609 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
633 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; 610 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
634 uv_cpu_hub_info(cpu)->m_val = m_val; 611 uv_cpu_hub_info(cpu)->m_val = m_val;
@@ -662,11 +639,10 @@ void __init uv_system_init(void)
662 pnode = (paddr >> m_val) & pnode_mask; 639 pnode = (paddr >> m_val) & pnode_mask;
663 blade = boot_pnode_to_blade(pnode); 640 blade = boot_pnode_to_blade(pnode);
664 uv_node_to_blade[nid] = blade; 641 uv_node_to_blade[nid] = blade;
642 max_pnode = max(pnode, max_pnode);
665 } 643 }
666 644
667 map_gru_high(max_pnode); 645 map_gru_high(max_pnode);
668 map_mmr_high(max_pnode);
669 map_config_high(max_pnode);
670 map_mmioh_high(max_pnode); 646 map_mmioh_high(max_pnode);
671 647
672 uv_cpu_init(); 648 uv_cpu_init();
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 79302e9a33a4..442b5508893f 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -811,7 +811,7 @@ static int apm_do_idle(void)
811 u8 ret = 0; 811 u8 ret = 0;
812 int idled = 0; 812 int idled = 0;
813 int polling; 813 int polling;
814 int err; 814 int err = 0;
815 815
816 polling = !!(current_thread_info()->status & TS_POLLING); 816 polling = !!(current_thread_info()->status & TS_POLLING);
817 if (polling) { 817 if (polling) {
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e5b27d8f1b47..63fddcd082cd 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -258,13 +258,15 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
258{ 258{
259#ifdef CONFIG_X86_HT 259#ifdef CONFIG_X86_HT
260 unsigned bits; 260 unsigned bits;
261 int cpu = smp_processor_id();
261 262
262 bits = c->x86_coreid_bits; 263 bits = c->x86_coreid_bits;
263
264 /* Low order bits define the core id (index of core in socket) */ 264 /* Low order bits define the core id (index of core in socket) */
265 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); 265 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
266 /* Convert the initial APIC ID into the socket ID */ 266 /* Convert the initial APIC ID into the socket ID */
267 c->phys_proc_id = c->initial_apicid >> bits; 267 c->phys_proc_id = c->initial_apicid >> bits;
268 /* use socket ID also for last level cache */
269 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
268#endif 270#endif
269} 271}
270 272
@@ -354,7 +356,7 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
354#endif 356#endif
355#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) 357#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
356 /* check CPU config space for extended APIC ID */ 358 /* check CPU config space for extended APIC ID */
357 if (c->x86 >= 0xf) { 359 if (cpu_has_apic && c->x86 >= 0xf) {
358 unsigned int val; 360 unsigned int val;
359 val = read_pci_config(0, 24, 0, 0x68); 361 val = read_pci_config(0, 24, 0, 0x68);
360 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) 362 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
@@ -398,6 +400,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
398 level = cpuid_eax(1); 400 level = cpuid_eax(1);
399 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) 401 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
400 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 402 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
403
404 /*
405 * Some BIOSes incorrectly force this feature, but only K8
406 * revision D (model = 0x14) and later actually support it.
407 */
408 if (c->x86_model < 0x14)
409 clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
401 } 410 }
402 if (c->x86 == 0x10 || c->x86 == 0x11) 411 if (c->x86 == 0x10 || c->x86 == 0x11)
403 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 412 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3ffdcfa9abdf..5ce60a88027b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -59,7 +59,30 @@ void __init setup_cpu_local_masks(void)
59 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); 59 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
60} 60}
61 61
62static const struct cpu_dev *this_cpu __cpuinitdata; 62static void __cpuinit default_init(struct cpuinfo_x86 *c)
63{
64#ifdef CONFIG_X86_64
65 display_cacheinfo(c);
66#else
67 /* Not much we can do here... */
68 /* Check if at least it has cpuid */
69 if (c->cpuid_level == -1) {
70 /* No cpuid. It must be an ancient CPU */
71 if (c->x86 == 4)
72 strcpy(c->x86_model_id, "486");
73 else if (c->x86 == 3)
74 strcpy(c->x86_model_id, "386");
75 }
76#endif
77}
78
79static const struct cpu_dev __cpuinitconst default_cpu = {
80 .c_init = default_init,
81 .c_vendor = "Unknown",
82 .c_x86_vendor = X86_VENDOR_UNKNOWN,
83};
84
85static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
63 86
64DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { 87DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
65#ifdef CONFIG_X86_64 88#ifdef CONFIG_X86_64
@@ -108,7 +131,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
108 /* data */ 131 /* data */
109 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, 132 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
110 133
111 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 134 [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } },
112 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, 135 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
113 GDT_STACK_CANARY_INIT 136 GDT_STACK_CANARY_INIT
114#endif 137#endif
@@ -332,29 +355,6 @@ void switch_to_new_gdt(int cpu)
332 355
333static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; 356static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
334 357
335static void __cpuinit default_init(struct cpuinfo_x86 *c)
336{
337#ifdef CONFIG_X86_64
338 display_cacheinfo(c);
339#else
340 /* Not much we can do here... */
341 /* Check if at least it has cpuid */
342 if (c->cpuid_level == -1) {
343 /* No cpuid. It must be an ancient CPU */
344 if (c->x86 == 4)
345 strcpy(c->x86_model_id, "486");
346 else if (c->x86 == 3)
347 strcpy(c->x86_model_id, "386");
348 }
349#endif
350}
351
352static const struct cpu_dev __cpuinitconst default_cpu = {
353 .c_init = default_init,
354 .c_vendor = "Unknown",
355 .c_x86_vendor = X86_VENDOR_UNKNOWN,
356};
357
358static void __cpuinit get_model_name(struct cpuinfo_x86 *c) 358static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
359{ 359{
360 unsigned int *v; 360 unsigned int *v;
@@ -487,7 +487,6 @@ out:
487static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) 487static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
488{ 488{
489 char *v = c->x86_vendor_id; 489 char *v = c->x86_vendor_id;
490 static int printed;
491 int i; 490 int i;
492 491
493 for (i = 0; i < X86_VENDOR_NUM; i++) { 492 for (i = 0; i < X86_VENDOR_NUM; i++) {
@@ -504,13 +503,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
504 } 503 }
505 } 504 }
506 505
507 if (!printed) { 506 printk_once(KERN_ERR
508 printed++; 507 "CPU: vendor_id '%s' unknown, using generic init.\n" \
509 printk(KERN_ERR 508 "CPU: Your system may be unstable.\n", v);
510 "CPU: vendor_id '%s' unknown, using generic init.\n", v);
511
512 printk(KERN_ERR "CPU: Your system may be unstable.\n");
513 }
514 509
515 c->x86_vendor = X86_VENDOR_UNKNOWN; 510 c->x86_vendor = X86_VENDOR_UNKNOWN;
516 this_cpu = &default_cpu; 511 this_cpu = &default_cpu;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index cf52215d9eb1..2a50ef891000 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,3 +1,4 @@
1
1/* 2/*
2 * (c) 2003-2006 Advanced Micro Devices, Inc. 3 * (c) 2003-2006 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the 4 * Your use of this code is subject to the terms and conditions of the
@@ -117,20 +118,17 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
117 u32 i = 0; 118 u32 i = 0;
118 119
119 if (cpu_family == CPU_HW_PSTATE) { 120 if (cpu_family == CPU_HW_PSTATE) {
120 if (data->currpstate == HW_PSTATE_INVALID) { 121 rdmsr(MSR_PSTATE_STATUS, lo, hi);
121 /* read (initial) hw pstate if not yet set */ 122 i = lo & HW_PSTATE_MASK;
122 rdmsr(MSR_PSTATE_STATUS, lo, hi); 123 data->currpstate = i;
123 i = lo & HW_PSTATE_MASK; 124
124 125 /*
125 /* 126 * a workaround for family 11h erratum 311 might cause
126 * a workaround for family 11h erratum 311 might cause 127 * an "out-of-range Pstate if the core is in Pstate-0
127 * an "out-of-range Pstate if the core is in Pstate-0 128 */
128 */ 129 if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps))
129 if (i >= data->numps) 130 data->currpstate = HW_PSTATE_0;
130 data->currpstate = HW_PSTATE_0; 131
131 else
132 data->currpstate = i;
133 }
134 return 0; 132 return 0;
135 } 133 }
136 do { 134 do {
@@ -301,7 +299,7 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
301static int transition_fid_vid(struct powernow_k8_data *data, 299static int transition_fid_vid(struct powernow_k8_data *data,
302 u32 reqfid, u32 reqvid) 300 u32 reqfid, u32 reqvid)
303{ 301{
304 if (core_voltage_pre_transition(data, reqvid)) 302 if (core_voltage_pre_transition(data, reqvid, reqfid))
305 return 1; 303 return 1;
306 304
307 if (core_frequency_transition(data, reqfid)) 305 if (core_frequency_transition(data, reqfid))
@@ -329,17 +327,20 @@ static int transition_fid_vid(struct powernow_k8_data *data,
329 327
330/* Phase 1 - core voltage transition ... setup voltage */ 328/* Phase 1 - core voltage transition ... setup voltage */
331static int core_voltage_pre_transition(struct powernow_k8_data *data, 329static int core_voltage_pre_transition(struct powernow_k8_data *data,
332 u32 reqvid) 330 u32 reqvid, u32 reqfid)
333{ 331{
334 u32 rvosteps = data->rvo; 332 u32 rvosteps = data->rvo;
335 u32 savefid = data->currfid; 333 u32 savefid = data->currfid;
336 u32 maxvid, lo; 334 u32 maxvid, lo, rvomult = 1;
337 335
338 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " 336 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
339 "reqvid 0x%x, rvo 0x%x\n", 337 "reqvid 0x%x, rvo 0x%x\n",
340 smp_processor_id(), 338 smp_processor_id(),
341 data->currfid, data->currvid, reqvid, data->rvo); 339 data->currfid, data->currvid, reqvid, data->rvo);
342 340
341 if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
342 rvomult = 2;
343 rvosteps *= rvomult;
343 rdmsr(MSR_FIDVID_STATUS, lo, maxvid); 344 rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
344 maxvid = 0x1f & (maxvid >> 16); 345 maxvid = 0x1f & (maxvid >> 16);
345 dprintk("ph1 maxvid=0x%x\n", maxvid); 346 dprintk("ph1 maxvid=0x%x\n", maxvid);
@@ -353,7 +354,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data,
353 return 1; 354 return 1;
354 } 355 }
355 356
356 while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) { 357 while ((rvosteps > 0) &&
358 ((rvomult * data->rvo + data->currvid) > reqvid)) {
357 if (data->currvid == maxvid) { 359 if (data->currvid == maxvid) {
358 rvosteps = 0; 360 rvosteps = 0;
359 } else { 361 } else {
@@ -386,13 +388,6 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
386 u32 vcoreqfid, vcocurrfid, vcofiddiff; 388 u32 vcoreqfid, vcocurrfid, vcofiddiff;
387 u32 fid_interval, savevid = data->currvid; 389 u32 fid_interval, savevid = data->currvid;
388 390
389 if ((reqfid < HI_FID_TABLE_BOTTOM) &&
390 (data->currfid < HI_FID_TABLE_BOTTOM)) {
391 printk(KERN_ERR PFX "ph2: illegal lo-lo transition "
392 "0x%x 0x%x\n", reqfid, data->currfid);
393 return 1;
394 }
395
396 if (data->currfid == reqfid) { 391 if (data->currfid == reqfid) {
397 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", 392 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
398 data->currfid); 393 data->currfid);
@@ -409,6 +404,9 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
409 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid 404 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
410 : vcoreqfid - vcocurrfid; 405 : vcoreqfid - vcocurrfid;
411 406
407 if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
408 vcofiddiff = 0;
409
412 while (vcofiddiff > 2) { 410 while (vcofiddiff > 2) {
413 (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); 411 (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
414 412
@@ -510,41 +508,34 @@ static int core_voltage_post_transition(struct powernow_k8_data *data,
510 return 0; 508 return 0;
511} 509}
512 510
513static int check_supported_cpu(unsigned int cpu) 511static void check_supported_cpu(void *_rc)
514{ 512{
515 cpumask_t oldmask;
516 u32 eax, ebx, ecx, edx; 513 u32 eax, ebx, ecx, edx;
517 unsigned int rc = 0; 514 int *rc = _rc;
518 515
519 oldmask = current->cpus_allowed; 516 *rc = -ENODEV;
520 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
521
522 if (smp_processor_id() != cpu) {
523 printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu);
524 goto out;
525 }
526 517
527 if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) 518 if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
528 goto out; 519 return;
529 520
530 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); 521 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
531 if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) && 522 if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
532 ((eax & CPUID_XFAM) < CPUID_XFAM_10H)) 523 ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
533 goto out; 524 return;
534 525
535 if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { 526 if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
536 if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || 527 if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
537 ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) { 528 ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
538 printk(KERN_INFO PFX 529 printk(KERN_INFO PFX
539 "Processor cpuid %x not supported\n", eax); 530 "Processor cpuid %x not supported\n", eax);
540 goto out; 531 return;
541 } 532 }
542 533
543 eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES); 534 eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
544 if (eax < CPUID_FREQ_VOLT_CAPABILITIES) { 535 if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
545 printk(KERN_INFO PFX 536 printk(KERN_INFO PFX
546 "No frequency change capabilities detected\n"); 537 "No frequency change capabilities detected\n");
547 goto out; 538 return;
548 } 539 }
549 540
550 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); 541 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
@@ -552,21 +543,17 @@ static int check_supported_cpu(unsigned int cpu)
552 != P_STATE_TRANSITION_CAPABLE) { 543 != P_STATE_TRANSITION_CAPABLE) {
553 printk(KERN_INFO PFX 544 printk(KERN_INFO PFX
554 "Power state transitions not supported\n"); 545 "Power state transitions not supported\n");
555 goto out; 546 return;
556 } 547 }
557 } else { /* must be a HW Pstate capable processor */ 548 } else { /* must be a HW Pstate capable processor */
558 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); 549 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
559 if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE) 550 if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
560 cpu_family = CPU_HW_PSTATE; 551 cpu_family = CPU_HW_PSTATE;
561 else 552 else
562 goto out; 553 return;
563 } 554 }
564 555
565 rc = 1; 556 *rc = 0;
566
567out:
568 set_cpus_allowed_ptr(current, &oldmask);
569 return rc;
570} 557}
571 558
572static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, 559static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
@@ -823,13 +810,14 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
823 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) 810 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
824 return; 811 return;
825 812
826 control = data->acpi_data.states[index].control; data->irt = (control 813 control = data->acpi_data.states[index].control;
827 >> IRT_SHIFT) & IRT_MASK; data->rvo = (control >> 814 data->irt = (control >> IRT_SHIFT) & IRT_MASK;
828 RVO_SHIFT) & RVO_MASK; data->exttype = (control 815 data->rvo = (control >> RVO_SHIFT) & RVO_MASK;
829 >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; 816 data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
830 data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; data->vidmvs = 1 817 data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK;
831 << ((control >> MVS_SHIFT) & MVS_MASK); data->vstable = 818 data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK);
832 (control >> VST_SHIFT) & VST_MASK; } 819 data->vstable = (control >> VST_SHIFT) & VST_MASK;
820}
833 821
834static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) 822static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
835{ 823{
@@ -1046,6 +1034,19 @@ static int get_transition_latency(struct powernow_k8_data *data)
1046 if (cur_latency > max_latency) 1034 if (cur_latency > max_latency)
1047 max_latency = cur_latency; 1035 max_latency = cur_latency;
1048 } 1036 }
1037 if (max_latency == 0) {
1038 /*
1039 * Fam 11h always returns 0 as transition latency.
1040 * This is intended and means "very fast". While cpufreq core
1041 * and governors currently can handle that gracefully, better
1042 * set it to 1 to avoid problems in the future.
1043 * For all others it's a BIOS bug.
1044 */
1045 if (!boot_cpu_data.x86 == 0x11)
1046 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1047 "latency\n");
1048 max_latency = 1;
1049 }
1049 /* value in usecs, needs to be in nanoseconds */ 1050 /* value in usecs, needs to be in nanoseconds */
1050 return 1000 * max_latency; 1051 return 1000 * max_latency;
1051} 1052}
@@ -1080,20 +1081,12 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,
1080 return 0; 1081 return 0;
1081 } 1082 }
1082 1083
1083 if ((fid < HI_FID_TABLE_BOTTOM) &&
1084 (data->currfid < HI_FID_TABLE_BOTTOM)) {
1085 printk(KERN_ERR PFX
1086 "ignoring illegal change in lo freq table-%x to 0x%x\n",
1087 data->currfid, fid);
1088 return 1;
1089 }
1090
1091 dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", 1084 dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
1092 smp_processor_id(), fid, vid); 1085 smp_processor_id(), fid, vid);
1093 freqs.old = find_khz_freq_from_fid(data->currfid); 1086 freqs.old = find_khz_freq_from_fid(data->currfid);
1094 freqs.new = find_khz_freq_from_fid(fid); 1087 freqs.new = find_khz_freq_from_fid(fid);
1095 1088
1096 for_each_cpu_mask_nr(i, *(data->available_cores)) { 1089 for_each_cpu(i, data->available_cores) {
1097 freqs.cpu = i; 1090 freqs.cpu = i;
1098 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 1091 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1099 } 1092 }
@@ -1101,7 +1094,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,
1101 res = transition_fid_vid(data, fid, vid); 1094 res = transition_fid_vid(data, fid, vid);
1102 freqs.new = find_khz_freq_from_fid(data->currfid); 1095 freqs.new = find_khz_freq_from_fid(data->currfid);
1103 1096
1104 for_each_cpu_mask_nr(i, *(data->available_cores)) { 1097 for_each_cpu(i, data->available_cores) {
1105 freqs.cpu = i; 1098 freqs.cpu = i;
1106 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 1099 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1107 } 1100 }
@@ -1126,7 +1119,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
1126 data->currpstate); 1119 data->currpstate);
1127 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1120 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1128 1121
1129 for_each_cpu_mask_nr(i, *(data->available_cores)) { 1122 for_each_cpu(i, data->available_cores) {
1130 freqs.cpu = i; 1123 freqs.cpu = i;
1131 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 1124 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1132 } 1125 }
@@ -1134,7 +1127,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
1134 res = transition_pstate(data, pstate); 1127 res = transition_pstate(data, pstate);
1135 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1128 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1136 1129
1137 for_each_cpu_mask_nr(i, *(data->available_cores)) { 1130 for_each_cpu(i, data->available_cores) {
1138 freqs.cpu = i; 1131 freqs.cpu = i;
1139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 1132 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1140 } 1133 }
@@ -1235,21 +1228,47 @@ static int powernowk8_verify(struct cpufreq_policy *pol)
1235 return cpufreq_frequency_table_verify(pol, data->powernow_table); 1228 return cpufreq_frequency_table_verify(pol, data->powernow_table);
1236} 1229}
1237 1230
1238static const char ACPI_PSS_BIOS_BUG_MSG[] = 1231struct init_on_cpu {
1239 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" 1232 struct powernow_k8_data *data;
1240 KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; 1233 int rc;
1234};
1235
1236static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu)
1237{
1238 struct init_on_cpu *init_on_cpu = _init_on_cpu;
1239
1240 if (pending_bit_stuck()) {
1241 printk(KERN_ERR PFX "failing init, change pending bit set\n");
1242 init_on_cpu->rc = -ENODEV;
1243 return;
1244 }
1245
1246 if (query_current_values_with_pending_wait(init_on_cpu->data)) {
1247 init_on_cpu->rc = -ENODEV;
1248 return;
1249 }
1250
1251 if (cpu_family == CPU_OPTERON)
1252 fidvid_msr_init();
1253
1254 init_on_cpu->rc = 0;
1255}
1241 1256
1242/* per CPU init entry point to the driver */ 1257/* per CPU init entry point to the driver */
1243static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) 1258static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1244{ 1259{
1260 static const char ACPI_PSS_BIOS_BUG_MSG[] =
1261 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
1262 FW_BUG PFX "Try again with latest BIOS.\n";
1245 struct powernow_k8_data *data; 1263 struct powernow_k8_data *data;
1246 cpumask_t oldmask; 1264 struct init_on_cpu init_on_cpu;
1247 int rc; 1265 int rc;
1248 1266
1249 if (!cpu_online(pol->cpu)) 1267 if (!cpu_online(pol->cpu))
1250 return -ENODEV; 1268 return -ENODEV;
1251 1269
1252 if (!check_supported_cpu(pol->cpu)) 1270 smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1);
1271 if (rc)
1253 return -ENODEV; 1272 return -ENODEV;
1254 1273
1255 data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); 1274 data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
@@ -1289,27 +1308,12 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1289 pol->cpuinfo.transition_latency = get_transition_latency(data); 1308 pol->cpuinfo.transition_latency = get_transition_latency(data);
1290 1309
1291 /* only run on specific CPU from here on */ 1310 /* only run on specific CPU from here on */
1292 oldmask = current->cpus_allowed; 1311 init_on_cpu.data = data;
1293 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); 1312 smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu,
1294 1313 &init_on_cpu, 1);
1295 if (smp_processor_id() != pol->cpu) { 1314 rc = init_on_cpu.rc;
1296 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1315 if (rc != 0)
1297 goto err_out_unmask; 1316 goto err_out_exit_acpi;
1298 }
1299
1300 if (pending_bit_stuck()) {
1301 printk(KERN_ERR PFX "failing init, change pending bit set\n");
1302 goto err_out_unmask;
1303 }
1304
1305 if (query_current_values_with_pending_wait(data))
1306 goto err_out_unmask;
1307
1308 if (cpu_family == CPU_OPTERON)
1309 fidvid_msr_init();
1310
1311 /* run on any CPU again */
1312 set_cpus_allowed_ptr(current, &oldmask);
1313 1317
1314 if (cpu_family == CPU_HW_PSTATE) 1318 if (cpu_family == CPU_HW_PSTATE)
1315 cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); 1319 cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
@@ -1346,8 +1350,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1346 1350
1347 return 0; 1351 return 0;
1348 1352
1349err_out_unmask: 1353err_out_exit_acpi:
1350 set_cpus_allowed_ptr(current, &oldmask);
1351 powernow_k8_cpu_exit_acpi(data); 1354 powernow_k8_cpu_exit_acpi(data);
1352 1355
1353err_out: 1356err_out:
@@ -1372,28 +1375,25 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1372 return 0; 1375 return 0;
1373} 1376}
1374 1377
1378static void query_values_on_cpu(void *_err)
1379{
1380 int *err = _err;
1381 struct powernow_k8_data *data = __get_cpu_var(powernow_data);
1382
1383 *err = query_current_values_with_pending_wait(data);
1384}
1385
1375static unsigned int powernowk8_get(unsigned int cpu) 1386static unsigned int powernowk8_get(unsigned int cpu)
1376{ 1387{
1377 struct powernow_k8_data *data; 1388 struct powernow_k8_data *data = per_cpu(powernow_data, cpu);
1378 cpumask_t oldmask = current->cpus_allowed;
1379 unsigned int khz = 0; 1389 unsigned int khz = 0;
1380 unsigned int first; 1390 int err;
1381
1382 first = cpumask_first(cpu_core_mask(cpu));
1383 data = per_cpu(powernow_data, first);
1384 1391
1385 if (!data) 1392 if (!data)
1386 return -EINVAL; 1393 return -EINVAL;
1387 1394
1388 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 1395 smp_call_function_single(cpu, query_values_on_cpu, &err, true);
1389 if (smp_processor_id() != cpu) { 1396 if (err)
1390 printk(KERN_ERR PFX
1391 "limiting to CPU %d failed in powernowk8_get\n", cpu);
1392 set_cpus_allowed_ptr(current, &oldmask);
1393 return 0;
1394 }
1395
1396 if (query_current_values_with_pending_wait(data))
1397 goto out; 1397 goto out;
1398 1398
1399 if (cpu_family == CPU_HW_PSTATE) 1399 if (cpu_family == CPU_HW_PSTATE)
@@ -1404,7 +1404,6 @@ static unsigned int powernowk8_get(unsigned int cpu)
1404 1404
1405 1405
1406out: 1406out:
1407 set_cpus_allowed_ptr(current, &oldmask);
1408 return khz; 1407 return khz;
1409} 1408}
1410 1409
@@ -1430,7 +1429,9 @@ static int __cpuinit powernowk8_init(void)
1430 unsigned int i, supported_cpus = 0; 1429 unsigned int i, supported_cpus = 0;
1431 1430
1432 for_each_online_cpu(i) { 1431 for_each_online_cpu(i) {
1433 if (check_supported_cpu(i)) 1432 int rc;
1433 smp_call_function_single(i, check_supported_cpu, &rc, 1);
1434 if (rc == 0)
1434 supported_cpus++; 1435 supported_cpus++;
1435 } 1436 }
1436 1437
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 6c6698feade1..02ce824073cb 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -215,7 +215,8 @@ struct pst_s {
215 215
216#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) 216#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
217 217
218static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid); 218static int core_voltage_pre_transition(struct powernow_k8_data *data,
219 u32 reqvid, u32 regfid);
219static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); 220static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
220static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); 221static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
221 222
@@ -223,14 +224,3 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned
223 224
224static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); 225static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
225static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); 226static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
226
227#ifdef CONFIG_SMP
228static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
229{
230}
231#else
232static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
233{
234 cpu_set(0, cpu_sharedcore_mask[0]);
235}
236#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 55c831ed71ce..8d672ef162ce 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -323,14 +323,8 @@ static unsigned int get_cur_freq(unsigned int cpu)
323{ 323{
324 unsigned l, h; 324 unsigned l, h;
325 unsigned clock_freq; 325 unsigned clock_freq;
326 cpumask_t saved_mask;
327 326
328 saved_mask = current->cpus_allowed; 327 rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);
329 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
330 if (smp_processor_id() != cpu)
331 return 0;
332
333 rdmsr(MSR_IA32_PERF_STATUS, l, h);
334 clock_freq = extract_clock(l, cpu, 0); 328 clock_freq = extract_clock(l, cpu, 0);
335 329
336 if (unlikely(clock_freq == 0)) { 330 if (unlikely(clock_freq == 0)) {
@@ -340,11 +334,9 @@ static unsigned int get_cur_freq(unsigned int cpu)
340 * P-state transition (like TM2). Get the last freq set 334 * P-state transition (like TM2). Get the last freq set
341 * in PERF_CTL. 335 * in PERF_CTL.
342 */ 336 */
343 rdmsr(MSR_IA32_PERF_CTL, l, h); 337 rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);
344 clock_freq = extract_clock(l, cpu, 1); 338 clock_freq = extract_clock(l, cpu, 1);
345 } 339 }
346
347 set_cpus_allowed_ptr(current, &saved_mask);
348 return clock_freq; 340 return clock_freq;
349} 341}
350 342
@@ -467,15 +459,10 @@ static int centrino_target (struct cpufreq_policy *policy,
467 struct cpufreq_freqs freqs; 459 struct cpufreq_freqs freqs;
468 int retval = 0; 460 int retval = 0;
469 unsigned int j, k, first_cpu, tmp; 461 unsigned int j, k, first_cpu, tmp;
470 cpumask_var_t saved_mask, covered_cpus; 462 cpumask_var_t covered_cpus;
471 463
472 if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) 464 if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
473 return -ENOMEM;
474 if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {
475 free_cpumask_var(saved_mask);
476 return -ENOMEM; 465 return -ENOMEM;
477 }
478 cpumask_copy(saved_mask, &current->cpus_allowed);
479 466
480 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { 467 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
481 retval = -ENODEV; 468 retval = -ENODEV;
@@ -493,7 +480,7 @@ static int centrino_target (struct cpufreq_policy *policy,
493 480
494 first_cpu = 1; 481 first_cpu = 1;
495 for_each_cpu(j, policy->cpus) { 482 for_each_cpu(j, policy->cpus) {
496 const struct cpumask *mask; 483 int good_cpu;
497 484
498 /* cpufreq holds the hotplug lock, so we are safe here */ 485 /* cpufreq holds the hotplug lock, so we are safe here */
499 if (!cpu_online(j)) 486 if (!cpu_online(j))
@@ -504,32 +491,30 @@ static int centrino_target (struct cpufreq_policy *policy,
504 * Make sure we are running on CPU that wants to change freq 491 * Make sure we are running on CPU that wants to change freq
505 */ 492 */
506 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) 493 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
507 mask = policy->cpus; 494 good_cpu = cpumask_any_and(policy->cpus,
495 cpu_online_mask);
508 else 496 else
509 mask = cpumask_of(j); 497 good_cpu = j;
510 498
511 set_cpus_allowed_ptr(current, mask); 499 if (good_cpu >= nr_cpu_ids) {
512 preempt_disable();
513 if (unlikely(!cpu_isset(smp_processor_id(), *mask))) {
514 dprintk("couldn't limit to CPUs in this domain\n"); 500 dprintk("couldn't limit to CPUs in this domain\n");
515 retval = -EAGAIN; 501 retval = -EAGAIN;
516 if (first_cpu) { 502 if (first_cpu) {
517 /* We haven't started the transition yet. */ 503 /* We haven't started the transition yet. */
518 goto migrate_end; 504 goto out;
519 } 505 }
520 preempt_enable();
521 break; 506 break;
522 } 507 }
523 508
524 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; 509 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
525 510
526 if (first_cpu) { 511 if (first_cpu) {
527 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); 512 rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
528 if (msr == (oldmsr & 0xffff)) { 513 if (msr == (oldmsr & 0xffff)) {
529 dprintk("no change needed - msr was and needs " 514 dprintk("no change needed - msr was and needs "
530 "to be %x\n", oldmsr); 515 "to be %x\n", oldmsr);
531 retval = 0; 516 retval = 0;
532 goto migrate_end; 517 goto out;
533 } 518 }
534 519
535 freqs.old = extract_clock(oldmsr, cpu, 0); 520 freqs.old = extract_clock(oldmsr, cpu, 0);
@@ -553,14 +538,11 @@ static int centrino_target (struct cpufreq_policy *policy,
553 oldmsr |= msr; 538 oldmsr |= msr;
554 } 539 }
555 540
556 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); 541 wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h);
557 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { 542 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
558 preempt_enable();
559 break; 543 break;
560 }
561 544
562 cpu_set(j, *covered_cpus); 545 cpumask_set_cpu(j, covered_cpus);
563 preempt_enable();
564 } 546 }
565 547
566 for_each_cpu(k, policy->cpus) { 548 for_each_cpu(k, policy->cpus) {
@@ -578,10 +560,8 @@ static int centrino_target (struct cpufreq_policy *policy,
578 * Best effort undo.. 560 * Best effort undo..
579 */ 561 */
580 562
581 for_each_cpu_mask_nr(j, *covered_cpus) { 563 for_each_cpu(j, covered_cpus)
582 set_cpus_allowed_ptr(current, &cpumask_of_cpu(j)); 564 wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);
583 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
584 }
585 565
586 tmp = freqs.new; 566 tmp = freqs.new;
587 freqs.new = freqs.old; 567 freqs.new = freqs.old;
@@ -593,15 +573,9 @@ static int centrino_target (struct cpufreq_policy *policy,
593 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 573 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
594 } 574 }
595 } 575 }
596 set_cpus_allowed_ptr(current, saved_mask);
597 retval = 0; 576 retval = 0;
598 goto out;
599 577
600migrate_end:
601 preempt_enable();
602 set_cpus_allowed_ptr(current, saved_mask);
603out: 578out:
604 free_cpumask_var(saved_mask);
605 free_cpumask_var(covered_cpus); 579 free_cpumask_var(covered_cpus);
606 return retval; 580 return retval;
607} 581}
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 016c1a4fa3fc..6911e91fb4f6 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -89,7 +89,8 @@ static int speedstep_find_register(void)
89 * speedstep_set_state - set the SpeedStep state 89 * speedstep_set_state - set the SpeedStep state
90 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) 90 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
91 * 91 *
92 * Tries to change the SpeedStep state. 92 * Tries to change the SpeedStep state. Can be called from
93 * smp_call_function_single.
93 */ 94 */
94static void speedstep_set_state(unsigned int state) 95static void speedstep_set_state(unsigned int state)
95{ 96{
@@ -143,6 +144,11 @@ static void speedstep_set_state(unsigned int state)
143 return; 144 return;
144} 145}
145 146
147/* Wrapper for smp_call_function_single. */
148static void _speedstep_set_state(void *_state)
149{
150 speedstep_set_state(*(unsigned int *)_state);
151}
146 152
147/** 153/**
148 * speedstep_activate - activate SpeedStep control in the chipset 154 * speedstep_activate - activate SpeedStep control in the chipset
@@ -226,22 +232,28 @@ static unsigned int speedstep_detect_chipset(void)
226 return 0; 232 return 0;
227} 233}
228 234
229static unsigned int _speedstep_get(const struct cpumask *cpus) 235struct get_freq_data {
230{
231 unsigned int speed; 236 unsigned int speed;
232 cpumask_t cpus_allowed; 237 unsigned int processor;
233 238};
234 cpus_allowed = current->cpus_allowed; 239
235 set_cpus_allowed_ptr(current, cpus); 240static void get_freq_data(void *_data)
236 speed = speedstep_get_frequency(speedstep_processor); 241{
237 set_cpus_allowed_ptr(current, &cpus_allowed); 242 struct get_freq_data *data = _data;
238 dprintk("detected %u kHz as current frequency\n", speed); 243
239 return speed; 244 data->speed = speedstep_get_frequency(data->processor);
240} 245}
241 246
242static unsigned int speedstep_get(unsigned int cpu) 247static unsigned int speedstep_get(unsigned int cpu)
243{ 248{
244 return _speedstep_get(cpumask_of(cpu)); 249 struct get_freq_data data = { .processor = cpu };
250
251 /* You're supposed to ensure CPU is online. */
252 if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0)
253 BUG();
254
255 dprintk("detected %u kHz as current frequency\n", data.speed);
256 return data.speed;
245} 257}
246 258
247/** 259/**
@@ -257,16 +269,16 @@ static int speedstep_target(struct cpufreq_policy *policy,
257 unsigned int target_freq, 269 unsigned int target_freq,
258 unsigned int relation) 270 unsigned int relation)
259{ 271{
260 unsigned int newstate = 0; 272 unsigned int newstate = 0, policy_cpu;
261 struct cpufreq_freqs freqs; 273 struct cpufreq_freqs freqs;
262 cpumask_t cpus_allowed;
263 int i; 274 int i;
264 275
265 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], 276 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
266 target_freq, relation, &newstate)) 277 target_freq, relation, &newstate))
267 return -EINVAL; 278 return -EINVAL;
268 279
269 freqs.old = _speedstep_get(policy->cpus); 280 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
281 freqs.old = speedstep_get(policy_cpu);
270 freqs.new = speedstep_freqs[newstate].frequency; 282 freqs.new = speedstep_freqs[newstate].frequency;
271 freqs.cpu = policy->cpu; 283 freqs.cpu = policy->cpu;
272 284
@@ -276,20 +288,13 @@ static int speedstep_target(struct cpufreq_policy *policy,
276 if (freqs.old == freqs.new) 288 if (freqs.old == freqs.new)
277 return 0; 289 return 0;
278 290
279 cpus_allowed = current->cpus_allowed;
280
281 for_each_cpu(i, policy->cpus) { 291 for_each_cpu(i, policy->cpus) {
282 freqs.cpu = i; 292 freqs.cpu = i;
283 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 293 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
284 } 294 }
285 295
286 /* switch to physical CPU where state is to be changed */ 296 smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
287 set_cpus_allowed_ptr(current, policy->cpus); 297 true);
288
289 speedstep_set_state(newstate);
290
291 /* allow to be run on all CPUs */
292 set_cpus_allowed_ptr(current, &cpus_allowed);
293 298
294 for_each_cpu(i, policy->cpus) { 299 for_each_cpu(i, policy->cpus) {
295 freqs.cpu = i; 300 freqs.cpu = i;
@@ -312,33 +317,43 @@ static int speedstep_verify(struct cpufreq_policy *policy)
312 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); 317 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
313} 318}
314 319
320struct get_freqs {
321 struct cpufreq_policy *policy;
322 int ret;
323};
324
325static void get_freqs_on_cpu(void *_get_freqs)
326{
327 struct get_freqs *get_freqs = _get_freqs;
328
329 get_freqs->ret =
330 speedstep_get_freqs(speedstep_processor,
331 &speedstep_freqs[SPEEDSTEP_LOW].frequency,
332 &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
333 &get_freqs->policy->cpuinfo.transition_latency,
334 &speedstep_set_state);
335}
315 336
316static int speedstep_cpu_init(struct cpufreq_policy *policy) 337static int speedstep_cpu_init(struct cpufreq_policy *policy)
317{ 338{
318 int result = 0; 339 int result;
319 unsigned int speed; 340 unsigned int policy_cpu, speed;
320 cpumask_t cpus_allowed; 341 struct get_freqs gf;
321 342
322 /* only run on CPU to be set, or on its sibling */ 343 /* only run on CPU to be set, or on its sibling */
323#ifdef CONFIG_SMP 344#ifdef CONFIG_SMP
324 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); 345 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
325#endif 346#endif
326 347 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
327 cpus_allowed = current->cpus_allowed;
328 set_cpus_allowed_ptr(current, policy->cpus);
329 348
330 /* detect low and high frequency and transition latency */ 349 /* detect low and high frequency and transition latency */
331 result = speedstep_get_freqs(speedstep_processor, 350 gf.policy = policy;
332 &speedstep_freqs[SPEEDSTEP_LOW].frequency, 351 smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
333 &speedstep_freqs[SPEEDSTEP_HIGH].frequency, 352 if (gf.ret)
334 &policy->cpuinfo.transition_latency, 353 return gf.ret;
335 &speedstep_set_state);
336 set_cpus_allowed_ptr(current, &cpus_allowed);
337 if (result)
338 return result;
339 354
340 /* get current speed setting */ 355 /* get current speed setting */
341 speed = _speedstep_get(policy->cpus); 356 speed = speedstep_get(policy_cpu);
342 if (!speed) 357 if (!speed)
343 return -EIO; 358 return -EIO;
344 359
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index 2e3c6862657b..f4c290b8482f 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -226,6 +226,7 @@ static unsigned int pentium4_get_frequency(void)
226} 226}
227 227
228 228
229/* Warning: may get called from smp_call_function_single. */
229unsigned int speedstep_get_frequency(unsigned int processor) 230unsigned int speedstep_get_frequency(unsigned int processor)
230{ 231{
231 switch (processor) { 232 switch (processor) {
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index daed39ba2614..3260ab044996 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
86 */ 86 */
87 if (c->x86 == 6 && c->x86_model < 15) 87 if (c->x86 == 6 && c->x86_model < 15)
88 clear_cpu_cap(c, X86_FEATURE_PAT); 88 clear_cpu_cap(c, X86_FEATURE_PAT);
89
90#ifdef CONFIG_KMEMCHECK
91 /*
92 * P4s have a "fast strings" feature which causes single-
93 * stepping REP instructions to only generate a #DB on
94 * cache-line boundaries.
95 *
96 * Ingo Molnar reported a Pentium D (model 6) and a Xeon
97 * (model 2) with the same problem.
98 */
99 if (c->x86 == 15) {
100 u64 misc_enable;
101
102 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
103
104 if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
105 printk(KERN_INFO "kmemcheck: Disabling fast string operations\n");
106
107 misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
108 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
109 }
110 }
111#endif
89} 112}
90 113
91#ifdef CONFIG_X86_32 114#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 45004faf67ea..188a1ca5ad2b 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,11 +1,12 @@
1obj-y = mce.o therm_throt.o 1obj-y = mce.o
2 2
3obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o 3obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
4obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o 4obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
5obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o 5obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
6obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o 6obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
7obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o 7obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
8obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
9obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o 8obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
10obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o 9obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
11obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o 10obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
11
12obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index 89e510424152..b945d5dbc609 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -10,10 +10,9 @@
10 10
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/mce.h>
13#include <asm/msr.h> 14#include <asm/msr.h>
14 15
15#include "mce.h"
16
17/* Machine Check Handler For AMD Athlon/Duron: */ 16/* Machine Check Handler For AMD Athlon/Duron: */
18static void k7_machine_check(struct pt_regs *regs, long error_code) 17static void k7_machine_check(struct pt_regs *regs, long error_code)
19{ 18{
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index fabba15e4558..01213048f62f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -44,7 +44,6 @@
44#include <asm/msr.h> 44#include <asm/msr.h>
45 45
46#include "mce-internal.h" 46#include "mce-internal.h"
47#include "mce.h"
48 47
49/* Handle unconfigured int18 (should never happen) */ 48/* Handle unconfigured int18 (should never happen) */
50static void unexpected_machine_check(struct pt_regs *regs, long error_code) 49static void unexpected_machine_check(struct pt_regs *regs, long error_code)
@@ -57,7 +56,7 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code)
57void (*machine_check_vector)(struct pt_regs *, long error_code) = 56void (*machine_check_vector)(struct pt_regs *, long error_code) =
58 unexpected_machine_check; 57 unexpected_machine_check;
59 58
60int mce_disabled; 59int mce_disabled __read_mostly;
61 60
62#ifdef CONFIG_X86_NEW_MCE 61#ifdef CONFIG_X86_NEW_MCE
63 62
@@ -76,21 +75,22 @@ DEFINE_PER_CPU(unsigned, mce_exception_count);
76 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 75 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
77 * 3: never panic or SIGBUS, log all errors (for testing only) 76 * 3: never panic or SIGBUS, log all errors (for testing only)
78 */ 77 */
79static int tolerant = 1; 78static int tolerant __read_mostly = 1;
80static int banks; 79static int banks __read_mostly;
81static u64 *bank; 80static u64 *bank __read_mostly;
82static unsigned long notify_user; 81static int rip_msr __read_mostly;
83static int rip_msr; 82static int mce_bootlog __read_mostly = -1;
84static int mce_bootlog = -1; 83static int monarch_timeout __read_mostly = -1;
85static int monarch_timeout = -1; 84static int mce_panic_timeout __read_mostly;
86static int mce_panic_timeout; 85static int mce_dont_log_ce __read_mostly;
87static int mce_dont_log_ce; 86int mce_cmci_disabled __read_mostly;
88int mce_cmci_disabled; 87int mce_ignore_ce __read_mostly;
89int mce_ignore_ce; 88int mce_ser __read_mostly;
90int mce_ser; 89
91 90/* User mode helper program triggered by machine check event */
92static char trigger[128]; 91static unsigned long mce_need_notify;
93static char *trigger_argv[2] = { trigger, NULL }; 92static char mce_helper[128];
93static char *mce_helper_argv[2] = { mce_helper, NULL };
94 94
95static unsigned long dont_init_banks; 95static unsigned long dont_init_banks;
96 96
@@ -180,7 +180,7 @@ void mce_log(struct mce *mce)
180 wmb(); 180 wmb();
181 181
182 mce->finished = 1; 182 mce->finished = 1;
183 set_bit(0, &notify_user); 183 set_bit(0, &mce_need_notify);
184} 184}
185 185
186static void print_mce(struct mce *m) 186static void print_mce(struct mce *m)
@@ -194,14 +194,14 @@ static void print_mce(struct mce *m)
194 m->cs, m->ip); 194 m->cs, m->ip);
195 if (m->cs == __KERNEL_CS) 195 if (m->cs == __KERNEL_CS)
196 print_symbol("{%s}", m->ip); 196 print_symbol("{%s}", m->ip);
197 printk("\n"); 197 printk(KERN_CONT "\n");
198 } 198 }
199 printk(KERN_EMERG "TSC %llx ", m->tsc); 199 printk(KERN_EMERG "TSC %llx ", m->tsc);
200 if (m->addr) 200 if (m->addr)
201 printk("ADDR %llx ", m->addr); 201 printk(KERN_CONT "ADDR %llx ", m->addr);
202 if (m->misc) 202 if (m->misc)
203 printk("MISC %llx ", m->misc); 203 printk(KERN_CONT "MISC %llx ", m->misc);
204 printk("\n"); 204 printk(KERN_CONT "\n");
205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 m->cpuvendor, m->cpuid, m->time, m->socketid, 206 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 m->apicid); 207 m->apicid);
@@ -209,13 +209,13 @@ static void print_mce(struct mce *m)
209 209
210static void print_mce_head(void) 210static void print_mce_head(void)
211{ 211{
212 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); 212 printk(KERN_EMERG "\nHARDWARE ERROR\n");
213} 213}
214 214
215static void print_mce_tail(void) 215static void print_mce_tail(void)
216{ 216{
217 printk(KERN_EMERG "This is not a software problem!\n" 217 printk(KERN_EMERG "This is not a software problem!\n"
218 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); 218 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
219} 219}
220 220
221#define PANIC_TIMEOUT 5 /* 5 seconds */ 221#define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -691,18 +691,21 @@ static atomic_t global_nwo;
691 * in the entry order. 691 * in the entry order.
692 * TBD double check parallel CPU hotunplug 692 * TBD double check parallel CPU hotunplug
693 */ 693 */
694static int mce_start(int no_way_out, int *order) 694static int mce_start(int *no_way_out)
695{ 695{
696 int nwo; 696 int order;
697 int cpus = num_online_cpus(); 697 int cpus = num_online_cpus();
698 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 698 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
699 699
700 if (!timeout) { 700 if (!timeout)
701 *order = -1; 701 return -1;
702 return no_way_out;
703 }
704 702
705 atomic_add(no_way_out, &global_nwo); 703 atomic_add(*no_way_out, &global_nwo);
704 /*
705 * global_nwo should be updated before mce_callin
706 */
707 smp_wmb();
708 order = atomic_add_return(1, &mce_callin);
706 709
707 /* 710 /*
708 * Wait for everyone. 711 * Wait for everyone.
@@ -710,40 +713,43 @@ static int mce_start(int no_way_out, int *order)
710 while (atomic_read(&mce_callin) != cpus) { 713 while (atomic_read(&mce_callin) != cpus) {
711 if (mce_timed_out(&timeout)) { 714 if (mce_timed_out(&timeout)) {
712 atomic_set(&global_nwo, 0); 715 atomic_set(&global_nwo, 0);
713 *order = -1; 716 return -1;
714 return no_way_out;
715 } 717 }
716 ndelay(SPINUNIT); 718 ndelay(SPINUNIT);
717 } 719 }
718 720
719 /* 721 /*
720 * Cache the global no_way_out state. 722 * mce_callin should be read before global_nwo
721 */ 723 */
722 nwo = atomic_read(&global_nwo); 724 smp_rmb();
723 725
724 /* 726 if (order == 1) {
725 * Monarch starts executing now, the others wait. 727 /*
726 */ 728 * Monarch: Starts executing now, the others wait.
727 if (*order == 1) { 729 */
728 atomic_set(&mce_executing, 1); 730 atomic_set(&mce_executing, 1);
729 return nwo; 731 } else {
732 /*
733 * Subject: Now start the scanning loop one by one in
734 * the original callin order.
735 * This way when there are any shared banks it will be
736 * only seen by one CPU before cleared, avoiding duplicates.
737 */
738 while (atomic_read(&mce_executing) < order) {
739 if (mce_timed_out(&timeout)) {
740 atomic_set(&global_nwo, 0);
741 return -1;
742 }
743 ndelay(SPINUNIT);
744 }
730 } 745 }
731 746
732 /* 747 /*
733 * Now start the scanning loop one by one 748 * Cache the global no_way_out state.
734 * in the original callin order.
735 * This way when there are any shared banks it will
736 * be only seen by one CPU before cleared, avoiding duplicates.
737 */ 749 */
738 while (atomic_read(&mce_executing) < *order) { 750 *no_way_out = atomic_read(&global_nwo);
739 if (mce_timed_out(&timeout)) { 751
740 atomic_set(&global_nwo, 0); 752 return order;
741 *order = -1;
742 return no_way_out;
743 }
744 ndelay(SPINUNIT);
745 }
746 return nwo;
747} 753}
748 754
749/* 755/*
@@ -863,7 +869,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
863 * check handler. 869 * check handler.
864 */ 870 */
865 int order; 871 int order;
866
867 /* 872 /*
868 * If no_way_out gets set, there is no safe way to recover from this 873 * If no_way_out gets set, there is no safe way to recover from this
869 * MCE. If tolerant is cranked up, we'll try anyway. 874 * MCE. If tolerant is cranked up, we'll try anyway.
@@ -887,7 +892,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
887 if (!banks) 892 if (!banks)
888 goto out; 893 goto out;
889 894
890 order = atomic_add_return(1, &mce_callin);
891 mce_setup(&m); 895 mce_setup(&m);
892 896
893 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 897 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
@@ -909,7 +913,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
909 * This way we don't report duplicated events on shared banks 913 * This way we don't report duplicated events on shared banks
910 * because the first one to see it will clear it. 914 * because the first one to see it will clear it.
911 */ 915 */
912 no_way_out = mce_start(no_way_out, &order); 916 order = mce_start(&no_way_out);
913 for (i = 0; i < banks; i++) { 917 for (i = 0; i < banks; i++) {
914 __clear_bit(i, toclear); 918 __clear_bit(i, toclear);
915 if (!bank[i]) 919 if (!bank[i])
@@ -1113,12 +1117,12 @@ static void mcheck_timer(unsigned long data)
1113 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1117 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1114 1118
1115 t->expires = jiffies + *n; 1119 t->expires = jiffies + *n;
1116 add_timer(t); 1120 add_timer_on(t, smp_processor_id());
1117} 1121}
1118 1122
1119static void mce_do_trigger(struct work_struct *work) 1123static void mce_do_trigger(struct work_struct *work)
1120{ 1124{
1121 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 1125 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1122} 1126}
1123 1127
1124static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1128static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
@@ -1135,7 +1139,7 @@ int mce_notify_irq(void)
1135 1139
1136 clear_thread_flag(TIF_MCE_NOTIFY); 1140 clear_thread_flag(TIF_MCE_NOTIFY);
1137 1141
1138 if (test_and_clear_bit(0, &notify_user)) { 1142 if (test_and_clear_bit(0, &mce_need_notify)) {
1139 wake_up_interruptible(&mce_wait); 1143 wake_up_interruptible(&mce_wait);
1140 1144
1141 /* 1145 /*
@@ -1143,7 +1147,7 @@ int mce_notify_irq(void)
1143 * work_pending is always cleared before the function is 1147 * work_pending is always cleared before the function is
1144 * executed. 1148 * executed.
1145 */ 1149 */
1146 if (trigger[0] && !work_pending(&mce_trigger_work)) 1150 if (mce_helper[0] && !work_pending(&mce_trigger_work))
1147 schedule_work(&mce_trigger_work); 1151 schedule_work(&mce_trigger_work);
1148 1152
1149 if (__ratelimit(&ratelimit)) 1153 if (__ratelimit(&ratelimit))
@@ -1222,8 +1226,13 @@ static void mce_init(void)
1222} 1226}
1223 1227
1224/* Add per CPU specific workarounds here */ 1228/* Add per CPU specific workarounds here */
1225static void mce_cpu_quirks(struct cpuinfo_x86 *c) 1229static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1226{ 1230{
1231 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1232 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
1233 return -EOPNOTSUPP;
1234 }
1235
1227 /* This should be disabled by the BIOS, but isn't always */ 1236 /* This should be disabled by the BIOS, but isn't always */
1228 if (c->x86_vendor == X86_VENDOR_AMD) { 1237 if (c->x86_vendor == X86_VENDOR_AMD) {
1229 if (c->x86 == 15 && banks > 4) { 1238 if (c->x86 == 15 && banks > 4) {
@@ -1245,7 +1254,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
1245 * Various K7s with broken bank 0 around. Always disable 1254 * Various K7s with broken bank 0 around. Always disable
1246 * by default. 1255 * by default.
1247 */ 1256 */
1248 if (c->x86 == 6) 1257 if (c->x86 == 6 && banks > 0)
1249 bank[0] = 0; 1258 bank[0] = 0;
1250 } 1259 }
1251 1260
@@ -1269,11 +1278,20 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
1269 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1278 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1270 monarch_timeout < 0) 1279 monarch_timeout < 0)
1271 monarch_timeout = USEC_PER_SEC; 1280 monarch_timeout = USEC_PER_SEC;
1281
1282 /*
1283 * There are also broken BIOSes on some Pentium M and
1284 * earlier systems:
1285 */
1286 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
1287 mce_bootlog = 0;
1272 } 1288 }
1273 if (monarch_timeout < 0) 1289 if (monarch_timeout < 0)
1274 monarch_timeout = 0; 1290 monarch_timeout = 0;
1275 if (mce_bootlog != 0) 1291 if (mce_bootlog != 0)
1276 mce_panic_timeout = 30; 1292 mce_panic_timeout = 30;
1293
1294 return 0;
1277} 1295}
1278 1296
1279static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1297static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
@@ -1282,8 +1300,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1282 return; 1300 return;
1283 switch (c->x86_vendor) { 1301 switch (c->x86_vendor) {
1284 case X86_VENDOR_INTEL: 1302 case X86_VENDOR_INTEL:
1285 if (mce_p5_enabled()) 1303 intel_p5_mcheck_init(c);
1286 intel_p5_mcheck_init(c);
1287 break; 1304 break;
1288 case X86_VENDOR_CENTAUR: 1305 case X86_VENDOR_CENTAUR:
1289 winchip_mcheck_init(c); 1306 winchip_mcheck_init(c);
@@ -1318,7 +1335,7 @@ static void mce_init_timer(void)
1318 return; 1335 return;
1319 setup_timer(t, mcheck_timer, smp_processor_id()); 1336 setup_timer(t, mcheck_timer, smp_processor_id());
1320 t->expires = round_jiffies(jiffies + *n); 1337 t->expires = round_jiffies(jiffies + *n);
1321 add_timer(t); 1338 add_timer_on(t, smp_processor_id());
1322} 1339}
1323 1340
1324/* 1341/*
@@ -1335,11 +1352,10 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1335 if (!mce_available(c)) 1352 if (!mce_available(c))
1336 return; 1353 return;
1337 1354
1338 if (mce_cap_init() < 0) { 1355 if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) {
1339 mce_disabled = 1; 1356 mce_disabled = 1;
1340 return; 1357 return;
1341 } 1358 }
1342 mce_cpu_quirks(c);
1343 1359
1344 machine_check_vector = do_machine_check; 1360 machine_check_vector = do_machine_check;
1345 1361
@@ -1609,8 +1625,9 @@ static int mce_resume(struct sys_device *dev)
1609static void mce_cpu_restart(void *data) 1625static void mce_cpu_restart(void *data)
1610{ 1626{
1611 del_timer_sync(&__get_cpu_var(mce_timer)); 1627 del_timer_sync(&__get_cpu_var(mce_timer));
1612 if (mce_available(&current_cpu_data)) 1628 if (!mce_available(&current_cpu_data))
1613 mce_init(); 1629 return;
1630 mce_init();
1614 mce_init_timer(); 1631 mce_init_timer();
1615} 1632}
1616 1633
@@ -1620,6 +1637,26 @@ static void mce_restart(void)
1620 on_each_cpu(mce_cpu_restart, NULL, 1); 1637 on_each_cpu(mce_cpu_restart, NULL, 1);
1621} 1638}
1622 1639
1640/* Toggle features for corrected errors */
1641static void mce_disable_ce(void *all)
1642{
1643 if (!mce_available(&current_cpu_data))
1644 return;
1645 if (all)
1646 del_timer_sync(&__get_cpu_var(mce_timer));
1647 cmci_clear();
1648}
1649
1650static void mce_enable_ce(void *all)
1651{
1652 if (!mce_available(&current_cpu_data))
1653 return;
1654 cmci_reenable();
1655 cmci_recheck();
1656 if (all)
1657 mce_init_timer();
1658}
1659
1623static struct sysdev_class mce_sysclass = { 1660static struct sysdev_class mce_sysclass = {
1624 .suspend = mce_suspend, 1661 .suspend = mce_suspend,
1625 .shutdown = mce_shutdown, 1662 .shutdown = mce_shutdown,
@@ -1659,26 +1696,70 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1659static ssize_t 1696static ssize_t
1660show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1697show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1661{ 1698{
1662 strcpy(buf, trigger); 1699 strcpy(buf, mce_helper);
1663 strcat(buf, "\n"); 1700 strcat(buf, "\n");
1664 return strlen(trigger) + 1; 1701 return strlen(mce_helper) + 1;
1665} 1702}
1666 1703
1667static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1704static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1668 const char *buf, size_t siz) 1705 const char *buf, size_t siz)
1669{ 1706{
1670 char *p; 1707 char *p;
1671 int len;
1672 1708
1673 strncpy(trigger, buf, sizeof(trigger)); 1709 strncpy(mce_helper, buf, sizeof(mce_helper));
1674 trigger[sizeof(trigger)-1] = 0; 1710 mce_helper[sizeof(mce_helper)-1] = 0;
1675 len = strlen(trigger); 1711 p = strchr(mce_helper, '\n');
1676 p = strchr(trigger, '\n');
1677 1712
1678 if (*p) 1713 if (p)
1679 *p = 0; 1714 *p = 0;
1680 1715
1681 return len; 1716 return strlen(mce_helper) + !!p;
1717}
1718
1719static ssize_t set_ignore_ce(struct sys_device *s,
1720 struct sysdev_attribute *attr,
1721 const char *buf, size_t size)
1722{
1723 u64 new;
1724
1725 if (strict_strtoull(buf, 0, &new) < 0)
1726 return -EINVAL;
1727
1728 if (mce_ignore_ce ^ !!new) {
1729 if (new) {
1730 /* disable ce features */
1731 on_each_cpu(mce_disable_ce, (void *)1, 1);
1732 mce_ignore_ce = 1;
1733 } else {
1734 /* enable ce features */
1735 mce_ignore_ce = 0;
1736 on_each_cpu(mce_enable_ce, (void *)1, 1);
1737 }
1738 }
1739 return size;
1740}
1741
1742static ssize_t set_cmci_disabled(struct sys_device *s,
1743 struct sysdev_attribute *attr,
1744 const char *buf, size_t size)
1745{
1746 u64 new;
1747
1748 if (strict_strtoull(buf, 0, &new) < 0)
1749 return -EINVAL;
1750
1751 if (mce_cmci_disabled ^ !!new) {
1752 if (new) {
1753 /* disable cmci */
1754 on_each_cpu(mce_disable_ce, NULL, 1);
1755 mce_cmci_disabled = 1;
1756 } else {
1757 /* enable cmci */
1758 mce_cmci_disabled = 0;
1759 on_each_cpu(mce_enable_ce, NULL, 1);
1760 }
1761 }
1762 return size;
1682} 1763}
1683 1764
1684static ssize_t store_int_with_restart(struct sys_device *s, 1765static ssize_t store_int_with_restart(struct sys_device *s,
@@ -1693,6 +1774,7 @@ static ssize_t store_int_with_restart(struct sys_device *s,
1693static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1774static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1694static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1775static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1695static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1776static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1777static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
1696 1778
1697static struct sysdev_ext_attribute attr_check_interval = { 1779static struct sysdev_ext_attribute attr_check_interval = {
1698 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1780 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
@@ -1700,9 +1782,24 @@ static struct sysdev_ext_attribute attr_check_interval = {
1700 &check_interval 1782 &check_interval
1701}; 1783};
1702 1784
1785static struct sysdev_ext_attribute attr_ignore_ce = {
1786 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
1787 &mce_ignore_ce
1788};
1789
1790static struct sysdev_ext_attribute attr_cmci_disabled = {
1791 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
1792 &mce_cmci_disabled
1793};
1794
1703static struct sysdev_attribute *mce_attrs[] = { 1795static struct sysdev_attribute *mce_attrs[] = {
1704 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, 1796 &attr_tolerant.attr,
1797 &attr_check_interval.attr,
1798 &attr_trigger,
1705 &attr_monarch_timeout.attr, 1799 &attr_monarch_timeout.attr,
1800 &attr_dont_log_ce.attr,
1801 &attr_ignore_ce.attr,
1802 &attr_cmci_disabled.attr,
1706 NULL 1803 NULL
1707}; 1804};
1708 1805
@@ -1712,7 +1809,7 @@ static cpumask_var_t mce_dev_initialized;
1712static __cpuinit int mce_create_device(unsigned int cpu) 1809static __cpuinit int mce_create_device(unsigned int cpu)
1713{ 1810{
1714 int err; 1811 int err;
1715 int i; 1812 int i, j;
1716 1813
1717 if (!mce_available(&boot_cpu_data)) 1814 if (!mce_available(&boot_cpu_data))
1718 return -EIO; 1815 return -EIO;
@@ -1730,9 +1827,9 @@ static __cpuinit int mce_create_device(unsigned int cpu)
1730 if (err) 1827 if (err)
1731 goto error; 1828 goto error;
1732 } 1829 }
1733 for (i = 0; i < banks; i++) { 1830 for (j = 0; j < banks; j++) {
1734 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1831 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1735 &bank_attrs[i]); 1832 &bank_attrs[j]);
1736 if (err) 1833 if (err)
1737 goto error2; 1834 goto error2;
1738 } 1835 }
@@ -1740,8 +1837,8 @@ static __cpuinit int mce_create_device(unsigned int cpu)
1740 1837
1741 return 0; 1838 return 0;
1742error2: 1839error2:
1743 while (--i >= 0) 1840 while (--j >= 0)
1744 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1841 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]);
1745error: 1842error:
1746 while (--i >= 0) 1843 while (--i >= 0)
1747 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1844 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
@@ -1883,7 +1980,7 @@ static __init int mce_init_device(void)
1883 if (!mce_available(&boot_cpu_data)) 1980 if (!mce_available(&boot_cpu_data))
1884 return -EIO; 1981 return -EIO;
1885 1982
1886 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1983 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1887 1984
1888 err = mce_init_banks(); 1985 err = mce_init_banks();
1889 if (err) 1986 if (err)
@@ -1915,7 +2012,7 @@ EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
1915/* This has to be run for each processor */ 2012/* This has to be run for each processor */
1916void mcheck_init(struct cpuinfo_x86 *c) 2013void mcheck_init(struct cpuinfo_x86 *c)
1917{ 2014{
1918 if (mce_disabled == 1) 2015 if (mce_disabled)
1919 return; 2016 return;
1920 2017
1921 switch (c->x86_vendor) { 2018 switch (c->x86_vendor) {
@@ -1945,10 +2042,9 @@ void mcheck_init(struct cpuinfo_x86 *c)
1945 2042
1946static int __init mcheck_enable(char *str) 2043static int __init mcheck_enable(char *str)
1947{ 2044{
1948 mce_disabled = -1; 2045 mce_p5_enabled = 1;
1949 return 1; 2046 return 1;
1950} 2047}
1951
1952__setup("mce", mcheck_enable); 2048__setup("mce", mcheck_enable);
1953 2049
1954#endif /* CONFIG_X86_OLD_MCE */ 2050#endif /* CONFIG_X86_OLD_MCE */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
deleted file mode 100644
index 84a552b458c8..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ /dev/null
@@ -1,38 +0,0 @@
1#include <linux/init.h>
2#include <asm/mce.h>
3
4#ifdef CONFIG_X86_OLD_MCE
5void amd_mcheck_init(struct cpuinfo_x86 *c);
6void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
7void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
8#endif
9
10#ifdef CONFIG_X86_ANCIENT_MCE
11void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
12void winchip_mcheck_init(struct cpuinfo_x86 *c);
13extern int mce_p5_enable;
14static inline int mce_p5_enabled(void) { return mce_p5_enable; }
15static inline void enable_p5_mce(void) { mce_p5_enable = 1; }
16#else
17static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
18static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
19static inline int mce_p5_enabled(void) { return 0; }
20static inline void enable_p5_mce(void) { }
21#endif
22
23/* Call the installed machine check handler for this CPU setup. */
24extern void (*machine_check_vector)(struct pt_regs *, long error_code);
25
26#ifdef CONFIG_X86_OLD_MCE
27
28extern int nr_mce_banks;
29
30void intel_set_thermal_handler(void);
31
32#else
33
34static inline void intel_set_thermal_handler(void) { }
35
36#endif
37
38void intel_init_thermal(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index ddae21620bda..ddae21620bda 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 2b011d2d8579..e1acec0f7a32 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -1,74 +1,226 @@
1/* 1/*
2 * Common code for Intel machine checks 2 * Intel specific MCE features.
3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
4 * Copyright (C) 2008, 2009 Intel Corporation
5 * Author: Andi Kleen
3 */ 6 */
4#include <linux/interrupt.h>
5#include <linux/kernel.h>
6#include <linux/types.h>
7#include <linux/init.h>
8#include <linux/smp.h>
9 7
10#include <asm/therm_throt.h> 8#include <linux/init.h>
11#include <asm/processor.h> 9#include <linux/interrupt.h>
12#include <asm/system.h> 10#include <linux/percpu.h>
13#include <asm/apic.h> 11#include <asm/apic.h>
12#include <asm/processor.h>
14#include <asm/msr.h> 13#include <asm/msr.h>
14#include <asm/mce.h>
15
16/*
17 * Support for Intel Correct Machine Check Interrupts. This allows
18 * the CPU to raise an interrupt when a corrected machine check happened.
19 * Normally we pick those up using a regular polling timer.
20 * Also supports reliable discovery of shared banks.
21 */
15 22
16#include "mce.h" 23static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
17 24
18void intel_init_thermal(struct cpuinfo_x86 *c) 25/*
26 * cmci_discover_lock protects against parallel discovery attempts
27 * which could race against each other.
28 */
29static DEFINE_SPINLOCK(cmci_discover_lock);
30
31#define CMCI_THRESHOLD 1
32
33static int cmci_supported(int *banks)
19{ 34{
20 unsigned int cpu = smp_processor_id(); 35 u64 cap;
21 int tm2 = 0;
22 u32 l, h;
23 36
24 /* Thermal monitoring depends on ACPI and clock modulation*/ 37 if (mce_cmci_disabled || mce_ignore_ce)
25 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) 38 return 0;
26 return;
27 39
28 /* 40 /*
29 * First check if its enabled already, in which case there might 41 * Vendor check is not strictly needed, but the initial
30 * be some SMM goo which handles it, so we can't even put a handler 42 * initialization is vendor keyed and this
31 * since it might be delivered via SMI already: 43 * makes sure none of the backdoors are entered otherwise.
32 */ 44 */
33 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 45 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
34 h = apic_read(APIC_LVTTHMR); 46 return 0;
35 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { 47 if (!cpu_has_apic || lapic_get_maxlvt() < 6)
36 printk(KERN_DEBUG 48 return 0;
37 "CPU%d: Thermal monitoring handled by SMI\n", cpu); 49 rdmsrl(MSR_IA32_MCG_CAP, cap);
38 return; 50 *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
51 return !!(cap & MCG_CMCI_P);
52}
53
54/*
55 * The interrupt handler. This is called on every event.
56 * Just call the poller directly to log any events.
57 * This could in theory increase the threshold under high load,
58 * but doesn't for now.
59 */
60static void intel_threshold_interrupt(void)
61{
62 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
63 mce_notify_irq();
64}
65
66static void print_update(char *type, int *hdr, int num)
67{
68 if (*hdr == 0)
69 printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
70 *hdr = 1;
71 printk(KERN_CONT " %s:%d", type, num);
72}
73
74/*
75 * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
76 * on this CPU. Use the algorithm recommended in the SDM to discover shared
77 * banks.
78 */
79static void cmci_discover(int banks, int boot)
80{
81 unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
82 unsigned long flags;
83 int hdr = 0;
84 int i;
85
86 spin_lock_irqsave(&cmci_discover_lock, flags);
87 for (i = 0; i < banks; i++) {
88 u64 val;
89
90 if (test_bit(i, owned))
91 continue;
92
93 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
94
95 /* Already owned by someone else? */
96 if (val & CMCI_EN) {
97 if (test_and_clear_bit(i, owned) || boot)
98 print_update("SHD", &hdr, i);
99 __clear_bit(i, __get_cpu_var(mce_poll_banks));
100 continue;
101 }
102
103 val |= CMCI_EN | CMCI_THRESHOLD;
104 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
105 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
106
107 /* Did the enable bit stick? -- the bank supports CMCI */
108 if (val & CMCI_EN) {
109 if (!test_and_set_bit(i, owned) || boot)
110 print_update("CMCI", &hdr, i);
111 __clear_bit(i, __get_cpu_var(mce_poll_banks));
112 } else {
113 WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
114 }
39 } 115 }
116 spin_unlock_irqrestore(&cmci_discover_lock, flags);
117 if (hdr)
118 printk(KERN_CONT "\n");
119}
120
121/*
122 * Just in case we missed an event during initialization check
123 * all the CMCI owned banks.
124 */
125void cmci_recheck(void)
126{
127 unsigned long flags;
128 int banks;
129
130 if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
131 return;
132 local_irq_save(flags);
133 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
134 local_irq_restore(flags);
135}
40 136
41 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) 137/*
42 tm2 = 1; 138 * Disable CMCI on this CPU for all banks it owns when it goes down.
139 * This allows other CPUs to claim the banks on rediscovery.
140 */
141void cmci_clear(void)
142{
143 unsigned long flags;
144 int i;
145 int banks;
146 u64 val;
43 147
44 /* Check whether a vector already exists */ 148 if (!cmci_supported(&banks))
45 if (h & APIC_VECTOR_MASK) {
46 printk(KERN_DEBUG
47 "CPU%d: Thermal LVT vector (%#x) already installed\n",
48 cpu, (h & APIC_VECTOR_MASK));
49 return; 149 return;
150 spin_lock_irqsave(&cmci_discover_lock, flags);
151 for (i = 0; i < banks; i++) {
152 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
153 continue;
154 /* Disable CMCI */
155 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
156 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
157 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
158 __clear_bit(i, __get_cpu_var(mce_banks_owned));
50 } 159 }
160 spin_unlock_irqrestore(&cmci_discover_lock, flags);
161}
162
163/*
164 * After a CPU went down cycle through all the others and rediscover
165 * Must run in process context.
166 */
167void cmci_rediscover(int dying)
168{
169 int banks;
170 int cpu;
171 cpumask_var_t old;
172
173 if (!cmci_supported(&banks))
174 return;
175 if (!alloc_cpumask_var(&old, GFP_KERNEL))
176 return;
177 cpumask_copy(old, &current->cpus_allowed);
51 178
52 /* We'll mask the thermal vector in the lapic till we're ready: */ 179 for_each_online_cpu(cpu) {
53 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; 180 if (cpu == dying)
54 apic_write(APIC_LVTTHMR, h); 181 continue;
182 if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
183 continue;
184 /* Recheck banks in case CPUs don't all have the same */
185 if (cmci_supported(&banks))
186 cmci_discover(banks, 0);
187 }
55 188
56 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 189 set_cpus_allowed_ptr(current, old);
57 wrmsr(MSR_IA32_THERM_INTERRUPT, 190 free_cpumask_var(old);
58 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); 191}
59 192
60 intel_set_thermal_handler(); 193/*
194 * Reenable CMCI on this CPU in case a CPU down failed.
195 */
196void cmci_reenable(void)
197{
198 int banks;
199 if (cmci_supported(&banks))
200 cmci_discover(banks, 0);
201}
61 202
62 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 203static void intel_init_cmci(void)
63 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); 204{
205 int banks;
64 206
65 /* Unmask the thermal vector: */ 207 if (!cmci_supported(&banks))
66 l = apic_read(APIC_LVTTHMR); 208 return;
67 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
68 209
69 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", 210 mce_threshold_vector = intel_threshold_interrupt;
70 cpu, tm2 ? "TM2" : "TM1"); 211 cmci_discover(banks, 1);
212 /*
213 * For CPU #0 this runs with still disabled APIC, but that's
214 * ok because only the vector is set up. We still do another
215 * check for the banks later for CPU #0 just to make sure
216 * to not miss any events.
217 */
218 apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
219 cmci_recheck();
220}
71 221
72 /* enable thermal throttle processing */ 222void mce_intel_feature_init(struct cpuinfo_x86 *c)
73 atomic_set(&therm_throt_en, 1); 223{
224 intel_init_thermal(c);
225 intel_init_cmci();
74} 226}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
deleted file mode 100644
index f2ef6952c400..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ /dev/null
@@ -1,248 +0,0 @@
1/*
2 * Intel specific MCE features.
3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
4 * Copyright (C) 2008, 2009 Intel Corporation
5 * Author: Andi Kleen
6 */
7
8#include <linux/init.h>
9#include <linux/interrupt.h>
10#include <linux/percpu.h>
11#include <asm/processor.h>
12#include <asm/apic.h>
13#include <asm/msr.h>
14#include <asm/mce.h>
15#include <asm/hw_irq.h>
16#include <asm/idle.h>
17#include <asm/therm_throt.h>
18
19#include "mce.h"
20
21asmlinkage void smp_thermal_interrupt(void)
22{
23 __u64 msr_val;
24
25 ack_APIC_irq();
26
27 exit_idle();
28 irq_enter();
29
30 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
31 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
32 mce_log_therm_throt_event(msr_val);
33
34 inc_irq_stat(irq_thermal_count);
35 irq_exit();
36}
37
38/*
39 * Support for Intel Correct Machine Check Interrupts. This allows
40 * the CPU to raise an interrupt when a corrected machine check happened.
41 * Normally we pick those up using a regular polling timer.
42 * Also supports reliable discovery of shared banks.
43 */
44
45static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
46
47/*
48 * cmci_discover_lock protects against parallel discovery attempts
49 * which could race against each other.
50 */
51static DEFINE_SPINLOCK(cmci_discover_lock);
52
53#define CMCI_THRESHOLD 1
54
55static int cmci_supported(int *banks)
56{
57 u64 cap;
58
59 if (mce_cmci_disabled || mce_ignore_ce)
60 return 0;
61
62 /*
63 * Vendor check is not strictly needed, but the initial
64 * initialization is vendor keyed and this
65 * makes sure none of the backdoors are entered otherwise.
66 */
67 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
68 return 0;
69 if (!cpu_has_apic || lapic_get_maxlvt() < 6)
70 return 0;
71 rdmsrl(MSR_IA32_MCG_CAP, cap);
72 *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
73 return !!(cap & MCG_CMCI_P);
74}
75
76/*
77 * The interrupt handler. This is called on every event.
78 * Just call the poller directly to log any events.
79 * This could in theory increase the threshold under high load,
80 * but doesn't for now.
81 */
82static void intel_threshold_interrupt(void)
83{
84 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
85 mce_notify_irq();
86}
87
88static void print_update(char *type, int *hdr, int num)
89{
90 if (*hdr == 0)
91 printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
92 *hdr = 1;
93 printk(KERN_CONT " %s:%d", type, num);
94}
95
96/*
97 * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
98 * on this CPU. Use the algorithm recommended in the SDM to discover shared
99 * banks.
100 */
101static void cmci_discover(int banks, int boot)
102{
103 unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
104 unsigned long flags;
105 int hdr = 0;
106 int i;
107
108 spin_lock_irqsave(&cmci_discover_lock, flags);
109 for (i = 0; i < banks; i++) {
110 u64 val;
111
112 if (test_bit(i, owned))
113 continue;
114
115 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
116
117 /* Already owned by someone else? */
118 if (val & CMCI_EN) {
119 if (test_and_clear_bit(i, owned) || boot)
120 print_update("SHD", &hdr, i);
121 __clear_bit(i, __get_cpu_var(mce_poll_banks));
122 continue;
123 }
124
125 val |= CMCI_EN | CMCI_THRESHOLD;
126 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
127 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
128
129 /* Did the enable bit stick? -- the bank supports CMCI */
130 if (val & CMCI_EN) {
131 if (!test_and_set_bit(i, owned) || boot)
132 print_update("CMCI", &hdr, i);
133 __clear_bit(i, __get_cpu_var(mce_poll_banks));
134 } else {
135 WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
136 }
137 }
138 spin_unlock_irqrestore(&cmci_discover_lock, flags);
139 if (hdr)
140 printk(KERN_CONT "\n");
141}
142
143/*
144 * Just in case we missed an event during initialization check
145 * all the CMCI owned banks.
146 */
147void cmci_recheck(void)
148{
149 unsigned long flags;
150 int banks;
151
152 if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
153 return;
154 local_irq_save(flags);
155 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
156 local_irq_restore(flags);
157}
158
159/*
160 * Disable CMCI on this CPU for all banks it owns when it goes down.
161 * This allows other CPUs to claim the banks on rediscovery.
162 */
163void cmci_clear(void)
164{
165 unsigned long flags;
166 int i;
167 int banks;
168 u64 val;
169
170 if (!cmci_supported(&banks))
171 return;
172 spin_lock_irqsave(&cmci_discover_lock, flags);
173 for (i = 0; i < banks; i++) {
174 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
175 continue;
176 /* Disable CMCI */
177 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
178 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
179 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
180 __clear_bit(i, __get_cpu_var(mce_banks_owned));
181 }
182 spin_unlock_irqrestore(&cmci_discover_lock, flags);
183}
184
185/*
186 * After a CPU went down cycle through all the others and rediscover
187 * Must run in process context.
188 */
189void cmci_rediscover(int dying)
190{
191 int banks;
192 int cpu;
193 cpumask_var_t old;
194
195 if (!cmci_supported(&banks))
196 return;
197 if (!alloc_cpumask_var(&old, GFP_KERNEL))
198 return;
199 cpumask_copy(old, &current->cpus_allowed);
200
201 for_each_online_cpu(cpu) {
202 if (cpu == dying)
203 continue;
204 if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
205 continue;
206 /* Recheck banks in case CPUs don't all have the same */
207 if (cmci_supported(&banks))
208 cmci_discover(banks, 0);
209 }
210
211 set_cpus_allowed_ptr(current, old);
212 free_cpumask_var(old);
213}
214
215/*
216 * Reenable CMCI on this CPU in case a CPU down failed.
217 */
218void cmci_reenable(void)
219{
220 int banks;
221 if (cmci_supported(&banks))
222 cmci_discover(banks, 0);
223}
224
225static void intel_init_cmci(void)
226{
227 int banks;
228
229 if (!cmci_supported(&banks))
230 return;
231
232 mce_threshold_vector = intel_threshold_interrupt;
233 cmci_discover(banks, 1);
234 /*
235 * For CPU #0 this runs with still disabled APIC, but that's
236 * ok because only the vector is set up. We still do another
237 * check for the banks later for CPU #0 just to make sure
238 * to not miss any events.
239 */
240 apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
241 cmci_recheck();
242}
243
244void mce_intel_feature_init(struct cpuinfo_x86 *c)
245{
246 intel_init_thermal(c);
247 intel_init_cmci();
248}
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index 70b710420f74..f5f2d6f71fb6 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -17,10 +17,9 @@
17 17
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <asm/system.h> 19#include <asm/system.h>
20#include <asm/mce.h>
20#include <asm/msr.h> 21#include <asm/msr.h>
21 22
22#include "mce.h"
23
24static int firstbank; 23static int firstbank;
25 24
26#define MCE_RATE (15*HZ) /* timer rate is 15s */ 25#define MCE_RATE (15*HZ) /* timer rate is 15s */
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index 82cee108a2d3..4482aea9aa2e 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -1,21 +1,15 @@
1/* 1/*
2 * P4 specific Machine Check Exception Reporting 2 * P4 specific Machine Check Exception Reporting
3 */ 3 */
4
5#include <linux/interrupt.h>
6#include <linux/kernel.h> 4#include <linux/kernel.h>
7#include <linux/types.h> 5#include <linux/types.h>
8#include <linux/init.h> 6#include <linux/init.h>
9#include <linux/smp.h> 7#include <linux/smp.h>
10 8
11#include <asm/therm_throt.h>
12#include <asm/processor.h> 9#include <asm/processor.h>
13#include <asm/system.h> 10#include <asm/mce.h>
14#include <asm/apic.h>
15#include <asm/msr.h> 11#include <asm/msr.h>
16 12
17#include "mce.h"
18
19/* as supported by the P4/Xeon family */ 13/* as supported by the P4/Xeon family */
20struct intel_mce_extended_msrs { 14struct intel_mce_extended_msrs {
21 u32 eax; 15 u32 eax;
@@ -33,46 +27,6 @@ struct intel_mce_extended_msrs {
33 27
34static int mce_num_extended_msrs; 28static int mce_num_extended_msrs;
35 29
36
37#ifdef CONFIG_X86_MCE_P4THERMAL
38
39static void unexpected_thermal_interrupt(struct pt_regs *regs)
40{
41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
42 smp_processor_id());
43 add_taint(TAINT_MACHINE_CHECK);
44}
45
46/* P4/Xeon Thermal transition interrupt handler: */
47static void intel_thermal_interrupt(struct pt_regs *regs)
48{
49 __u64 msr_val;
50
51 ack_APIC_irq();
52
53 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
54 therm_throt_process(msr_val & THERM_STATUS_PROCHOT);
55}
56
57/* Thermal interrupt handler for this CPU setup: */
58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) =
59 unexpected_thermal_interrupt;
60
61void smp_thermal_interrupt(struct pt_regs *regs)
62{
63 irq_enter();
64 vendor_thermal_interrupt(regs);
65 __get_cpu_var(irq_stat).irq_thermal_count++;
66 irq_exit();
67}
68
69void intel_set_thermal_handler(void)
70{
71 vendor_thermal_interrupt = intel_thermal_interrupt;
72}
73
74#endif /* CONFIG_X86_MCE_P4THERMAL */
75
76/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ 30/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
77static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) 31static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
78{ 32{
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 015f481ab1b0..5c0e6533d9bc 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -10,12 +10,11 @@
10 10
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/mce.h>
13#include <asm/msr.h> 14#include <asm/msr.h>
14 15
15#include "mce.h"
16
17/* By default disabled */ 16/* By default disabled */
18int mce_p5_enable; 17int mce_p5_enabled __read_mostly;
19 18
20/* Machine check handler for Pentium class Intel CPUs: */ 19/* Machine check handler for Pentium class Intel CPUs: */
21static void pentium_machine_check(struct pt_regs *regs, long error_code) 20static void pentium_machine_check(struct pt_regs *regs, long error_code)
@@ -43,15 +42,13 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
43{ 42{
44 u32 l, h; 43 u32 l, h;
45 44
46 /* Check for MCE support: */ 45 /* Default P5 to off as its often misconnected: */
47 if (!cpu_has(c, X86_FEATURE_MCE)) 46 if (!mce_p5_enabled)
48 return; 47 return;
49 48
50#ifdef CONFIG_X86_OLD_MCE 49 /* Check for MCE support: */
51 /* Default P5 to off as its often misconnected: */ 50 if (!cpu_has(c, X86_FEATURE_MCE))
52 if (mce_disabled != -1)
53 return; 51 return;
54#endif
55 52
56 machine_check_vector = pentium_machine_check; 53 machine_check_vector = pentium_machine_check;
57 /* Make sure the vector pointer is visible before we enable MCEs: */ 54 /* Make sure the vector pointer is visible before we enable MCEs: */
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index 43c24e667457..01e4f8178183 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -10,10 +10,9 @@
10 10
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/mce.h>
13#include <asm/msr.h> 14#include <asm/msr.h>
14 15
15#include "mce.h"
16
17/* Machine Check Handler For PII/PIII */ 16/* Machine Check Handler For PII/PIII */
18static void intel_machine_check(struct pt_regs *regs, long error_code) 17static void intel_machine_check(struct pt_regs *regs, long error_code)
19{ 18{
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 7b1ae2e20ba5..5957a93e5173 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -13,21 +13,32 @@
13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. 13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
14 * Inspired by Ross Biro's and Al Borchers' counter code. 14 * Inspired by Ross Biro's and Al Borchers' counter code.
15 */ 15 */
16#include <linux/interrupt.h>
16#include <linux/notifier.h> 17#include <linux/notifier.h>
17#include <linux/jiffies.h> 18#include <linux/jiffies.h>
19#include <linux/kernel.h>
18#include <linux/percpu.h> 20#include <linux/percpu.h>
19#include <linux/sysdev.h> 21#include <linux/sysdev.h>
22#include <linux/types.h>
23#include <linux/init.h>
24#include <linux/smp.h>
20#include <linux/cpu.h> 25#include <linux/cpu.h>
21 26
22#include <asm/therm_throt.h> 27#include <asm/processor.h>
28#include <asm/system.h>
29#include <asm/apic.h>
30#include <asm/idle.h>
31#include <asm/mce.h>
32#include <asm/msr.h>
23 33
24/* How long to wait between reporting thermal events */ 34/* How long to wait between reporting thermal events */
25#define CHECK_INTERVAL (300 * HZ) 35#define CHECK_INTERVAL (300 * HZ)
26 36
27static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; 37static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
28static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); 38static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
39static DEFINE_PER_CPU(bool, thermal_throttle_active);
29 40
30atomic_t therm_throt_en = ATOMIC_INIT(0); 41static atomic_t therm_throt_en = ATOMIC_INIT(0);
31 42
32#ifdef CONFIG_SYSFS 43#ifdef CONFIG_SYSFS
33#define define_therm_throt_sysdev_one_ro(_name) \ 44#define define_therm_throt_sysdev_one_ro(_name) \
@@ -82,31 +93,37 @@ static struct attribute_group thermal_throttle_attr_group = {
82 * 1 : Event should be logged further, and a message has been 93 * 1 : Event should be logged further, and a message has been
83 * printed to the syslog. 94 * printed to the syslog.
84 */ 95 */
85int therm_throt_process(int curr) 96static int therm_throt_process(int curr)
86{ 97{
87 unsigned int cpu = smp_processor_id(); 98 unsigned int cpu = smp_processor_id();
88 __u64 tmp_jiffs = get_jiffies_64(); 99 __u64 tmp_jiffs = get_jiffies_64();
100 bool was_throttled = __get_cpu_var(thermal_throttle_active);
101 bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr;
89 102
90 if (curr) 103 if (is_throttled)
91 __get_cpu_var(thermal_throttle_count)++; 104 __get_cpu_var(thermal_throttle_count)++;
92 105
93 if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) 106 if (!(was_throttled ^ is_throttled) &&
107 time_before64(tmp_jiffs, __get_cpu_var(next_check)))
94 return 0; 108 return 0;
95 109
96 __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; 110 __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL;
97 111
98 /* if we just entered the thermal event */ 112 /* if we just entered the thermal event */
99 if (curr) { 113 if (is_throttled) {
100 printk(KERN_CRIT "CPU%d: Temperature above threshold, " 114 printk(KERN_CRIT "CPU%d: Temperature above threshold, "
101 "cpu clock throttled (total events = %lu)\n", cpu, 115 "cpu clock throttled (total events = %lu)\n",
102 __get_cpu_var(thermal_throttle_count)); 116 cpu, __get_cpu_var(thermal_throttle_count));
103 117
104 add_taint(TAINT_MACHINE_CHECK); 118 add_taint(TAINT_MACHINE_CHECK);
105 } else { 119 return 1;
106 printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); 120 }
121 if (was_throttled) {
122 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
123 return 1;
107 } 124 }
108 125
109 return 1; 126 return 0;
110} 127}
111 128
112#ifdef CONFIG_SYSFS 129#ifdef CONFIG_SYSFS
@@ -186,6 +203,94 @@ static __init int thermal_throttle_init_device(void)
186 203
187 return 0; 204 return 0;
188} 205}
189
190device_initcall(thermal_throttle_init_device); 206device_initcall(thermal_throttle_init_device);
207
191#endif /* CONFIG_SYSFS */ 208#endif /* CONFIG_SYSFS */
209
210/* Thermal transition interrupt handler */
211static void intel_thermal_interrupt(void)
212{
213 __u64 msr_val;
214
215 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
216 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
217 mce_log_therm_throt_event(msr_val);
218}
219
220static void unexpected_thermal_interrupt(void)
221{
222 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
223 smp_processor_id());
224 add_taint(TAINT_MACHINE_CHECK);
225}
226
227static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
228
229asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
230{
231 exit_idle();
232 irq_enter();
233 inc_irq_stat(irq_thermal_count);
234 smp_thermal_vector();
235 irq_exit();
236 /* Ack only at the end to avoid potential reentry */
237 ack_APIC_irq();
238}
239
240void intel_init_thermal(struct cpuinfo_x86 *c)
241{
242 unsigned int cpu = smp_processor_id();
243 int tm2 = 0;
244 u32 l, h;
245
246 /* Thermal monitoring depends on ACPI and clock modulation*/
247 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
248 return;
249
250 /*
251 * First check if its enabled already, in which case there might
252 * be some SMM goo which handles it, so we can't even put a handler
253 * since it might be delivered via SMI already:
254 */
255 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
256 h = apic_read(APIC_LVTTHMR);
257 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
258 printk(KERN_DEBUG
259 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
260 return;
261 }
262
263 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
264 tm2 = 1;
265
266 /* Check whether a vector already exists */
267 if (h & APIC_VECTOR_MASK) {
268 printk(KERN_DEBUG
269 "CPU%d: Thermal LVT vector (%#x) already installed\n",
270 cpu, (h & APIC_VECTOR_MASK));
271 return;
272 }
273
274 /* We'll mask the thermal vector in the lapic till we're ready: */
275 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
276 apic_write(APIC_LVTTHMR, h);
277
278 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
279 wrmsr(MSR_IA32_THERM_INTERRUPT,
280 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
281
282 smp_thermal_vector = intel_thermal_interrupt;
283
284 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
285 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
286
287 /* Unmask the thermal vector: */
288 l = apic_read(APIC_LVTTHMR);
289 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
290
291 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
292 cpu, tm2 ? "TM2" : "TM1");
293
294 /* enable thermal throttle processing */
295 atomic_set(&therm_throt_en, 1);
296}
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 81b02487090b..54060f565974 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -9,10 +9,9 @@
9 9
10#include <asm/processor.h> 10#include <asm/processor.h>
11#include <asm/system.h> 11#include <asm/system.h>
12#include <asm/mce.h>
12#include <asm/msr.h> 13#include <asm/msr.h>
13 14
14#include "mce.h"
15
16/* Machine check handler for WinChip C6: */ 15/* Machine check handler for WinChip C6: */
17static void winchip_machine_check(struct pt_regs *regs, long error_code) 16static void winchip_machine_check(struct pt_regs *regs, long error_code)
18{ 17{
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 275bc142cd5d..900332b800f8 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -19,6 +19,7 @@
19#include <linux/kdebug.h> 19#include <linux/kdebug.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/highmem.h>
22 23
23#include <asm/apic.h> 24#include <asm/apic.h>
24#include <asm/stacktrace.h> 25#include <asm/stacktrace.h>
@@ -54,6 +55,7 @@ struct x86_pmu {
54 int num_counters_fixed; 55 int num_counters_fixed;
55 int counter_bits; 56 int counter_bits;
56 u64 counter_mask; 57 u64 counter_mask;
58 int apic;
57 u64 max_period; 59 u64 max_period;
58 u64 intel_ctrl; 60 u64 intel_ctrl;
59}; 61};
@@ -65,6 +67,52 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
65}; 67};
66 68
67/* 69/*
70 * Not sure about some of these
71 */
72static const u64 p6_perfmon_event_map[] =
73{
74 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
75 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
76 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
77 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
78 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
79 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
80 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
81};
82
83static u64 p6_pmu_event_map(int event)
84{
85 return p6_perfmon_event_map[event];
86}
87
88/*
89 * Counter setting that is specified not to count anything.
90 * We use this to effectively disable a counter.
91 *
92 * L2_RQSTS with 0 MESI unit mask.
93 */
94#define P6_NOP_COUNTER 0x0000002EULL
95
96static u64 p6_pmu_raw_event(u64 event)
97{
98#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
99#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
100#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
101#define P6_EVNTSEL_INV_MASK 0x00800000ULL
102#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL
103
104#define P6_EVNTSEL_MASK \
105 (P6_EVNTSEL_EVENT_MASK | \
106 P6_EVNTSEL_UNIT_MASK | \
107 P6_EVNTSEL_EDGE_MASK | \
108 P6_EVNTSEL_INV_MASK | \
109 P6_EVNTSEL_COUNTER_MASK)
110
111 return event & P6_EVNTSEL_MASK;
112}
113
114
115/*
68 * Intel PerfMon v3. Used on Core2 and later. 116 * Intel PerfMon v3. Used on Core2 and later.
69 */ 117 */
70static const u64 intel_perfmon_event_map[] = 118static const u64 intel_perfmon_event_map[] =
@@ -389,23 +437,23 @@ static u64 intel_pmu_raw_event(u64 event)
389 return event & CORE_EVNTSEL_MASK; 437 return event & CORE_EVNTSEL_MASK;
390} 438}
391 439
392static const u64 amd_0f_hw_cache_event_ids 440static const u64 amd_hw_cache_event_ids
393 [PERF_COUNT_HW_CACHE_MAX] 441 [PERF_COUNT_HW_CACHE_MAX]
394 [PERF_COUNT_HW_CACHE_OP_MAX] 442 [PERF_COUNT_HW_CACHE_OP_MAX]
395 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 443 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
396{ 444{
397 [ C(L1D) ] = { 445 [ C(L1D) ] = {
398 [ C(OP_READ) ] = { 446 [ C(OP_READ) ] = {
399 [ C(RESULT_ACCESS) ] = 0, 447 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
400 [ C(RESULT_MISS) ] = 0, 448 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
401 }, 449 },
402 [ C(OP_WRITE) ] = { 450 [ C(OP_WRITE) ] = {
403 [ C(RESULT_ACCESS) ] = 0, 451 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
404 [ C(RESULT_MISS) ] = 0, 452 [ C(RESULT_MISS) ] = 0,
405 }, 453 },
406 [ C(OP_PREFETCH) ] = { 454 [ C(OP_PREFETCH) ] = {
407 [ C(RESULT_ACCESS) ] = 0, 455 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
408 [ C(RESULT_MISS) ] = 0, 456 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
409 }, 457 },
410 }, 458 },
411 [ C(L1I ) ] = { 459 [ C(L1I ) ] = {
@@ -418,17 +466,17 @@ static const u64 amd_0f_hw_cache_event_ids
418 [ C(RESULT_MISS) ] = -1, 466 [ C(RESULT_MISS) ] = -1,
419 }, 467 },
420 [ C(OP_PREFETCH) ] = { 468 [ C(OP_PREFETCH) ] = {
421 [ C(RESULT_ACCESS) ] = 0, 469 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
422 [ C(RESULT_MISS) ] = 0, 470 [ C(RESULT_MISS) ] = 0,
423 }, 471 },
424 }, 472 },
425 [ C(LL ) ] = { 473 [ C(LL ) ] = {
426 [ C(OP_READ) ] = { 474 [ C(OP_READ) ] = {
427 [ C(RESULT_ACCESS) ] = 0, 475 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
428 [ C(RESULT_MISS) ] = 0, 476 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
429 }, 477 },
430 [ C(OP_WRITE) ] = { 478 [ C(OP_WRITE) ] = {
431 [ C(RESULT_ACCESS) ] = 0, 479 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
432 [ C(RESULT_MISS) ] = 0, 480 [ C(RESULT_MISS) ] = 0,
433 }, 481 },
434 [ C(OP_PREFETCH) ] = { 482 [ C(OP_PREFETCH) ] = {
@@ -438,8 +486,8 @@ static const u64 amd_0f_hw_cache_event_ids
438 }, 486 },
439 [ C(DTLB) ] = { 487 [ C(DTLB) ] = {
440 [ C(OP_READ) ] = { 488 [ C(OP_READ) ] = {
441 [ C(RESULT_ACCESS) ] = 0, 489 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
442 [ C(RESULT_MISS) ] = 0, 490 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
443 }, 491 },
444 [ C(OP_WRITE) ] = { 492 [ C(OP_WRITE) ] = {
445 [ C(RESULT_ACCESS) ] = 0, 493 [ C(RESULT_ACCESS) ] = 0,
@@ -566,6 +614,7 @@ static DEFINE_MUTEX(pmc_reserve_mutex);
566 614
567static bool reserve_pmc_hardware(void) 615static bool reserve_pmc_hardware(void)
568{ 616{
617#ifdef CONFIG_X86_LOCAL_APIC
569 int i; 618 int i;
570 619
571 if (nmi_watchdog == NMI_LOCAL_APIC) 620 if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -580,9 +629,11 @@ static bool reserve_pmc_hardware(void)
580 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 629 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
581 goto eventsel_fail; 630 goto eventsel_fail;
582 } 631 }
632#endif
583 633
584 return true; 634 return true;
585 635
636#ifdef CONFIG_X86_LOCAL_APIC
586eventsel_fail: 637eventsel_fail:
587 for (i--; i >= 0; i--) 638 for (i--; i >= 0; i--)
588 release_evntsel_nmi(x86_pmu.eventsel + i); 639 release_evntsel_nmi(x86_pmu.eventsel + i);
@@ -597,10 +648,12 @@ perfctr_fail:
597 enable_lapic_nmi_watchdog(); 648 enable_lapic_nmi_watchdog();
598 649
599 return false; 650 return false;
651#endif
600} 652}
601 653
602static void release_pmc_hardware(void) 654static void release_pmc_hardware(void)
603{ 655{
656#ifdef CONFIG_X86_LOCAL_APIC
604 int i; 657 int i;
605 658
606 for (i = 0; i < x86_pmu.num_counters; i++) { 659 for (i = 0; i < x86_pmu.num_counters; i++) {
@@ -610,6 +663,7 @@ static void release_pmc_hardware(void)
610 663
611 if (nmi_watchdog == NMI_LOCAL_APIC) 664 if (nmi_watchdog == NMI_LOCAL_APIC)
612 enable_lapic_nmi_watchdog(); 665 enable_lapic_nmi_watchdog();
666#endif
613} 667}
614 668
615static void hw_perf_counter_destroy(struct perf_counter *counter) 669static void hw_perf_counter_destroy(struct perf_counter *counter)
@@ -665,6 +719,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
665{ 719{
666 struct perf_counter_attr *attr = &counter->attr; 720 struct perf_counter_attr *attr = &counter->attr;
667 struct hw_perf_counter *hwc = &counter->hw; 721 struct hw_perf_counter *hwc = &counter->hw;
722 u64 config;
668 int err; 723 int err;
669 724
670 if (!x86_pmu_initialized()) 725 if (!x86_pmu_initialized())
@@ -700,6 +755,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
700 hwc->sample_period = x86_pmu.max_period; 755 hwc->sample_period = x86_pmu.max_period;
701 hwc->last_period = hwc->sample_period; 756 hwc->last_period = hwc->sample_period;
702 atomic64_set(&hwc->period_left, hwc->sample_period); 757 atomic64_set(&hwc->period_left, hwc->sample_period);
758 } else {
759 /*
760 * If we have a PMU initialized but no APIC
761 * interrupts, we cannot sample hardware
762 * counters (user-space has to fall back and
763 * sample via a hrtimer based software counter):
764 */
765 if (!x86_pmu.apic)
766 return -EOPNOTSUPP;
703 } 767 }
704 768
705 counter->destroy = hw_perf_counter_destroy; 769 counter->destroy = hw_perf_counter_destroy;
@@ -717,14 +781,40 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
717 781
718 if (attr->config >= x86_pmu.max_events) 782 if (attr->config >= x86_pmu.max_events)
719 return -EINVAL; 783 return -EINVAL;
784
720 /* 785 /*
721 * The generic map: 786 * The generic map:
722 */ 787 */
723 hwc->config |= x86_pmu.event_map(attr->config); 788 config = x86_pmu.event_map(attr->config);
789
790 if (config == 0)
791 return -ENOENT;
792
793 if (config == -1LL)
794 return -EINVAL;
795
796 hwc->config |= config;
724 797
725 return 0; 798 return 0;
726} 799}
727 800
801static void p6_pmu_disable_all(void)
802{
803 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
804 u64 val;
805
806 if (!cpuc->enabled)
807 return;
808
809 cpuc->enabled = 0;
810 barrier();
811
812 /* p6 only has one enable register */
813 rdmsrl(MSR_P6_EVNTSEL0, val);
814 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
815 wrmsrl(MSR_P6_EVNTSEL0, val);
816}
817
728static void intel_pmu_disable_all(void) 818static void intel_pmu_disable_all(void)
729{ 819{
730 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 820 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
@@ -766,6 +856,23 @@ void hw_perf_disable(void)
766 return x86_pmu.disable_all(); 856 return x86_pmu.disable_all();
767} 857}
768 858
859static void p6_pmu_enable_all(void)
860{
861 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
862 unsigned long val;
863
864 if (cpuc->enabled)
865 return;
866
867 cpuc->enabled = 1;
868 barrier();
869
870 /* p6 only has one enable register */
871 rdmsrl(MSR_P6_EVNTSEL0, val);
872 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
873 wrmsrl(MSR_P6_EVNTSEL0, val);
874}
875
769static void intel_pmu_enable_all(void) 876static void intel_pmu_enable_all(void)
770{ 877{
771 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 878 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
@@ -783,13 +890,13 @@ static void amd_pmu_enable_all(void)
783 barrier(); 890 barrier();
784 891
785 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 892 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
893 struct perf_counter *counter = cpuc->counters[idx];
786 u64 val; 894 u64 val;
787 895
788 if (!test_bit(idx, cpuc->active_mask)) 896 if (!test_bit(idx, cpuc->active_mask))
789 continue; 897 continue;
790 rdmsrl(MSR_K7_EVNTSEL0 + idx, val); 898
791 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) 899 val = counter->hw.config;
792 continue;
793 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 900 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
794 wrmsrl(MSR_K7_EVNTSEL0 + idx, val); 901 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
795 } 902 }
@@ -818,16 +925,13 @@ static inline void intel_pmu_ack_status(u64 ack)
818 925
819static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 926static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
820{ 927{
821 int err; 928 (void)checking_wrmsrl(hwc->config_base + idx,
822 err = checking_wrmsrl(hwc->config_base + idx,
823 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); 929 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
824} 930}
825 931
826static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 932static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
827{ 933{
828 int err; 934 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
829 err = checking_wrmsrl(hwc->config_base + idx,
830 hwc->config);
831} 935}
832 936
833static inline void 937static inline void
@@ -835,13 +939,24 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
835{ 939{
836 int idx = __idx - X86_PMC_IDX_FIXED; 940 int idx = __idx - X86_PMC_IDX_FIXED;
837 u64 ctrl_val, mask; 941 u64 ctrl_val, mask;
838 int err;
839 942
840 mask = 0xfULL << (idx * 4); 943 mask = 0xfULL << (idx * 4);
841 944
842 rdmsrl(hwc->config_base, ctrl_val); 945 rdmsrl(hwc->config_base, ctrl_val);
843 ctrl_val &= ~mask; 946 ctrl_val &= ~mask;
844 err = checking_wrmsrl(hwc->config_base, ctrl_val); 947 (void)checking_wrmsrl(hwc->config_base, ctrl_val);
948}
949
950static inline void
951p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
952{
953 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
954 u64 val = P6_NOP_COUNTER;
955
956 if (cpuc->enabled)
957 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
958
959 (void)checking_wrmsrl(hwc->config_base + idx, val);
845} 960}
846 961
847static inline void 962static inline void
@@ -911,6 +1026,8 @@ x86_perf_counter_set_period(struct perf_counter *counter,
911 err = checking_wrmsrl(hwc->counter_base + idx, 1026 err = checking_wrmsrl(hwc->counter_base + idx,
912 (u64)(-left) & x86_pmu.counter_mask); 1027 (u64)(-left) & x86_pmu.counter_mask);
913 1028
1029 perf_counter_update_userpage(counter);
1030
914 return ret; 1031 return ret;
915} 1032}
916 1033
@@ -940,6 +1057,19 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
940 err = checking_wrmsrl(hwc->config_base, ctrl_val); 1057 err = checking_wrmsrl(hwc->config_base, ctrl_val);
941} 1058}
942 1059
1060static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
1061{
1062 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1063 u64 val;
1064
1065 val = hwc->config;
1066 if (cpuc->enabled)
1067 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
1068
1069 (void)checking_wrmsrl(hwc->config_base + idx, val);
1070}
1071
1072
943static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1073static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
944{ 1074{
945 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 1075 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -956,8 +1086,6 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
956 1086
957 if (cpuc->enabled) 1087 if (cpuc->enabled)
958 x86_pmu_enable_counter(hwc, idx); 1088 x86_pmu_enable_counter(hwc, idx);
959 else
960 x86_pmu_disable_counter(hwc, idx);
961} 1089}
962 1090
963static int 1091static int
@@ -968,13 +1096,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
968 if (!x86_pmu.num_counters_fixed) 1096 if (!x86_pmu.num_counters_fixed)
969 return -1; 1097 return -1;
970 1098
971 /*
972 * Quirk, IA32_FIXED_CTRs do not work on current Atom processors:
973 */
974 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
975 boot_cpu_data.x86_model == 28)
976 return -1;
977
978 event = hwc->config & ARCH_PERFMON_EVENT_MASK; 1099 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
979 1100
980 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 1101 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
@@ -1040,6 +1161,8 @@ try_generic:
1040 x86_perf_counter_set_period(counter, hwc, idx); 1161 x86_perf_counter_set_period(counter, hwc, idx);
1041 x86_pmu.enable(hwc, idx); 1162 x86_pmu.enable(hwc, idx);
1042 1163
1164 perf_counter_update_userpage(counter);
1165
1043 return 0; 1166 return 0;
1044} 1167}
1045 1168
@@ -1132,6 +1255,8 @@ static void x86_pmu_disable(struct perf_counter *counter)
1132 x86_perf_counter_update(counter, hwc, idx); 1255 x86_perf_counter_update(counter, hwc, idx);
1133 cpuc->counters[idx] = NULL; 1256 cpuc->counters[idx] = NULL;
1134 clear_bit(idx, cpuc->used_mask); 1257 clear_bit(idx, cpuc->used_mask);
1258
1259 perf_counter_update_userpage(counter);
1135} 1260}
1136 1261
1137/* 1262/*
@@ -1176,6 +1301,49 @@ static void intel_pmu_reset(void)
1176 local_irq_restore(flags); 1301 local_irq_restore(flags);
1177} 1302}
1178 1303
1304static int p6_pmu_handle_irq(struct pt_regs *regs)
1305{
1306 struct perf_sample_data data;
1307 struct cpu_hw_counters *cpuc;
1308 struct perf_counter *counter;
1309 struct hw_perf_counter *hwc;
1310 int idx, handled = 0;
1311 u64 val;
1312
1313 data.regs = regs;
1314 data.addr = 0;
1315
1316 cpuc = &__get_cpu_var(cpu_hw_counters);
1317
1318 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1319 if (!test_bit(idx, cpuc->active_mask))
1320 continue;
1321
1322 counter = cpuc->counters[idx];
1323 hwc = &counter->hw;
1324
1325 val = x86_perf_counter_update(counter, hwc, idx);
1326 if (val & (1ULL << (x86_pmu.counter_bits - 1)))
1327 continue;
1328
1329 /*
1330 * counter overflow
1331 */
1332 handled = 1;
1333 data.period = counter->hw.last_period;
1334
1335 if (!x86_perf_counter_set_period(counter, hwc, idx))
1336 continue;
1337
1338 if (perf_counter_overflow(counter, 1, &data))
1339 p6_pmu_disable_counter(hwc, idx);
1340 }
1341
1342 if (handled)
1343 inc_irq_stat(apic_perf_irqs);
1344
1345 return handled;
1346}
1179 1347
1180/* 1348/*
1181 * This handler is triggered by the local APIC, so the APIC IRQ handling 1349 * This handler is triggered by the local APIC, so the APIC IRQ handling
@@ -1185,14 +1353,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1185{ 1353{
1186 struct perf_sample_data data; 1354 struct perf_sample_data data;
1187 struct cpu_hw_counters *cpuc; 1355 struct cpu_hw_counters *cpuc;
1188 int bit, cpu, loops; 1356 int bit, loops;
1189 u64 ack, status; 1357 u64 ack, status;
1190 1358
1191 data.regs = regs; 1359 data.regs = regs;
1192 data.addr = 0; 1360 data.addr = 0;
1193 1361
1194 cpu = smp_processor_id(); 1362 cpuc = &__get_cpu_var(cpu_hw_counters);
1195 cpuc = &per_cpu(cpu_hw_counters, cpu);
1196 1363
1197 perf_disable(); 1364 perf_disable();
1198 status = intel_pmu_get_status(); 1365 status = intel_pmu_get_status();
@@ -1223,6 +1390,8 @@ again:
1223 if (!intel_pmu_save_and_restart(counter)) 1390 if (!intel_pmu_save_and_restart(counter))
1224 continue; 1391 continue;
1225 1392
1393 data.period = counter->hw.last_period;
1394
1226 if (perf_counter_overflow(counter, 1, &data)) 1395 if (perf_counter_overflow(counter, 1, &data))
1227 intel_pmu_disable_counter(&counter->hw, bit); 1396 intel_pmu_disable_counter(&counter->hw, bit);
1228 } 1397 }
@@ -1247,14 +1416,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1247 struct cpu_hw_counters *cpuc; 1416 struct cpu_hw_counters *cpuc;
1248 struct perf_counter *counter; 1417 struct perf_counter *counter;
1249 struct hw_perf_counter *hwc; 1418 struct hw_perf_counter *hwc;
1250 int cpu, idx, handled = 0; 1419 int idx, handled = 0;
1251 u64 val; 1420 u64 val;
1252 1421
1253 data.regs = regs; 1422 data.regs = regs;
1254 data.addr = 0; 1423 data.addr = 0;
1255 1424
1256 cpu = smp_processor_id(); 1425 cpuc = &__get_cpu_var(cpu_hw_counters);
1257 cpuc = &per_cpu(cpu_hw_counters, cpu);
1258 1426
1259 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1427 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1260 if (!test_bit(idx, cpuc->active_mask)) 1428 if (!test_bit(idx, cpuc->active_mask))
@@ -1297,18 +1465,22 @@ void smp_perf_pending_interrupt(struct pt_regs *regs)
1297 1465
1298void set_perf_counter_pending(void) 1466void set_perf_counter_pending(void)
1299{ 1467{
1468#ifdef CONFIG_X86_LOCAL_APIC
1300 apic->send_IPI_self(LOCAL_PENDING_VECTOR); 1469 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1470#endif
1301} 1471}
1302 1472
1303void perf_counters_lapic_init(void) 1473void perf_counters_lapic_init(void)
1304{ 1474{
1305 if (!x86_pmu_initialized()) 1475#ifdef CONFIG_X86_LOCAL_APIC
1476 if (!x86_pmu.apic || !x86_pmu_initialized())
1306 return; 1477 return;
1307 1478
1308 /* 1479 /*
1309 * Always use NMI for PMU 1480 * Always use NMI for PMU
1310 */ 1481 */
1311 apic_write(APIC_LVTPC, APIC_DM_NMI); 1482 apic_write(APIC_LVTPC, APIC_DM_NMI);
1483#endif
1312} 1484}
1313 1485
1314static int __kprobes 1486static int __kprobes
@@ -1332,7 +1504,9 @@ perf_counter_nmi_handler(struct notifier_block *self,
1332 1504
1333 regs = args->regs; 1505 regs = args->regs;
1334 1506
1507#ifdef CONFIG_X86_LOCAL_APIC
1335 apic_write(APIC_LVTPC, APIC_DM_NMI); 1508 apic_write(APIC_LVTPC, APIC_DM_NMI);
1509#endif
1336 /* 1510 /*
1337 * Can't rely on the handled return value to say it was our NMI, two 1511 * Can't rely on the handled return value to say it was our NMI, two
1338 * counters could trigger 'simultaneously' raising two back-to-back NMIs. 1512 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
@@ -1351,6 +1525,33 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
1351 .priority = 1 1525 .priority = 1
1352}; 1526};
1353 1527
1528static struct x86_pmu p6_pmu = {
1529 .name = "p6",
1530 .handle_irq = p6_pmu_handle_irq,
1531 .disable_all = p6_pmu_disable_all,
1532 .enable_all = p6_pmu_enable_all,
1533 .enable = p6_pmu_enable_counter,
1534 .disable = p6_pmu_disable_counter,
1535 .eventsel = MSR_P6_EVNTSEL0,
1536 .perfctr = MSR_P6_PERFCTR0,
1537 .event_map = p6_pmu_event_map,
1538 .raw_event = p6_pmu_raw_event,
1539 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
1540 .apic = 1,
1541 .max_period = (1ULL << 31) - 1,
1542 .version = 0,
1543 .num_counters = 2,
1544 /*
1545 * Counters have 40 bits implemented. However they are designed such
1546 * that bits [32-39] are sign extensions of bit 31. As such the
1547 * effective width of a counter for P6-like PMU is 32 bits only.
1548 *
1549 * See IA-32 Intel Architecture Software developer manual Vol 3B
1550 */
1551 .counter_bits = 32,
1552 .counter_mask = (1ULL << 32) - 1,
1553};
1554
1354static struct x86_pmu intel_pmu = { 1555static struct x86_pmu intel_pmu = {
1355 .name = "Intel", 1556 .name = "Intel",
1356 .handle_irq = intel_pmu_handle_irq, 1557 .handle_irq = intel_pmu_handle_irq,
@@ -1363,6 +1564,7 @@ static struct x86_pmu intel_pmu = {
1363 .event_map = intel_pmu_event_map, 1564 .event_map = intel_pmu_event_map,
1364 .raw_event = intel_pmu_raw_event, 1565 .raw_event = intel_pmu_raw_event,
1365 .max_events = ARRAY_SIZE(intel_perfmon_event_map), 1566 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
1567 .apic = 1,
1366 /* 1568 /*
1367 * Intel PMCs cannot be accessed sanely above 32 bit width, 1569 * Intel PMCs cannot be accessed sanely above 32 bit width,
1368 * so we install an artificial 1<<31 period regardless of 1570 * so we install an artificial 1<<31 period regardless of
@@ -1386,10 +1588,43 @@ static struct x86_pmu amd_pmu = {
1386 .num_counters = 4, 1588 .num_counters = 4,
1387 .counter_bits = 48, 1589 .counter_bits = 48,
1388 .counter_mask = (1ULL << 48) - 1, 1590 .counter_mask = (1ULL << 48) - 1,
1591 .apic = 1,
1389 /* use highest bit to detect overflow */ 1592 /* use highest bit to detect overflow */
1390 .max_period = (1ULL << 47) - 1, 1593 .max_period = (1ULL << 47) - 1,
1391}; 1594};
1392 1595
1596static int p6_pmu_init(void)
1597{
1598 switch (boot_cpu_data.x86_model) {
1599 case 1:
1600 case 3: /* Pentium Pro */
1601 case 5:
1602 case 6: /* Pentium II */
1603 case 7:
1604 case 8:
1605 case 11: /* Pentium III */
1606 break;
1607 case 9:
1608 case 13:
1609 /* Pentium M */
1610 break;
1611 default:
1612 pr_cont("unsupported p6 CPU model %d ",
1613 boot_cpu_data.x86_model);
1614 return -ENODEV;
1615 }
1616
1617 x86_pmu = p6_pmu;
1618
1619 if (!cpu_has_apic) {
1620 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1621 pr_info("no hardware sampling interrupt available.\n");
1622 x86_pmu.apic = 0;
1623 }
1624
1625 return 0;
1626}
1627
1393static int intel_pmu_init(void) 1628static int intel_pmu_init(void)
1394{ 1629{
1395 union cpuid10_edx edx; 1630 union cpuid10_edx edx;
@@ -1398,8 +1633,14 @@ static int intel_pmu_init(void)
1398 unsigned int ebx; 1633 unsigned int ebx;
1399 int version; 1634 int version;
1400 1635
1401 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 1636 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
1637 /* check for P6 processor family */
1638 if (boot_cpu_data.x86 == 6) {
1639 return p6_pmu_init();
1640 } else {
1402 return -ENODEV; 1641 return -ENODEV;
1642 }
1643 }
1403 1644
1404 /* 1645 /*
1405 * Check whether the Architectural PerfMon supports 1646 * Check whether the Architectural PerfMon supports
@@ -1425,8 +1666,6 @@ static int intel_pmu_init(void)
1425 */ 1666 */
1426 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); 1667 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
1427 1668
1428 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
1429
1430 /* 1669 /*
1431 * Install the hw-cache-events table: 1670 * Install the hw-cache-events table:
1432 */ 1671 */
@@ -1459,18 +1698,16 @@ static int intel_pmu_init(void)
1459 1698
1460static int amd_pmu_init(void) 1699static int amd_pmu_init(void)
1461{ 1700{
1701 /* Performance-monitoring supported from K7 and later: */
1702 if (boot_cpu_data.x86 < 6)
1703 return -ENODEV;
1704
1462 x86_pmu = amd_pmu; 1705 x86_pmu = amd_pmu;
1463 1706
1464 switch (boot_cpu_data.x86) { 1707 /* Events are common for all AMDs */
1465 case 0x0f: 1708 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
1466 case 0x10: 1709 sizeof(hw_cache_event_ids));
1467 case 0x11:
1468 memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
1469 sizeof(hw_cache_event_ids));
1470 1710
1471 pr_cont("AMD Family 0f/10/11 events, ");
1472 break;
1473 }
1474 return 0; 1711 return 0;
1475} 1712}
1476 1713
@@ -1498,21 +1735,22 @@ void __init init_hw_perf_counters(void)
1498 pr_cont("%s PMU driver.\n", x86_pmu.name); 1735 pr_cont("%s PMU driver.\n", x86_pmu.name);
1499 1736
1500 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { 1737 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1501 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1502 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", 1738 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1503 x86_pmu.num_counters, X86_PMC_MAX_GENERIC); 1739 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1740 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1504 } 1741 }
1505 perf_counter_mask = (1 << x86_pmu.num_counters) - 1; 1742 perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
1506 perf_max_counters = x86_pmu.num_counters; 1743 perf_max_counters = x86_pmu.num_counters;
1507 1744
1508 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { 1745 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1509 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1510 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", 1746 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1511 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); 1747 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1748 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1512 } 1749 }
1513 1750
1514 perf_counter_mask |= 1751 perf_counter_mask |=
1515 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; 1752 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1753 x86_pmu.intel_ctrl = perf_counter_mask;
1516 1754
1517 perf_counters_lapic_init(); 1755 perf_counters_lapic_init();
1518 register_die_notifier(&perf_counter_nmi_notifier); 1756 register_die_notifier(&perf_counter_nmi_notifier);
@@ -1554,14 +1792,15 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
1554 */ 1792 */
1555 1793
1556static inline 1794static inline
1557void callchain_store(struct perf_callchain_entry *entry, unsigned long ip) 1795void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1558{ 1796{
1559 if (entry->nr < MAX_STACK_DEPTH) 1797 if (entry->nr < PERF_MAX_STACK_DEPTH)
1560 entry->ip[entry->nr++] = ip; 1798 entry->ip[entry->nr++] = ip;
1561} 1799}
1562 1800
1563static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); 1801static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
1564static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); 1802static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
1803static DEFINE_PER_CPU(int, in_nmi_frame);
1565 1804
1566 1805
1567static void 1806static void
@@ -1577,14 +1816,19 @@ static void backtrace_warning(void *data, char *msg)
1577 1816
1578static int backtrace_stack(void *data, char *name) 1817static int backtrace_stack(void *data, char *name)
1579{ 1818{
1580 /* Don't bother with IRQ stacks for now */ 1819 per_cpu(in_nmi_frame, smp_processor_id()) =
1581 return -1; 1820 x86_is_stack_id(NMI_STACK, name);
1821
1822 return 0;
1582} 1823}
1583 1824
1584static void backtrace_address(void *data, unsigned long addr, int reliable) 1825static void backtrace_address(void *data, unsigned long addr, int reliable)
1585{ 1826{
1586 struct perf_callchain_entry *entry = data; 1827 struct perf_callchain_entry *entry = data;
1587 1828
1829 if (per_cpu(in_nmi_frame, smp_processor_id()))
1830 return;
1831
1588 if (reliable) 1832 if (reliable)
1589 callchain_store(entry, addr); 1833 callchain_store(entry, addr);
1590} 1834}
@@ -1596,47 +1840,59 @@ static const struct stacktrace_ops backtrace_ops = {
1596 .address = backtrace_address, 1840 .address = backtrace_address,
1597}; 1841};
1598 1842
1843#include "../dumpstack.h"
1844
1599static void 1845static void
1600perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) 1846perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1601{ 1847{
1602 unsigned long bp; 1848 callchain_store(entry, PERF_CONTEXT_KERNEL);
1603 char *stack; 1849 callchain_store(entry, regs->ip);
1604 int nr = entry->nr;
1605 1850
1606 callchain_store(entry, instruction_pointer(regs)); 1851 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1852}
1607 1853
1608 stack = ((char *)regs + sizeof(struct pt_regs)); 1854/*
1609#ifdef CONFIG_FRAME_POINTER 1855 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
1610 bp = frame_pointer(regs); 1856 */
1611#else 1857static unsigned long
1612 bp = 0; 1858copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
1613#endif 1859{
1860 unsigned long offset, addr = (unsigned long)from;
1861 int type = in_nmi() ? KM_NMI : KM_IRQ0;
1862 unsigned long size, len = 0;
1863 struct page *page;
1864 void *map;
1865 int ret;
1614 1866
1615 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); 1867 do {
1868 ret = __get_user_pages_fast(addr, 1, 0, &page);
1869 if (!ret)
1870 break;
1616 1871
1617 entry->kernel = entry->nr - nr; 1872 offset = addr & (PAGE_SIZE - 1);
1618} 1873 size = min(PAGE_SIZE - offset, n - len);
1619 1874
1875 map = kmap_atomic(page, type);
1876 memcpy(to, map+offset, size);
1877 kunmap_atomic(map, type);
1878 put_page(page);
1620 1879
1621struct stack_frame { 1880 len += size;
1622 const void __user *next_fp; 1881 to += size;
1623 unsigned long return_address; 1882 addr += size;
1624}; 1883
1884 } while (len < n);
1885
1886 return len;
1887}
1625 1888
1626static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) 1889static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
1627{ 1890{
1628 int ret; 1891 unsigned long bytes;
1629
1630 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
1631 return 0;
1632 1892
1633 ret = 1; 1893 bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));
1634 pagefault_disable();
1635 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
1636 ret = 0;
1637 pagefault_enable();
1638 1894
1639 return ret; 1895 return bytes == sizeof(*frame);
1640} 1896}
1641 1897
1642static void 1898static void
@@ -1644,28 +1900,28 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1644{ 1900{
1645 struct stack_frame frame; 1901 struct stack_frame frame;
1646 const void __user *fp; 1902 const void __user *fp;
1647 int nr = entry->nr;
1648 1903
1649 regs = (struct pt_regs *)current->thread.sp0 - 1; 1904 if (!user_mode(regs))
1650 fp = (void __user *)regs->bp; 1905 regs = task_pt_regs(current);
1906
1907 fp = (void __user *)regs->bp;
1651 1908
1909 callchain_store(entry, PERF_CONTEXT_USER);
1652 callchain_store(entry, regs->ip); 1910 callchain_store(entry, regs->ip);
1653 1911
1654 while (entry->nr < MAX_STACK_DEPTH) { 1912 while (entry->nr < PERF_MAX_STACK_DEPTH) {
1655 frame.next_fp = NULL; 1913 frame.next_frame = NULL;
1656 frame.return_address = 0; 1914 frame.return_address = 0;
1657 1915
1658 if (!copy_stack_frame(fp, &frame)) 1916 if (!copy_stack_frame(fp, &frame))
1659 break; 1917 break;
1660 1918
1661 if ((unsigned long)fp < user_stack_pointer(regs)) 1919 if ((unsigned long)fp < regs->sp)
1662 break; 1920 break;
1663 1921
1664 callchain_store(entry, frame.return_address); 1922 callchain_store(entry, frame.return_address);
1665 fp = frame.next_fp; 1923 fp = frame.next_frame;
1666 } 1924 }
1667
1668 entry->user = entry->nr - nr;
1669} 1925}
1670 1926
1671static void 1927static void
@@ -1701,9 +1957,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1701 entry = &__get_cpu_var(irq_entry); 1957 entry = &__get_cpu_var(irq_entry);
1702 1958
1703 entry->nr = 0; 1959 entry->nr = 0;
1704 entry->hv = 0;
1705 entry->kernel = 0;
1706 entry->user = 0;
1707 1960
1708 perf_do_callchain(regs, entry); 1961 perf_do_callchain(regs, entry);
1709 1962
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index d6f5b9fbde32..e60ed740d2b3 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -716,11 +716,15 @@ static void probe_nmi_watchdog(void)
716 wd_ops = &k7_wd_ops; 716 wd_ops = &k7_wd_ops;
717 break; 717 break;
718 case X86_VENDOR_INTEL: 718 case X86_VENDOR_INTEL:
719 /* 719 /* Work around where perfctr1 doesn't have a working enable
720 * Work around Core Duo (Yonah) errata AE49 where perfctr1 720 * bit as described in the following errata:
721 * doesn't have a working enable bit. 721 * AE49 Core Duo and Intel Core Solo 65 nm
722 * AN49 Intel Pentium Dual-Core
723 * AF49 Dual-Core Intel Xeon Processor LV
722 */ 724 */
723 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { 725 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
726 ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
727 boot_cpu_data.x86_mask == 4))) {
724 intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; 728 intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
725 intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; 729 intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
726 } 730 }
@@ -799,8 +803,3 @@ int __kprobes lapic_wd_event(unsigned nmi_hz)
799 wd_ops->rearm(wd, nmi_hz); 803 wd_ops->rearm(wd, nmi_hz);
800 return 1; 804 return 1;
801} 805}
802
803int lapic_watchdog_ok(void)
804{
805 return wd_ops != NULL;
806}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2ac1f0c2beb3..b07af8861244 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -182,6 +182,11 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
182 .notifier_call = cpuid_class_cpu_callback, 182 .notifier_call = cpuid_class_cpu_callback,
183}; 183};
184 184
185static char *cpuid_nodename(struct device *dev)
186{
187 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
188}
189
185static int __init cpuid_init(void) 190static int __init cpuid_init(void)
186{ 191{
187 int i, err = 0; 192 int i, err = 0;
@@ -198,6 +203,7 @@ static int __init cpuid_init(void)
198 err = PTR_ERR(cpuid_class); 203 err = PTR_ERR(cpuid_class);
199 goto out_chrdev; 204 goto out_chrdev;
200 } 205 }
206 cpuid_class->nodename = cpuid_nodename;
201 for_each_online_cpu(i) { 207 for_each_online_cpu(i) {
202 err = cpuid_device_create(i); 208 err = cpuid_device_create(i);
203 if (err != 0) 209 if (err != 0)
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ff958248e61d..5e409dc298a4 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,6 +27,7 @@
27#include <asm/cpu.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30#include <asm/iommu.h>
30 31
31 32
32#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
@@ -103,5 +104,10 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
103#ifdef CONFIG_HPET_TIMER 104#ifdef CONFIG_HPET_TIMER
104 hpet_disable(); 105 hpet_disable();
105#endif 106#endif
107
108#ifdef CONFIG_X86_64
109 pci_iommu_shutdown();
110#endif
111
106 crash_save_cpu(regs, safe_smp_processor_id()); 112 crash_save_cpu(regs, safe_smp_processor_id());
107} 113}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 95ea5fa7d444..c8405718a4c3 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -22,6 +22,7 @@
22#include "dumpstack.h" 22#include "dumpstack.h"
23 23
24int panic_on_unrecovered_nmi; 24int panic_on_unrecovered_nmi;
25int panic_on_io_nmi;
25unsigned int code_bytes = 64; 26unsigned int code_bytes = 64;
26int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; 27int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
27static int die_counter; 28static int die_counter;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index d593cd1f58dc..bca5fba91c9e 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -19,6 +19,12 @@
19 19
20#include "dumpstack.h" 20#include "dumpstack.h"
21 21
22/* Just a stub for now */
23int x86_is_stack_id(int id, char *name)
24{
25 return 0;
26}
27
22void dump_trace(struct task_struct *task, struct pt_regs *regs, 28void dump_trace(struct task_struct *task, struct pt_regs *regs,
23 unsigned long *stack, unsigned long bp, 29 unsigned long *stack, unsigned long bp,
24 const struct stacktrace_ops *ops, void *data) 30 const struct stacktrace_ops *ops, void *data)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index d35db5993fd6..54b0a3276766 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -19,10 +19,8 @@
19 19
20#include "dumpstack.h" 20#include "dumpstack.h"
21 21
22static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 22
23 unsigned *usedp, char **idp) 23static char x86_stack_ids[][8] = {
24{
25 static char ids[][8] = {
26 [DEBUG_STACK - 1] = "#DB", 24 [DEBUG_STACK - 1] = "#DB",
27 [NMI_STACK - 1] = "NMI", 25 [NMI_STACK - 1] = "NMI",
28 [DOUBLEFAULT_STACK - 1] = "#DF", 26 [DOUBLEFAULT_STACK - 1] = "#DF",
@@ -33,6 +31,15 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
33 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 31 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
34#endif 32#endif
35 }; 33 };
34
35int x86_is_stack_id(int id, char *name)
36{
37 return x86_stack_ids[id - 1] == name;
38}
39
40static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
41 unsigned *usedp, char **idp)
42{
36 unsigned k; 43 unsigned k;
37 44
38 /* 45 /*
@@ -61,7 +68,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
61 if (*usedp & (1U << k)) 68 if (*usedp & (1U << k))
62 break; 69 break;
63 *usedp |= 1U << k; 70 *usedp |= 1U << k;
64 *idp = ids[k]; 71 *idp = x86_stack_ids[k];
65 return (unsigned long *)end; 72 return (unsigned long *)end;
66 } 73 }
67 /* 74 /*
@@ -81,12 +88,13 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
81 do { 88 do {
82 ++j; 89 ++j;
83 end -= EXCEPTION_STKSZ; 90 end -= EXCEPTION_STKSZ;
84 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); 91 x86_stack_ids[j][4] = '1' +
92 (j - N_EXCEPTION_STACKS);
85 } while (stack < end - EXCEPTION_STKSZ); 93 } while (stack < end - EXCEPTION_STKSZ);
86 if (*usedp & (1U << j)) 94 if (*usedp & (1U << j))
87 break; 95 break;
88 *usedp |= 1U << j; 96 *usedp |= 1U << j;
89 *idp = ids[j]; 97 *idp = x86_stack_ids[j];
90 return (unsigned long *)end; 98 return (unsigned long *)end;
91 } 99 }
92#endif 100#endif
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7271fa33d791..5cb5725b2bae 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -627,10 +627,9 @@ __init void e820_setup_gap(void)
627#ifdef CONFIG_X86_64 627#ifdef CONFIG_X86_64
628 if (!found) { 628 if (!found) {
629 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; 629 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
630 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " 630 printk(KERN_ERR
631 "address range\n" 631 "PCI: Warning: Cannot find a gap in the 32bit address range\n"
632 KERN_ERR "PCI: Unassigned devices with 32bit resource " 632 "PCI: Unassigned devices with 32bit resource registers may break!\n");
633 "registers may break!\n");
634 } 633 }
635#endif 634#endif
636 635
@@ -1383,6 +1382,8 @@ static unsigned long ram_alignment(resource_size_t pos)
1383 return 32*1024*1024; 1382 return 32*1024*1024;
1384} 1383}
1385 1384
1385#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
1386
1386void __init e820_reserve_resources_late(void) 1387void __init e820_reserve_resources_late(void)
1387{ 1388{
1388 int i; 1389 int i;
@@ -1400,17 +1401,19 @@ void __init e820_reserve_resources_late(void)
1400 * avoid stolen RAM: 1401 * avoid stolen RAM:
1401 */ 1402 */
1402 for (i = 0; i < e820.nr_map; i++) { 1403 for (i = 0; i < e820.nr_map; i++) {
1403 struct e820entry *entry = &e820_saved.map[i]; 1404 struct e820entry *entry = &e820.map[i];
1404 resource_size_t start, end; 1405 u64 start, end;
1405 1406
1406 if (entry->type != E820_RAM) 1407 if (entry->type != E820_RAM)
1407 continue; 1408 continue;
1408 start = entry->addr + entry->size; 1409 start = entry->addr + entry->size;
1409 end = round_up(start, ram_alignment(start)); 1410 end = round_up(start, ram_alignment(start)) - 1;
1410 if (start == end) 1411 if (end > MAX_RESOURCE_SIZE)
1412 end = MAX_RESOURCE_SIZE;
1413 if (start >= end)
1411 continue; 1414 continue;
1412 reserve_region_with_split(&iomem_resource, start, 1415 reserve_region_with_split(&iomem_resource, start, end,
1413 end - 1, "RAM buffer"); 1416 "RAM buffer");
1414 } 1417 }
1415} 1418}
1416 1419
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 1736acc4d7aa..fe26ba3e3451 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -240,10 +240,35 @@ static void __init do_add_efi_memmap(void)
240 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; 240 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
241 int e820_type; 241 int e820_type;
242 242
243 if (md->attribute & EFI_MEMORY_WB) 243 switch (md->type) {
244 e820_type = E820_RAM; 244 case EFI_LOADER_CODE:
245 else 245 case EFI_LOADER_DATA:
246 case EFI_BOOT_SERVICES_CODE:
247 case EFI_BOOT_SERVICES_DATA:
248 case EFI_CONVENTIONAL_MEMORY:
249 if (md->attribute & EFI_MEMORY_WB)
250 e820_type = E820_RAM;
251 else
252 e820_type = E820_RESERVED;
253 break;
254 case EFI_ACPI_RECLAIM_MEMORY:
255 e820_type = E820_ACPI;
256 break;
257 case EFI_ACPI_MEMORY_NVS:
258 e820_type = E820_NVS;
259 break;
260 case EFI_UNUSABLE_MEMORY:
261 e820_type = E820_UNUSABLE;
262 break;
263 default:
264 /*
265 * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE
266 * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO
267 * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE
268 */
246 e820_type = E820_RESERVED; 269 e820_type = E820_RESERVED;
270 break;
271 }
247 e820_add_region(start, size, e820_type); 272 e820_add_region(start, size, e820_type);
248 } 273 }
249 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 274 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
@@ -329,7 +354,7 @@ void __init efi_init(void)
329 */ 354 */
330 c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); 355 c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
331 if (c16) { 356 if (c16) {
332 for (i = 0; i < sizeof(vendor) && *c16; ++i) 357 for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
333 vendor[i] = *c16++; 358 vendor[i] = *c16++;
334 vendor[i] = '\0'; 359 vendor[i] = '\0';
335 } else 360 } else
@@ -487,7 +512,7 @@ void __init efi_enter_virtual_mode(void)
487 && end_pfn <= max_pfn_mapped)) 512 && end_pfn <= max_pfn_mapped))
488 va = __va(md->phys_addr); 513 va = __va(md->phys_addr);
489 else 514 else
490 va = efi_ioremap(md->phys_addr, size); 515 va = efi_ioremap(md->phys_addr, size, md->type);
491 516
492 md->virt_addr = (u64) (unsigned long) va; 517 md->virt_addr = (u64) (unsigned long) va;
493 518
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 22c3b7828c50..ac0621a7ac3d 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -98,10 +98,14 @@ void __init efi_call_phys_epilog(void)
98 early_runtime_code_mapping_set_exec(0); 98 early_runtime_code_mapping_set_exec(0);
99} 99}
100 100
101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) 101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
102 u32 type)
102{ 103{
103 unsigned long last_map_pfn; 104 unsigned long last_map_pfn;
104 105
106 if (type == EFI_MEMORY_MAPPED_IO)
107 return ioremap(phys_addr, size);
108
105 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); 109 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
106 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) 110 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
107 return NULL; 111 return NULL;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c929add475c9..c097e7d607c6 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -48,7 +48,6 @@
48#include <asm/segment.h> 48#include <asm/segment.h>
49#include <asm/smp.h> 49#include <asm/smp.h>
50#include <asm/page_types.h> 50#include <asm/page_types.h>
51#include <asm/desc.h>
52#include <asm/percpu.h> 51#include <asm/percpu.h>
53#include <asm/dwarf2.h> 52#include <asm/dwarf2.h>
54#include <asm/processor-flags.h> 53#include <asm/processor-flags.h>
@@ -84,7 +83,7 @@
84#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF 83#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
85#else 84#else
86#define preempt_stop(clobbers) 85#define preempt_stop(clobbers)
87#define resume_kernel restore_nocheck 86#define resume_kernel restore_all
88#endif 87#endif
89 88
90.macro TRACE_IRQS_IRET 89.macro TRACE_IRQS_IRET
@@ -372,7 +371,7 @@ END(ret_from_exception)
372ENTRY(resume_kernel) 371ENTRY(resume_kernel)
373 DISABLE_INTERRUPTS(CLBR_ANY) 372 DISABLE_INTERRUPTS(CLBR_ANY)
374 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? 373 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
375 jnz restore_nocheck 374 jnz restore_all
376need_resched: 375need_resched:
377 movl TI_flags(%ebp), %ecx # need_resched set ? 376 movl TI_flags(%ebp), %ecx # need_resched set ?
378 testb $_TIF_NEED_RESCHED, %cl 377 testb $_TIF_NEED_RESCHED, %cl
@@ -540,6 +539,8 @@ syscall_exit:
540 jne syscall_exit_work 539 jne syscall_exit_work
541 540
542restore_all: 541restore_all:
542 TRACE_IRQS_IRET
543restore_all_notrace:
543 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS 544 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
544 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we 545 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
545 # are returning to the kernel. 546 # are returning to the kernel.
@@ -551,8 +552,6 @@ restore_all:
551 CFI_REMEMBER_STATE 552 CFI_REMEMBER_STATE
552 je ldt_ss # returning to user-space with LDT SS 553 je ldt_ss # returning to user-space with LDT SS
553restore_nocheck: 554restore_nocheck:
554 TRACE_IRQS_IRET
555restore_nocheck_notrace:
556 RESTORE_REGS 4 # skip orig_eax/error_code 555 RESTORE_REGS 4 # skip orig_eax/error_code
557 CFI_ADJUST_CFA_OFFSET -4 556 CFI_ADJUST_CFA_OFFSET -4
558irq_return: 557irq_return:
@@ -588,22 +587,34 @@ ldt_ss:
588 jne restore_nocheck 587 jne restore_nocheck
589#endif 588#endif
590 589
591 /* If returning to userspace with 16bit stack, 590/*
592 * try to fix the higher word of ESP, as the CPU 591 * Setup and switch to ESPFIX stack
593 * won't restore it. 592 *
594 * This is an "official" bug of all the x86-compatible 593 * We're returning to userspace with a 16 bit stack. The CPU will not
595 * CPUs, which we can try to work around to make 594 * restore the high word of ESP for us on executing iret... This is an
596 * dosemu and wine happy. */ 595 * "official" bug of all the x86-compatible CPUs, which we can work
597 movl PT_OLDESP(%esp), %eax 596 * around to make dosemu and wine happy. We do this by preloading the
598 movl %esp, %edx 597 * high word of ESP with the high word of the userspace ESP while
599 call patch_espfix_desc 598 * compensating for the offset by changing to the ESPFIX segment with
599 * a base address that matches for the difference.
600 */
601 mov %esp, %edx /* load kernel esp */
602 mov PT_OLDESP(%esp), %eax /* load userspace esp */
603 mov %dx, %ax /* eax: new kernel esp */
604 sub %eax, %edx /* offset (low word is 0) */
605 PER_CPU(gdt_page, %ebx)
606 shr $16, %edx
607 mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */
608 mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */
600 pushl $__ESPFIX_SS 609 pushl $__ESPFIX_SS
601 CFI_ADJUST_CFA_OFFSET 4 610 CFI_ADJUST_CFA_OFFSET 4
602 pushl %eax 611 push %eax /* new kernel esp */
603 CFI_ADJUST_CFA_OFFSET 4 612 CFI_ADJUST_CFA_OFFSET 4
613 /* Disable interrupts, but do not irqtrace this section: we
614 * will soon execute iret and the tracer was already set to
615 * the irqstate after the iret */
604 DISABLE_INTERRUPTS(CLBR_EAX) 616 DISABLE_INTERRUPTS(CLBR_EAX)
605 TRACE_IRQS_OFF 617 lss (%esp), %esp /* switch to espfix segment */
606 lss (%esp), %esp
607 CFI_ADJUST_CFA_OFFSET -8 618 CFI_ADJUST_CFA_OFFSET -8
608 jmp restore_nocheck 619 jmp restore_nocheck
609 CFI_ENDPROC 620 CFI_ENDPROC
@@ -716,15 +727,24 @@ PTREGSCALL(vm86)
716PTREGSCALL(vm86old) 727PTREGSCALL(vm86old)
717 728
718.macro FIXUP_ESPFIX_STACK 729.macro FIXUP_ESPFIX_STACK
719 /* since we are on a wrong stack, we cant make it a C code :( */ 730/*
731 * Switch back for ESPFIX stack to the normal zerobased stack
732 *
733 * We can't call C functions using the ESPFIX stack. This code reads
734 * the high word of the segment base from the GDT and swiches to the
735 * normal stack and adjusts ESP with the matching offset.
736 */
737 /* fixup the stack */
720 PER_CPU(gdt_page, %ebx) 738 PER_CPU(gdt_page, %ebx)
721 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) 739 mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */
722 addl %esp, %eax 740 mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
741 shl $16, %eax
742 addl %esp, %eax /* the adjusted stack pointer */
723 pushl $__KERNEL_DS 743 pushl $__KERNEL_DS
724 CFI_ADJUST_CFA_OFFSET 4 744 CFI_ADJUST_CFA_OFFSET 4
725 pushl %eax 745 pushl %eax
726 CFI_ADJUST_CFA_OFFSET 4 746 CFI_ADJUST_CFA_OFFSET 4
727 lss (%esp), %esp 747 lss (%esp), %esp /* switch to the normal stack segment */
728 CFI_ADJUST_CFA_OFFSET -8 748 CFI_ADJUST_CFA_OFFSET -8
729.endm 749.endm
730.macro UNWIND_ESPFIX_STACK 750.macro UNWIND_ESPFIX_STACK
@@ -1154,6 +1174,7 @@ ENTRY(ftrace_graph_caller)
1154 pushl %edx 1174 pushl %edx
1155 movl 0xc(%esp), %edx 1175 movl 0xc(%esp), %edx
1156 lea 0x4(%ebp), %eax 1176 lea 0x4(%ebp), %eax
1177 movl (%ebp), %ecx
1157 subl $MCOUNT_INSN_SIZE, %edx 1178 subl $MCOUNT_INSN_SIZE, %edx
1158 call prepare_ftrace_return 1179 call prepare_ftrace_return
1159 popl %edx 1180 popl %edx
@@ -1168,6 +1189,7 @@ return_to_handler:
1168 pushl %eax 1189 pushl %eax
1169 pushl %ecx 1190 pushl %ecx
1170 pushl %edx 1191 pushl %edx
1192 movl %ebp, %eax
1171 call ftrace_return_to_handler 1193 call ftrace_return_to_handler
1172 movl %eax, 0xc(%esp) 1194 movl %eax, 0xc(%esp)
1173 popl %edx 1195 popl %edx
@@ -1329,7 +1351,7 @@ nmi_stack_correct:
1329 xorl %edx,%edx # zero error code 1351 xorl %edx,%edx # zero error code
1330 movl %esp,%eax # pt_regs pointer 1352 movl %esp,%eax # pt_regs pointer
1331 call do_nmi 1353 call do_nmi
1332 jmp restore_nocheck_notrace 1354 jmp restore_all_notrace
1333 CFI_ENDPROC 1355 CFI_ENDPROC
1334 1356
1335nmi_stack_fixup: 1357nmi_stack_fixup:
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index de74f0a3e0ed..c251be745107 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -135,6 +135,7 @@ ENTRY(ftrace_graph_caller)
135 135
136 leaq 8(%rbp), %rdi 136 leaq 8(%rbp), %rdi
137 movq 0x38(%rsp), %rsi 137 movq 0x38(%rsp), %rsi
138 movq (%rbp), %rdx
138 subq $MCOUNT_INSN_SIZE, %rsi 139 subq $MCOUNT_INSN_SIZE, %rsi
139 140
140 call prepare_ftrace_return 141 call prepare_ftrace_return
@@ -150,6 +151,7 @@ GLOBAL(return_to_handler)
150 /* Save the return values */ 151 /* Save the return values */
151 movq %rax, (%rsp) 152 movq %rax, (%rsp)
152 movq %rdx, 8(%rsp) 153 movq %rdx, 8(%rsp)
154 movq %rbp, %rdi
153 155
154 call ftrace_return_to_handler 156 call ftrace_return_to_handler
155 157
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index b79c5533c421..d94e1ea3b9fe 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -408,7 +408,8 @@ int ftrace_disable_ftrace_graph_caller(void)
408 * Hook the return address and push it in the stack of return addrs 408 * Hook the return address and push it in the stack of return addrs
409 * in current thread info. 409 * in current thread info.
410 */ 410 */
411void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) 411void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
412 unsigned long frame_pointer)
412{ 413{
413 unsigned long old; 414 unsigned long old;
414 int faulted; 415 int faulted;
@@ -453,7 +454,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
453 return; 454 return;
454 } 455 }
455 456
456 if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) { 457 if (ftrace_push_return_trace(old, self_addr, &trace.depth,
458 frame_pointer) == -EBUSY) {
457 *parent = old; 459 *parent = old;
458 return; 460 return;
459 } 461 }
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index dc5ed4bdd88d..0d98a01cbdb2 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -13,7 +13,6 @@
13#include <asm/segment.h> 13#include <asm/segment.h>
14#include <asm/page_types.h> 14#include <asm/page_types.h>
15#include <asm/pgtable_types.h> 15#include <asm/pgtable_types.h>
16#include <asm/desc.h>
17#include <asm/cache.h> 16#include <asm/cache.h>
18#include <asm/thread_info.h> 17#include <asm/thread_info.h>
19#include <asm/asm-offsets.h> 18#include <asm/asm-offsets.h>
@@ -603,7 +602,11 @@ ignore_int:
603#endif 602#endif
604 iret 603 iret
605 604
606.section .cpuinit.data,"wa" 605#ifndef CONFIG_HOTPLUG_CPU
606 __CPUINITDATA
607#else
608 __REFDATA
609#endif
607.align 4 610.align 4
608ENTRY(initial_code) 611ENTRY(initial_code)
609 .long i386_start_kernel 612 .long i386_start_kernel
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 54b29bb24e71..fa54f78e2a05 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -12,7 +12,6 @@
12#include <linux/linkage.h> 12#include <linux/linkage.h>
13#include <linux/threads.h> 13#include <linux/threads.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <asm/desc.h>
16#include <asm/segment.h> 15#include <asm/segment.h>
17#include <asm/pgtable.h> 16#include <asm/pgtable.h>
18#include <asm/page.h> 17#include <asm/page.h>
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 81408b93f887..dedc2bddf7a5 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -510,7 +510,8 @@ static int hpet_setup_irq(struct hpet_dev *dev)
510{ 510{
511 511
512 if (request_irq(dev->irq, hpet_interrupt_handler, 512 if (request_irq(dev->irq, hpet_interrupt_handler,
513 IRQF_DISABLED|IRQF_NOBALANCING, dev->name, dev)) 513 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
514 dev->name, dev))
514 return -1; 515 return -1;
515 516
516 disable_irq(dev->irq); 517 disable_irq(dev->irq);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c2e0bb0890d4..5cf36c053ac4 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -7,6 +7,7 @@
7#include <linux/spinlock.h> 7#include <linux/spinlock.h>
8#include <linux/jiffies.h> 8#include <linux/jiffies.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/timex.h>
10#include <linux/delay.h> 11#include <linux/delay.h>
11#include <linux/init.h> 12#include <linux/init.h>
12#include <linux/io.h> 13#include <linux/io.h>
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index df3bf269beab..270ff83efc11 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -12,7 +12,6 @@
12 12
13static struct signal_struct init_signals = INIT_SIGNALS(init_signals); 13static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
14static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); 14static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
15struct mm_struct init_mm = INIT_MM(init_mm);
16 15
17/* 16/*
18 * Initial thread structure. 17 * Initial thread structure.
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 696f0e475c2d..92b7703d3d58 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -187,7 +187,7 @@ static void __init apic_intr_init(void)
187#ifdef CONFIG_X86_THERMAL_VECTOR 187#ifdef CONFIG_X86_THERMAL_VECTOR
188 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 188 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
189#endif 189#endif
190#ifdef CONFIG_X86_THRESHOLD 190#ifdef CONFIG_X86_MCE_THRESHOLD
191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
192#endif 192#endif
193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) 193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a78ecad0c900..c664d515f613 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -200,7 +200,7 @@ static void kvm_leave_lazy_mmu(void)
200 state->mode = paravirt_get_lazy_mode(); 200 state->mode = paravirt_get_lazy_mode();
201} 201}
202 202
203static void paravirt_ops_setup(void) 203static void __init paravirt_ops_setup(void)
204{ 204{
205 pv_info.name = "KVM"; 205 pv_info.name = "KVM";
206 pv_info.paravirt_enabled = 1; 206 pv_info.paravirt_enabled = 1;
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 846510b78a09..2a62d843f015 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -347,7 +347,7 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id)
347 347
348static struct irqaction mfgptirq = { 348static struct irqaction mfgptirq = {
349 .handler = mfgpt_tick, 349 .handler = mfgpt_tick,
350 .flags = IRQF_DISABLED | IRQF_NOBALANCING, 350 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
351 .name = "mfgpt-timer" 351 .name = "mfgpt-timer"
352}; 352};
353 353
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 9c4461501fcb..9371448290ac 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -236,6 +236,7 @@ static const struct file_operations microcode_fops = {
236static struct miscdevice microcode_dev = { 236static struct miscdevice microcode_dev = {
237 .minor = MICROCODE_MINOR, 237 .minor = MICROCODE_MINOR,
238 .name = "microcode", 238 .name = "microcode",
239 .devnode = "cpu/microcode",
239 .fops = &microcode_fops, 240 .fops = &microcode_fops,
240}; 241};
241 242
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 3cf3413ec626..98fd6cd4e3a4 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -196,6 +196,11 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
196 .notifier_call = msr_class_cpu_callback, 196 .notifier_call = msr_class_cpu_callback,
197}; 197};
198 198
199static char *msr_nodename(struct device *dev)
200{
201 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
202}
203
199static int __init msr_init(void) 204static int __init msr_init(void)
200{ 205{
201 int i, err = 0; 206 int i, err = 0;
@@ -212,6 +217,7 @@ static int __init msr_init(void)
212 err = PTR_ERR(msr_class); 217 err = PTR_ERR(msr_class);
213 goto out_chrdev; 218 goto out_chrdev;
214 } 219 }
220 msr_class->nodename = msr_nodename;
215 for_each_online_cpu(i) { 221 for_each_online_cpu(i) {
216 err = msr_device_create(i); 222 err = msr_device_create(i);
217 if (err != 0) 223 if (err != 0)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 745579bc8256..1a041bcf506b 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -32,6 +32,8 @@ int no_iommu __read_mostly;
32/* Set this to 1 if there is a HW IOMMU in the system */ 32/* Set this to 1 if there is a HW IOMMU in the system */
33int iommu_detected __read_mostly = 0; 33int iommu_detected __read_mostly = 0;
34 34
35int iommu_pass_through;
36
35dma_addr_t bad_dma_address __read_mostly = 0; 37dma_addr_t bad_dma_address __read_mostly = 0;
36EXPORT_SYMBOL(bad_dma_address); 38EXPORT_SYMBOL(bad_dma_address);
37 39
@@ -210,6 +212,10 @@ static __init int iommu_setup(char *p)
210 if (!strncmp(p, "soft", 4)) 212 if (!strncmp(p, "soft", 4))
211 swiotlb = 1; 213 swiotlb = 1;
212#endif 214#endif
215 if (!strncmp(p, "pt", 2)) {
216 iommu_pass_through = 1;
217 return 1;
218 }
213 219
214 gart_parse_options(p); 220 gart_parse_options(p);
215 221
@@ -290,6 +296,8 @@ static int __init pci_iommu_init(void)
290void pci_iommu_shutdown(void) 296void pci_iommu_shutdown(void)
291{ 297{
292 gart_iommu_shutdown(); 298 gart_iommu_shutdown();
299
300 amd_iommu_shutdown();
293} 301}
294/* Must execute after PCI subsystem */ 302/* Must execute after PCI subsystem */
295fs_initcall(pci_iommu_init); 303fs_initcall(pci_iommu_init);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index cfd9f9063896..d2e56b8f48e7 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -675,7 +675,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
675 nommu: 675 nommu:
676 /* Should not happen anymore */ 676 /* Should not happen anymore */
677 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 677 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
678 KERN_WARNING "falling back to iommu=soft.\n"); 678 "falling back to iommu=soft.\n");
679 return -1; 679 return -1;
680} 680}
681 681
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index a1712f2b50f1..6af96ee44200 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -71,7 +71,8 @@ void __init pci_swiotlb_init(void)
71{ 71{
72 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 72 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
73#ifdef CONFIG_X86_64 73#ifdef CONFIG_X86_64
74 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) 74 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) ||
75 iommu_pass_through)
75 swiotlb = 1; 76 swiotlb = 1;
76#endif 77#endif
77 if (swiotlb_force) 78 if (swiotlb_force)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3bb2be1649bd..994dd6a4a2a0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,7 +63,7 @@ void arch_task_cache_init(void)
63 task_xstate_cachep = 63 task_xstate_cachep =
64 kmem_cache_create("task_xstate", xstate_size, 64 kmem_cache_create("task_xstate", xstate_size,
65 __alignof__(union thread_xstate), 65 __alignof__(union thread_xstate),
66 SLAB_PANIC, NULL); 66 SLAB_PANIC | SLAB_NOTRACK, NULL);
67} 67}
68 68
69/* 69/*
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 4f9c55f3a7c0..03801f2f761f 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -60,7 +60,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
60 "adc %5,%%edx ; " 60 "adc %5,%%edx ; "
61 : "=A" (product), "=r" (tmp1), "=r" (tmp2) 61 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
62 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); 62 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
63#elif __x86_64__ 63#elif defined(__x86_64__)
64 __asm__ ( 64 __asm__ (
65 "mul %%rdx ; shrd $32,%%rdx,%%rax" 65 "mul %%rdx ; shrd $32,%%rdx,%%rax"
66 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); 66 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d2d1ce8170f0..a06e8d101844 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -3,6 +3,7 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/pm.h> 4#include <linux/pm.h>
5#include <linux/efi.h> 5#include <linux/efi.h>
6#include <linux/dmi.h>
6#include <acpi/reboot.h> 7#include <acpi/reboot.h>
7#include <asm/io.h> 8#include <asm/io.h>
8#include <asm/apic.h> 9#include <asm/apic.h>
@@ -17,7 +18,6 @@
17#include <asm/cpu.h> 18#include <asm/cpu.h>
18 19
19#ifdef CONFIG_X86_32 20#ifdef CONFIG_X86_32
20# include <linux/dmi.h>
21# include <linux/ctype.h> 21# include <linux/ctype.h>
22# include <linux/mc146818rtc.h> 22# include <linux/mc146818rtc.h>
23#else 23#else
@@ -249,6 +249,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
249 DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), 249 DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
250 }, 250 },
251 }, 251 },
252 { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */
253 .callback = set_bios_reboot,
254 .ident = "CompuLab SBC-FITPC2",
255 .matches = {
256 DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"),
257 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
258 },
259 },
252 { } 260 { }
253}; 261};
254 262
@@ -396,6 +404,46 @@ EXPORT_SYMBOL(machine_real_restart);
396 404
397#endif /* CONFIG_X86_32 */ 405#endif /* CONFIG_X86_32 */
398 406
407/*
408 * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
409 */
410static int __init set_pci_reboot(const struct dmi_system_id *d)
411{
412 if (reboot_type != BOOT_CF9) {
413 reboot_type = BOOT_CF9;
414 printk(KERN_INFO "%s series board detected. "
415 "Selecting PCI-method for reboots.\n", d->ident);
416 }
417 return 0;
418}
419
420static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
421 { /* Handle problems with rebooting on Apple MacBook5 */
422 .callback = set_pci_reboot,
423 .ident = "Apple MacBook5",
424 .matches = {
425 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
426 DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
427 },
428 },
429 { /* Handle problems with rebooting on Apple MacBookPro5 */
430 .callback = set_pci_reboot,
431 .ident = "Apple MacBookPro5",
432 .matches = {
433 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
434 DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
435 },
436 },
437 { }
438};
439
440static int __init pci_reboot_init(void)
441{
442 dmi_check_system(pci_reboot_dmi_table);
443 return 0;
444}
445core_initcall(pci_reboot_init);
446
399static inline void kb_wait(void) 447static inline void kb_wait(void)
400{ 448{
401 int i; 449 int i;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index be5ae80f897f..63f32d220ef2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -289,6 +289,20 @@ void * __init extend_brk(size_t size, size_t align)
289 return ret; 289 return ret;
290} 290}
291 291
292#ifdef CONFIG_X86_64
293static void __init init_gbpages(void)
294{
295 if (direct_gbpages && cpu_has_gbpages)
296 printk(KERN_INFO "Using GB pages for direct mapping\n");
297 else
298 direct_gbpages = 0;
299}
300#else
301static inline void init_gbpages(void)
302{
303}
304#endif
305
292static void __init reserve_brk(void) 306static void __init reserve_brk(void)
293{ 307{
294 if (_brk_end > _brk_start) 308 if (_brk_end > _brk_start)
@@ -658,6 +672,19 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
658 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), 672 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
659 }, 673 },
660 }, 674 },
675 {
676 /*
677 * AMI BIOS with low memory corruption was found on Intel DG45ID board.
678 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
679 * match only DMI_BOARD_NAME and see if there is more bad products
680 * with this vendor.
681 */
682 .callback = dmi_low_memory_corruption,
683 .ident = "AMI BIOS",
684 .matches = {
685 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
686 },
687 },
661#endif 688#endif
662 {} 689 {}
663}; 690};
@@ -871,6 +898,8 @@ void __init setup_arch(char **cmdline_p)
871 898
872 reserve_brk(); 899 reserve_brk();
873 900
901 init_gbpages();
902
874 /* max_pfn_mapped is updated here */ 903 /* max_pfn_mapped is updated here */
875 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); 904 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
876 max_pfn_mapped = max_low_pfn_mapped; 905 max_pfn_mapped = max_low_pfn_mapped;
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 9c3f0823e6aa..07d81916f212 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -124,7 +124,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
124} 124}
125 125
126/* 126/*
127 * Remap allocator 127 * Large page remap allocator
128 * 128 *
129 * This allocator uses PMD page as unit. A PMD page is allocated for 129 * This allocator uses PMD page as unit. A PMD page is allocated for
130 * each cpu and each is remapped into vmalloc area using PMD mapping. 130 * each cpu and each is remapped into vmalloc area using PMD mapping.
@@ -137,105 +137,185 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
137 * better than only using 4k mappings while still being NUMA friendly. 137 * better than only using 4k mappings while still being NUMA friendly.
138 */ 138 */
139#ifdef CONFIG_NEED_MULTIPLE_NODES 139#ifdef CONFIG_NEED_MULTIPLE_NODES
140static size_t pcpur_size __initdata; 140struct pcpul_ent {
141static void **pcpur_ptrs __initdata; 141 unsigned int cpu;
142 void *ptr;
143};
144
145static size_t pcpul_size;
146static struct pcpul_ent *pcpul_map;
147static struct vm_struct pcpul_vm;
142 148
143static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) 149static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
144{ 150{
145 size_t off = (size_t)pageno << PAGE_SHIFT; 151 size_t off = (size_t)pageno << PAGE_SHIFT;
146 152
147 if (off >= pcpur_size) 153 if (off >= pcpul_size)
148 return NULL; 154 return NULL;
149 155
150 return virt_to_page(pcpur_ptrs[cpu] + off); 156 return virt_to_page(pcpul_map[cpu].ptr + off);
151} 157}
152 158
153static ssize_t __init setup_pcpu_remap(size_t static_size) 159static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
154{ 160{
155 static struct vm_struct vm; 161 size_t map_size, dyn_size;
156 size_t ptrs_size, dyn_size;
157 unsigned int cpu; 162 unsigned int cpu;
163 int i, j;
158 ssize_t ret; 164 ssize_t ret;
159 165
160 /* 166 if (!chosen) {
161 * If large page isn't supported, there's no benefit in doing 167 size_t vm_size = VMALLOC_END - VMALLOC_START;
162 * this. Also, on non-NUMA, embedding is better. 168 size_t tot_size = nr_cpu_ids * PMD_SIZE;
163 * 169
164 * NOTE: disabled for now. 170 /* on non-NUMA, embedding is better */
165 */ 171 if (!pcpu_need_numa())
166 if (true || !cpu_has_pse || !pcpu_need_numa()) 172 return -EINVAL;
173
174 /* don't consume more than 20% of vmalloc area */
175 if (tot_size > vm_size / 5) {
176 pr_info("PERCPU: too large chunk size %zuMB for "
177 "large page remap\n", tot_size >> 20);
178 return -EINVAL;
179 }
180 }
181
182 /* need PSE */
183 if (!cpu_has_pse) {
184 pr_warning("PERCPU: lpage allocator requires PSE\n");
167 return -EINVAL; 185 return -EINVAL;
186 }
168 187
169 /* 188 /*
170 * Currently supports only single page. Supporting multiple 189 * Currently supports only single page. Supporting multiple
171 * pages won't be too difficult if it ever becomes necessary. 190 * pages won't be too difficult if it ever becomes necessary.
172 */ 191 */
173 pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + 192 pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
174 PERCPU_DYNAMIC_RESERVE); 193 PERCPU_DYNAMIC_RESERVE);
175 if (pcpur_size > PMD_SIZE) { 194 if (pcpul_size > PMD_SIZE) {
176 pr_warning("PERCPU: static data is larger than large page, " 195 pr_warning("PERCPU: static data is larger than large page, "
177 "can't use large page\n"); 196 "can't use large page\n");
178 return -EINVAL; 197 return -EINVAL;
179 } 198 }
180 dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; 199 dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
181 200
182 /* allocate pointer array and alloc large pages */ 201 /* allocate pointer array and alloc large pages */
183 ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); 202 map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
184 pcpur_ptrs = alloc_bootmem(ptrs_size); 203 pcpul_map = alloc_bootmem(map_size);
185 204
186 for_each_possible_cpu(cpu) { 205 for_each_possible_cpu(cpu) {
187 pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); 206 pcpul_map[cpu].cpu = cpu;
188 if (!pcpur_ptrs[cpu]) 207 pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
208 PMD_SIZE);
209 if (!pcpul_map[cpu].ptr) {
210 pr_warning("PERCPU: failed to allocate large page "
211 "for cpu%u\n", cpu);
189 goto enomem; 212 goto enomem;
213 }
190 214
191 /* 215 /*
192 * Only use pcpur_size bytes and give back the rest. 216 * Only use pcpul_size bytes and give back the rest.
193 * 217 *
194 * Ingo: The 2MB up-rounding bootmem is needed to make 218 * Ingo: The 2MB up-rounding bootmem is needed to make
195 * sure the partial 2MB page is still fully RAM - it's 219 * sure the partial 2MB page is still fully RAM - it's
196 * not well-specified to have a PAT-incompatible area 220 * not well-specified to have a PAT-incompatible area
197 * (unmapped RAM, device memory, etc.) in that hole. 221 * (unmapped RAM, device memory, etc.) in that hole.
198 */ 222 */
199 free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), 223 free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
200 PMD_SIZE - pcpur_size); 224 PMD_SIZE - pcpul_size);
201 225
202 memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); 226 memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
203 } 227 }
204 228
205 /* allocate address and map */ 229 /* allocate address and map */
206 vm.flags = VM_ALLOC; 230 pcpul_vm.flags = VM_ALLOC;
207 vm.size = num_possible_cpus() * PMD_SIZE; 231 pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
208 vm_area_register_early(&vm, PMD_SIZE); 232 vm_area_register_early(&pcpul_vm, PMD_SIZE);
209 233
210 for_each_possible_cpu(cpu) { 234 for_each_possible_cpu(cpu) {
211 pmd_t *pmd; 235 pmd_t *pmd, pmd_v;
212 236
213 pmd = populate_extra_pmd((unsigned long)vm.addr 237 pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
214 + cpu * PMD_SIZE); 238 cpu * PMD_SIZE);
215 set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), 239 pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
216 PAGE_KERNEL_LARGE)); 240 PAGE_KERNEL_LARGE);
241 set_pmd(pmd, pmd_v);
217 } 242 }
218 243
219 /* we're ready, commit */ 244 /* we're ready, commit */
220 pr_info("PERCPU: Remapped at %p with large pages, static data " 245 pr_info("PERCPU: Remapped at %p with large pages, static data "
221 "%zu bytes\n", vm.addr, static_size); 246 "%zu bytes\n", pcpul_vm.addr, static_size);
222 247
223 ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 248 ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
224 PERCPU_FIRST_CHUNK_RESERVE, dyn_size, 249 PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
225 PMD_SIZE, vm.addr, NULL); 250 PMD_SIZE, pcpul_vm.addr, NULL);
226 goto out_free_ar; 251
252 /* sort pcpul_map array for pcpu_lpage_remapped() */
253 for (i = 0; i < nr_cpu_ids - 1; i++)
254 for (j = i + 1; j < nr_cpu_ids; j++)
255 if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
256 struct pcpul_ent tmp = pcpul_map[i];
257 pcpul_map[i] = pcpul_map[j];
258 pcpul_map[j] = tmp;
259 }
260
261 return ret;
227 262
228enomem: 263enomem:
229 for_each_possible_cpu(cpu) 264 for_each_possible_cpu(cpu)
230 if (pcpur_ptrs[cpu]) 265 if (pcpul_map[cpu].ptr)
231 free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); 266 free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
232 ret = -ENOMEM; 267 free_bootmem(__pa(pcpul_map), map_size);
233out_free_ar: 268 return -ENOMEM;
234 free_bootmem(__pa(pcpur_ptrs), ptrs_size); 269}
235 return ret; 270
271/**
272 * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
273 * @kaddr: the kernel address in question
274 *
275 * Determine whether @kaddr falls in the pcpul recycled area. This is
276 * used by pageattr to detect VM aliases and break up the pcpu PMD
277 * mapping such that the same physical page is not mapped under
278 * different attributes.
279 *
280 * The recycled area is always at the tail of a partially used PMD
281 * page.
282 *
283 * RETURNS:
284 * Address of corresponding remapped pcpu address if match is found;
285 * otherwise, NULL.
286 */
287void *pcpu_lpage_remapped(void *kaddr)
288{
289 void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
290 unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
291 int left = 0, right = nr_cpu_ids - 1;
292 int pos;
293
294 /* pcpul in use at all? */
295 if (!pcpul_map)
296 return NULL;
297
298 /* okay, perform binary search */
299 while (left <= right) {
300 pos = (left + right) / 2;
301
302 if (pcpul_map[pos].ptr < pmd_addr)
303 left = pos + 1;
304 else if (pcpul_map[pos].ptr > pmd_addr)
305 right = pos - 1;
306 else {
307 /* it shouldn't be in the area for the first chunk */
308 WARN_ON(offset < pcpul_size);
309
310 return pcpul_vm.addr +
311 pcpul_map[pos].cpu * PMD_SIZE + offset;
312 }
313 }
314
315 return NULL;
236} 316}
237#else 317#else
238static ssize_t __init setup_pcpu_remap(size_t static_size) 318static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
239{ 319{
240 return -EINVAL; 320 return -EINVAL;
241} 321}
@@ -249,7 +329,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
249 * mapping so that it can use PMD mapping without additional TLB 329 * mapping so that it can use PMD mapping without additional TLB
250 * pressure. 330 * pressure.
251 */ 331 */
252static ssize_t __init setup_pcpu_embed(size_t static_size) 332static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
253{ 333{
254 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; 334 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
255 335
@@ -258,7 +338,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
258 * this. Also, embedding allocation doesn't play well with 338 * this. Also, embedding allocation doesn't play well with
259 * NUMA. 339 * NUMA.
260 */ 340 */
261 if (!cpu_has_pse || pcpu_need_numa()) 341 if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
262 return -EINVAL; 342 return -EINVAL;
263 343
264 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, 344 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
@@ -297,7 +377,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
297 pcpu4k_nr_static_pages = PFN_UP(static_size); 377 pcpu4k_nr_static_pages = PFN_UP(static_size);
298 378
299 /* unaligned allocations can't be freed, round up to page size */ 379 /* unaligned allocations can't be freed, round up to page size */
300 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() 380 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
301 * sizeof(pcpu4k_pages[0])); 381 * sizeof(pcpu4k_pages[0]));
302 pcpu4k_pages = alloc_bootmem(pages_size); 382 pcpu4k_pages = alloc_bootmem(pages_size);
303 383
@@ -308,8 +388,11 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
308 void *ptr; 388 void *ptr;
309 389
310 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); 390 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
311 if (!ptr) 391 if (!ptr) {
392 pr_warning("PERCPU: failed to allocate "
393 "4k page for cpu%u\n", cpu);
312 goto enomem; 394 goto enomem;
395 }
313 396
314 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); 397 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
315 pcpu4k_pages[j++] = virt_to_page(ptr); 398 pcpu4k_pages[j++] = virt_to_page(ptr);
@@ -333,6 +416,16 @@ out_free_ar:
333 return ret; 416 return ret;
334} 417}
335 418
419/* for explicit first chunk allocator selection */
420static char pcpu_chosen_alloc[16] __initdata;
421
422static int __init percpu_alloc_setup(char *str)
423{
424 strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
425 return 0;
426}
427early_param("percpu_alloc", percpu_alloc_setup);
428
336static inline void setup_percpu_segment(int cpu) 429static inline void setup_percpu_segment(int cpu)
337{ 430{
338#ifdef CONFIG_X86_32 431#ifdef CONFIG_X86_32
@@ -346,11 +439,6 @@ static inline void setup_percpu_segment(int cpu)
346#endif 439#endif
347} 440}
348 441
349/*
350 * Great future plan:
351 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
352 * Always point %gs to its beginning
353 */
354void __init setup_per_cpu_areas(void) 442void __init setup_per_cpu_areas(void)
355{ 443{
356 size_t static_size = __per_cpu_end - __per_cpu_start; 444 size_t static_size = __per_cpu_end - __per_cpu_start;
@@ -367,9 +455,26 @@ void __init setup_per_cpu_areas(void)
367 * of large page mappings. Please read comments on top of 455 * of large page mappings. Please read comments on top of
368 * each allocator for details. 456 * each allocator for details.
369 */ 457 */
370 ret = setup_pcpu_remap(static_size); 458 ret = -EINVAL;
371 if (ret < 0) 459 if (strlen(pcpu_chosen_alloc)) {
372 ret = setup_pcpu_embed(static_size); 460 if (strcmp(pcpu_chosen_alloc, "4k")) {
461 if (!strcmp(pcpu_chosen_alloc, "lpage"))
462 ret = setup_pcpu_lpage(static_size, true);
463 else if (!strcmp(pcpu_chosen_alloc, "embed"))
464 ret = setup_pcpu_embed(static_size, true);
465 else
466 pr_warning("PERCPU: unknown allocator %s "
467 "specified\n", pcpu_chosen_alloc);
468 if (ret < 0)
469 pr_warning("PERCPU: %s allocator failed (%zd), "
470 "falling back to 4k\n",
471 pcpu_chosen_alloc, ret);
472 }
473 } else {
474 ret = setup_pcpu_lpage(static_size, false);
475 if (ret < 0)
476 ret = setup_pcpu_embed(static_size, false);
477 }
373 if (ret < 0) 478 if (ret < 0)
374 ret = setup_pcpu_4k(static_size); 479 ret = setup_pcpu_4k(static_size);
375 if (ret < 0) 480 if (ret < 0)
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 4aaf7e48394f..c3eb207181fe 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace)
77} 77}
78EXPORT_SYMBOL_GPL(save_stack_trace); 78EXPORT_SYMBOL_GPL(save_stack_trace);
79 79
80void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
81{
82 dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
83 if (trace->nr_entries < trace->max_entries)
84 trace->entries[trace->nr_entries++] = ULONG_MAX;
85}
86
80void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 87void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
81{ 88{
82 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); 89 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 124d40c575df..77b9689f8edb 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -711,7 +711,6 @@ uv_activation_descriptor_init(int node, int pnode)
711 unsigned long pa; 711 unsigned long pa;
712 unsigned long m; 712 unsigned long m;
713 unsigned long n; 713 unsigned long n;
714 unsigned long mmr_image;
715 struct bau_desc *adp; 714 struct bau_desc *adp;
716 struct bau_desc *ad2; 715 struct bau_desc *ad2;
717 716
@@ -727,12 +726,8 @@ uv_activation_descriptor_init(int node, int pnode)
727 n = pa >> uv_nshift; 726 n = pa >> uv_nshift;
728 m = pa & uv_mmask; 727 m = pa & uv_mmask;
729 728
730 mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE); 729 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
731 if (mmr_image) { 730 (n << UV_DESC_BASE_PNODE_SHIFT | m));
732 uv_write_global_mmr64(pnode, (unsigned long)
733 UVH_LB_BAU_SB_DESCRIPTOR_BASE,
734 (n << UV_DESC_BASE_PNODE_SHIFT | m));
735 }
736 731
737 /* 732 /*
738 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each 733 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
@@ -749,6 +744,7 @@ uv_activation_descriptor_init(int node, int pnode)
749 * note that base_dest_nodeid is actually a nasid. 744 * note that base_dest_nodeid is actually a nasid.
750 */ 745 */
751 ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; 746 ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
747 ad2->header.dest_subnodeid = 0x10; /* the LB */
752 ad2->header.command = UV_NET_ENDPOINT_INTD; 748 ad2->header.command = UV_NET_ENDPOINT_INTD;
753 ad2->header.int_both = 1; 749 ad2->header.int_both = 1;
754 /* 750 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1e1e27b7d438..5204332f475d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -45,6 +45,7 @@
45#include <linux/edac.h> 45#include <linux/edac.h>
46#endif 46#endif
47 47
48#include <asm/kmemcheck.h>
48#include <asm/stacktrace.h> 49#include <asm/stacktrace.h>
49#include <asm/processor.h> 50#include <asm/processor.h>
50#include <asm/debugreg.h> 51#include <asm/debugreg.h>
@@ -53,6 +54,7 @@
53#include <asm/traps.h> 54#include <asm/traps.h>
54#include <asm/desc.h> 55#include <asm/desc.h>
55#include <asm/i387.h> 56#include <asm/i387.h>
57#include <asm/mce.h>
56 58
57#include <asm/mach_traps.h> 59#include <asm/mach_traps.h>
58 60
@@ -64,8 +66,6 @@
64#include <asm/setup.h> 66#include <asm/setup.h>
65#include <asm/traps.h> 67#include <asm/traps.h>
66 68
67#include "cpu/mcheck/mce.h"
68
69asmlinkage int system_call(void); 69asmlinkage int system_call(void);
70 70
71/* Do we ignore FPU interrupts ? */ 71/* Do we ignore FPU interrupts ? */
@@ -346,6 +346,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
346 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); 346 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
347 show_registers(regs); 347 show_registers(regs);
348 348
349 if (panic_on_io_nmi)
350 panic("NMI IOCK error: Not continuing");
351
349 /* Re-enable the IOCK line, wait for a few seconds */ 352 /* Re-enable the IOCK line, wait for a few seconds */
350 reason = (reason & 0xf) | 8; 353 reason = (reason & 0xf) | 8;
351 outb(reason, 0x61); 354 outb(reason, 0x61);
@@ -534,6 +537,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
534 537
535 get_debugreg(condition, 6); 538 get_debugreg(condition, 6);
536 539
540 /* Catch kmemcheck conditions first of all! */
541 if (condition & DR_STEP && kmemcheck_trap(regs))
542 return;
543
537 /* 544 /*
538 * The processor cleared BTF, so don't mark that we need it set. 545 * The processor cleared BTF, so don't mark that we need it set.
539 */ 546 */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3e1c057e98fe..71f4368b357e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -9,6 +9,7 @@
9#include <linux/delay.h> 9#include <linux/delay.h>
10#include <linux/clocksource.h> 10#include <linux/clocksource.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/timex.h>
12 13
13#include <asm/hpet.h> 14#include <asm/hpet.h>
14#include <asm/timer.h> 15#include <asm/timer.h>
@@ -274,15 +275,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
274 * use the TSC value at the transitions to calculate a pretty 275 * use the TSC value at the transitions to calculate a pretty
275 * good value for the TSC frequencty. 276 * good value for the TSC frequencty.
276 */ 277 */
278static inline int pit_verify_msb(unsigned char val)
279{
280 /* Ignore LSB */
281 inb(0x42);
282 return inb(0x42) == val;
283}
284
277static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) 285static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
278{ 286{
279 int count; 287 int count;
280 u64 tsc = 0; 288 u64 tsc = 0;
281 289
282 for (count = 0; count < 50000; count++) { 290 for (count = 0; count < 50000; count++) {
283 /* Ignore LSB */ 291 if (!pit_verify_msb(val))
284 inb(0x42);
285 if (inb(0x42) != val)
286 break; 292 break;
287 tsc = get_cycles(); 293 tsc = get_cycles();
288 } 294 }
@@ -335,8 +341,7 @@ static unsigned long quick_pit_calibrate(void)
335 * to do that is to just read back the 16-bit counter 341 * to do that is to just read back the 16-bit counter
336 * once from the PIT. 342 * once from the PIT.
337 */ 343 */
338 inb(0x42); 344 pit_verify_msb(0);
339 inb(0x42);
340 345
341 if (pit_expect_msb(0xff, &tsc, &d1)) { 346 if (pit_expect_msb(0xff, &tsc, &d1)) {
342 for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { 347 for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
@@ -347,8 +352,19 @@ static unsigned long quick_pit_calibrate(void)
347 * Iterate until the error is less than 500 ppm 352 * Iterate until the error is less than 500 ppm
348 */ 353 */
349 delta -= tsc; 354 delta -= tsc;
350 if (d1+d2 < delta >> 11) 355 if (d1+d2 >= delta >> 11)
351 goto success; 356 continue;
357
358 /*
359 * Check the PIT one more time to verify that
360 * all TSC reads were stable wrt the PIT.
361 *
362 * This also guarantees serialization of the
363 * last cycle read ('d2') in pit_expect_msb.
364 */
365 if (!pit_verify_msb(0xfe - i))
366 break;
367 goto success;
352 } 368 }
353 } 369 }
354 printk("Fast TSC calibration failed\n"); 370 printk("Fast TSC calibration failed\n");
@@ -589,22 +605,26 @@ EXPORT_SYMBOL(recalibrate_cpu_khz);
589 */ 605 */
590 606
591DEFINE_PER_CPU(unsigned long, cyc2ns); 607DEFINE_PER_CPU(unsigned long, cyc2ns);
608DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
592 609
593static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) 610static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
594{ 611{
595 unsigned long long tsc_now, ns_now; 612 unsigned long long tsc_now, ns_now, *offset;
596 unsigned long flags, *scale; 613 unsigned long flags, *scale;
597 614
598 local_irq_save(flags); 615 local_irq_save(flags);
599 sched_clock_idle_sleep_event(); 616 sched_clock_idle_sleep_event();
600 617
601 scale = &per_cpu(cyc2ns, cpu); 618 scale = &per_cpu(cyc2ns, cpu);
619 offset = &per_cpu(cyc2ns_offset, cpu);
602 620
603 rdtscll(tsc_now); 621 rdtscll(tsc_now);
604 ns_now = __cycles_2_ns(tsc_now); 622 ns_now = __cycles_2_ns(tsc_now);
605 623
606 if (cpu_khz) 624 if (cpu_khz) {
607 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; 625 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
626 *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
627 }
608 628
609 sched_clock_idle_wakeup_event(0); 629 sched_clock_idle_wakeup_event(0);
610 local_irq_restore(flags); 630 local_irq_restore(flags);
@@ -631,17 +651,15 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
631 void *data) 651 void *data)
632{ 652{
633 struct cpufreq_freqs *freq = data; 653 struct cpufreq_freqs *freq = data;
634 unsigned long *lpj, dummy; 654 unsigned long *lpj;
635 655
636 if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) 656 if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
637 return 0; 657 return 0;
638 658
639 lpj = &dummy; 659 lpj = &boot_cpu_data.loops_per_jiffy;
640 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
641#ifdef CONFIG_SMP 660#ifdef CONFIG_SMP
661 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
642 lpj = &cpu_data(freq->cpu).loops_per_jiffy; 662 lpj = &cpu_data(freq->cpu).loops_per_jiffy;
643#else
644 lpj = &boot_cpu_data.loops_per_jiffy;
645#endif 663#endif
646 664
647 if (!ref_freq) { 665 if (!ref_freq) {
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index b263423fbe2a..95a7289e4b0c 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -441,7 +441,7 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
441 ap.ds = __USER_DS; 441 ap.ds = __USER_DS;
442 ap.es = __USER_DS; 442 ap.es = __USER_DS;
443 ap.fs = __KERNEL_PERCPU; 443 ap.fs = __KERNEL_PERCPU;
444 ap.gs = 0; 444 ap.gs = __KERNEL_STACK_CANARY;
445 445
446 ap.eflags = 0; 446 ap.eflags = 0;
447 447
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 367e87882041..78d185d797de 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -112,11 +112,6 @@ SECTIONS
112 _sdata = .; 112 _sdata = .;
113 DATA_DATA 113 DATA_DATA
114 CONSTRUCTORS 114 CONSTRUCTORS
115
116#ifdef CONFIG_X86_64
117 /* End of data section */
118 _edata = .;
119#endif
120 } :data 115 } :data
121 116
122#ifdef CONFIG_X86_32 117#ifdef CONFIG_X86_32
@@ -156,10 +151,8 @@ SECTIONS
156 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { 151 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
157 *(.data.read_mostly) 152 *(.data.read_mostly)
158 153
159#ifdef CONFIG_X86_32
160 /* End of data section */ 154 /* End of data section */
161 _edata = .; 155 _edata = .;
162#endif
163 } 156 }
164 157
165#ifdef CONFIG_X86_64 158#ifdef CONFIG_X86_64
@@ -400,8 +393,8 @@ SECTIONS
400 393
401 394
402#ifdef CONFIG_X86_32 395#ifdef CONFIG_X86_32
403ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), 396. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
404 "kernel image bigger than KERNEL_IMAGE_SIZE") 397 "kernel image bigger than KERNEL_IMAGE_SIZE");
405#else 398#else
406/* 399/*
407 * Per-cpu symbols which need to be offset from __per_cpu_load 400 * Per-cpu symbols which need to be offset from __per_cpu_load
@@ -414,12 +407,12 @@ INIT_PER_CPU(irq_stack_union);
414/* 407/*
415 * Build-time check on the image size: 408 * Build-time check on the image size:
416 */ 409 */
417ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), 410. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
418 "kernel image bigger than KERNEL_IMAGE_SIZE") 411 "kernel image bigger than KERNEL_IMAGE_SIZE");
419 412
420#ifdef CONFIG_SMP 413#ifdef CONFIG_SMP
421ASSERT((per_cpu__irq_stack_union == 0), 414. = ASSERT((per_cpu__irq_stack_union == 0),
422 "irq_stack_union is not at start of per-cpu area"); 415 "irq_stack_union is not at start of per-cpu area");
423#endif 416#endif
424 417
425#endif /* CONFIG_X86_32 */ 418#endif /* CONFIG_X86_32 */
@@ -427,7 +420,7 @@ ASSERT((per_cpu__irq_stack_union == 0),
427#ifdef CONFIG_KEXEC 420#ifdef CONFIG_KEXEC
428#include <asm/kexec.h> 421#include <asm/kexec.h>
429 422
430ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, 423. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
431 "kexec control code size is too big") 424 "kexec control code size is too big");
432#endif 425#endif
433 426
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 4d6f0d293ee2..21f68e00524f 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -104,6 +104,9 @@ static s64 __kpit_elapsed(struct kvm *kvm)
104 ktime_t remaining; 104 ktime_t remaining;
105 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; 105 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
106 106
107 if (!ps->pit_timer.period)
108 return 0;
109
107 /* 110 /*
108 * The Counter does not stop when it reaches zero. In 111 * The Counter does not stop when it reaches zero. In
109 * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to 112 * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5c3d6e81a7dc..0ef5bb2b4043 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -489,16 +489,20 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
489 * 489 *
490 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc 490 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
491 * containing more mappings. 491 * containing more mappings.
492 *
493 * Returns the number of rmap entries before the spte was added or zero if
494 * the spte was not added.
495 *
492 */ 496 */
493static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) 497static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
494{ 498{
495 struct kvm_mmu_page *sp; 499 struct kvm_mmu_page *sp;
496 struct kvm_rmap_desc *desc; 500 struct kvm_rmap_desc *desc;
497 unsigned long *rmapp; 501 unsigned long *rmapp;
498 int i; 502 int i, count = 0;
499 503
500 if (!is_rmap_pte(*spte)) 504 if (!is_rmap_pte(*spte))
501 return; 505 return count;
502 gfn = unalias_gfn(vcpu->kvm, gfn); 506 gfn = unalias_gfn(vcpu->kvm, gfn);
503 sp = page_header(__pa(spte)); 507 sp = page_header(__pa(spte));
504 sp->gfns[spte - sp->spt] = gfn; 508 sp->gfns[spte - sp->spt] = gfn;
@@ -515,8 +519,10 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
515 } else { 519 } else {
516 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 520 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
517 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 521 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
518 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) 522 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) {
519 desc = desc->more; 523 desc = desc->more;
524 count += RMAP_EXT;
525 }
520 if (desc->shadow_ptes[RMAP_EXT-1]) { 526 if (desc->shadow_ptes[RMAP_EXT-1]) {
521 desc->more = mmu_alloc_rmap_desc(vcpu); 527 desc->more = mmu_alloc_rmap_desc(vcpu);
522 desc = desc->more; 528 desc = desc->more;
@@ -525,6 +531,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
525 ; 531 ;
526 desc->shadow_ptes[i] = spte; 532 desc->shadow_ptes[i] = spte;
527 } 533 }
534 return count;
528} 535}
529 536
530static void rmap_desc_remove_entry(unsigned long *rmapp, 537static void rmap_desc_remove_entry(unsigned long *rmapp,
@@ -754,6 +761,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
754 return young; 761 return young;
755} 762}
756 763
764#define RMAP_RECYCLE_THRESHOLD 1000
765
766static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage)
767{
768 unsigned long *rmapp;
769
770 gfn = unalias_gfn(vcpu->kvm, gfn);
771 rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
772
773 kvm_unmap_rmapp(vcpu->kvm, rmapp);
774 kvm_flush_remote_tlbs(vcpu->kvm);
775}
776
757int kvm_age_hva(struct kvm *kvm, unsigned long hva) 777int kvm_age_hva(struct kvm *kvm, unsigned long hva)
758{ 778{
759 return kvm_handle_hva(kvm, hva, kvm_age_rmapp); 779 return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
@@ -1407,24 +1427,25 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1407 */ 1427 */
1408void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1428void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1409{ 1429{
1430 int used_pages;
1431
1432 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1433 used_pages = max(0, used_pages);
1434
1410 /* 1435 /*
1411 * If we set the number of mmu pages to be smaller be than the 1436 * If we set the number of mmu pages to be smaller be than the
1412 * number of actived pages , we must to free some mmu pages before we 1437 * number of actived pages , we must to free some mmu pages before we
1413 * change the value 1438 * change the value
1414 */ 1439 */
1415 1440
1416 if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > 1441 if (used_pages > kvm_nr_mmu_pages) {
1417 kvm_nr_mmu_pages) { 1442 while (used_pages > kvm_nr_mmu_pages) {
1418 int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
1419 - kvm->arch.n_free_mmu_pages;
1420
1421 while (n_used_mmu_pages > kvm_nr_mmu_pages) {
1422 struct kvm_mmu_page *page; 1443 struct kvm_mmu_page *page;
1423 1444
1424 page = container_of(kvm->arch.active_mmu_pages.prev, 1445 page = container_of(kvm->arch.active_mmu_pages.prev,
1425 struct kvm_mmu_page, link); 1446 struct kvm_mmu_page, link);
1426 kvm_mmu_zap_page(kvm, page); 1447 kvm_mmu_zap_page(kvm, page);
1427 n_used_mmu_pages--; 1448 used_pages--;
1428 } 1449 }
1429 kvm->arch.n_free_mmu_pages = 0; 1450 kvm->arch.n_free_mmu_pages = 0;
1430 } 1451 }
@@ -1740,6 +1761,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1740{ 1761{
1741 int was_rmapped = 0; 1762 int was_rmapped = 0;
1742 int was_writeble = is_writeble_pte(*shadow_pte); 1763 int was_writeble = is_writeble_pte(*shadow_pte);
1764 int rmap_count;
1743 1765
1744 pgprintk("%s: spte %llx access %x write_fault %d" 1766 pgprintk("%s: spte %llx access %x write_fault %d"
1745 " user_fault %d gfn %lx\n", 1767 " user_fault %d gfn %lx\n",
@@ -1781,9 +1803,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1781 1803
1782 page_header_update_slot(vcpu->kvm, shadow_pte, gfn); 1804 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
1783 if (!was_rmapped) { 1805 if (!was_rmapped) {
1784 rmap_add(vcpu, shadow_pte, gfn, largepage); 1806 rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage);
1785 if (!is_rmap_pte(*shadow_pte)) 1807 if (!is_rmap_pte(*shadow_pte))
1786 kvm_release_pfn_clean(pfn); 1808 kvm_release_pfn_clean(pfn);
1809 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1810 rmap_recycle(vcpu, gfn, largepage);
1787 } else { 1811 } else {
1788 if (was_writeble) 1812 if (was_writeble)
1789 kvm_release_pfn_dirty(pfn); 1813 kvm_release_pfn_dirty(pfn);
@@ -2157,7 +2181,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2157 else 2181 else
2158 /* 32 bits PSE 4MB page */ 2182 /* 32 bits PSE 4MB page */
2159 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); 2183 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2160 context->rsvd_bits_mask[1][0] = ~0ull; 2184 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
2161 break; 2185 break;
2162 case PT32E_ROOT_LEVEL: 2186 case PT32E_ROOT_LEVEL:
2163 context->rsvd_bits_mask[0][2] = 2187 context->rsvd_bits_mask[0][2] =
@@ -2170,7 +2194,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2170 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2194 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2171 rsvd_bits(maxphyaddr, 62) | 2195 rsvd_bits(maxphyaddr, 62) |
2172 rsvd_bits(13, 20); /* large page */ 2196 rsvd_bits(13, 20); /* large page */
2173 context->rsvd_bits_mask[1][0] = ~0ull; 2197 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
2174 break; 2198 break;
2175 case PT64_ROOT_LEVEL: 2199 case PT64_ROOT_LEVEL:
2176 context->rsvd_bits_mask[0][3] = exb_bit_rsvd | 2200 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
@@ -2186,7 +2210,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2186 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2210 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2187 rsvd_bits(maxphyaddr, 51) | 2211 rsvd_bits(maxphyaddr, 51) |
2188 rsvd_bits(13, 20); /* large page */ 2212 rsvd_bits(13, 20); /* large page */
2189 context->rsvd_bits_mask[1][0] = ~0ull; 2213 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
2190 break; 2214 break;
2191 } 2215 }
2192} 2216}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 258e4591e1ca..67785f635399 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -281,7 +281,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
281{ 281{
282 unsigned access = gw->pt_access; 282 unsigned access = gw->pt_access;
283 struct kvm_mmu_page *shadow_page; 283 struct kvm_mmu_page *shadow_page;
284 u64 spte, *sptep; 284 u64 spte, *sptep = NULL;
285 int direct; 285 int direct;
286 gfn_t table_gfn; 286 gfn_t table_gfn;
287 int r; 287 int r;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 71510e07e69e..b1f658ad2f06 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -711,6 +711,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
711 svm->vmcb->control.tsc_offset += delta; 711 svm->vmcb->control.tsc_offset += delta;
712 vcpu->cpu = cpu; 712 vcpu->cpu = cpu;
713 kvm_migrate_timers(vcpu); 713 kvm_migrate_timers(vcpu);
714 svm->asid_generation = 0;
714 } 715 }
715 716
716 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 717 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
@@ -1031,7 +1032,6 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
1031 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; 1032 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1032 } 1033 }
1033 1034
1034 svm->vcpu.cpu = svm_data->cpu;
1035 svm->asid_generation = svm_data->asid_generation; 1035 svm->asid_generation = svm_data->asid_generation;
1036 svm->vmcb->control.asid = svm_data->next_asid++; 1036 svm->vmcb->control.asid = svm_data->next_asid++;
1037} 1037}
@@ -2300,8 +2300,8 @@ static void pre_svm_run(struct vcpu_svm *svm)
2300 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); 2300 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
2301 2301
2302 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; 2302 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
2303 if (svm->vcpu.cpu != cpu || 2303 /* FIXME: handle wraparound of asid_generation */
2304 svm->asid_generation != svm_data->asid_generation) 2304 if (svm->asid_generation != svm_data->asid_generation)
2305 new_asid(svm, svm_data); 2305 new_asid(svm, svm_data);
2306} 2306}
2307 2307
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 32d6ae8fb60e..29f912927a58 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1277,7 +1277,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
1277 struct page *pages; 1277 struct page *pages;
1278 struct vmcs *vmcs; 1278 struct vmcs *vmcs;
1279 1279
1280 pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); 1280 pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
1281 if (!pages) 1281 if (!pages)
1282 return NULL; 1282 return NULL;
1283 vmcs = page_address(pages); 1283 vmcs = page_address(pages);
@@ -3012,6 +3012,12 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3012 return 1; 3012 return 1;
3013} 3013}
3014 3014
3015static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3016{
3017 kvm_queue_exception(vcpu, UD_VECTOR);
3018 return 1;
3019}
3020
3015static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3021static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3016{ 3022{
3017 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3023 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3151,8 +3157,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3151 struct vcpu_vmx *vmx = to_vmx(vcpu); 3157 struct vcpu_vmx *vmx = to_vmx(vcpu);
3152 enum emulation_result err = EMULATE_DONE; 3158 enum emulation_result err = EMULATE_DONE;
3153 3159
3154 preempt_enable();
3155 local_irq_enable(); 3160 local_irq_enable();
3161 preempt_enable();
3156 3162
3157 while (!guest_state_valid(vcpu)) { 3163 while (!guest_state_valid(vcpu)) {
3158 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3164 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
@@ -3162,7 +3168,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3162 3168
3163 if (err != EMULATE_DONE) { 3169 if (err != EMULATE_DONE) {
3164 kvm_report_emulation_failure(vcpu, "emulation failure"); 3170 kvm_report_emulation_failure(vcpu, "emulation failure");
3165 return; 3171 break;
3166 } 3172 }
3167 3173
3168 if (signal_pending(current)) 3174 if (signal_pending(current))
@@ -3171,8 +3177,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3171 schedule(); 3177 schedule();
3172 } 3178 }
3173 3179
3174 local_irq_disable();
3175 preempt_disable(); 3180 preempt_disable();
3181 local_irq_disable();
3176 3182
3177 vmx->invalid_state_emulation_result = err; 3183 vmx->invalid_state_emulation_result = err;
3178} 3184}
@@ -3198,6 +3204,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3198 [EXIT_REASON_HLT] = handle_halt, 3204 [EXIT_REASON_HLT] = handle_halt,
3199 [EXIT_REASON_INVLPG] = handle_invlpg, 3205 [EXIT_REASON_INVLPG] = handle_invlpg,
3200 [EXIT_REASON_VMCALL] = handle_vmcall, 3206 [EXIT_REASON_VMCALL] = handle_vmcall,
3207 [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
3208 [EXIT_REASON_VMLAUNCH] = handle_vmx_insn,
3209 [EXIT_REASON_VMPTRLD] = handle_vmx_insn,
3210 [EXIT_REASON_VMPTRST] = handle_vmx_insn,
3211 [EXIT_REASON_VMREAD] = handle_vmx_insn,
3212 [EXIT_REASON_VMRESUME] = handle_vmx_insn,
3213 [EXIT_REASON_VMWRITE] = handle_vmx_insn,
3214 [EXIT_REASON_VMOFF] = handle_vmx_insn,
3215 [EXIT_REASON_VMON] = handle_vmx_insn,
3201 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 3216 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
3202 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 3217 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
3203 [EXIT_REASON_WBINVD] = handle_wbinvd, 3218 [EXIT_REASON_WBINVD] = handle_wbinvd,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 249540f98513..3d4529011828 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -704,11 +704,48 @@ static bool msr_mtrr_valid(unsigned msr)
704 return false; 704 return false;
705} 705}
706 706
707static bool valid_pat_type(unsigned t)
708{
709 return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
710}
711
712static bool valid_mtrr_type(unsigned t)
713{
714 return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
715}
716
717static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
718{
719 int i;
720
721 if (!msr_mtrr_valid(msr))
722 return false;
723
724 if (msr == MSR_IA32_CR_PAT) {
725 for (i = 0; i < 8; i++)
726 if (!valid_pat_type((data >> (i * 8)) & 0xff))
727 return false;
728 return true;
729 } else if (msr == MSR_MTRRdefType) {
730 if (data & ~0xcff)
731 return false;
732 return valid_mtrr_type(data & 0xff);
733 } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
734 for (i = 0; i < 8 ; i++)
735 if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
736 return false;
737 return true;
738 }
739
740 /* variable MTRRs */
741 return valid_mtrr_type(data & 0xff);
742}
743
707static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 744static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
708{ 745{
709 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 746 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
710 747
711 if (!msr_mtrr_valid(msr)) 748 if (!mtrr_valid(vcpu, msr, data))
712 return 1; 749 return 1;
713 750
714 if (msr == MSR_MTRRdefType) { 751 if (msr == MSR_MTRRdefType) {
@@ -898,6 +935,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
898 case MSR_VM_HSAVE_PA: 935 case MSR_VM_HSAVE_PA:
899 case MSR_P6_EVNTSEL0: 936 case MSR_P6_EVNTSEL0:
900 case MSR_P6_EVNTSEL1: 937 case MSR_P6_EVNTSEL1:
938 case MSR_K7_EVNTSEL0:
901 data = 0; 939 data = 0;
902 break; 940 break;
903 case MSR_MTRRcap: 941 case MSR_MTRRcap:
@@ -1078,14 +1116,13 @@ long kvm_arch_dev_ioctl(struct file *filp,
1078 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 1116 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1079 goto out; 1117 goto out;
1080 r = -E2BIG; 1118 r = -E2BIG;
1081 if (n < num_msrs_to_save) 1119 if (n < msr_list.nmsrs)
1082 goto out; 1120 goto out;
1083 r = -EFAULT; 1121 r = -EFAULT;
1084 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 1122 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1085 num_msrs_to_save * sizeof(u32))) 1123 num_msrs_to_save * sizeof(u32)))
1086 goto out; 1124 goto out;
1087 if (copy_to_user(user_msr_list->indices 1125 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
1088 + num_msrs_to_save * sizeof(u32),
1089 &emulated_msrs, 1126 &emulated_msrs,
1090 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 1127 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1091 goto out; 1128 goto out;
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index c1b6c232e02b..616de4628d60 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1361,7 +1361,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1361 return 0; 1361 return 0;
1362} 1362}
1363 1363
1364void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) 1364static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
1365{ 1365{
1366 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); 1366 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
1367 /* 1367 /*
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 7bc65f0f62c4..d677fa9ca650 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -22,7 +22,8 @@
22 * 22 *
23 * So how does the kernel know it's a Guest? We'll see that later, but let's 23 * So how does the kernel know it's a Guest? We'll see that later, but let's
24 * just say that we end up here where we replace the native functions various 24 * just say that we end up here where we replace the native functions various
25 * "paravirt" structures with our Guest versions, then boot like normal. :*/ 25 * "paravirt" structures with our Guest versions, then boot like normal.
26:*/
26 27
27/* 28/*
28 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. 29 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
@@ -74,7 +75,8 @@
74 * 75 *
75 * The Guest in our tale is a simple creature: identical to the Host but 76 * The Guest in our tale is a simple creature: identical to the Host but
76 * behaving in simplified but equivalent ways. In particular, the Guest is the 77 * behaving in simplified but equivalent ways. In particular, the Guest is the
77 * same kernel as the Host (or at least, built from the same source code). :*/ 78 * same kernel as the Host (or at least, built from the same source code).
79:*/
78 80
79struct lguest_data lguest_data = { 81struct lguest_data lguest_data = {
80 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, 82 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
@@ -85,7 +87,8 @@ struct lguest_data lguest_data = {
85 .syscall_vec = SYSCALL_VECTOR, 87 .syscall_vec = SYSCALL_VECTOR,
86}; 88};
87 89
88/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a 90/*G:037
91 * async_hcall() is pretty simple: I'm quite proud of it really. We have a
89 * ring buffer of stored hypercalls which the Host will run though next time we 92 * ring buffer of stored hypercalls which the Host will run though next time we
90 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall 93 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
91 * arguments, and a "hcall_status" word which is 0 if the call is ready to go, 94 * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
@@ -94,7 +97,8 @@ struct lguest_data lguest_data = {
94 * If we come around to a slot which hasn't been finished, then the table is 97 * If we come around to a slot which hasn't been finished, then the table is
95 * full and we just make the hypercall directly. This has the nice side 98 * full and we just make the hypercall directly. This has the nice side
96 * effect of causing the Host to run all the stored calls in the ring buffer 99 * effect of causing the Host to run all the stored calls in the ring buffer
97 * which empties it for next time! */ 100 * which empties it for next time!
101 */
98static void async_hcall(unsigned long call, unsigned long arg1, 102static void async_hcall(unsigned long call, unsigned long arg1,
99 unsigned long arg2, unsigned long arg3, 103 unsigned long arg2, unsigned long arg3,
100 unsigned long arg4) 104 unsigned long arg4)
@@ -103,9 +107,11 @@ static void async_hcall(unsigned long call, unsigned long arg1,
103 static unsigned int next_call; 107 static unsigned int next_call;
104 unsigned long flags; 108 unsigned long flags;
105 109
106 /* Disable interrupts if not already disabled: we don't want an 110 /*
111 * Disable interrupts if not already disabled: we don't want an
107 * interrupt handler making a hypercall while we're already doing 112 * interrupt handler making a hypercall while we're already doing
108 * one! */ 113 * one!
114 */
109 local_irq_save(flags); 115 local_irq_save(flags);
110 if (lguest_data.hcall_status[next_call] != 0xFF) { 116 if (lguest_data.hcall_status[next_call] != 0xFF) {
111 /* Table full, so do normal hcall which will flush table. */ 117 /* Table full, so do normal hcall which will flush table. */
@@ -125,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1,
125 local_irq_restore(flags); 131 local_irq_restore(flags);
126} 132}
127 133
128/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first 134/*G:035
129 * real optimization trick! 135 * Notice the lazy_hcall() above, rather than hcall(). This is our first real
136 * optimization trick!
130 * 137 *
131 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do 138 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
132 * them as a batch when lazy_mode is eventually turned off. Because hypercalls 139 * them as a batch when lazy_mode is eventually turned off. Because hypercalls
@@ -136,7 +143,8 @@ static void async_hcall(unsigned long call, unsigned long arg1,
136 * lguest_leave_lazy_mode(). 143 * lguest_leave_lazy_mode().
137 * 144 *
138 * So, when we're in lazy mode, we call async_hcall() to store the call for 145 * So, when we're in lazy mode, we call async_hcall() to store the call for
139 * future processing: */ 146 * future processing:
147 */
140static void lazy_hcall1(unsigned long call, 148static void lazy_hcall1(unsigned long call,
141 unsigned long arg1) 149 unsigned long arg1)
142{ 150{
@@ -146,6 +154,7 @@ static void lazy_hcall1(unsigned long call,
146 async_hcall(call, arg1, 0, 0, 0); 154 async_hcall(call, arg1, 0, 0, 0);
147} 155}
148 156
157/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
149static void lazy_hcall2(unsigned long call, 158static void lazy_hcall2(unsigned long call,
150 unsigned long arg1, 159 unsigned long arg1,
151 unsigned long arg2) 160 unsigned long arg2)
@@ -181,8 +190,10 @@ static void lazy_hcall4(unsigned long call,
181} 190}
182#endif 191#endif
183 192
184/* When lazy mode is turned off reset the per-cpu lazy mode variable and then 193/*G:036
185 * issue the do-nothing hypercall to flush any stored calls. */ 194 * When lazy mode is turned off reset the per-cpu lazy mode variable and then
195 * issue the do-nothing hypercall to flush any stored calls.
196:*/
186static void lguest_leave_lazy_mmu_mode(void) 197static void lguest_leave_lazy_mmu_mode(void)
187{ 198{
188 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 199 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
@@ -208,9 +219,11 @@ static void lguest_end_context_switch(struct task_struct *next)
208 * check there before it tries to deliver an interrupt. 219 * check there before it tries to deliver an interrupt.
209 */ 220 */
210 221
211/* save_flags() is expected to return the processor state (ie. "flags"). The 222/*
223 * save_flags() is expected to return the processor state (ie. "flags"). The
212 * flags word contains all kind of stuff, but in practice Linux only cares 224 * flags word contains all kind of stuff, but in practice Linux only cares
213 * about the interrupt flag. Our "save_flags()" just returns that. */ 225 * about the interrupt flag. Our "save_flags()" just returns that.
226 */
214static unsigned long save_fl(void) 227static unsigned long save_fl(void)
215{ 228{
216 return lguest_data.irq_enabled; 229 return lguest_data.irq_enabled;
@@ -222,13 +235,15 @@ static void irq_disable(void)
222 lguest_data.irq_enabled = 0; 235 lguest_data.irq_enabled = 0;
223} 236}
224 237
225/* Let's pause a moment. Remember how I said these are called so often? 238/*
239 * Let's pause a moment. Remember how I said these are called so often?
226 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to 240 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
227 * break some rules. In particular, these functions are assumed to save their 241 * break some rules. In particular, these functions are assumed to save their
228 * own registers if they need to: normal C functions assume they can trash the 242 * own registers if they need to: normal C functions assume they can trash the
229 * eax register. To use normal C functions, we use 243 * eax register. To use normal C functions, we use
230 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the 244 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
231 * C function, then restores it. */ 245 * C function, then restores it.
246 */
232PV_CALLEE_SAVE_REGS_THUNK(save_fl); 247PV_CALLEE_SAVE_REGS_THUNK(save_fl);
233PV_CALLEE_SAVE_REGS_THUNK(irq_disable); 248PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
234/*:*/ 249/*:*/
@@ -237,18 +252,18 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
237extern void lg_irq_enable(void); 252extern void lg_irq_enable(void);
238extern void lg_restore_fl(unsigned long flags); 253extern void lg_restore_fl(unsigned long flags);
239 254
240/*M:003 Note that we don't check for outstanding interrupts when we re-enable 255/*M:003
241 * them (or when we unmask an interrupt). This seems to work for the moment, 256 * We could be more efficient in our checking of outstanding interrupts, rather
242 * since interrupts are rare and we'll just get the interrupt on the next timer 257 * than using a branch. One way would be to put the "irq_enabled" field in a
243 * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way 258 * page by itself, and have the Host write-protect it when an interrupt comes
244 * would be to put the "irq_enabled" field in a page by itself, and have the 259 * in when irqs are disabled. There will then be a page fault as soon as
245 * Host write-protect it when an interrupt comes in when irqs are disabled. 260 * interrupts are re-enabled.
246 * There will then be a page fault as soon as interrupts are re-enabled.
247 * 261 *
248 * A better method is to implement soft interrupt disable generally for x86: 262 * A better method is to implement soft interrupt disable generally for x86:
249 * instead of disabling interrupts, we set a flag. If an interrupt does come 263 * instead of disabling interrupts, we set a flag. If an interrupt does come
250 * in, we then disable them for real. This is uncommon, so we could simply use 264 * in, we then disable them for real. This is uncommon, so we could simply use
251 * a hypercall for interrupt control and not worry about efficiency. :*/ 265 * a hypercall for interrupt control and not worry about efficiency.
266:*/
252 267
253/*G:034 268/*G:034
254 * The Interrupt Descriptor Table (IDT). 269 * The Interrupt Descriptor Table (IDT).
@@ -261,10 +276,12 @@ extern void lg_restore_fl(unsigned long flags);
261static void lguest_write_idt_entry(gate_desc *dt, 276static void lguest_write_idt_entry(gate_desc *dt,
262 int entrynum, const gate_desc *g) 277 int entrynum, const gate_desc *g)
263{ 278{
264 /* The gate_desc structure is 8 bytes long: we hand it to the Host in 279 /*
280 * The gate_desc structure is 8 bytes long: we hand it to the Host in
265 * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors 281 * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors
266 * around like this; typesafety wasn't a big concern in Linux's early 282 * around like this; typesafety wasn't a big concern in Linux's early
267 * years. */ 283 * years.
284 */
268 u32 *desc = (u32 *)g; 285 u32 *desc = (u32 *)g;
269 /* Keep the local copy up to date. */ 286 /* Keep the local copy up to date. */
270 native_write_idt_entry(dt, entrynum, g); 287 native_write_idt_entry(dt, entrynum, g);
@@ -272,9 +289,11 @@ static void lguest_write_idt_entry(gate_desc *dt,
272 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); 289 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
273} 290}
274 291
275/* Changing to a different IDT is very rare: we keep the IDT up-to-date every 292/*
293 * Changing to a different IDT is very rare: we keep the IDT up-to-date every
276 * time it is written, so we can simply loop through all entries and tell the 294 * time it is written, so we can simply loop through all entries and tell the
277 * Host about them. */ 295 * Host about them.
296 */
278static void lguest_load_idt(const struct desc_ptr *desc) 297static void lguest_load_idt(const struct desc_ptr *desc)
279{ 298{
280 unsigned int i; 299 unsigned int i;
@@ -305,9 +324,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc)
305 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); 324 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b);
306} 325}
307 326
308/* For a single GDT entry which changes, we do the lazy thing: alter our GDT, 327/*
328 * For a single GDT entry which changes, we do the lazy thing: alter our GDT,
309 * then tell the Host to reload the entire thing. This operation is so rare 329 * then tell the Host to reload the entire thing. This operation is so rare
310 * that this naive implementation is reasonable. */ 330 * that this naive implementation is reasonable.
331 */
311static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, 332static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
312 const void *desc, int type) 333 const void *desc, int type)
313{ 334{
@@ -317,29 +338,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
317 dt[entrynum].a, dt[entrynum].b); 338 dt[entrynum].a, dt[entrynum].b);
318} 339}
319 340
320/* OK, I lied. There are three "thread local storage" GDT entries which change 341/*
342 * OK, I lied. There are three "thread local storage" GDT entries which change
321 * on every context switch (these three entries are how glibc implements 343 * on every context switch (these three entries are how glibc implements
322 * __thread variables). So we have a hypercall specifically for this case. */ 344 * __thread variables). So we have a hypercall specifically for this case.
345 */
323static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) 346static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
324{ 347{
325 /* There's one problem which normal hardware doesn't have: the Host 348 /*
349 * There's one problem which normal hardware doesn't have: the Host
326 * can't handle us removing entries we're currently using. So we clear 350 * can't handle us removing entries we're currently using. So we clear
327 * the GS register here: if it's needed it'll be reloaded anyway. */ 351 * the GS register here: if it's needed it'll be reloaded anyway.
352 */
328 lazy_load_gs(0); 353 lazy_load_gs(0);
329 lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); 354 lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
330} 355}
331 356
332/*G:038 That's enough excitement for now, back to ploughing through each of 357/*G:038
333 * the different pv_ops structures (we're about 1/3 of the way through). 358 * That's enough excitement for now, back to ploughing through each of the
359 * different pv_ops structures (we're about 1/3 of the way through).
334 * 360 *
335 * This is the Local Descriptor Table, another weird Intel thingy. Linux only 361 * This is the Local Descriptor Table, another weird Intel thingy. Linux only
336 * uses this for some strange applications like Wine. We don't do anything 362 * uses this for some strange applications like Wine. We don't do anything
337 * here, so they'll get an informative and friendly Segmentation Fault. */ 363 * here, so they'll get an informative and friendly Segmentation Fault.
364 */
338static void lguest_set_ldt(const void *addr, unsigned entries) 365static void lguest_set_ldt(const void *addr, unsigned entries)
339{ 366{
340} 367}
341 368
342/* This loads a GDT entry into the "Task Register": that entry points to a 369/*
370 * This loads a GDT entry into the "Task Register": that entry points to a
343 * structure called the Task State Segment. Some comments scattered though the 371 * structure called the Task State Segment. Some comments scattered though the
344 * kernel code indicate that this used for task switching in ages past, along 372 * kernel code indicate that this used for task switching in ages past, along
345 * with blood sacrifice and astrology. 373 * with blood sacrifice and astrology.
@@ -347,19 +375,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries)
347 * Now there's nothing interesting in here that we don't get told elsewhere. 375 * Now there's nothing interesting in here that we don't get told elsewhere.
348 * But the native version uses the "ltr" instruction, which makes the Host 376 * But the native version uses the "ltr" instruction, which makes the Host
349 * complain to the Guest about a Segmentation Fault and it'll oops. So we 377 * complain to the Guest about a Segmentation Fault and it'll oops. So we
350 * override the native version with a do-nothing version. */ 378 * override the native version with a do-nothing version.
379 */
351static void lguest_load_tr_desc(void) 380static void lguest_load_tr_desc(void)
352{ 381{
353} 382}
354 383
355/* The "cpuid" instruction is a way of querying both the CPU identity 384/*
385 * The "cpuid" instruction is a way of querying both the CPU identity
356 * (manufacturer, model, etc) and its features. It was introduced before the 386 * (manufacturer, model, etc) and its features. It was introduced before the
357 * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. 387 * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
358 * As you might imagine, after a decade and a half this treatment, it is now a 388 * As you might imagine, after a decade and a half this treatment, it is now a
359 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. 389 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
360 * 390 *
361 * This instruction even it has its own Wikipedia entry. The Wikipedia entry 391 * This instruction even it has its own Wikipedia entry. The Wikipedia entry
362 * has been translated into 4 languages. I am not making this up! 392 * has been translated into 5 languages. I am not making this up!
363 * 393 *
364 * We could get funky here and identify ourselves as "GenuineLguest", but 394 * We could get funky here and identify ourselves as "GenuineLguest", but
365 * instead we just use the real "cpuid" instruction. Then I pretty much turned 395 * instead we just use the real "cpuid" instruction. Then I pretty much turned
@@ -371,7 +401,8 @@ static void lguest_load_tr_desc(void)
371 * Replacing the cpuid so we can turn features off is great for the kernel, but 401 * Replacing the cpuid so we can turn features off is great for the kernel, but
372 * anyone (including userspace) can just use the raw "cpuid" instruction and 402 * anyone (including userspace) can just use the raw "cpuid" instruction and
373 * the Host won't even notice since it isn't privileged. So we try not to get 403 * the Host won't even notice since it isn't privileged. So we try not to get
374 * too worked up about it. */ 404 * too worked up about it.
405 */
375static void lguest_cpuid(unsigned int *ax, unsigned int *bx, 406static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
376 unsigned int *cx, unsigned int *dx) 407 unsigned int *cx, unsigned int *dx)
377{ 408{
@@ -379,38 +410,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
379 410
380 native_cpuid(ax, bx, cx, dx); 411 native_cpuid(ax, bx, cx, dx);
381 switch (function) { 412 switch (function) {
382 case 1: /* Basic feature request. */ 413 /*
383 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 414 * CPUID 0 gives the highest legal CPUID number (and the ID string).
415 * We futureproof our code a little by sticking to known CPUID values.
416 */
417 case 0:
418 if (*ax > 5)
419 *ax = 5;
420 break;
421
422 /*
423 * CPUID 1 is a basic feature request.
424 *
425 * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
426 * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
427 */
428 case 1:
384 *cx &= 0x00002201; 429 *cx &= 0x00002201;
385 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */
386 *dx &= 0x07808151; 430 *dx &= 0x07808151;
387 /* The Host can do a nice optimization if it knows that the 431 /*
432 * The Host can do a nice optimization if it knows that the
388 * kernel mappings (addresses above 0xC0000000 or whatever 433 * kernel mappings (addresses above 0xC0000000 or whatever
389 * PAGE_OFFSET is set to) haven't changed. But Linux calls 434 * PAGE_OFFSET is set to) haven't changed. But Linux calls
390 * flush_tlb_user() for both user and kernel mappings unless 435 * flush_tlb_user() for both user and kernel mappings unless
391 * the Page Global Enable (PGE) feature bit is set. */ 436 * the Page Global Enable (PGE) feature bit is set.
437 */
392 *dx |= 0x00002000; 438 *dx |= 0x00002000;
393 /* We also lie, and say we're family id 5. 6 or greater 439 /*
440 * We also lie, and say we're family id 5. 6 or greater
394 * leads to a rdmsr in early_init_intel which we can't handle. 441 * leads to a rdmsr in early_init_intel which we can't handle.
395 * Family ID is returned as bits 8-12 in ax. */ 442 * Family ID is returned as bits 8-12 in ax.
443 */
396 *ax &= 0xFFFFF0FF; 444 *ax &= 0xFFFFF0FF;
397 *ax |= 0x00000500; 445 *ax |= 0x00000500;
398 break; 446 break;
447 /*
448 * 0x80000000 returns the highest Extended Function, so we futureproof
449 * like we do above by limiting it to known fields.
450 */
399 case 0x80000000: 451 case 0x80000000:
400 /* Futureproof this a little: if they ask how much extended
401 * processor information there is, limit it to known fields. */
402 if (*ax > 0x80000008) 452 if (*ax > 0x80000008)
403 *ax = 0x80000008; 453 *ax = 0x80000008;
404 break; 454 break;
455
456 /*
457 * PAE systems can mark pages as non-executable. Linux calls this the
458 * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
459 * Virus Protection). We just switch turn if off here, since we don't
460 * support it.
461 */
405 case 0x80000001: 462 case 0x80000001:
406 /* Here we should fix nx cap depending on host. */
407 /* For this version of PAE, we just clear NX bit. */
408 *dx &= ~(1 << 20); 463 *dx &= ~(1 << 20);
409 break; 464 break;
410 } 465 }
411} 466}
412 467
413/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. 468/*
469 * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
414 * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother 470 * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
415 * it. The Host needs to know when the Guest wants to change them, so we have 471 * it. The Host needs to know when the Guest wants to change them, so we have
416 * a whole series of functions like read_cr0() and write_cr0(). 472 * a whole series of functions like read_cr0() and write_cr0().
@@ -425,7 +481,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
425 * name like "FPUTRAP bit" be a little less cryptic? 481 * name like "FPUTRAP bit" be a little less cryptic?
426 * 482 *
427 * We store cr0 locally because the Host never changes it. The Guest sometimes 483 * We store cr0 locally because the Host never changes it. The Guest sometimes
428 * wants to read it and we'd prefer not to bother the Host unnecessarily. */ 484 * wants to read it and we'd prefer not to bother the Host unnecessarily.
485 */
429static unsigned long current_cr0; 486static unsigned long current_cr0;
430static void lguest_write_cr0(unsigned long val) 487static void lguest_write_cr0(unsigned long val)
431{ 488{
@@ -438,18 +495,22 @@ static unsigned long lguest_read_cr0(void)
438 return current_cr0; 495 return current_cr0;
439} 496}
440 497
441/* Intel provided a special instruction to clear the TS bit for people too cool 498/*
499 * Intel provided a special instruction to clear the TS bit for people too cool
442 * to use write_cr0() to do it. This "clts" instruction is faster, because all 500 * to use write_cr0() to do it. This "clts" instruction is faster, because all
443 * the vowels have been optimized out. */ 501 * the vowels have been optimized out.
502 */
444static void lguest_clts(void) 503static void lguest_clts(void)
445{ 504{
446 lazy_hcall1(LHCALL_TS, 0); 505 lazy_hcall1(LHCALL_TS, 0);
447 current_cr0 &= ~X86_CR0_TS; 506 current_cr0 &= ~X86_CR0_TS;
448} 507}
449 508
450/* cr2 is the virtual address of the last page fault, which the Guest only ever 509/*
510 * cr2 is the virtual address of the last page fault, which the Guest only ever
451 * reads. The Host kindly writes this into our "struct lguest_data", so we 511 * reads. The Host kindly writes this into our "struct lguest_data", so we
452 * just read it out of there. */ 512 * just read it out of there.
513 */
453static unsigned long lguest_read_cr2(void) 514static unsigned long lguest_read_cr2(void)
454{ 515{
455 return lguest_data.cr2; 516 return lguest_data.cr2;
@@ -458,10 +519,12 @@ static unsigned long lguest_read_cr2(void)
458/* See lguest_set_pte() below. */ 519/* See lguest_set_pte() below. */
459static bool cr3_changed = false; 520static bool cr3_changed = false;
460 521
461/* cr3 is the current toplevel pagetable page: the principle is the same as 522/*
523 * cr3 is the current toplevel pagetable page: the principle is the same as
462 * cr0. Keep a local copy, and tell the Host when it changes. The only 524 * cr0. Keep a local copy, and tell the Host when it changes. The only
463 * difference is that our local copy is in lguest_data because the Host needs 525 * difference is that our local copy is in lguest_data because the Host needs
464 * to set it upon our initial hypercall. */ 526 * to set it upon our initial hypercall.
527 */
465static void lguest_write_cr3(unsigned long cr3) 528static void lguest_write_cr3(unsigned long cr3)
466{ 529{
467 lguest_data.pgdir = cr3; 530 lguest_data.pgdir = cr3;
@@ -506,7 +569,7 @@ static void lguest_write_cr4(unsigned long val)
506 * cr3 ---> +---------+ 569 * cr3 ---> +---------+
507 * | --------->+---------+ 570 * | --------->+---------+
508 * | | | PADDR1 | 571 * | | | PADDR1 |
509 * Top-level | | PADDR2 | 572 * Mid-level | | PADDR2 |
510 * (PMD) page | | | 573 * (PMD) page | | |
511 * | | Lower-level | 574 * | | Lower-level |
512 * | | (PTE) page | 575 * | | (PTE) page |
@@ -526,21 +589,62 @@ static void lguest_write_cr4(unsigned long val)
526 * Index into top Index into second Offset within page 589 * Index into top Index into second Offset within page
527 * page directory page pagetable page 590 * page directory page pagetable page
528 * 591 *
529 * The kernel spends a lot of time changing both the top-level page directory 592 * Now, unfortunately, this isn't the whole story: Intel added Physical Address
530 * and lower-level pagetable pages. The Guest doesn't know physical addresses, 593 * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
531 * so while it maintains these page tables exactly like normal, it also needs 594 * These are held in 64-bit page table entries, so we can now only fit 512
532 * to keep the Host informed whenever it makes a change: the Host will create 595 * entries in a page, and the neat three-level tree breaks down.
533 * the real page tables based on the Guests'. 596 *
597 * The result is a four level page table:
598 *
599 * cr3 --> [ 4 Upper ]
600 * [ Level ]
601 * [ Entries ]
602 * [(PUD Page)]---> +---------+
603 * | --------->+---------+
604 * | | | PADDR1 |
605 * Mid-level | | PADDR2 |
606 * (PMD) page | | |
607 * | | Lower-level |
608 * | | (PTE) page |
609 * | | | |
610 * .... ....
611 *
612 *
613 * And the virtual address is decoded as:
614 *
615 * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
616 * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
617 * Index into Index into mid Index into lower Offset within page
618 * top entries directory page pagetable page
619 *
620 * It's too hard to switch between these two formats at runtime, so Linux only
621 * supports one or the other depending on whether CONFIG_X86_PAE is set. Many
622 * distributions turn it on, and not just for people with silly amounts of
623 * memory: the larger PTE entries allow room for the NX bit, which lets the
624 * kernel disable execution of pages and increase security.
625 *
626 * This was a problem for lguest, which couldn't run on these distributions;
627 * then Matias Zabaljauregui figured it all out and implemented it, and only a
628 * handful of puppies were crushed in the process!
629 *
630 * Back to our point: the kernel spends a lot of time changing both the
631 * top-level page directory and lower-level pagetable pages. The Guest doesn't
632 * know physical addresses, so while it maintains these page tables exactly
633 * like normal, it also needs to keep the Host informed whenever it makes a
634 * change: the Host will create the real page tables based on the Guests'.
534 */ 635 */
535 636
536/* The Guest calls this to set a second-level entry (pte), ie. to map a page 637/*
537 * into a process' address space. We set the entry then tell the Host the 638 * The Guest calls this after it has set a second-level entry (pte), ie. to map
538 * toplevel and address this corresponds to. The Guest uses one pagetable per 639 * a page into a process' address space. Wetell the Host the toplevel and
539 * process, so we need to tell the Host which one we're changing (mm->pgd). */ 640 * address this corresponds to. The Guest uses one pagetable per process, so
641 * we need to tell the Host which one we're changing (mm->pgd).
642 */
540static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 643static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
541 pte_t *ptep) 644 pte_t *ptep)
542{ 645{
543#ifdef CONFIG_X86_PAE 646#ifdef CONFIG_X86_PAE
647 /* PAE needs to hand a 64 bit page table entry, so it uses two args. */
544 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, 648 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
545 ptep->pte_low, ptep->pte_high); 649 ptep->pte_low, ptep->pte_high);
546#else 650#else
@@ -548,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
548#endif 652#endif
549} 653}
550 654
655/* This is the "set and update" combo-meal-deal version. */
551static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 656static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
552 pte_t *ptep, pte_t pteval) 657 pte_t *ptep, pte_t pteval)
553{ 658{
@@ -555,10 +660,13 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
555 lguest_pte_update(mm, addr, ptep); 660 lguest_pte_update(mm, addr, ptep);
556} 661}
557 662
558/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd 663/*
664 * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
559 * to set a middle-level entry when PAE is activated. 665 * to set a middle-level entry when PAE is activated.
666 *
560 * Again, we set the entry then tell the Host which page we changed, 667 * Again, we set the entry then tell the Host which page we changed,
561 * and the index of the entry we changed. */ 668 * and the index of the entry we changed.
669 */
562#ifdef CONFIG_X86_PAE 670#ifdef CONFIG_X86_PAE
563static void lguest_set_pud(pud_t *pudp, pud_t pudval) 671static void lguest_set_pud(pud_t *pudp, pud_t pudval)
564{ 672{
@@ -577,8 +685,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
577} 685}
578#else 686#else
579 687
580/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not 688/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
581 * activated. */
582static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 689static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
583{ 690{
584 native_set_pmd(pmdp, pmdval); 691 native_set_pmd(pmdp, pmdval);
@@ -587,7 +694,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
587} 694}
588#endif 695#endif
589 696
590/* There are a couple of legacy places where the kernel sets a PTE, but we 697/*
698 * There are a couple of legacy places where the kernel sets a PTE, but we
591 * don't know the top level any more. This is useless for us, since we don't 699 * don't know the top level any more. This is useless for us, since we don't
592 * know which pagetable is changing or what address, so we just tell the Host 700 * know which pagetable is changing or what address, so we just tell the Host
593 * to forget all of them. Fortunately, this is very rare. 701 * to forget all of them. Fortunately, this is very rare.
@@ -595,7 +703,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
595 * ... except in early boot when the kernel sets up the initial pagetables, 703 * ... except in early boot when the kernel sets up the initial pagetables,
596 * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell 704 * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell
597 * the Host anything changed until we've done the first page table switch, 705 * the Host anything changed until we've done the first page table switch,
598 * which brings boot back to 0.25 seconds. */ 706 * which brings boot back to 0.25 seconds.
707 */
599static void lguest_set_pte(pte_t *ptep, pte_t pteval) 708static void lguest_set_pte(pte_t *ptep, pte_t pteval)
600{ 709{
601 native_set_pte(ptep, pteval); 710 native_set_pte(ptep, pteval);
@@ -604,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
604} 713}
605 714
606#ifdef CONFIG_X86_PAE 715#ifdef CONFIG_X86_PAE
716/*
717 * With 64-bit PTE values, we need to be careful setting them: if we set 32
718 * bits at a time, the hardware could see a weird half-set entry. These
719 * versions ensure we update all 64 bits at once.
720 */
607static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) 721static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
608{ 722{
609 native_set_pte_atomic(ptep, pte); 723 native_set_pte_atomic(ptep, pte);
@@ -611,19 +725,21 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
611 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 725 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
612} 726}
613 727
614void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 728static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
729 pte_t *ptep)
615{ 730{
616 native_pte_clear(mm, addr, ptep); 731 native_pte_clear(mm, addr, ptep);
617 lguest_pte_update(mm, addr, ptep); 732 lguest_pte_update(mm, addr, ptep);
618} 733}
619 734
620void lguest_pmd_clear(pmd_t *pmdp) 735static void lguest_pmd_clear(pmd_t *pmdp)
621{ 736{
622 lguest_set_pmd(pmdp, __pmd(0)); 737 lguest_set_pmd(pmdp, __pmd(0));
623} 738}
624#endif 739#endif
625 740
626/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 741/*
742 * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
627 * native page table operations. On native hardware you can set a new page 743 * native page table operations. On native hardware you can set a new page
628 * table entry whenever you want, but if you want to remove one you have to do 744 * table entry whenever you want, but if you want to remove one you have to do
629 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). 745 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
@@ -632,24 +748,29 @@ void lguest_pmd_clear(pmd_t *pmdp)
632 * called when a valid entry is written, not when it's removed (ie. marked not 748 * called when a valid entry is written, not when it's removed (ie. marked not
633 * present). Instead, this is where we come when the Guest wants to remove a 749 * present). Instead, this is where we come when the Guest wants to remove a
634 * page table entry: we tell the Host to set that entry to 0 (ie. the present 750 * page table entry: we tell the Host to set that entry to 0 (ie. the present
635 * bit is zero). */ 751 * bit is zero).
752 */
636static void lguest_flush_tlb_single(unsigned long addr) 753static void lguest_flush_tlb_single(unsigned long addr)
637{ 754{
638 /* Simply set it to zero: if it was not, it will fault back in. */ 755 /* Simply set it to zero: if it was not, it will fault back in. */
639 lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); 756 lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
640} 757}
641 758
642/* This is what happens after the Guest has removed a large number of entries. 759/*
760 * This is what happens after the Guest has removed a large number of entries.
643 * This tells the Host that any of the page table entries for userspace might 761 * This tells the Host that any of the page table entries for userspace might
644 * have changed, ie. virtual addresses below PAGE_OFFSET. */ 762 * have changed, ie. virtual addresses below PAGE_OFFSET.
763 */
645static void lguest_flush_tlb_user(void) 764static void lguest_flush_tlb_user(void)
646{ 765{
647 lazy_hcall1(LHCALL_FLUSH_TLB, 0); 766 lazy_hcall1(LHCALL_FLUSH_TLB, 0);
648} 767}
649 768
650/* This is called when the kernel page tables have changed. That's not very 769/*
770 * This is called when the kernel page tables have changed. That's not very
651 * common (unless the Guest is using highmem, which makes the Guest extremely 771 * common (unless the Guest is using highmem, which makes the Guest extremely
652 * slow), so it's worth separating this from the user flushing above. */ 772 * slow), so it's worth separating this from the user flushing above.
773 */
653static void lguest_flush_tlb_kernel(void) 774static void lguest_flush_tlb_kernel(void)
654{ 775{
655 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 776 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
@@ -686,26 +807,38 @@ static struct irq_chip lguest_irq_controller = {
686 .unmask = enable_lguest_irq, 807 .unmask = enable_lguest_irq,
687}; 808};
688 809
689/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware 810/*
811 * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
690 * interrupt (except 128, which is used for system calls), and then tells the 812 * interrupt (except 128, which is used for system calls), and then tells the
691 * Linux infrastructure that each interrupt is controlled by our level-based 813 * Linux infrastructure that each interrupt is controlled by our level-based
692 * lguest interrupt controller. */ 814 * lguest interrupt controller.
815 */
693static void __init lguest_init_IRQ(void) 816static void __init lguest_init_IRQ(void)
694{ 817{
695 unsigned int i; 818 unsigned int i;
696 819
697 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 820 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
698 /* Some systems map "vectors" to interrupts weirdly. Lguest has 821 /* Some systems map "vectors" to interrupts weirdly. Not us! */
699 * a straightforward 1 to 1 mapping, so force that here. */
700 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; 822 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
701 if (i != SYSCALL_VECTOR) 823 if (i != SYSCALL_VECTOR)
702 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 824 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
703 } 825 }
704 /* This call is required to set up for 4k stacks, where we have 826
705 * separate stacks for hard and soft interrupts. */ 827 /*
828 * This call is required to set up for 4k stacks, where we have
829 * separate stacks for hard and soft interrupts.
830 */
706 irq_ctx_init(smp_processor_id()); 831 irq_ctx_init(smp_processor_id());
707} 832}
708 833
834/*
835 * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so
836 * rather than set them in lguest_init_IRQ we are called here every time an
837 * lguest device needs an interrupt.
838 *
839 * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should
840 * pass that up!
841 */
709void lguest_setup_irq(unsigned int irq) 842void lguest_setup_irq(unsigned int irq)
710{ 843{
711 irq_to_desc_alloc_node(irq, 0); 844 irq_to_desc_alloc_node(irq, 0);
@@ -724,31 +857,39 @@ static unsigned long lguest_get_wallclock(void)
724 return lguest_data.time.tv_sec; 857 return lguest_data.time.tv_sec;
725} 858}
726 859
727/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us 860/*
861 * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us
728 * what speed it runs at, or 0 if it's unusable as a reliable clock source. 862 * what speed it runs at, or 0 if it's unusable as a reliable clock source.
729 * This matches what we want here: if we return 0 from this function, the x86 863 * This matches what we want here: if we return 0 from this function, the x86
730 * TSC clock will give up and not register itself. */ 864 * TSC clock will give up and not register itself.
865 */
731static unsigned long lguest_tsc_khz(void) 866static unsigned long lguest_tsc_khz(void)
732{ 867{
733 return lguest_data.tsc_khz; 868 return lguest_data.tsc_khz;
734} 869}
735 870
736/* If we can't use the TSC, the kernel falls back to our lower-priority 871/*
737 * "lguest_clock", where we read the time value given to us by the Host. */ 872 * If we can't use the TSC, the kernel falls back to our lower-priority
873 * "lguest_clock", where we read the time value given to us by the Host.
874 */
738static cycle_t lguest_clock_read(struct clocksource *cs) 875static cycle_t lguest_clock_read(struct clocksource *cs)
739{ 876{
740 unsigned long sec, nsec; 877 unsigned long sec, nsec;
741 878
742 /* Since the time is in two parts (seconds and nanoseconds), we risk 879 /*
880 * Since the time is in two parts (seconds and nanoseconds), we risk
743 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, 881 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
744 * and getting 99 and 0. As Linux tends to come apart under the stress 882 * and getting 99 and 0. As Linux tends to come apart under the stress
745 * of time travel, we must be careful: */ 883 * of time travel, we must be careful:
884 */
746 do { 885 do {
747 /* First we read the seconds part. */ 886 /* First we read the seconds part. */
748 sec = lguest_data.time.tv_sec; 887 sec = lguest_data.time.tv_sec;
749 /* This read memory barrier tells the compiler and the CPU that 888 /*
889 * This read memory barrier tells the compiler and the CPU that
750 * this can't be reordered: we have to complete the above 890 * this can't be reordered: we have to complete the above
751 * before going on. */ 891 * before going on.
892 */
752 rmb(); 893 rmb();
753 /* Now we read the nanoseconds part. */ 894 /* Now we read the nanoseconds part. */
754 nsec = lguest_data.time.tv_nsec; 895 nsec = lguest_data.time.tv_nsec;
@@ -772,9 +913,11 @@ static struct clocksource lguest_clock = {
772 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 913 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
773}; 914};
774 915
775/* We also need a "struct clock_event_device": Linux asks us to set it to go 916/*
917 * We also need a "struct clock_event_device": Linux asks us to set it to go
776 * off some time in the future. Actually, James Morris figured all this out, I 918 * off some time in the future. Actually, James Morris figured all this out, I
777 * just applied the patch. */ 919 * just applied the patch.
920 */
778static int lguest_clockevent_set_next_event(unsigned long delta, 921static int lguest_clockevent_set_next_event(unsigned long delta,
779 struct clock_event_device *evt) 922 struct clock_event_device *evt)
780{ 923{
@@ -824,8 +967,10 @@ static struct clock_event_device lguest_clockevent = {
824 .max_delta_ns = LG_CLOCK_MAX_DELTA, 967 .max_delta_ns = LG_CLOCK_MAX_DELTA,
825}; 968};
826 969
827/* This is the Guest timer interrupt handler (hardware interrupt 0). We just 970/*
828 * call the clockevent infrastructure and it does whatever needs doing. */ 971 * This is the Guest timer interrupt handler (hardware interrupt 0). We just
972 * call the clockevent infrastructure and it does whatever needs doing.
973 */
829static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) 974static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
830{ 975{
831 unsigned long flags; 976 unsigned long flags;
@@ -836,10 +981,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
836 local_irq_restore(flags); 981 local_irq_restore(flags);
837} 982}
838 983
839/* At some point in the boot process, we get asked to set up our timing 984/*
985 * At some point in the boot process, we get asked to set up our timing
840 * infrastructure. The kernel doesn't expect timer interrupts before this, but 986 * infrastructure. The kernel doesn't expect timer interrupts before this, but
841 * we cleverly initialized the "blocked_interrupts" field of "struct 987 * we cleverly initialized the "blocked_interrupts" field of "struct
842 * lguest_data" so that timer interrupts were blocked until now. */ 988 * lguest_data" so that timer interrupts were blocked until now.
989 */
843static void lguest_time_init(void) 990static void lguest_time_init(void)
844{ 991{
845 /* Set up the timer interrupt (0) to go to our simple timer routine */ 992 /* Set up the timer interrupt (0) to go to our simple timer routine */
@@ -863,14 +1010,16 @@ static void lguest_time_init(void)
863 * to work. They're pretty simple. 1010 * to work. They're pretty simple.
864 */ 1011 */
865 1012
866/* The Guest needs to tell the Host what stack it expects traps to use. For 1013/*
1014 * The Guest needs to tell the Host what stack it expects traps to use. For
867 * native hardware, this is part of the Task State Segment mentioned above in 1015 * native hardware, this is part of the Task State Segment mentioned above in
868 * lguest_load_tr_desc(), but to help hypervisors there's this special call. 1016 * lguest_load_tr_desc(), but to help hypervisors there's this special call.
869 * 1017 *
870 * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data 1018 * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
871 * segment), the privilege level (we're privilege level 1, the Host is 0 and 1019 * segment), the privilege level (we're privilege level 1, the Host is 0 and
872 * will not tolerate us trying to use that), the stack pointer, and the number 1020 * will not tolerate us trying to use that), the stack pointer, and the number
873 * of pages in the stack. */ 1021 * of pages in the stack.
1022 */
874static void lguest_load_sp0(struct tss_struct *tss, 1023static void lguest_load_sp0(struct tss_struct *tss,
875 struct thread_struct *thread) 1024 struct thread_struct *thread)
876{ 1025{
@@ -884,7 +1033,8 @@ static void lguest_set_debugreg(int regno, unsigned long value)
884 /* FIXME: Implement */ 1033 /* FIXME: Implement */
885} 1034}
886 1035
887/* There are times when the kernel wants to make sure that no memory writes are 1036/*
1037 * There are times when the kernel wants to make sure that no memory writes are
888 * caught in the cache (that they've all reached real hardware devices). This 1038 * caught in the cache (that they've all reached real hardware devices). This
889 * doesn't matter for the Guest which has virtual hardware. 1039 * doesn't matter for the Guest which has virtual hardware.
890 * 1040 *
@@ -898,11 +1048,13 @@ static void lguest_wbinvd(void)
898{ 1048{
899} 1049}
900 1050
901/* If the Guest expects to have an Advanced Programmable Interrupt Controller, 1051/*
1052 * If the Guest expects to have an Advanced Programmable Interrupt Controller,
902 * we play dumb by ignoring writes and returning 0 for reads. So it's no 1053 * we play dumb by ignoring writes and returning 0 for reads. So it's no
903 * longer Programmable nor Controlling anything, and I don't think 8 lines of 1054 * longer Programmable nor Controlling anything, and I don't think 8 lines of
904 * code qualifies for Advanced. It will also never interrupt anything. It 1055 * code qualifies for Advanced. It will also never interrupt anything. It
905 * does, however, allow us to get through the Linux boot code. */ 1056 * does, however, allow us to get through the Linux boot code.
1057 */
906#ifdef CONFIG_X86_LOCAL_APIC 1058#ifdef CONFIG_X86_LOCAL_APIC
907static void lguest_apic_write(u32 reg, u32 v) 1059static void lguest_apic_write(u32 reg, u32 v)
908{ 1060{
@@ -951,11 +1103,13 @@ static void lguest_safe_halt(void)
951 kvm_hypercall0(LHCALL_HALT); 1103 kvm_hypercall0(LHCALL_HALT);
952} 1104}
953 1105
954/* The SHUTDOWN hypercall takes a string to describe what's happening, and 1106/*
1107 * The SHUTDOWN hypercall takes a string to describe what's happening, and
955 * an argument which says whether this to restart (reboot) the Guest or not. 1108 * an argument which says whether this to restart (reboot) the Guest or not.
956 * 1109 *
957 * Note that the Host always prefers that the Guest speak in physical addresses 1110 * Note that the Host always prefers that the Guest speak in physical addresses
958 * rather than virtual addresses, so we use __pa() here. */ 1111 * rather than virtual addresses, so we use __pa() here.
1112 */
959static void lguest_power_off(void) 1113static void lguest_power_off(void)
960{ 1114{
961 kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), 1115 kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"),
@@ -986,8 +1140,10 @@ static __init char *lguest_memory_setup(void)
986 * nice to move it back to lguest_init. Patch welcome... */ 1140 * nice to move it back to lguest_init. Patch welcome... */
987 atomic_notifier_chain_register(&panic_notifier_list, &paniced); 1141 atomic_notifier_chain_register(&panic_notifier_list, &paniced);
988 1142
989 /* The Linux bootloader header contains an "e820" memory map: the 1143 /*
990 * Launcher populated the first entry with our memory limit. */ 1144 *The Linux bootloader header contains an "e820" memory map: the
1145 * Launcher populated the first entry with our memory limit.
1146 */
991 e820_add_region(boot_params.e820_map[0].addr, 1147 e820_add_region(boot_params.e820_map[0].addr,
992 boot_params.e820_map[0].size, 1148 boot_params.e820_map[0].size,
993 boot_params.e820_map[0].type); 1149 boot_params.e820_map[0].type);
@@ -996,16 +1152,17 @@ static __init char *lguest_memory_setup(void)
996 return "LGUEST"; 1152 return "LGUEST";
997} 1153}
998 1154
999/* We will eventually use the virtio console device to produce console output, 1155/*
1156 * We will eventually use the virtio console device to produce console output,
1000 * but before that is set up we use LHCALL_NOTIFY on normal memory to produce 1157 * but before that is set up we use LHCALL_NOTIFY on normal memory to produce
1001 * console output. */ 1158 * console output.
1159 */
1002static __init int early_put_chars(u32 vtermno, const char *buf, int count) 1160static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1003{ 1161{
1004 char scratch[17]; 1162 char scratch[17];
1005 unsigned int len = count; 1163 unsigned int len = count;
1006 1164
1007 /* We use a nul-terminated string, so we have to make a copy. Icky, 1165 /* We use a nul-terminated string, so we make a copy. Icky, huh? */
1008 * huh? */
1009 if (len > sizeof(scratch) - 1) 1166 if (len > sizeof(scratch) - 1)
1010 len = sizeof(scratch) - 1; 1167 len = sizeof(scratch) - 1;
1011 scratch[len] = '\0'; 1168 scratch[len] = '\0';
@@ -1016,8 +1173,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1016 return len; 1173 return len;
1017} 1174}
1018 1175
1019/* Rebooting also tells the Host we're finished, but the RESTART flag tells the 1176/*
1020 * Launcher to reboot us. */ 1177 * Rebooting also tells the Host we're finished, but the RESTART flag tells the
1178 * Launcher to reboot us.
1179 */
1021static void lguest_restart(char *reason) 1180static void lguest_restart(char *reason)
1022{ 1181{
1023 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); 1182 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART);
@@ -1044,7 +1203,8 @@ static void lguest_restart(char *reason)
1044 * fit comfortably. 1203 * fit comfortably.
1045 * 1204 *
1046 * First we need assembly templates of each of the patchable Guest operations, 1205 * First we need assembly templates of each of the patchable Guest operations,
1047 * and these are in i386_head.S. */ 1206 * and these are in i386_head.S.
1207 */
1048 1208
1049/*G:060 We construct a table from the assembler templates: */ 1209/*G:060 We construct a table from the assembler templates: */
1050static const struct lguest_insns 1210static const struct lguest_insns
@@ -1055,9 +1215,11 @@ static const struct lguest_insns
1055 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, 1215 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
1056}; 1216};
1057 1217
1058/* Now our patch routine is fairly simple (based on the native one in 1218/*
1219 * Now our patch routine is fairly simple (based on the native one in
1059 * paravirt.c). If we have a replacement, we copy it in and return how much of 1220 * paravirt.c). If we have a replacement, we copy it in and return how much of
1060 * the available space we used. */ 1221 * the available space we used.
1222 */
1061static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, 1223static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
1062 unsigned long addr, unsigned len) 1224 unsigned long addr, unsigned len)
1063{ 1225{
@@ -1069,8 +1231,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
1069 1231
1070 insn_len = lguest_insns[type].end - lguest_insns[type].start; 1232 insn_len = lguest_insns[type].end - lguest_insns[type].start;
1071 1233
1072 /* Similarly if we can't fit replacement (shouldn't happen, but let's 1234 /* Similarly if it can't fit (doesn't happen, but let's be thorough). */
1073 * be thorough). */
1074 if (len < insn_len) 1235 if (len < insn_len)
1075 return paravirt_patch_default(type, clobber, ibuf, addr, len); 1236 return paravirt_patch_default(type, clobber, ibuf, addr, len);
1076 1237
@@ -1079,22 +1240,28 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
1079 return insn_len; 1240 return insn_len;
1080} 1241}
1081 1242
1082/*G:030 Once we get to lguest_init(), we know we're a Guest. The various 1243/*G:029
1244 * Once we get to lguest_init(), we know we're a Guest. The various
1083 * pv_ops structures in the kernel provide points for (almost) every routine we 1245 * pv_ops structures in the kernel provide points for (almost) every routine we
1084 * have to override to avoid privileged instructions. */ 1246 * have to override to avoid privileged instructions.
1247 */
1085__init void lguest_init(void) 1248__init void lguest_init(void)
1086{ 1249{
1087 /* We're under lguest, paravirt is enabled, and we're running at 1250 /* We're under lguest. */
1088 * privilege level 1, not 0 as normal. */
1089 pv_info.name = "lguest"; 1251 pv_info.name = "lguest";
1252 /* Paravirt is enabled. */
1090 pv_info.paravirt_enabled = 1; 1253 pv_info.paravirt_enabled = 1;
1254 /* We're running at privilege level 1, not 0 as normal. */
1091 pv_info.kernel_rpl = 1; 1255 pv_info.kernel_rpl = 1;
1256 /* Everyone except Xen runs with this set. */
1092 pv_info.shared_kernel_pmd = 1; 1257 pv_info.shared_kernel_pmd = 1;
1093 1258
1094 /* We set up all the lguest overrides for sensitive operations. These 1259 /*
1095 * are detailed with the operations themselves. */ 1260 * We set up all the lguest overrides for sensitive operations. These
1261 * are detailed with the operations themselves.
1262 */
1096 1263
1097 /* interrupt-related operations */ 1264 /* Interrupt-related operations */
1098 pv_irq_ops.init_IRQ = lguest_init_IRQ; 1265 pv_irq_ops.init_IRQ = lguest_init_IRQ;
1099 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); 1266 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
1100 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); 1267 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
@@ -1102,11 +1269,11 @@ __init void lguest_init(void)
1102 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); 1269 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
1103 pv_irq_ops.safe_halt = lguest_safe_halt; 1270 pv_irq_ops.safe_halt = lguest_safe_halt;
1104 1271
1105 /* init-time operations */ 1272 /* Setup operations */
1106 pv_init_ops.memory_setup = lguest_memory_setup; 1273 pv_init_ops.memory_setup = lguest_memory_setup;
1107 pv_init_ops.patch = lguest_patch; 1274 pv_init_ops.patch = lguest_patch;
1108 1275
1109 /* Intercepts of various cpu instructions */ 1276 /* Intercepts of various CPU instructions */
1110 pv_cpu_ops.load_gdt = lguest_load_gdt; 1277 pv_cpu_ops.load_gdt = lguest_load_gdt;
1111 pv_cpu_ops.cpuid = lguest_cpuid; 1278 pv_cpu_ops.cpuid = lguest_cpuid;
1112 pv_cpu_ops.load_idt = lguest_load_idt; 1279 pv_cpu_ops.load_idt = lguest_load_idt;
@@ -1127,7 +1294,7 @@ __init void lguest_init(void)
1127 pv_cpu_ops.start_context_switch = paravirt_start_context_switch; 1294 pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
1128 pv_cpu_ops.end_context_switch = lguest_end_context_switch; 1295 pv_cpu_ops.end_context_switch = lguest_end_context_switch;
1129 1296
1130 /* pagetable management */ 1297 /* Pagetable management */
1131 pv_mmu_ops.write_cr3 = lguest_write_cr3; 1298 pv_mmu_ops.write_cr3 = lguest_write_cr3;
1132 pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; 1299 pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
1133 pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; 1300 pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
@@ -1149,54 +1316,71 @@ __init void lguest_init(void)
1149 pv_mmu_ops.pte_update_defer = lguest_pte_update; 1316 pv_mmu_ops.pte_update_defer = lguest_pte_update;
1150 1317
1151#ifdef CONFIG_X86_LOCAL_APIC 1318#ifdef CONFIG_X86_LOCAL_APIC
1152 /* apic read/write intercepts */ 1319 /* APIC read/write intercepts */
1153 set_lguest_basic_apic_ops(); 1320 set_lguest_basic_apic_ops();
1154#endif 1321#endif
1155 1322
1156 /* time operations */ 1323 /* Time operations */
1157 pv_time_ops.get_wallclock = lguest_get_wallclock; 1324 pv_time_ops.get_wallclock = lguest_get_wallclock;
1158 pv_time_ops.time_init = lguest_time_init; 1325 pv_time_ops.time_init = lguest_time_init;
1159 pv_time_ops.get_tsc_khz = lguest_tsc_khz; 1326 pv_time_ops.get_tsc_khz = lguest_tsc_khz;
1160 1327
1161 /* Now is a good time to look at the implementations of these functions 1328 /*
1162 * before returning to the rest of lguest_init(). */ 1329 * Now is a good time to look at the implementations of these functions
1330 * before returning to the rest of lguest_init().
1331 */
1163 1332
1164 /*G:070 Now we've seen all the paravirt_ops, we return to 1333 /*G:070
1334 * Now we've seen all the paravirt_ops, we return to
1165 * lguest_init() where the rest of the fairly chaotic boot setup 1335 * lguest_init() where the rest of the fairly chaotic boot setup
1166 * occurs. */ 1336 * occurs.
1337 */
1167 1338
1168 /* The stack protector is a weird thing where gcc places a canary 1339 /*
1340 * The stack protector is a weird thing where gcc places a canary
1169 * value on the stack and then checks it on return. This file is 1341 * value on the stack and then checks it on return. This file is
1170 * compiled with -fno-stack-protector it, so we got this far without 1342 * compiled with -fno-stack-protector it, so we got this far without
1171 * problems. The value of the canary is kept at offset 20 from the 1343 * problems. The value of the canary is kept at offset 20 from the
1172 * %gs register, so we need to set that up before calling C functions 1344 * %gs register, so we need to set that up before calling C functions
1173 * in other files. */ 1345 * in other files.
1346 */
1174 setup_stack_canary_segment(0); 1347 setup_stack_canary_segment(0);
1175 /* We could just call load_stack_canary_segment(), but we might as 1348
1176 * call switch_to_new_gdt() which loads the whole table and sets up 1349 /*
1177 * the per-cpu segment descriptor register %fs as well. */ 1350 * We could just call load_stack_canary_segment(), but we might as well
1351 * call switch_to_new_gdt() which loads the whole table and sets up the
1352 * per-cpu segment descriptor register %fs as well.
1353 */
1178 switch_to_new_gdt(0); 1354 switch_to_new_gdt(0);
1179 1355
1180 /* As described in head_32.S, we map the first 128M of memory. */ 1356 /* We actually boot with all memory mapped, but let's say 128MB. */
1181 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1357 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1182 1358
1183 /* The Host<->Guest Switcher lives at the top of our address space, and 1359 /*
1360 * The Host<->Guest Switcher lives at the top of our address space, and
1184 * the Host told us how big it is when we made LGUEST_INIT hypercall: 1361 * the Host told us how big it is when we made LGUEST_INIT hypercall:
1185 * it put the answer in lguest_data.reserve_mem */ 1362 * it put the answer in lguest_data.reserve_mem
1363 */
1186 reserve_top_address(lguest_data.reserve_mem); 1364 reserve_top_address(lguest_data.reserve_mem);
1187 1365
1188 /* If we don't initialize the lock dependency checker now, it crashes 1366 /*
1189 * paravirt_disable_iospace. */ 1367 * If we don't initialize the lock dependency checker now, it crashes
1368 * paravirt_disable_iospace.
1369 */
1190 lockdep_init(); 1370 lockdep_init();
1191 1371
1192 /* The IDE code spends about 3 seconds probing for disks: if we reserve 1372 /*
1373 * The IDE code spends about 3 seconds probing for disks: if we reserve
1193 * all the I/O ports up front it can't get them and so doesn't probe. 1374 * all the I/O ports up front it can't get them and so doesn't probe.
1194 * Other device drivers are similar (but less severe). This cuts the 1375 * Other device drivers are similar (but less severe). This cuts the
1195 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ 1376 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds.
1377 */
1196 paravirt_disable_iospace(); 1378 paravirt_disable_iospace();
1197 1379
1198 /* This is messy CPU setup stuff which the native boot code does before 1380 /*
1199 * start_kernel, so we have to do, too: */ 1381 * This is messy CPU setup stuff which the native boot code does before
1382 * start_kernel, so we have to do, too:
1383 */
1200 cpu_detect(&new_cpu_data); 1384 cpu_detect(&new_cpu_data);
1201 /* head.S usually sets up the first capability word, so do it here. */ 1385 /* head.S usually sets up the first capability word, so do it here. */
1202 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1386 new_cpu_data.x86_capability[0] = cpuid_edx(1);
@@ -1213,22 +1397,28 @@ __init void lguest_init(void)
1213 acpi_ht = 0; 1397 acpi_ht = 0;
1214#endif 1398#endif
1215 1399
1216 /* We set the preferred console to "hvc". This is the "hypervisor 1400 /*
1401 * We set the preferred console to "hvc". This is the "hypervisor
1217 * virtual console" driver written by the PowerPC people, which we also 1402 * virtual console" driver written by the PowerPC people, which we also
1218 * adapted for lguest's use. */ 1403 * adapted for lguest's use.
1404 */
1219 add_preferred_console("hvc", 0, NULL); 1405 add_preferred_console("hvc", 0, NULL);
1220 1406
1221 /* Register our very early console. */ 1407 /* Register our very early console. */
1222 virtio_cons_early_init(early_put_chars); 1408 virtio_cons_early_init(early_put_chars);
1223 1409
1224 /* Last of all, we set the power management poweroff hook to point to 1410 /*
1411 * Last of all, we set the power management poweroff hook to point to
1225 * the Guest routine to power off, and the reboot hook to our restart 1412 * the Guest routine to power off, and the reboot hook to our restart
1226 * routine. */ 1413 * routine.
1414 */
1227 pm_power_off = lguest_power_off; 1415 pm_power_off = lguest_power_off;
1228 machine_ops.restart = lguest_restart; 1416 machine_ops.restart = lguest_restart;
1229 1417
1230 /* Now we're set up, call i386_start_kernel() in head32.c and we proceed 1418 /*
1231 * to boot as normal. It never returns. */ 1419 * Now we're set up, call i386_start_kernel() in head32.c and we proceed
1420 * to boot as normal. It never returns.
1421 */
1232 i386_start_kernel(); 1422 i386_start_kernel();
1233} 1423}
1234/* 1424/*
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index a9c8cfe61cd4..27eac0faee48 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -5,7 +5,8 @@
5#include <asm/thread_info.h> 5#include <asm/thread_info.h>
6#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
7 7
8/*G:020 Our story starts with the kernel booting into startup_32 in 8/*G:020
9 * Our story starts with the kernel booting into startup_32 in
9 * arch/x86/kernel/head_32.S. It expects a boot header, which is created by 10 * arch/x86/kernel/head_32.S. It expects a boot header, which is created by
10 * the bootloader (the Launcher in our case). 11 * the bootloader (the Launcher in our case).
11 * 12 *
@@ -21,11 +22,14 @@
21 * data without remembering to subtract __PAGE_OFFSET! 22 * data without remembering to subtract __PAGE_OFFSET!
22 * 23 *
23 * The .section line puts this code in .init.text so it will be discarded after 24 * The .section line puts this code in .init.text so it will be discarded after
24 * boot. */ 25 * boot.
26 */
25.section .init.text, "ax", @progbits 27.section .init.text, "ax", @progbits
26ENTRY(lguest_entry) 28ENTRY(lguest_entry)
27 /* We make the "initialization" hypercall now to tell the Host about 29 /*
28 * us, and also find out where it put our page tables. */ 30 * We make the "initialization" hypercall now to tell the Host about
31 * us, and also find out where it put our page tables.
32 */
29 movl $LHCALL_LGUEST_INIT, %eax 33 movl $LHCALL_LGUEST_INIT, %eax
30 movl $lguest_data - __PAGE_OFFSET, %ebx 34 movl $lguest_data - __PAGE_OFFSET, %ebx
31 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 35 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
@@ -33,13 +37,14 @@ ENTRY(lguest_entry)
33 /* Set up the initial stack so we can run C code. */ 37 /* Set up the initial stack so we can run C code. */
34 movl $(init_thread_union+THREAD_SIZE),%esp 38 movl $(init_thread_union+THREAD_SIZE),%esp
35 39
36 /* Jumps are relative, and we're running __PAGE_OFFSET too low at the 40 /* Jumps are relative: we're running __PAGE_OFFSET too low. */
37 * moment. */
38 jmp lguest_init+__PAGE_OFFSET 41 jmp lguest_init+__PAGE_OFFSET
39 42
40/*G:055 We create a macro which puts the assembler code between lgstart_ and 43/*G:055
41 * lgend_ markers. These templates are put in the .text section: they can't be 44 * We create a macro which puts the assembler code between lgstart_ and lgend_
42 * discarded after boot as we may need to patch modules, too. */ 45 * markers. These templates are put in the .text section: they can't be
46 * discarded after boot as we may need to patch modules, too.
47 */
43.text 48.text
44#define LGUEST_PATCH(name, insns...) \ 49#define LGUEST_PATCH(name, insns...) \
45 lgstart_##name: insns; lgend_##name:; \ 50 lgstart_##name: insns; lgend_##name:; \
@@ -48,83 +53,103 @@ ENTRY(lguest_entry)
48LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) 53LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
49LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) 54LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
50 55
51/*G:033 But using those wrappers is inefficient (we'll see why that doesn't 56/*G:033
52 * matter for save_fl and irq_disable later). If we write our routines 57 * But using those wrappers is inefficient (we'll see why that doesn't matter
53 * carefully in assembler, we can avoid clobbering any registers and avoid 58 * for save_fl and irq_disable later). If we write our routines carefully in
54 * jumping through the wrapper functions. 59 * assembler, we can avoid clobbering any registers and avoid jumping through
60 * the wrapper functions.
55 * 61 *
56 * I skipped over our first piece of assembler, but this one is worth studying 62 * I skipped over our first piece of assembler, but this one is worth studying
57 * in a bit more detail so I'll describe in easy stages. First, the routine 63 * in a bit more detail so I'll describe in easy stages. First, the routine to
58 * to enable interrupts: */ 64 * enable interrupts:
65 */
59ENTRY(lg_irq_enable) 66ENTRY(lg_irq_enable)
60 /* The reverse of irq_disable, this sets lguest_data.irq_enabled to 67 /*
61 * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ 68 * The reverse of irq_disable, this sets lguest_data.irq_enabled to
69 * X86_EFLAGS_IF (ie. "Interrupts enabled").
70 */
62 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled 71 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
63 /* But now we need to check if the Host wants to know: there might have 72 /*
73 * But now we need to check if the Host wants to know: there might have
64 * been interrupts waiting to be delivered, in which case it will have 74 * been interrupts waiting to be delivered, in which case it will have
65 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we 75 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
66 * jump to send_interrupts, otherwise we're done. */ 76 * jump to send_interrupts, otherwise we're done.
77 */
67 testl $0, lguest_data+LGUEST_DATA_irq_pending 78 testl $0, lguest_data+LGUEST_DATA_irq_pending
68 jnz send_interrupts 79 jnz send_interrupts
69 /* One cool thing about x86 is that you can do many things without using 80 /*
81 * One cool thing about x86 is that you can do many things without using
70 * a register. In this case, the normal path hasn't needed to save or 82 * a register. In this case, the normal path hasn't needed to save or
71 * restore any registers at all! */ 83 * restore any registers at all!
84 */
72 ret 85 ret
73send_interrupts: 86send_interrupts:
74 /* OK, now we need a register: eax is used for the hypercall number, 87 /*
88 * OK, now we need a register: eax is used for the hypercall number,
75 * which is LHCALL_SEND_INTERRUPTS. 89 * which is LHCALL_SEND_INTERRUPTS.
76 * 90 *
77 * We used not to bother with this pending detection at all, which was 91 * We used not to bother with this pending detection at all, which was
78 * much simpler. Sooner or later the Host would realize it had to 92 * much simpler. Sooner or later the Host would realize it had to
79 * send us an interrupt. But that turns out to make performance 7 93 * send us an interrupt. But that turns out to make performance 7
80 * times worse on a simple tcp benchmark. So now we do this the hard 94 * times worse on a simple tcp benchmark. So now we do this the hard
81 * way. */ 95 * way.
96 */
82 pushl %eax 97 pushl %eax
83 movl $LHCALL_SEND_INTERRUPTS, %eax 98 movl $LHCALL_SEND_INTERRUPTS, %eax
84 /* This is a vmcall instruction (same thing that KVM uses). Older 99 /*
100 * This is a vmcall instruction (same thing that KVM uses). Older
85 * assembler versions might not know the "vmcall" instruction, so we 101 * assembler versions might not know the "vmcall" instruction, so we
86 * create one manually here. */ 102 * create one manually here.
103 */
87 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 104 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
105 /* Put eax back the way we found it. */
88 popl %eax 106 popl %eax
89 ret 107 ret
90 108
91/* Finally, the "popf" or "restore flags" routine. The %eax register holds the 109/*
110 * Finally, the "popf" or "restore flags" routine. The %eax register holds the
92 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're 111 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
93 * enabling interrupts again, if it's 0 we're leaving them off. */ 112 * enabling interrupts again, if it's 0 we're leaving them off.
113 */
94ENTRY(lg_restore_fl) 114ENTRY(lg_restore_fl)
95 /* This is just "lguest_data.irq_enabled = flags;" */ 115 /* This is just "lguest_data.irq_enabled = flags;" */
96 movl %eax, lguest_data+LGUEST_DATA_irq_enabled 116 movl %eax, lguest_data+LGUEST_DATA_irq_enabled
97 /* Now, if the %eax value has enabled interrupts and 117 /*
118 * Now, if the %eax value has enabled interrupts and
98 * lguest_data.irq_pending is set, we want to tell the Host so it can 119 * lguest_data.irq_pending is set, we want to tell the Host so it can
99 * deliver any outstanding interrupts. Fortunately, both values will 120 * deliver any outstanding interrupts. Fortunately, both values will
100 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" 121 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
101 * instruction will AND them together for us. If both are set, we 122 * instruction will AND them together for us. If both are set, we
102 * jump to send_interrupts. */ 123 * jump to send_interrupts.
124 */
103 testl lguest_data+LGUEST_DATA_irq_pending, %eax 125 testl lguest_data+LGUEST_DATA_irq_pending, %eax
104 jnz send_interrupts 126 jnz send_interrupts
105 /* Again, the normal path has used no extra registers. Clever, huh? */ 127 /* Again, the normal path has used no extra registers. Clever, huh? */
106 ret 128 ret
129/*:*/
107 130
108/* These demark the EIP range where host should never deliver interrupts. */ 131/* These demark the EIP range where host should never deliver interrupts. */
109.global lguest_noirq_start 132.global lguest_noirq_start
110.global lguest_noirq_end 133.global lguest_noirq_end
111 134
112/*M:004 When the Host reflects a trap or injects an interrupt into the Guest, 135/*M:004
113 * it sets the eflags interrupt bit on the stack based on 136 * When the Host reflects a trap or injects an interrupt into the Guest, it
114 * lguest_data.irq_enabled, so the Guest iret logic does the right thing when 137 * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
115 * restoring it. However, when the Host sets the Guest up for direct traps, 138 * so the Guest iret logic does the right thing when restoring it. However,
116 * such as system calls, the processor is the one to push eflags onto the 139 * when the Host sets the Guest up for direct traps, such as system calls, the
117 * stack, and the interrupt bit will be 1 (in reality, interrupts are always 140 * processor is the one to push eflags onto the stack, and the interrupt bit
118 * enabled in the Guest). 141 * will be 1 (in reality, interrupts are always enabled in the Guest).
119 * 142 *
120 * This turns out to be harmless: the only trap which should happen under Linux 143 * This turns out to be harmless: the only trap which should happen under Linux
121 * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc 144 * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
122 * regions), which has to be reflected through the Host anyway. If another 145 * regions), which has to be reflected through the Host anyway. If another
123 * trap *does* go off when interrupts are disabled, the Guest will panic, and 146 * trap *does* go off when interrupts are disabled, the Guest will panic, and
124 * we'll never get to this iret! :*/ 147 * we'll never get to this iret!
148:*/
125 149
126/*G:045 There is one final paravirt_op that the Guest implements, and glancing 150/*G:045
127 * at it you can see why I left it to last. It's *cool*! It's in *assembler*! 151 * There is one final paravirt_op that the Guest implements, and glancing at it
152 * you can see why I left it to last. It's *cool*! It's in *assembler*!
128 * 153 *
129 * The "iret" instruction is used to return from an interrupt or trap. The 154 * The "iret" instruction is used to return from an interrupt or trap. The
130 * stack looks like this: 155 * stack looks like this:
@@ -148,15 +173,18 @@ ENTRY(lg_restore_fl)
148 * return to userspace or wherever. Our solution to this is to surround the 173 * return to userspace or wherever. Our solution to this is to surround the
149 * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the 174 * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the
150 * Host that it is *never* to interrupt us there, even if interrupts seem to be 175 * Host that it is *never* to interrupt us there, even if interrupts seem to be
151 * enabled. */ 176 * enabled.
177 */
152ENTRY(lguest_iret) 178ENTRY(lguest_iret)
153 pushl %eax 179 pushl %eax
154 movl 12(%esp), %eax 180 movl 12(%esp), %eax
155lguest_noirq_start: 181lguest_noirq_start:
156 /* Note the %ss: segment prefix here. Normal data accesses use the 182 /*
183 * Note the %ss: segment prefix here. Normal data accesses use the
157 * "ds" segment, but that will have already been restored for whatever 184 * "ds" segment, but that will have already been restored for whatever
158 * we're returning to (such as userspace): we can't trust it. The %ss: 185 * we're returning to (such as userspace): we can't trust it. The %ss:
159 * prefix makes sure we use the stack segment, which is still valid. */ 186 * prefix makes sure we use the stack segment, which is still valid.
187 */
160 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled 188 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled
161 popl %eax 189 popl %eax
162 iret 190 iret
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f9d35632666b..07c31899c9c2 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -10,6 +10,7 @@ lib-y += usercopy_$(BITS).o getuser.o putuser.o
10lib-y += memcpy_$(BITS).o 10lib-y += memcpy_$(BITS).o
11 11
12ifeq ($(CONFIG_X86_32),y) 12ifeq ($(CONFIG_X86_32),y)
13 obj-y += atomic64_32.o
13 lib-y += checksum_32.o 14 lib-y += checksum_32.o
14 lib-y += strstr_32.o 15 lib-y += strstr_32.o
15 lib-y += semaphore_32.o string_32.o 16 lib-y += semaphore_32.o string_32.o
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
new file mode 100644
index 000000000000..824fa0be55a3
--- /dev/null
+++ b/arch/x86/lib/atomic64_32.c
@@ -0,0 +1,230 @@
1#include <linux/compiler.h>
2#include <linux/module.h>
3#include <linux/types.h>
4
5#include <asm/processor.h>
6#include <asm/cmpxchg.h>
7#include <asm/atomic.h>
8
9static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new)
10{
11 u32 low = new;
12 u32 high = new >> 32;
13
14 asm volatile(
15 LOCK_PREFIX "cmpxchg8b %1\n"
16 : "+A" (old), "+m" (*ptr)
17 : "b" (low), "c" (high)
18 );
19 return old;
20}
21
22u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val)
23{
24 return cmpxchg8b(&ptr->counter, old_val, new_val);
25}
26EXPORT_SYMBOL(atomic64_cmpxchg);
27
28/**
29 * atomic64_xchg - xchg atomic64 variable
30 * @ptr: pointer to type atomic64_t
31 * @new_val: value to assign
32 *
33 * Atomically xchgs the value of @ptr to @new_val and returns
34 * the old value.
35 */
36u64 atomic64_xchg(atomic64_t *ptr, u64 new_val)
37{
38 /*
39 * Try first with a (possibly incorrect) assumption about
40 * what we have there. We'll do two loops most likely,
41 * but we'll get an ownership MESI transaction straight away
42 * instead of a read transaction followed by a
43 * flush-for-ownership transaction:
44 */
45 u64 old_val, real_val = 0;
46
47 do {
48 old_val = real_val;
49
50 real_val = atomic64_cmpxchg(ptr, old_val, new_val);
51
52 } while (real_val != old_val);
53
54 return old_val;
55}
56EXPORT_SYMBOL(atomic64_xchg);
57
58/**
59 * atomic64_set - set atomic64 variable
60 * @ptr: pointer to type atomic64_t
61 * @new_val: value to assign
62 *
63 * Atomically sets the value of @ptr to @new_val.
64 */
65void atomic64_set(atomic64_t *ptr, u64 new_val)
66{
67 atomic64_xchg(ptr, new_val);
68}
69EXPORT_SYMBOL(atomic64_set);
70
71/**
72EXPORT_SYMBOL(atomic64_read);
73 * atomic64_add_return - add and return
74 * @delta: integer value to add
75 * @ptr: pointer to type atomic64_t
76 *
77 * Atomically adds @delta to @ptr and returns @delta + *@ptr
78 */
79noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr)
80{
81 /*
82 * Try first with a (possibly incorrect) assumption about
83 * what we have there. We'll do two loops most likely,
84 * but we'll get an ownership MESI transaction straight away
85 * instead of a read transaction followed by a
86 * flush-for-ownership transaction:
87 */
88 u64 old_val, new_val, real_val = 0;
89
90 do {
91 old_val = real_val;
92 new_val = old_val + delta;
93
94 real_val = atomic64_cmpxchg(ptr, old_val, new_val);
95
96 } while (real_val != old_val);
97
98 return new_val;
99}
100EXPORT_SYMBOL(atomic64_add_return);
101
102u64 atomic64_sub_return(u64 delta, atomic64_t *ptr)
103{
104 return atomic64_add_return(-delta, ptr);
105}
106EXPORT_SYMBOL(atomic64_sub_return);
107
108u64 atomic64_inc_return(atomic64_t *ptr)
109{
110 return atomic64_add_return(1, ptr);
111}
112EXPORT_SYMBOL(atomic64_inc_return);
113
114u64 atomic64_dec_return(atomic64_t *ptr)
115{
116 return atomic64_sub_return(1, ptr);
117}
118EXPORT_SYMBOL(atomic64_dec_return);
119
120/**
121 * atomic64_add - add integer to atomic64 variable
122 * @delta: integer value to add
123 * @ptr: pointer to type atomic64_t
124 *
125 * Atomically adds @delta to @ptr.
126 */
127void atomic64_add(u64 delta, atomic64_t *ptr)
128{
129 atomic64_add_return(delta, ptr);
130}
131EXPORT_SYMBOL(atomic64_add);
132
133/**
134 * atomic64_sub - subtract the atomic64 variable
135 * @delta: integer value to subtract
136 * @ptr: pointer to type atomic64_t
137 *
138 * Atomically subtracts @delta from @ptr.
139 */
140void atomic64_sub(u64 delta, atomic64_t *ptr)
141{
142 atomic64_add(-delta, ptr);
143}
144EXPORT_SYMBOL(atomic64_sub);
145
146/**
147 * atomic64_sub_and_test - subtract value from variable and test result
148 * @delta: integer value to subtract
149 * @ptr: pointer to type atomic64_t
150 *
151 * Atomically subtracts @delta from @ptr and returns
152 * true if the result is zero, or false for all
153 * other cases.
154 */
155int atomic64_sub_and_test(u64 delta, atomic64_t *ptr)
156{
157 u64 new_val = atomic64_sub_return(delta, ptr);
158
159 return new_val == 0;
160}
161EXPORT_SYMBOL(atomic64_sub_and_test);
162
163/**
164 * atomic64_inc - increment atomic64 variable
165 * @ptr: pointer to type atomic64_t
166 *
167 * Atomically increments @ptr by 1.
168 */
169void atomic64_inc(atomic64_t *ptr)
170{
171 atomic64_add(1, ptr);
172}
173EXPORT_SYMBOL(atomic64_inc);
174
175/**
176 * atomic64_dec - decrement atomic64 variable
177 * @ptr: pointer to type atomic64_t
178 *
179 * Atomically decrements @ptr by 1.
180 */
181void atomic64_dec(atomic64_t *ptr)
182{
183 atomic64_sub(1, ptr);
184}
185EXPORT_SYMBOL(atomic64_dec);
186
187/**
188 * atomic64_dec_and_test - decrement and test
189 * @ptr: pointer to type atomic64_t
190 *
191 * Atomically decrements @ptr by 1 and
192 * returns true if the result is 0, or false for all other
193 * cases.
194 */
195int atomic64_dec_and_test(atomic64_t *ptr)
196{
197 return atomic64_sub_and_test(1, ptr);
198}
199EXPORT_SYMBOL(atomic64_dec_and_test);
200
201/**
202 * atomic64_inc_and_test - increment and test
203 * @ptr: pointer to type atomic64_t
204 *
205 * Atomically increments @ptr by 1
206 * and returns true if the result is zero, or false for all
207 * other cases.
208 */
209int atomic64_inc_and_test(atomic64_t *ptr)
210{
211 return atomic64_sub_and_test(-1, ptr);
212}
213EXPORT_SYMBOL(atomic64_inc_and_test);
214
215/**
216 * atomic64_add_negative - add and test if negative
217 * @delta: integer value to add
218 * @ptr: pointer to type atomic64_t
219 *
220 * Atomically adds @delta to @ptr and returns true
221 * if the result is negative, or false when
222 * result is greater than or equal to zero.
223 */
224int atomic64_add_negative(u64 delta, atomic64_t *ptr)
225{
226 s64 new_val = atomic64_add_return(delta, ptr);
227
228 return new_val < 0;
229}
230EXPORT_SYMBOL(atomic64_add_negative);
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 9a10a78bb4a4..ebeafcce04a9 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -5,15 +5,14 @@
5 * Zero a page. 5 * Zero a page.
6 * rdi page 6 * rdi page
7 */ 7 */
8 ALIGN 8ENTRY(clear_page_c)
9clear_page_c:
10 CFI_STARTPROC 9 CFI_STARTPROC
11 movl $4096/8,%ecx 10 movl $4096/8,%ecx
12 xorl %eax,%eax 11 xorl %eax,%eax
13 rep stosq 12 rep stosq
14 ret 13 ret
15 CFI_ENDPROC 14 CFI_ENDPROC
16ENDPROC(clear_page) 15ENDPROC(clear_page_c)
17 16
18ENTRY(clear_page) 17ENTRY(clear_page)
19 CFI_STARTPROC 18 CFI_STARTPROC
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index f118c110af32..6ba0f7bb85ea 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -75,6 +75,7 @@ ENTRY(copy_to_user)
75 jae bad_to_user 75 jae bad_to_user
76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
77 CFI_ENDPROC 77 CFI_ENDPROC
78ENDPROC(copy_to_user)
78 79
79/* Standard copy_from_user with segment limit checking */ 80/* Standard copy_from_user with segment limit checking */
80ENTRY(copy_from_user) 81ENTRY(copy_from_user)
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index f4568605d7d5..ff485d361182 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -55,8 +55,10 @@ static void delay_tsc(unsigned long loops)
55 55
56 preempt_disable(); 56 preempt_disable();
57 cpu = smp_processor_id(); 57 cpu = smp_processor_id();
58 rdtsc_barrier();
58 rdtscl(bclock); 59 rdtscl(bclock);
59 for (;;) { 60 for (;;) {
61 rdtsc_barrier();
60 rdtscl(now); 62 rdtscl(now);
61 if ((now - bclock) >= loops) 63 if ((now - bclock) >= loops)
62 break; 64 break;
@@ -78,6 +80,7 @@ static void delay_tsc(unsigned long loops)
78 if (unlikely(cpu != smp_processor_id())) { 80 if (unlikely(cpu != smp_processor_id())) {
79 loops -= (now - bclock); 81 loops -= (now - bclock);
80 cpu = smp_processor_id(); 82 cpu = smp_processor_id();
83 rdtsc_barrier();
81 rdtscl(bclock); 84 rdtscl(bclock);
82 } 85 }
83 } 86 }
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 1440b9c0547e..caa24aca8115 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -89,16 +89,13 @@ void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
89 rv.msrs = msrs; 89 rv.msrs = msrs;
90 rv.msr_no = msr_no; 90 rv.msr_no = msr_no;
91 91
92 preempt_disable(); 92 this_cpu = get_cpu();
93 /* 93
94 * FIXME: handle the CPU we're executing on separately for now until 94 if (cpumask_test_cpu(this_cpu, mask))
95 * smp_call_function_many has been fixed to not skip it. 95 __rdmsr_on_cpu(&rv);
96 */
97 this_cpu = raw_smp_processor_id();
98 smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1);
99 96
100 smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1); 97 smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
101 preempt_enable(); 98 put_cpu();
102} 99}
103EXPORT_SYMBOL(rdmsr_on_cpus); 100EXPORT_SYMBOL(rdmsr_on_cpus);
104 101
@@ -121,16 +118,13 @@ void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
121 rv.msrs = msrs; 118 rv.msrs = msrs;
122 rv.msr_no = msr_no; 119 rv.msr_no = msr_no;
123 120
124 preempt_disable(); 121 this_cpu = get_cpu();
125 /* 122
126 * FIXME: handle the CPU we're executing on separately for now until 123 if (cpumask_test_cpu(this_cpu, mask))
127 * smp_call_function_many has been fixed to not skip it. 124 __wrmsr_on_cpu(&rv);
128 */
129 this_cpu = raw_smp_processor_id();
130 smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1);
131 125
132 smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1); 126 smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
133 preempt_enable(); 127 put_cpu();
134} 128}
135EXPORT_SYMBOL(wrmsr_on_cpus); 129EXPORT_SYMBOL(wrmsr_on_cpus);
136 130
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 7c8ca91bb9ec..1f118d462acc 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -751,7 +751,7 @@ survive:
751 751
752 if (retval == -ENOMEM && is_global_init(current)) { 752 if (retval == -ENOMEM && is_global_init(current)) {
753 up_read(&current->mm->mmap_sem); 753 up_read(&current->mm->mmap_sem);
754 congestion_wait(WRITE, HZ/50); 754 congestion_wait(BLK_RW_ASYNC, HZ/50);
755 goto survive; 755 goto survive;
756 } 756 }
757 757
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index ec13cb5f17ed..b7c2849ffb66 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -127,7 +127,7 @@ EXPORT_SYMBOL(__strnlen_user);
127 127
128long strnlen_user(const char __user *s, long n) 128long strnlen_user(const char __user *s, long n)
129{ 129{
130 if (!access_ok(VERIFY_READ, s, n)) 130 if (!access_ok(VERIFY_READ, s, 1))
131 return 0; 131 return 0;
132 return __strnlen_user(s, n); 132 return __strnlen_user(s, n);
133} 133}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fdd30d08ab52..eefdeee8a871 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
10 10
11obj-$(CONFIG_HIGHMEM) += highmem_32.o 11obj-$(CONFIG_HIGHMEM) += highmem_32.o
12 12
13obj-$(CONFIG_KMEMCHECK) += kmemcheck/
14
13obj-$(CONFIG_MMIOTRACE) += mmiotrace.o 15obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
14mmiotrace-y := kmmio.o pf_in.o mmio-mod.o 16mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
15obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 17obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c6acc6326374..bfae139182ff 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -14,6 +14,7 @@
14 14
15#include <asm/traps.h> /* dotraplinkage, ... */ 15#include <asm/traps.h> /* dotraplinkage, ... */
16#include <asm/pgalloc.h> /* pgd_*(), ... */ 16#include <asm/pgalloc.h> /* pgd_*(), ... */
17#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
17 18
18/* 19/*
19 * Page fault error code bits: 20 * Page fault error code bits:
@@ -425,10 +426,11 @@ static noinline int vmalloc_fault(unsigned long address)
425} 426}
426 427
427static const char errata93_warning[] = 428static const char errata93_warning[] =
428KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" 429KERN_ERR
429KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" 430"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
430KERN_ERR "******* Please consider a BIOS update.\n" 431"******* Working around it, but it may cause SEGVs or burn power.\n"
431KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; 432"******* Please consider a BIOS update.\n"
433"******* Disabling USB legacy in the BIOS may also help.\n";
432 434
433/* 435/*
434 * No vm86 mode in 64-bit mode: 436 * No vm86 mode in 64-bit mode:
@@ -695,7 +697,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
695 if (!printk_ratelimit()) 697 if (!printk_ratelimit())
696 return; 698 return;
697 699
698 printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", 700 printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
699 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 701 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
700 tsk->comm, task_pid_nr(tsk), address, 702 tsk->comm, task_pid_nr(tsk), address,
701 (void *)regs->ip, (void *)regs->sp, error_code); 703 (void *)regs->ip, (void *)regs->sp, error_code);
@@ -951,11 +953,17 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
951 tsk = current; 953 tsk = current;
952 mm = tsk->mm; 954 mm = tsk->mm;
953 955
954 prefetchw(&mm->mmap_sem);
955
956 /* Get the faulting address: */ 956 /* Get the faulting address: */
957 address = read_cr2(); 957 address = read_cr2();
958 958
959 /*
960 * Detect and handle instructions that would cause a page fault for
961 * both a tracked kernel page and a userspace page.
962 */
963 if (kmemcheck_active(regs))
964 kmemcheck_hide(regs);
965 prefetchw(&mm->mmap_sem);
966
959 if (unlikely(kmmio_fault(regs, address))) 967 if (unlikely(kmmio_fault(regs, address)))
960 return; 968 return;
961 969
@@ -973,9 +981,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
973 * protection error (error_code & 9) == 0. 981 * protection error (error_code & 9) == 0.
974 */ 982 */
975 if (unlikely(fault_in_kernel_space(address))) { 983 if (unlikely(fault_in_kernel_space(address))) {
976 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 984 if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
977 vmalloc_fault(address) >= 0) 985 if (vmalloc_fault(address) >= 0)
978 return; 986 return;
987
988 if (kmemcheck_fault(regs, address, error_code))
989 return;
990 }
979 991
980 /* Can handle a stale RO->RW TLB: */ 992 /* Can handle a stale RO->RW TLB: */
981 if (spurious_fault(error_code, address)) 993 if (spurious_fault(error_code, address))
@@ -1102,7 +1114,7 @@ good_area:
1102 * make sure we exit gracefully rather than endlessly redo 1114 * make sure we exit gracefully rather than endlessly redo
1103 * the fault: 1115 * the fault:
1104 */ 1116 */
1105 fault = handle_mm_fault(mm, vma, address, write); 1117 fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
1106 1118
1107 if (unlikely(fault & VM_FAULT_ERROR)) { 1119 if (unlikely(fault & VM_FAULT_ERROR)) {
1108 mm_fault_error(regs, error_code, address, fault); 1120 mm_fault_error(regs, error_code, address, fault);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 6340cef6798a..71da1bca13cb 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -14,7 +14,7 @@
14static inline pte_t gup_get_pte(pte_t *ptep) 14static inline pte_t gup_get_pte(pte_t *ptep)
15{ 15{
16#ifndef CONFIG_X86_PAE 16#ifndef CONFIG_X86_PAE
17 return *ptep; 17 return ACCESS_ONCE(*ptep);
18#else 18#else
19 /* 19 /*
20 * With get_user_pages_fast, we walk down the pagetables without taking 20 * With get_user_pages_fast, we walk down the pagetables without taking
@@ -219,6 +219,62 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
219 return 1; 219 return 1;
220} 220}
221 221
222/*
223 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
224 * back to the regular GUP.
225 */
226int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
227 struct page **pages)
228{
229 struct mm_struct *mm = current->mm;
230 unsigned long addr, len, end;
231 unsigned long next;
232 unsigned long flags;
233 pgd_t *pgdp;
234 int nr = 0;
235
236 start &= PAGE_MASK;
237 addr = start;
238 len = (unsigned long) nr_pages << PAGE_SHIFT;
239 end = start + len;
240 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
241 (void __user *)start, len)))
242 return 0;
243
244 /*
245 * XXX: batch / limit 'nr', to avoid large irq off latency
246 * needs some instrumenting to determine the common sizes used by
247 * important workloads (eg. DB2), and whether limiting the batch size
248 * will decrease performance.
249 *
250 * It seems like we're in the clear for the moment. Direct-IO is
251 * the main guy that batches up lots of get_user_pages, and even
252 * they are limited to 64-at-a-time which is not so many.
253 */
254 /*
255 * This doesn't prevent pagetable teardown, but does prevent
256 * the pagetables and pages from being freed on x86.
257 *
258 * So long as we atomically load page table pointers versus teardown
259 * (which we do on x86, with the above PAE exception), we can follow the
260 * address down to the the page and take a ref on it.
261 */
262 local_irq_save(flags);
263 pgdp = pgd_offset(mm, addr);
264 do {
265 pgd_t pgd = *pgdp;
266
267 next = pgd_addr_end(addr, end);
268 if (pgd_none(pgd))
269 break;
270 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
271 break;
272 } while (pgdp++, addr = next, addr != end);
273 local_irq_restore(flags);
274
275 return nr;
276}
277
222/** 278/**
223 * get_user_pages_fast() - pin user pages in memory 279 * get_user_pages_fast() - pin user pages in memory
224 * @start: starting user address 280 * @start: starting user address
@@ -247,11 +303,16 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
247 start &= PAGE_MASK; 303 start &= PAGE_MASK;
248 addr = start; 304 addr = start;
249 len = (unsigned long) nr_pages << PAGE_SHIFT; 305 len = (unsigned long) nr_pages << PAGE_SHIFT;
306
250 end = start + len; 307 end = start + len;
251 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, 308 if (end < start)
252 (void __user *)start, len)))
253 goto slow_irqon; 309 goto slow_irqon;
254 310
311#ifdef CONFIG_X86_64
312 if (end >> __VIRTUAL_MASK_SHIFT)
313 goto slow_irqon;
314#endif
315
255 /* 316 /*
256 * XXX: batch / limit 'nr', to avoid large irq off latency 317 * XXX: batch / limit 'nr', to avoid large irq off latency
257 * needs some instrumenting to determine the common sizes used by 318 * needs some instrumenting to determine the common sizes used by
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 58f621e81919..2112ed55e7ea 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -103,6 +103,7 @@ EXPORT_SYMBOL(kmap);
103EXPORT_SYMBOL(kunmap); 103EXPORT_SYMBOL(kunmap);
104EXPORT_SYMBOL(kmap_atomic); 104EXPORT_SYMBOL(kmap_atomic);
105EXPORT_SYMBOL(kunmap_atomic); 105EXPORT_SYMBOL(kunmap_atomic);
106EXPORT_SYMBOL(kmap_atomic_prot);
106 107
107void __init set_highmem_pages_init(void) 108void __init set_highmem_pages_init(void)
108{ 109{
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 34c1bfb64f1c..0607119cef94 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -12,6 +12,7 @@
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/tlbflush.h> 13#include <asm/tlbflush.h>
14#include <asm/tlb.h> 14#include <asm/tlb.h>
15#include <asm/proto.h>
15 16
16DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 17DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
17 18
@@ -177,20 +178,6 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,
177 return nr_range; 178 return nr_range;
178} 179}
179 180
180#ifdef CONFIG_X86_64
181static void __init init_gbpages(void)
182{
183 if (direct_gbpages && cpu_has_gbpages)
184 printk(KERN_INFO "Using GB pages for direct mapping\n");
185 else
186 direct_gbpages = 0;
187}
188#else
189static inline void init_gbpages(void)
190{
191}
192#endif
193
194/* 181/*
195 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 182 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
196 * This runs before bootmem is initialized and gets pages directly from 183 * This runs before bootmem is initialized and gets pages directly from
@@ -210,10 +197,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
210 197
211 printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); 198 printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
212 199
213 if (!after_bootmem) 200#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
214 init_gbpages();
215
216#ifdef CONFIG_DEBUG_PAGEALLOC
217 /* 201 /*
218 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. 202 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
219 * This will simplify cpa(), which otherwise needs to support splitting 203 * This will simplify cpa(), which otherwise needs to support splitting
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9ff3c0816d15..3cd7711bb949 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -111,7 +111,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
111 pte_t *page_table = NULL; 111 pte_t *page_table = NULL;
112 112
113 if (after_bootmem) { 113 if (after_bootmem) {
114#ifdef CONFIG_DEBUG_PAGEALLOC 114#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
115 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 115 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
116#endif 116#endif
117 if (!page_table) 117 if (!page_table)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 52bb9519bb86..6176fe8f29e0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -104,7 +104,7 @@ static __ref void *spp_getpage(void)
104 void *ptr; 104 void *ptr;
105 105
106 if (after_bootmem) 106 if (after_bootmem)
107 ptr = (void *) get_zeroed_page(GFP_ATOMIC); 107 ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
108 else 108 else
109 ptr = alloc_bootmem_pages(PAGE_SIZE); 109 ptr = alloc_bootmem_pages(PAGE_SIZE);
110 110
@@ -281,7 +281,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
281 void *adr; 281 void *adr;
282 282
283 if (after_bootmem) { 283 if (after_bootmem) {
284 adr = (void *)get_zeroed_page(GFP_ATOMIC); 284 adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
285 *phys = __pa(adr); 285 *phys = __pa(adr);
286 286
287 return adr; 287 return adr;
@@ -527,7 +527,7 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
527 return phys_pud_init(pud, addr, end, page_size_mask); 527 return phys_pud_init(pud, addr, end, page_size_mask);
528} 528}
529 529
530unsigned long __init 530unsigned long __meminit
531kernel_physical_mapping_init(unsigned long start, 531kernel_physical_mapping_init(unsigned long start,
532 unsigned long end, 532 unsigned long end,
533 unsigned long page_size_mask) 533 unsigned long page_size_mask)
@@ -598,6 +598,15 @@ void __init paging_init(void)
598 598
599 sparse_memory_present_with_active_regions(MAX_NUMNODES); 599 sparse_memory_present_with_active_regions(MAX_NUMNODES);
600 sparse_init(); 600 sparse_init();
601
602 /*
603 * clear the default setting with node 0
604 * note: don't use nodes_clear here, that is really clearing when
605 * numa support is not compiled in, and later node_set_state
606 * will not set it back.
607 */
608 node_clear_state(0, N_NORMAL_MEMORY);
609
601 free_area_init_nodes(max_zone_pfns); 610 free_area_init_nodes(max_zone_pfns);
602} 611}
603 612
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
new file mode 100644
index 000000000000..520b3bce4095
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/Makefile
@@ -0,0 +1 @@
obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
new file mode 100644
index 000000000000..4901d0dafda6
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -0,0 +1,228 @@
1#include <linux/interrupt.h>
2#include <linux/kdebug.h>
3#include <linux/kmemcheck.h>
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/ptrace.h>
7#include <linux/stacktrace.h>
8#include <linux/string.h>
9
10#include "error.h"
11#include "shadow.h"
12
13enum kmemcheck_error_type {
14 KMEMCHECK_ERROR_INVALID_ACCESS,
15 KMEMCHECK_ERROR_BUG,
16};
17
18#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
19
20struct kmemcheck_error {
21 enum kmemcheck_error_type type;
22
23 union {
24 /* KMEMCHECK_ERROR_INVALID_ACCESS */
25 struct {
26 /* Kind of access that caused the error */
27 enum kmemcheck_shadow state;
28 /* Address and size of the erroneous read */
29 unsigned long address;
30 unsigned int size;
31 };
32 };
33
34 struct pt_regs regs;
35 struct stack_trace trace;
36 unsigned long trace_entries[32];
37
38 /* We compress it to a char. */
39 unsigned char shadow_copy[SHADOW_COPY_SIZE];
40 unsigned char memory_copy[SHADOW_COPY_SIZE];
41};
42
43/*
44 * Create a ring queue of errors to output. We can't call printk() directly
45 * from the kmemcheck traps, since this may call the console drivers and
46 * result in a recursive fault.
47 */
48static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
49static unsigned int error_count;
50static unsigned int error_rd;
51static unsigned int error_wr;
52static unsigned int error_missed_count;
53
54static struct kmemcheck_error *error_next_wr(void)
55{
56 struct kmemcheck_error *e;
57
58 if (error_count == ARRAY_SIZE(error_fifo)) {
59 ++error_missed_count;
60 return NULL;
61 }
62
63 e = &error_fifo[error_wr];
64 if (++error_wr == ARRAY_SIZE(error_fifo))
65 error_wr = 0;
66 ++error_count;
67 return e;
68}
69
70static struct kmemcheck_error *error_next_rd(void)
71{
72 struct kmemcheck_error *e;
73
74 if (error_count == 0)
75 return NULL;
76
77 e = &error_fifo[error_rd];
78 if (++error_rd == ARRAY_SIZE(error_fifo))
79 error_rd = 0;
80 --error_count;
81 return e;
82}
83
84void kmemcheck_error_recall(void)
85{
86 static const char *desc[] = {
87 [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
88 [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
89 [KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
90 [KMEMCHECK_SHADOW_FREED] = "freed",
91 };
92
93 static const char short_desc[] = {
94 [KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
95 [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
96 [KMEMCHECK_SHADOW_INITIALIZED] = 'i',
97 [KMEMCHECK_SHADOW_FREED] = 'f',
98 };
99
100 struct kmemcheck_error *e;
101 unsigned int i;
102
103 e = error_next_rd();
104 if (!e)
105 return;
106
107 switch (e->type) {
108 case KMEMCHECK_ERROR_INVALID_ACCESS:
109 printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read "
110 "from %s memory (%p)\n",
111 8 * e->size, e->state < ARRAY_SIZE(desc) ?
112 desc[e->state] : "(invalid shadow state)",
113 (void *) e->address);
114
115 printk(KERN_INFO);
116 for (i = 0; i < SHADOW_COPY_SIZE; ++i)
117 printk("%02x", e->memory_copy[i]);
118 printk("\n");
119
120 printk(KERN_INFO);
121 for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
122 if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
123 printk(" %c", short_desc[e->shadow_copy[i]]);
124 else
125 printk(" ?");
126 }
127 printk("\n");
128 printk(KERN_INFO "%*c\n", 2 + 2
129 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
130 break;
131 case KMEMCHECK_ERROR_BUG:
132 printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
133 break;
134 }
135
136 __show_regs(&e->regs, 1);
137 print_stack_trace(&e->trace, 0);
138}
139
140static void do_wakeup(unsigned long data)
141{
142 while (error_count > 0)
143 kmemcheck_error_recall();
144
145 if (error_missed_count > 0) {
146 printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
147 "the queue was too small\n", error_missed_count);
148 error_missed_count = 0;
149 }
150}
151
152static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
153
154/*
155 * Save the context of an error report.
156 */
157void kmemcheck_error_save(enum kmemcheck_shadow state,
158 unsigned long address, unsigned int size, struct pt_regs *regs)
159{
160 static unsigned long prev_ip;
161
162 struct kmemcheck_error *e;
163 void *shadow_copy;
164 void *memory_copy;
165
166 /* Don't report several adjacent errors from the same EIP. */
167 if (regs->ip == prev_ip)
168 return;
169 prev_ip = regs->ip;
170
171 e = error_next_wr();
172 if (!e)
173 return;
174
175 e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
176
177 e->state = state;
178 e->address = address;
179 e->size = size;
180
181 /* Save regs */
182 memcpy(&e->regs, regs, sizeof(*regs));
183
184 /* Save stack trace */
185 e->trace.nr_entries = 0;
186 e->trace.entries = e->trace_entries;
187 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
188 e->trace.skip = 0;
189 save_stack_trace_bp(&e->trace, regs->bp);
190
191 /* Round address down to nearest 16 bytes */
192 shadow_copy = kmemcheck_shadow_lookup(address
193 & ~(SHADOW_COPY_SIZE - 1));
194 BUG_ON(!shadow_copy);
195
196 memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
197
198 kmemcheck_show_addr(address);
199 memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
200 memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
201 kmemcheck_hide_addr(address);
202
203 tasklet_hi_schedule_first(&kmemcheck_tasklet);
204}
205
206/*
207 * Save the context of a kmemcheck bug.
208 */
209void kmemcheck_error_save_bug(struct pt_regs *regs)
210{
211 struct kmemcheck_error *e;
212
213 e = error_next_wr();
214 if (!e)
215 return;
216
217 e->type = KMEMCHECK_ERROR_BUG;
218
219 memcpy(&e->regs, regs, sizeof(*regs));
220
221 e->trace.nr_entries = 0;
222 e->trace.entries = e->trace_entries;
223 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
224 e->trace.skip = 1;
225 save_stack_trace(&e->trace);
226
227 tasklet_hi_schedule_first(&kmemcheck_tasklet);
228}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
new file mode 100644
index 000000000000..0efc2e8d0a20
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -0,0 +1,15 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
2#define ARCH__X86__MM__KMEMCHECK__ERROR_H
3
4#include <linux/ptrace.h>
5
6#include "shadow.h"
7
8void kmemcheck_error_save(enum kmemcheck_shadow state,
9 unsigned long address, unsigned int size, struct pt_regs *regs);
10
11void kmemcheck_error_save_bug(struct pt_regs *regs);
12
13void kmemcheck_error_recall(void);
14
15#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
new file mode 100644
index 000000000000..2c55ed098654
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -0,0 +1,640 @@
1/**
2 * kmemcheck - a heavyweight memory checker for the linux kernel
3 * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
4 * (With a lot of help from Ingo Molnar and Pekka Enberg.)
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2) as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/init.h>
12#include <linux/interrupt.h>
13#include <linux/kallsyms.h>
14#include <linux/kernel.h>
15#include <linux/kmemcheck.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <linux/page-flags.h>
19#include <linux/percpu.h>
20#include <linux/ptrace.h>
21#include <linux/string.h>
22#include <linux/types.h>
23
24#include <asm/cacheflush.h>
25#include <asm/kmemcheck.h>
26#include <asm/pgtable.h>
27#include <asm/tlbflush.h>
28
29#include "error.h"
30#include "opcode.h"
31#include "pte.h"
32#include "selftest.h"
33#include "shadow.h"
34
35
36#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
37# define KMEMCHECK_ENABLED 0
38#endif
39
40#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
41# define KMEMCHECK_ENABLED 1
42#endif
43
44#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
45# define KMEMCHECK_ENABLED 2
46#endif
47
48int kmemcheck_enabled = KMEMCHECK_ENABLED;
49
50int __init kmemcheck_init(void)
51{
52#ifdef CONFIG_SMP
53 /*
54 * Limit SMP to use a single CPU. We rely on the fact that this code
55 * runs before SMP is set up.
56 */
57 if (setup_max_cpus > 1) {
58 printk(KERN_INFO
59 "kmemcheck: Limiting number of CPUs to 1.\n");
60 setup_max_cpus = 1;
61 }
62#endif
63
64 if (!kmemcheck_selftest()) {
65 printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
66 kmemcheck_enabled = 0;
67 return -EINVAL;
68 }
69
70 printk(KERN_INFO "kmemcheck: Initialized\n");
71 return 0;
72}
73
74early_initcall(kmemcheck_init);
75
76/*
77 * We need to parse the kmemcheck= option before any memory is allocated.
78 */
79static int __init param_kmemcheck(char *str)
80{
81 if (!str)
82 return -EINVAL;
83
84 sscanf(str, "%d", &kmemcheck_enabled);
85 return 0;
86}
87
88early_param("kmemcheck", param_kmemcheck);
89
90int kmemcheck_show_addr(unsigned long address)
91{
92 pte_t *pte;
93
94 pte = kmemcheck_pte_lookup(address);
95 if (!pte)
96 return 0;
97
98 set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
99 __flush_tlb_one(address);
100 return 1;
101}
102
103int kmemcheck_hide_addr(unsigned long address)
104{
105 pte_t *pte;
106
107 pte = kmemcheck_pte_lookup(address);
108 if (!pte)
109 return 0;
110
111 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
112 __flush_tlb_one(address);
113 return 1;
114}
115
116struct kmemcheck_context {
117 bool busy;
118 int balance;
119
120 /*
121 * There can be at most two memory operands to an instruction, but
122 * each address can cross a page boundary -- so we may need up to
123 * four addresses that must be hidden/revealed for each fault.
124 */
125 unsigned long addr[4];
126 unsigned long n_addrs;
127 unsigned long flags;
128
129 /* Data size of the instruction that caused a fault. */
130 unsigned int size;
131};
132
133static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
134
135bool kmemcheck_active(struct pt_regs *regs)
136{
137 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
138
139 return data->balance > 0;
140}
141
142/* Save an address that needs to be shown/hidden */
143static void kmemcheck_save_addr(unsigned long addr)
144{
145 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
146
147 BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
148 data->addr[data->n_addrs++] = addr;
149}
150
151static unsigned int kmemcheck_show_all(void)
152{
153 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
154 unsigned int i;
155 unsigned int n;
156
157 n = 0;
158 for (i = 0; i < data->n_addrs; ++i)
159 n += kmemcheck_show_addr(data->addr[i]);
160
161 return n;
162}
163
164static unsigned int kmemcheck_hide_all(void)
165{
166 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
167 unsigned int i;
168 unsigned int n;
169
170 n = 0;
171 for (i = 0; i < data->n_addrs; ++i)
172 n += kmemcheck_hide_addr(data->addr[i]);
173
174 return n;
175}
176
177/*
178 * Called from the #PF handler.
179 */
180void kmemcheck_show(struct pt_regs *regs)
181{
182 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
183
184 BUG_ON(!irqs_disabled());
185
186 if (unlikely(data->balance != 0)) {
187 kmemcheck_show_all();
188 kmemcheck_error_save_bug(regs);
189 data->balance = 0;
190 return;
191 }
192
193 /*
194 * None of the addresses actually belonged to kmemcheck. Note that
195 * this is not an error.
196 */
197 if (kmemcheck_show_all() == 0)
198 return;
199
200 ++data->balance;
201
202 /*
203 * The IF needs to be cleared as well, so that the faulting
204 * instruction can run "uninterrupted". Otherwise, we might take
205 * an interrupt and start executing that before we've had a chance
206 * to hide the page again.
207 *
208 * NOTE: In the rare case of multiple faults, we must not override
209 * the original flags:
210 */
211 if (!(regs->flags & X86_EFLAGS_TF))
212 data->flags = regs->flags;
213
214 regs->flags |= X86_EFLAGS_TF;
215 regs->flags &= ~X86_EFLAGS_IF;
216}
217
218/*
219 * Called from the #DB handler.
220 */
221void kmemcheck_hide(struct pt_regs *regs)
222{
223 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
224 int n;
225
226 BUG_ON(!irqs_disabled());
227
228 if (data->balance == 0)
229 return;
230
231 if (unlikely(data->balance != 1)) {
232 kmemcheck_show_all();
233 kmemcheck_error_save_bug(regs);
234 data->n_addrs = 0;
235 data->balance = 0;
236
237 if (!(data->flags & X86_EFLAGS_TF))
238 regs->flags &= ~X86_EFLAGS_TF;
239 if (data->flags & X86_EFLAGS_IF)
240 regs->flags |= X86_EFLAGS_IF;
241 return;
242 }
243
244 if (kmemcheck_enabled)
245 n = kmemcheck_hide_all();
246 else
247 n = kmemcheck_show_all();
248
249 if (n == 0)
250 return;
251
252 --data->balance;
253
254 data->n_addrs = 0;
255
256 if (!(data->flags & X86_EFLAGS_TF))
257 regs->flags &= ~X86_EFLAGS_TF;
258 if (data->flags & X86_EFLAGS_IF)
259 regs->flags |= X86_EFLAGS_IF;
260}
261
262void kmemcheck_show_pages(struct page *p, unsigned int n)
263{
264 unsigned int i;
265
266 for (i = 0; i < n; ++i) {
267 unsigned long address;
268 pte_t *pte;
269 unsigned int level;
270
271 address = (unsigned long) page_address(&p[i]);
272 pte = lookup_address(address, &level);
273 BUG_ON(!pte);
274 BUG_ON(level != PG_LEVEL_4K);
275
276 set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
277 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
278 __flush_tlb_one(address);
279 }
280}
281
282bool kmemcheck_page_is_tracked(struct page *p)
283{
284 /* This will also check the "hidden" flag of the PTE. */
285 return kmemcheck_pte_lookup((unsigned long) page_address(p));
286}
287
288void kmemcheck_hide_pages(struct page *p, unsigned int n)
289{
290 unsigned int i;
291
292 for (i = 0; i < n; ++i) {
293 unsigned long address;
294 pte_t *pte;
295 unsigned int level;
296
297 address = (unsigned long) page_address(&p[i]);
298 pte = lookup_address(address, &level);
299 BUG_ON(!pte);
300 BUG_ON(level != PG_LEVEL_4K);
301
302 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
303 set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
304 __flush_tlb_one(address);
305 }
306}
307
308/* Access may NOT cross page boundary */
309static void kmemcheck_read_strict(struct pt_regs *regs,
310 unsigned long addr, unsigned int size)
311{
312 void *shadow;
313 enum kmemcheck_shadow status;
314
315 shadow = kmemcheck_shadow_lookup(addr);
316 if (!shadow)
317 return;
318
319 kmemcheck_save_addr(addr);
320 status = kmemcheck_shadow_test(shadow, size);
321 if (status == KMEMCHECK_SHADOW_INITIALIZED)
322 return;
323
324 if (kmemcheck_enabled)
325 kmemcheck_error_save(status, addr, size, regs);
326
327 if (kmemcheck_enabled == 2)
328 kmemcheck_enabled = 0;
329
330 /* Don't warn about it again. */
331 kmemcheck_shadow_set(shadow, size);
332}
333
334/* Access may cross page boundary */
335static void kmemcheck_read(struct pt_regs *regs,
336 unsigned long addr, unsigned int size)
337{
338 unsigned long page = addr & PAGE_MASK;
339 unsigned long next_addr = addr + size - 1;
340 unsigned long next_page = next_addr & PAGE_MASK;
341
342 if (likely(page == next_page)) {
343 kmemcheck_read_strict(regs, addr, size);
344 return;
345 }
346
347 /*
348 * What we do is basically to split the access across the
349 * two pages and handle each part separately. Yes, this means
350 * that we may now see reads that are 3 + 5 bytes, for
351 * example (and if both are uninitialized, there will be two
352 * reports), but it makes the code a lot simpler.
353 */
354 kmemcheck_read_strict(regs, addr, next_page - addr);
355 kmemcheck_read_strict(regs, next_page, next_addr - next_page);
356}
357
358static void kmemcheck_write_strict(struct pt_regs *regs,
359 unsigned long addr, unsigned int size)
360{
361 void *shadow;
362
363 shadow = kmemcheck_shadow_lookup(addr);
364 if (!shadow)
365 return;
366
367 kmemcheck_save_addr(addr);
368 kmemcheck_shadow_set(shadow, size);
369}
370
371static void kmemcheck_write(struct pt_regs *regs,
372 unsigned long addr, unsigned int size)
373{
374 unsigned long page = addr & PAGE_MASK;
375 unsigned long next_addr = addr + size - 1;
376 unsigned long next_page = next_addr & PAGE_MASK;
377
378 if (likely(page == next_page)) {
379 kmemcheck_write_strict(regs, addr, size);
380 return;
381 }
382
383 /* See comment in kmemcheck_read(). */
384 kmemcheck_write_strict(regs, addr, next_page - addr);
385 kmemcheck_write_strict(regs, next_page, next_addr - next_page);
386}
387
388/*
389 * Copying is hard. We have two addresses, each of which may be split across
390 * a page (and each page will have different shadow addresses).
391 */
392static void kmemcheck_copy(struct pt_regs *regs,
393 unsigned long src_addr, unsigned long dst_addr, unsigned int size)
394{
395 uint8_t shadow[8];
396 enum kmemcheck_shadow status;
397
398 unsigned long page;
399 unsigned long next_addr;
400 unsigned long next_page;
401
402 uint8_t *x;
403 unsigned int i;
404 unsigned int n;
405
406 BUG_ON(size > sizeof(shadow));
407
408 page = src_addr & PAGE_MASK;
409 next_addr = src_addr + size - 1;
410 next_page = next_addr & PAGE_MASK;
411
412 if (likely(page == next_page)) {
413 /* Same page */
414 x = kmemcheck_shadow_lookup(src_addr);
415 if (x) {
416 kmemcheck_save_addr(src_addr);
417 for (i = 0; i < size; ++i)
418 shadow[i] = x[i];
419 } else {
420 for (i = 0; i < size; ++i)
421 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
422 }
423 } else {
424 n = next_page - src_addr;
425 BUG_ON(n > sizeof(shadow));
426
427 /* First page */
428 x = kmemcheck_shadow_lookup(src_addr);
429 if (x) {
430 kmemcheck_save_addr(src_addr);
431 for (i = 0; i < n; ++i)
432 shadow[i] = x[i];
433 } else {
434 /* Not tracked */
435 for (i = 0; i < n; ++i)
436 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
437 }
438
439 /* Second page */
440 x = kmemcheck_shadow_lookup(next_page);
441 if (x) {
442 kmemcheck_save_addr(next_page);
443 for (i = n; i < size; ++i)
444 shadow[i] = x[i - n];
445 } else {
446 /* Not tracked */
447 for (i = n; i < size; ++i)
448 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
449 }
450 }
451
452 page = dst_addr & PAGE_MASK;
453 next_addr = dst_addr + size - 1;
454 next_page = next_addr & PAGE_MASK;
455
456 if (likely(page == next_page)) {
457 /* Same page */
458 x = kmemcheck_shadow_lookup(dst_addr);
459 if (x) {
460 kmemcheck_save_addr(dst_addr);
461 for (i = 0; i < size; ++i) {
462 x[i] = shadow[i];
463 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
464 }
465 }
466 } else {
467 n = next_page - dst_addr;
468 BUG_ON(n > sizeof(shadow));
469
470 /* First page */
471 x = kmemcheck_shadow_lookup(dst_addr);
472 if (x) {
473 kmemcheck_save_addr(dst_addr);
474 for (i = 0; i < n; ++i) {
475 x[i] = shadow[i];
476 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
477 }
478 }
479
480 /* Second page */
481 x = kmemcheck_shadow_lookup(next_page);
482 if (x) {
483 kmemcheck_save_addr(next_page);
484 for (i = n; i < size; ++i) {
485 x[i - n] = shadow[i];
486 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
487 }
488 }
489 }
490
491 status = kmemcheck_shadow_test(shadow, size);
492 if (status == KMEMCHECK_SHADOW_INITIALIZED)
493 return;
494
495 if (kmemcheck_enabled)
496 kmemcheck_error_save(status, src_addr, size, regs);
497
498 if (kmemcheck_enabled == 2)
499 kmemcheck_enabled = 0;
500}
501
502enum kmemcheck_method {
503 KMEMCHECK_READ,
504 KMEMCHECK_WRITE,
505};
506
507static void kmemcheck_access(struct pt_regs *regs,
508 unsigned long fallback_address, enum kmemcheck_method fallback_method)
509{
510 const uint8_t *insn;
511 const uint8_t *insn_primary;
512 unsigned int size;
513
514 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
515
516 /* Recursive fault -- ouch. */
517 if (data->busy) {
518 kmemcheck_show_addr(fallback_address);
519 kmemcheck_error_save_bug(regs);
520 return;
521 }
522
523 data->busy = true;
524
525 insn = (const uint8_t *) regs->ip;
526 insn_primary = kmemcheck_opcode_get_primary(insn);
527
528 kmemcheck_opcode_decode(insn, &size);
529
530 switch (insn_primary[0]) {
531#ifdef CONFIG_KMEMCHECK_BITOPS_OK
532 /* AND, OR, XOR */
533 /*
534 * Unfortunately, these instructions have to be excluded from
535 * our regular checking since they access only some (and not
536 * all) bits. This clears out "bogus" bitfield-access warnings.
537 */
538 case 0x80:
539 case 0x81:
540 case 0x82:
541 case 0x83:
542 switch ((insn_primary[1] >> 3) & 7) {
543 /* OR */
544 case 1:
545 /* AND */
546 case 4:
547 /* XOR */
548 case 6:
549 kmemcheck_write(regs, fallback_address, size);
550 goto out;
551
552 /* ADD */
553 case 0:
554 /* ADC */
555 case 2:
556 /* SBB */
557 case 3:
558 /* SUB */
559 case 5:
560 /* CMP */
561 case 7:
562 break;
563 }
564 break;
565#endif
566
567 /* MOVS, MOVSB, MOVSW, MOVSD */
568 case 0xa4:
569 case 0xa5:
570 /*
571 * These instructions are special because they take two
572 * addresses, but we only get one page fault.
573 */
574 kmemcheck_copy(regs, regs->si, regs->di, size);
575 goto out;
576
577 /* CMPS, CMPSB, CMPSW, CMPSD */
578 case 0xa6:
579 case 0xa7:
580 kmemcheck_read(regs, regs->si, size);
581 kmemcheck_read(regs, regs->di, size);
582 goto out;
583 }
584
585 /*
586 * If the opcode isn't special in any way, we use the data from the
587 * page fault handler to determine the address and type of memory
588 * access.
589 */
590 switch (fallback_method) {
591 case KMEMCHECK_READ:
592 kmemcheck_read(regs, fallback_address, size);
593 goto out;
594 case KMEMCHECK_WRITE:
595 kmemcheck_write(regs, fallback_address, size);
596 goto out;
597 }
598
599out:
600 data->busy = false;
601}
602
603bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
604 unsigned long error_code)
605{
606 pte_t *pte;
607
608 /*
609 * XXX: Is it safe to assume that memory accesses from virtual 86
610 * mode or non-kernel code segments will _never_ access kernel
611 * memory (e.g. tracked pages)? For now, we need this to avoid
612 * invoking kmemcheck for PnP BIOS calls.
613 */
614 if (regs->flags & X86_VM_MASK)
615 return false;
616 if (regs->cs != __KERNEL_CS)
617 return false;
618
619 pte = kmemcheck_pte_lookup(address);
620 if (!pte)
621 return false;
622
623 if (error_code & 2)
624 kmemcheck_access(regs, address, KMEMCHECK_WRITE);
625 else
626 kmemcheck_access(regs, address, KMEMCHECK_READ);
627
628 kmemcheck_show(regs);
629 return true;
630}
631
632bool kmemcheck_trap(struct pt_regs *regs)
633{
634 if (!kmemcheck_active(regs))
635 return false;
636
637 /* We're done. */
638 kmemcheck_hide(regs);
639 return true;
640}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
new file mode 100644
index 000000000000..63c19e27aa6f
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -0,0 +1,106 @@
1#include <linux/types.h>
2
3#include "opcode.h"
4
5static bool opcode_is_prefix(uint8_t b)
6{
7 return
8 /* Group 1 */
9 b == 0xf0 || b == 0xf2 || b == 0xf3
10 /* Group 2 */
11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
12 || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
13 /* Group 3 */
14 || b == 0x66
15 /* Group 4 */
16 || b == 0x67;
17}
18
19#ifdef CONFIG_X86_64
20static bool opcode_is_rex_prefix(uint8_t b)
21{
22 return (b & 0xf0) == 0x40;
23}
24#else
25static bool opcode_is_rex_prefix(uint8_t b)
26{
27 return false;
28}
29#endif
30
31#define REX_W (1 << 3)
32
33/*
34 * This is a VERY crude opcode decoder. We only need to find the size of the
35 * load/store that caused our #PF and this should work for all the opcodes
36 * that we care about. Moreover, the ones who invented this instruction set
37 * should be shot.
38 */
39void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
40{
41 /* Default operand size */
42 int operand_size_override = 4;
43
44 /* prefixes */
45 for (; opcode_is_prefix(*op); ++op) {
46 if (*op == 0x66)
47 operand_size_override = 2;
48 }
49
50 /* REX prefix */
51 if (opcode_is_rex_prefix(*op)) {
52 uint8_t rex = *op;
53
54 ++op;
55 if (rex & REX_W) {
56 switch (*op) {
57 case 0x63:
58 *size = 4;
59 return;
60 case 0x0f:
61 ++op;
62
63 switch (*op) {
64 case 0xb6:
65 case 0xbe:
66 *size = 1;
67 return;
68 case 0xb7:
69 case 0xbf:
70 *size = 2;
71 return;
72 }
73
74 break;
75 }
76
77 *size = 8;
78 return;
79 }
80 }
81
82 /* escape opcode */
83 if (*op == 0x0f) {
84 ++op;
85
86 /*
87 * This is move with zero-extend and sign-extend, respectively;
88 * we don't have to think about 0xb6/0xbe, because this is
89 * already handled in the conditional below.
90 */
91 if (*op == 0xb7 || *op == 0xbf)
92 operand_size_override = 2;
93 }
94
95 *size = (*op & 1) ? operand_size_override : 1;
96}
97
98const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
99{
100 /* skip prefixes */
101 while (opcode_is_prefix(*op))
102 ++op;
103 if (opcode_is_rex_prefix(*op))
104 ++op;
105 return op;
106}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
new file mode 100644
index 000000000000..6956aad66b5b
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -0,0 +1,9 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
2#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
3
4#include <linux/types.h>
5
6void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
7const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
8
9#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
new file mode 100644
index 000000000000..4ead26eeaf96
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -0,0 +1,22 @@
1#include <linux/mm.h>
2
3#include <asm/pgtable.h>
4
5#include "pte.h"
6
7pte_t *kmemcheck_pte_lookup(unsigned long address)
8{
9 pte_t *pte;
10 unsigned int level;
11
12 pte = lookup_address(address, &level);
13 if (!pte)
14 return NULL;
15 if (level != PG_LEVEL_4K)
16 return NULL;
17 if (!pte_hidden(*pte))
18 return NULL;
19
20 return pte;
21}
22
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
new file mode 100644
index 000000000000..9f5966456492
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -0,0 +1,10 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
2#define ARCH__X86__MM__KMEMCHECK__PTE_H
3
4#include <linux/mm.h>
5
6#include <asm/pgtable.h>
7
8pte_t *kmemcheck_pte_lookup(unsigned long address);
9
10#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
new file mode 100644
index 000000000000..036efbea8b28
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -0,0 +1,69 @@
1#include <linux/kernel.h>
2
3#include "opcode.h"
4#include "selftest.h"
5
6struct selftest_opcode {
7 unsigned int expected_size;
8 const uint8_t *insn;
9 const char *desc;
10};
11
12static const struct selftest_opcode selftest_opcodes[] = {
13 /* REP MOVS */
14 {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
15 {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
16
17 /* MOVZX / MOVZXD */
18 {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
19 {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
20
21 /* MOVSX / MOVSXD */
22 {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
23 {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
24
25#ifdef CONFIG_X86_64
26 /* MOVZX / MOVZXD */
27 {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
28 {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
29
30 /* MOVSX / MOVSXD */
31 {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
32 {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
33 {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
34#endif
35};
36
37static bool selftest_opcode_one(const struct selftest_opcode *op)
38{
39 unsigned size;
40
41 kmemcheck_opcode_decode(op->insn, &size);
42
43 if (size == op->expected_size)
44 return true;
45
46 printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
47 op->desc, op->expected_size, size);
48 return false;
49}
50
51static bool selftest_opcodes_all(void)
52{
53 bool pass = true;
54 unsigned int i;
55
56 for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
57 pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
58
59 return pass;
60}
61
62bool kmemcheck_selftest(void)
63{
64 bool pass = true;
65
66 pass = pass && selftest_opcodes_all();
67
68 return pass;
69}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
new file mode 100644
index 000000000000..8fed4fe11f95
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -0,0 +1,6 @@
1#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
2#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
3
4bool kmemcheck_selftest(void);
5
6#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
new file mode 100644
index 000000000000..e773b6bd0079
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -0,0 +1,162 @@
1#include <linux/kmemcheck.h>
2#include <linux/module.h>
3#include <linux/mm.h>
4#include <linux/module.h>
5
6#include <asm/page.h>
7#include <asm/pgtable.h>
8
9#include "pte.h"
10#include "shadow.h"
11
12/*
13 * Return the shadow address for the given address. Returns NULL if the
14 * address is not tracked.
15 *
16 * We need to be extremely careful not to follow any invalid pointers,
17 * because this function can be called for *any* possible address.
18 */
19void *kmemcheck_shadow_lookup(unsigned long address)
20{
21 pte_t *pte;
22 struct page *page;
23
24 if (!virt_addr_valid(address))
25 return NULL;
26
27 pte = kmemcheck_pte_lookup(address);
28 if (!pte)
29 return NULL;
30
31 page = virt_to_page(address);
32 if (!page->shadow)
33 return NULL;
34 return page->shadow + (address & (PAGE_SIZE - 1));
35}
36
37static void mark_shadow(void *address, unsigned int n,
38 enum kmemcheck_shadow status)
39{
40 unsigned long addr = (unsigned long) address;
41 unsigned long last_addr = addr + n - 1;
42 unsigned long page = addr & PAGE_MASK;
43 unsigned long last_page = last_addr & PAGE_MASK;
44 unsigned int first_n;
45 void *shadow;
46
47 /* If the memory range crosses a page boundary, stop there. */
48 if (page == last_page)
49 first_n = n;
50 else
51 first_n = page + PAGE_SIZE - addr;
52
53 shadow = kmemcheck_shadow_lookup(addr);
54 if (shadow)
55 memset(shadow, status, first_n);
56
57 addr += first_n;
58 n -= first_n;
59
60 /* Do full-page memset()s. */
61 while (n >= PAGE_SIZE) {
62 shadow = kmemcheck_shadow_lookup(addr);
63 if (shadow)
64 memset(shadow, status, PAGE_SIZE);
65
66 addr += PAGE_SIZE;
67 n -= PAGE_SIZE;
68 }
69
70 /* Do the remaining page, if any. */
71 if (n > 0) {
72 shadow = kmemcheck_shadow_lookup(addr);
73 if (shadow)
74 memset(shadow, status, n);
75 }
76}
77
78void kmemcheck_mark_unallocated(void *address, unsigned int n)
79{
80 mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
81}
82
83void kmemcheck_mark_uninitialized(void *address, unsigned int n)
84{
85 mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
86}
87
88/*
89 * Fill the shadow memory of the given address such that the memory at that
90 * address is marked as being initialized.
91 */
92void kmemcheck_mark_initialized(void *address, unsigned int n)
93{
94 mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
95}
96EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
97
98void kmemcheck_mark_freed(void *address, unsigned int n)
99{
100 mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
101}
102
103void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
104{
105 unsigned int i;
106
107 for (i = 0; i < n; ++i)
108 kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
109}
110
111void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
112{
113 unsigned int i;
114
115 for (i = 0; i < n; ++i)
116 kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
117}
118
119void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
120{
121 unsigned int i;
122
123 for (i = 0; i < n; ++i)
124 kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
125}
126
127enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
128{
129 uint8_t *x;
130 unsigned int i;
131
132 x = shadow;
133
134#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
135 /*
136 * Make sure _some_ bytes are initialized. Gcc frequently generates
137 * code to access neighboring bytes.
138 */
139 for (i = 0; i < size; ++i) {
140 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
141 return x[i];
142 }
143#else
144 /* All bytes must be initialized. */
145 for (i = 0; i < size; ++i) {
146 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
147 return x[i];
148 }
149#endif
150
151 return x[0];
152}
153
154void kmemcheck_shadow_set(void *shadow, unsigned int size)
155{
156 uint8_t *x;
157 unsigned int i;
158
159 x = shadow;
160 for (i = 0; i < size; ++i)
161 x[i] = KMEMCHECK_SHADOW_INITIALIZED;
162}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
new file mode 100644
index 000000000000..af46d9ab9d86
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -0,0 +1,16 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
2#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
3
4enum kmemcheck_shadow {
5 KMEMCHECK_SHADOW_UNALLOCATED,
6 KMEMCHECK_SHADOW_UNINITIALIZED,
7 KMEMCHECK_SHADOW_INITIALIZED,
8 KMEMCHECK_SHADOW_FREED,
9};
10
11void *kmemcheck_shadow_lookup(unsigned long address);
12
13enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
14void kmemcheck_shadow_set(void *shadow, unsigned int size);
15
16#endif
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6ce9518fe2ac..7e600c1962db 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -11,6 +11,7 @@
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/pfn.h>
14 15
15#include <asm/e820.h> 16#include <asm/e820.h>
16#include <asm/processor.h> 17#include <asm/processor.h>
@@ -470,7 +471,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
470 471
471 if (!debug_pagealloc) 472 if (!debug_pagealloc)
472 spin_unlock(&cpa_lock); 473 spin_unlock(&cpa_lock);
473 base = alloc_pages(GFP_KERNEL, 0); 474 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
474 if (!debug_pagealloc) 475 if (!debug_pagealloc)
475 spin_lock(&cpa_lock); 476 spin_lock(&cpa_lock);
476 if (!base) 477 if (!base)
@@ -590,9 +591,12 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
590 unsigned int level; 591 unsigned int level;
591 pte_t *kpte, old_pte; 592 pte_t *kpte, old_pte;
592 593
593 if (cpa->flags & CPA_PAGES_ARRAY) 594 if (cpa->flags & CPA_PAGES_ARRAY) {
594 address = (unsigned long)page_address(cpa->pages[cpa->curpage]); 595 struct page *page = cpa->pages[cpa->curpage];
595 else if (cpa->flags & CPA_ARRAY) 596 if (unlikely(PageHighMem(page)))
597 return 0;
598 address = (unsigned long)page_address(page);
599 } else if (cpa->flags & CPA_ARRAY)
596 address = cpa->vaddr[cpa->curpage]; 600 address = cpa->vaddr[cpa->curpage];
597 else 601 else
598 address = *cpa->vaddr; 602 address = *cpa->vaddr;
@@ -681,8 +685,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
681static int cpa_process_alias(struct cpa_data *cpa) 685static int cpa_process_alias(struct cpa_data *cpa)
682{ 686{
683 struct cpa_data alias_cpa; 687 struct cpa_data alias_cpa;
684 int ret = 0; 688 unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
685 unsigned long temp_cpa_vaddr, vaddr; 689 unsigned long vaddr, remapped;
690 int ret;
686 691
687 if (cpa->pfn >= max_pfn_mapped) 692 if (cpa->pfn >= max_pfn_mapped)
688 return 0; 693 return 0;
@@ -695,9 +700,12 @@ static int cpa_process_alias(struct cpa_data *cpa)
695 * No need to redo, when the primary call touched the direct 700 * No need to redo, when the primary call touched the direct
696 * mapping already: 701 * mapping already:
697 */ 702 */
698 if (cpa->flags & CPA_PAGES_ARRAY) 703 if (cpa->flags & CPA_PAGES_ARRAY) {
699 vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]); 704 struct page *page = cpa->pages[cpa->curpage];
700 else if (cpa->flags & CPA_ARRAY) 705 if (unlikely(PageHighMem(page)))
706 return 0;
707 vaddr = (unsigned long)page_address(page);
708 } else if (cpa->flags & CPA_ARRAY)
701 vaddr = cpa->vaddr[cpa->curpage]; 709 vaddr = cpa->vaddr[cpa->curpage];
702 else 710 else
703 vaddr = *cpa->vaddr; 711 vaddr = *cpa->vaddr;
@@ -706,42 +714,55 @@ static int cpa_process_alias(struct cpa_data *cpa)
706 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { 714 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
707 715
708 alias_cpa = *cpa; 716 alias_cpa = *cpa;
709 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 717 alias_cpa.vaddr = &laddr;
710 alias_cpa.vaddr = &temp_cpa_vaddr;
711 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); 718 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
712 719
713
714 ret = __change_page_attr_set_clr(&alias_cpa, 0); 720 ret = __change_page_attr_set_clr(&alias_cpa, 0);
721 if (ret)
722 return ret;
715 } 723 }
716 724
717#ifdef CONFIG_X86_64 725#ifdef CONFIG_X86_64
718 if (ret)
719 return ret;
720 /*
721 * No need to redo, when the primary call touched the high
722 * mapping already:
723 */
724 if (within(vaddr, (unsigned long) _text, _brk_end))
725 return 0;
726
727 /* 726 /*
728 * If the physical address is inside the kernel map, we need 727 * If the primary call didn't touch the high mapping already
728 * and the physical address is inside the kernel map, we need
729 * to touch the high mapped kernel as well: 729 * to touch the high mapped kernel as well:
730 */ 730 */
731 if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) 731 if (!within(vaddr, (unsigned long)_text, _brk_end) &&
732 return 0; 732 within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
733 unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
734 __START_KERNEL_map - phys_base;
735 alias_cpa = *cpa;
736 alias_cpa.vaddr = &temp_cpa_vaddr;
737 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
733 738
734 alias_cpa = *cpa; 739 /*
735 temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; 740 * The high mapping range is imprecise, so ignore the
736 alias_cpa.vaddr = &temp_cpa_vaddr; 741 * return value.
737 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); 742 */
743 __change_page_attr_set_clr(&alias_cpa, 0);
744 }
745#endif
738 746
739 /* 747 /*
740 * The high mapping range is imprecise, so ignore the return value. 748 * If the PMD page was partially used for per-cpu remapping,
749 * the recycled area needs to be split and modified. Because
750 * the area is always proper subset of a PMD page
751 * cpa->numpages is guaranteed to be 1 for these areas, so
752 * there's no need to loop over and check for further remaps.
741 */ 753 */
742 __change_page_attr_set_clr(&alias_cpa, 0); 754 remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr);
743#endif 755 if (remapped) {
744 return ret; 756 WARN_ON(cpa->numpages > 1);
757 alias_cpa = *cpa;
758 alias_cpa.vaddr = &remapped;
759 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
760 ret = __change_page_attr_set_clr(&alias_cpa, 0);
761 if (ret)
762 return ret;
763 }
764
765 return 0;
745} 766}
746 767
747static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) 768static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
@@ -982,12 +1003,15 @@ EXPORT_SYMBOL(set_memory_array_uc);
982int _set_memory_wc(unsigned long addr, int numpages) 1003int _set_memory_wc(unsigned long addr, int numpages)
983{ 1004{
984 int ret; 1005 int ret;
1006 unsigned long addr_copy = addr;
1007
985 ret = change_page_attr_set(&addr, numpages, 1008 ret = change_page_attr_set(&addr, numpages,
986 __pgprot(_PAGE_CACHE_UC_MINUS), 0); 1009 __pgprot(_PAGE_CACHE_UC_MINUS), 0);
987
988 if (!ret) { 1010 if (!ret) {
989 ret = change_page_attr_set(&addr, numpages, 1011 ret = change_page_attr_set_clr(&addr_copy, numpages,
990 __pgprot(_PAGE_CACHE_WC), 0); 1012 __pgprot(_PAGE_CACHE_WC),
1013 __pgprot(_PAGE_CACHE_MASK),
1014 0, 0, NULL);
991 } 1015 }
992 return ret; 1016 return ret;
993} 1017}
@@ -1104,7 +1128,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray)
1104 int free_idx; 1128 int free_idx;
1105 1129
1106 for (i = 0; i < addrinarray; i++) { 1130 for (i = 0; i < addrinarray; i++) {
1107 start = (unsigned long)page_address(pages[i]); 1131 if (PageHighMem(pages[i]))
1132 continue;
1133 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1108 end = start + PAGE_SIZE; 1134 end = start + PAGE_SIZE;
1109 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) 1135 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
1110 goto err_out; 1136 goto err_out;
@@ -1117,7 +1143,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray)
1117err_out: 1143err_out:
1118 free_idx = i; 1144 free_idx = i;
1119 for (i = 0; i < free_idx; i++) { 1145 for (i = 0; i < free_idx; i++) {
1120 start = (unsigned long)page_address(pages[i]); 1146 if (PageHighMem(pages[i]))
1147 continue;
1148 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1121 end = start + PAGE_SIZE; 1149 end = start + PAGE_SIZE;
1122 free_memtype(start, end); 1150 free_memtype(start, end);
1123 } 1151 }
@@ -1146,7 +1174,9 @@ int set_pages_array_wb(struct page **pages, int addrinarray)
1146 return retval; 1174 return retval;
1147 1175
1148 for (i = 0; i < addrinarray; i++) { 1176 for (i = 0; i < addrinarray; i++) {
1149 start = (unsigned long)page_address(pages[i]); 1177 if (PageHighMem(pages[i]))
1178 continue;
1179 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1150 end = start + PAGE_SIZE; 1180 end = start + PAGE_SIZE;
1151 free_memtype(start, end); 1181 free_memtype(start, end);
1152 } 1182 }
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7aa03a5389f5..ed34f5e35999 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,9 +4,11 @@
4#include <asm/tlb.h> 4#include <asm/tlb.h>
5#include <asm/fixmap.h> 5#include <asm/fixmap.h>
6 6
7#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
8
7pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 9pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
8{ 10{
9 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); 11 return (pte_t *)__get_free_page(PGALLOC_GFP);
10} 12}
11 13
12pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) 14pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -14,16 +16,16 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
14 struct page *pte; 16 struct page *pte;
15 17
16#ifdef CONFIG_HIGHPTE 18#ifdef CONFIG_HIGHPTE
17 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); 19 pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0);
18#else 20#else
19 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); 21 pte = alloc_pages(PGALLOC_GFP, 0);
20#endif 22#endif
21 if (pte) 23 if (pte)
22 pgtable_page_ctor(pte); 24 pgtable_page_ctor(pte);
23 return pte; 25 return pte;
24} 26}
25 27
26void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) 28void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
27{ 29{
28 pgtable_page_dtor(pte); 30 pgtable_page_dtor(pte);
29 paravirt_release_pte(page_to_pfn(pte)); 31 paravirt_release_pte(page_to_pfn(pte));
@@ -31,14 +33,14 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
31} 33}
32 34
33#if PAGETABLE_LEVELS > 2 35#if PAGETABLE_LEVELS > 2
34void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) 36void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
35{ 37{
36 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); 38 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
37 tlb_remove_page(tlb, virt_to_page(pmd)); 39 tlb_remove_page(tlb, virt_to_page(pmd));
38} 40}
39 41
40#if PAGETABLE_LEVELS > 3 42#if PAGETABLE_LEVELS > 3
41void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 43void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
42{ 44{
43 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 45 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
44 tlb_remove_page(tlb, virt_to_page(pud)); 46 tlb_remove_page(tlb, virt_to_page(pud));
@@ -161,7 +163,7 @@ static int preallocate_pmds(pmd_t *pmds[])
161 bool failed = false; 163 bool failed = false;
162 164
163 for(i = 0; i < PREALLOCATED_PMDS; i++) { 165 for(i = 0; i < PREALLOCATED_PMDS; i++) {
164 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); 166 pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
165 if (pmd == NULL) 167 if (pmd == NULL)
166 failed = true; 168 failed = true;
167 pmds[i] = pmd; 169 pmds[i] = pmd;
@@ -228,7 +230,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
228 pmd_t *pmds[PREALLOCATED_PMDS]; 230 pmd_t *pmds[PREALLOCATED_PMDS];
229 unsigned long flags; 231 unsigned long flags;
230 232
231 pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 233 pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
232 234
233 if (pgd == NULL) 235 if (pgd == NULL)
234 goto out; 236 goto out;
@@ -327,7 +329,6 @@ void __init reserve_top_address(unsigned long reserve)
327 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", 329 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
328 (int)-reserve); 330 (int)-reserve);
329 __FIXADDR_TOP = -reserve - PAGE_SIZE; 331 __FIXADDR_TOP = -reserve - PAGE_SIZE;
330 __VMALLOC_RESERVE += reserve;
331#endif 332#endif
332} 333}
333 334
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 2dfcbf9df2ae..dbb5381f7b3b 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -79,8 +79,10 @@ static __init void bad_srat(void)
79 acpi_numa = -1; 79 acpi_numa = -1;
80 for (i = 0; i < MAX_LOCAL_APIC; i++) 80 for (i = 0; i < MAX_LOCAL_APIC; i++)
81 apicid_to_node[i] = NUMA_NO_NODE; 81 apicid_to_node[i] = NUMA_NO_NODE;
82 for (i = 0; i < MAX_NUMNODES; i++) 82 for (i = 0; i < MAX_NUMNODES; i++) {
83 nodes_add[i].start = nodes[i].end = 0; 83 nodes[i].start = nodes[i].end = 0;
84 nodes_add[i].start = nodes_add[i].end = 0;
85 }
84 remove_all_active_ranges(); 86 remove_all_active_ranges();
85} 87}
86 88
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 821e97017e95..c814e144a3f0 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -183,18 +183,17 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
183 183
184 f->flush_mm = mm; 184 f->flush_mm = mm;
185 f->flush_va = va; 185 f->flush_va = va;
186 cpumask_andnot(to_cpumask(f->flush_cpumask), 186 if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
187 cpumask, cpumask_of(smp_processor_id())); 187 /*
188 188 * We have to send the IPI only to
189 /* 189 * CPUs affected.
190 * We have to send the IPI only to 190 */
191 * CPUs affected. 191 apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
192 */ 192 INVALIDATE_TLB_VECTOR_START + sender);
193 apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
194 INVALIDATE_TLB_VECTOR_START + sender);
195 193
196 while (!cpumask_empty(to_cpumask(f->flush_cpumask))) 194 while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
197 cpu_relax(); 195 cpu_relax();
196 }
198 197
199 f->flush_mm = NULL; 198 f->flush_mm = NULL;
200 f->flush_va = 0; 199 f->flush_va = 0;
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index b07dd8d0b321..89b9a5cd63da 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -390,7 +390,7 @@ static int __init p4_init(char **cpu_type)
390static int force_arch_perfmon; 390static int force_arch_perfmon;
391static int force_cpu_type(const char *str, struct kernel_param *kp) 391static int force_cpu_type(const char *str, struct kernel_param *kp)
392{ 392{
393 if (!strcmp(str, "archperfmon")) { 393 if (!strcmp(str, "arch_perfmon")) {
394 force_arch_perfmon = 1; 394 force_arch_perfmon = 1;
395 printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); 395 printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
396 } 396 }
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index c0ecf250fe51..1014eb4bfc37 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -38,15 +38,26 @@ count_resource(struct acpi_resource *acpi_res, void *data)
38 struct acpi_resource_address64 addr; 38 struct acpi_resource_address64 addr;
39 acpi_status status; 39 acpi_status status;
40 40
41 if (info->res_num >= PCI_BUS_NUM_RESOURCES)
42 return AE_OK;
43
44 status = resource_to_addr(acpi_res, &addr); 41 status = resource_to_addr(acpi_res, &addr);
45 if (ACPI_SUCCESS(status)) 42 if (ACPI_SUCCESS(status))
46 info->res_num++; 43 info->res_num++;
47 return AE_OK; 44 return AE_OK;
48} 45}
49 46
47static int
48bus_has_transparent_bridge(struct pci_bus *bus)
49{
50 struct pci_dev *dev;
51
52 list_for_each_entry(dev, &bus->devices, bus_list) {
53 u16 class = dev->class >> 8;
54
55 if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent)
56 return true;
57 }
58 return false;
59}
60
50static acpi_status 61static acpi_status
51setup_resource(struct acpi_resource *acpi_res, void *data) 62setup_resource(struct acpi_resource *acpi_res, void *data)
52{ 63{
@@ -56,9 +67,11 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
56 acpi_status status; 67 acpi_status status;
57 unsigned long flags; 68 unsigned long flags;
58 struct resource *root; 69 struct resource *root;
70 int max_root_bus_resources = PCI_BUS_NUM_RESOURCES;
71 u64 start, end;
59 72
60 if (info->res_num >= PCI_BUS_NUM_RESOURCES) 73 if (bus_has_transparent_bridge(info->bus))
61 return AE_OK; 74 max_root_bus_resources -= 3;
62 75
63 status = resource_to_addr(acpi_res, &addr); 76 status = resource_to_addr(acpi_res, &addr);
64 if (!ACPI_SUCCESS(status)) 77 if (!ACPI_SUCCESS(status))
@@ -75,11 +88,22 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
75 } else 88 } else
76 return AE_OK; 89 return AE_OK;
77 90
91 start = addr.minimum + addr.translation_offset;
92 end = start + addr.address_length - 1;
93 if (info->res_num >= max_root_bus_resources) {
94 printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx "
95 "from %s for %s due to _CRS returning more than "
96 "%d resource descriptors\n", (unsigned long) start,
97 (unsigned long) end, root->name, info->name,
98 max_root_bus_resources);
99 return AE_OK;
100 }
101
78 res = &info->res[info->res_num]; 102 res = &info->res[info->res_num];
79 res->name = info->name; 103 res->name = info->name;
80 res->flags = flags; 104 res->flags = flags;
81 res->start = addr.minimum + addr.translation_offset; 105 res->start = start;
82 res->end = res->start + addr.address_length - 1; 106 res->end = end;
83 res->child = NULL; 107 res->child = NULL;
84 108
85 if (insert_resource(root, res)) { 109 if (insert_resource(root, res)) {
@@ -94,23 +118,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
94} 118}
95 119
96static void 120static void
97adjust_transparent_bridge_resources(struct pci_bus *bus)
98{
99 struct pci_dev *dev;
100
101 list_for_each_entry(dev, &bus->devices, bus_list) {
102 int i;
103 u16 class = dev->class >> 8;
104
105 if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent) {
106 for(i = 3; i < PCI_BUS_NUM_RESOURCES; i++)
107 dev->subordinate->resource[i] =
108 dev->bus->resource[i - 3];
109 }
110 }
111}
112
113static void
114get_current_resources(struct acpi_device *device, int busnum, 121get_current_resources(struct acpi_device *device, int busnum,
115 int domain, struct pci_bus *bus) 122 int domain, struct pci_bus *bus)
116{ 123{
@@ -137,8 +144,6 @@ get_current_resources(struct acpi_device *device, int busnum,
137 info.res_num = 0; 144 info.res_num = 0;
138 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, 145 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
139 &info); 146 &info);
140 if (info.res_num)
141 adjust_transparent_bridge_resources(bus);
142 147
143 return; 148 return;
144 149
@@ -201,8 +206,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
201 */ 206 */
202 memcpy(bus->sysdata, sd, sizeof(*sd)); 207 memcpy(bus->sysdata, sd, sizeof(*sd));
203 kfree(sd); 208 kfree(sd);
204 } else 209 } else {
205 bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); 210 bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd);
211 if (bus) {
212 if (pci_probe & PCI_USE__CRS)
213 get_current_resources(device, busnum, domain,
214 bus);
215 bus->subordinate = pci_scan_child_bus(bus);
216 }
217 }
206 218
207 if (!bus) 219 if (!bus)
208 kfree(sd); 220 kfree(sd);
@@ -217,8 +229,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
217#endif 229#endif
218 } 230 }
219 231
220 if (bus && (pci_probe & PCI_USE__CRS))
221 get_current_resources(device, busnum, domain, bus);
222 return bus; 232 return bus;
223} 233}
224 234
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index f893d6a6e803..3ffa10df20b9 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -100,8 +100,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
100 int j; 100 int j;
101 struct pci_root_info *info; 101 struct pci_root_info *info;
102 102
103 /* don't go for it if _CRS is used */ 103 /* don't go for it if _CRS is used already */
104 if (pci_probe & PCI_USE__CRS) 104 if (b->resource[0] != &ioport_resource ||
105 b->resource[1] != &iomem_resource)
105 return; 106 return;
106 107
107 /* if only one root bus, don't need to anything */ 108 /* if only one root bus, don't need to anything */
@@ -116,6 +117,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
116 if (i == pci_root_num) 117 if (i == pci_root_num)
117 return; 118 return;
118 119
120 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
121 b->number);
122
119 info = &pci_root_info[i]; 123 info = &pci_root_info[i];
120 for (j = 0; j < info->res_num; j++) { 124 for (j = 0; j < info->res_num; j++) {
121 struct resource *res; 125 struct resource *res;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index a85bef20a3b9..52e62e57fedd 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -35,6 +35,7 @@
35#include <asm/pat.h> 35#include <asm/pat.h>
36#include <asm/e820.h> 36#include <asm/e820.h>
37#include <asm/pci_x86.h> 37#include <asm/pci_x86.h>
38#include <asm/io_apic.h>
38 39
39 40
40static int 41static int
@@ -116,7 +117,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
116 struct pci_bus *bus; 117 struct pci_bus *bus;
117 struct pci_dev *dev; 118 struct pci_dev *dev;
118 int idx; 119 int idx;
119 struct resource *r, *pr; 120 struct resource *r;
120 121
121 /* Depth-First Search on bus tree */ 122 /* Depth-First Search on bus tree */
122 list_for_each_entry(bus, bus_list, node) { 123 list_for_each_entry(bus, bus_list, node) {
@@ -126,9 +127,8 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
126 r = &dev->resource[idx]; 127 r = &dev->resource[idx];
127 if (!r->flags) 128 if (!r->flags)
128 continue; 129 continue;
129 pr = pci_find_parent_resource(dev, r); 130 if (!r->start ||
130 if (!r->start || !pr || 131 pci_claim_resource(dev, idx) < 0) {
131 request_resource(pr, r) < 0) {
132 dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); 132 dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx);
133 /* 133 /*
134 * Something is wrong with the region. 134 * Something is wrong with the region.
@@ -149,7 +149,7 @@ static void __init pcibios_allocate_resources(int pass)
149 struct pci_dev *dev = NULL; 149 struct pci_dev *dev = NULL;
150 int idx, disabled; 150 int idx, disabled;
151 u16 command; 151 u16 command;
152 struct resource *r, *pr; 152 struct resource *r;
153 153
154 for_each_pci_dev(dev) { 154 for_each_pci_dev(dev) {
155 pci_read_config_word(dev, PCI_COMMAND, &command); 155 pci_read_config_word(dev, PCI_COMMAND, &command);
@@ -168,8 +168,7 @@ static void __init pcibios_allocate_resources(int pass)
168 (unsigned long long) r->start, 168 (unsigned long long) r->start,
169 (unsigned long long) r->end, 169 (unsigned long long) r->end,
170 r->flags, disabled, pass); 170 r->flags, disabled, pass);
171 pr = pci_find_parent_resource(dev, r); 171 if (pci_claim_resource(dev, idx) < 0) {
172 if (!pr || request_resource(pr, r) < 0) {
173 dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); 172 dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx);
174 /* We'll assign a new address later */ 173 /* We'll assign a new address later */
175 r->end -= r->start; 174 r->end -= r->start;
@@ -197,7 +196,7 @@ static void __init pcibios_allocate_resources(int pass)
197static int __init pcibios_assign_resources(void) 196static int __init pcibios_assign_resources(void)
198{ 197{
199 struct pci_dev *dev = NULL; 198 struct pci_dev *dev = NULL;
200 struct resource *r, *pr; 199 struct resource *r;
201 200
202 if (!(pci_probe & PCI_ASSIGN_ROMS)) { 201 if (!(pci_probe & PCI_ASSIGN_ROMS)) {
203 /* 202 /*
@@ -209,8 +208,7 @@ static int __init pcibios_assign_resources(void)
209 r = &dev->resource[PCI_ROM_RESOURCE]; 208 r = &dev->resource[PCI_ROM_RESOURCE];
210 if (!r->flags || !r->start) 209 if (!r->flags || !r->start)
211 continue; 210 continue;
212 pr = pci_find_parent_resource(dev, r); 211 if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) {
213 if (!pr || request_resource(pr, r) < 0) {
214 r->end -= r->start; 212 r->end -= r->start;
215 r->start = 0; 213 r->start = 0;
216 } 214 }
@@ -230,6 +228,12 @@ void __init pcibios_resource_survey(void)
230 pcibios_allocate_resources(1); 228 pcibios_allocate_resources(1);
231 229
232 e820_reserve_resources_late(); 230 e820_reserve_resources_late();
231 /*
232 * Insert the IO APIC resources after PCI initialization has
233 * occured to handle IO APICS that are mapped in on a BAR in
234 * PCI space, but before trying to assign unassigned pci res.
235 */
236 ioapic_insert_resources();
233} 237}
234 238
235/** 239/**
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 8766b0e216c5..712443ec6d43 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -523,6 +523,69 @@ reject:
523 523
524static int __initdata known_bridge; 524static int __initdata known_bridge;
525 525
526static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
527
528/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
529struct acpi_mcfg_allocation *pci_mmcfg_config;
530int pci_mmcfg_config_num;
531
532static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
533{
534 if (!strcmp(mcfg->header.oem_id, "SGI"))
535 acpi_mcfg_64bit_base_addr = TRUE;
536
537 return 0;
538}
539
540static int __init pci_parse_mcfg(struct acpi_table_header *header)
541{
542 struct acpi_table_mcfg *mcfg;
543 unsigned long i;
544 int config_size;
545
546 if (!header)
547 return -EINVAL;
548
549 mcfg = (struct acpi_table_mcfg *)header;
550
551 /* how many config structures do we have */
552 pci_mmcfg_config_num = 0;
553 i = header->length - sizeof(struct acpi_table_mcfg);
554 while (i >= sizeof(struct acpi_mcfg_allocation)) {
555 ++pci_mmcfg_config_num;
556 i -= sizeof(struct acpi_mcfg_allocation);
557 };
558 if (pci_mmcfg_config_num == 0) {
559 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
560 return -ENODEV;
561 }
562
563 config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
564 pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
565 if (!pci_mmcfg_config) {
566 printk(KERN_WARNING PREFIX
567 "No memory for MCFG config tables\n");
568 return -ENOMEM;
569 }
570
571 memcpy(pci_mmcfg_config, &mcfg[1], config_size);
572
573 acpi_mcfg_oem_check(mcfg);
574
575 for (i = 0; i < pci_mmcfg_config_num; ++i) {
576 if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
577 !acpi_mcfg_64bit_base_addr) {
578 printk(KERN_ERR PREFIX
579 "MMCONFIG not in low 4GB of memory\n");
580 kfree(pci_mmcfg_config);
581 pci_mmcfg_config_num = 0;
582 return -ENODEV;
583 }
584 }
585
586 return 0;
587}
588
526static void __init __pci_mmcfg_init(int early) 589static void __init __pci_mmcfg_init(int early)
527{ 590{
528 /* MMCONFIG disabled */ 591 /* MMCONFIG disabled */
@@ -543,7 +606,7 @@ static void __init __pci_mmcfg_init(int early)
543 } 606 }
544 607
545 if (!known_bridge) 608 if (!known_bridge)
546 acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); 609 acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
547 610
548 pci_mmcfg_reject_broken(early); 611 pci_mmcfg_reject_broken(early);
549 612
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index de2abbd07544..a6a198c33623 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -1,7 +1,7 @@
1# __restore_processor_state() restores %gs after S3 resume and so should not 1# __restore_processor_state() restores %gs after S3 resume and so should not
2# itself be stack-protected 2# itself be stack-protected
3nostackp := $(call cc-option, -fno-stack-protector) 3nostackp := $(call cc-option, -fno-stack-protector)
4CFLAGS_cpu_$(BITS).o := $(nostackp) 4CFLAGS_cpu.o := $(nostackp)
5 5
6obj-$(CONFIG_PM_SLEEP) += cpu.o 6obj-$(CONFIG_PM_SLEEP) += cpu.o
7obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o 7obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index d277ef1eea51..b3d20b9cac63 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -244,7 +244,7 @@ static void __restore_processor_state(struct saved_context *ctxt)
244 do_fpu_end(); 244 do_fpu_end();
245 mtrr_ap_init(); 245 mtrr_ap_init();
246 246
247#ifdef CONFIG_X86_32 247#ifdef CONFIG_X86_OLD_MCE
248 mcheck_init(&boot_cpu_data); 248 mcheck_init(&boot_cpu_data);
249#endif 249#endif
250} 250}
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 16a9020c8f11..88112b49f02c 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -123,6 +123,7 @@ quiet_cmd_vdso = VDSO $@
123 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) 123 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^)
124 124
125VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv) 125VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv)
126GCOV_PROFILE := n
126 127
127# 128#
128# Install the unstripped copy of vdso*.so listed in $(vdso-install-y). 129# Install the unstripped copy of vdso*.so listed in $(vdso-install-y).