aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2011-01-07 08:14:15 -0500
committerIngo Molnar <mingo@elte.hu>2011-01-07 08:14:15 -0500
commit1c2a48cf65580a276552151eb8f78d78c55b828e (patch)
tree68ed0628a276b33cb5aa0ad4899c1afe0a33a69d /arch/x86
parent0aa002fe602939370e9476e5ec32b562000a0425 (diff)
parentcb600d2f83c854ec3d6660063e4466431999489b (diff)
Merge branch 'linus' into x86/apic-cleanups
Conflicts: arch/x86/include/asm/io_apic.h Merge reason: Resolve the conflict, update to a more recent -rc base Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/Kconfig.debug11
-rw-r--r--arch/x86/boot/compressed/head_64.S2
-rw-r--r--arch/x86/boot/compressed/misc.c2
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c1
-rw-r--r--arch/x86/ia32/sys_ia32.c1
-rw-r--r--arch/x86/include/asm/alternative.h8
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/e820.h3
-rw-r--r--arch/x86/include/asm/fixmap.h4
-rw-r--r--arch/x86/include/asm/i387.h24
-rw-r--r--arch/x86/include/asm/irq.h4
-rw-r--r--arch/x86/include/asm/kdebug.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h2
-rw-r--r--arch/x86/include/asm/mce.h3
-rw-r--r--arch/x86/include/asm/microcode.h6
-rw-r--r--arch/x86/include/asm/mpspec.h31
-rw-r--r--arch/x86/include/asm/mpspec_def.h7
-rw-r--r--arch/x86/include/asm/msr-index.h18
-rw-r--r--arch/x86/include/asm/nmi.h53
-rw-r--r--arch/x86/include/asm/paravirt.h12
-rw-r--r--arch/x86/include/asm/pci.h1
-rw-r--r--arch/x86/include/asm/perf_event.h2
-rw-r--r--arch/x86/include/asm/perf_event_p4.h63
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h1
-rw-r--r--arch/x86/include/asm/stacktrace.h33
-rw-r--r--arch/x86/include/asm/timer.h6
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h9
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h4
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h19
-rw-r--r--arch/x86/include/asm/xen/interface.h6
-rw-r--r--arch/x86/include/asm/xen/interface_32.h5
-rw-r--r--arch/x86/include/asm/xen/interface_64.h13
-rw-r--r--arch/x86/include/asm/xen/page.h7
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c11
-rw-r--r--arch/x86/kernel/alternative.c52
-rw-r--r--arch/x86/kernel/apic/Makefile5
-rw-r--r--arch/x86/kernel/apic/apic.c40
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c41
-rw-r--r--arch/x86/kernel/apic/io_apic.c53
-rw-r--r--arch/x86/kernel/apic/nmi.c567
-rw-r--r--arch/x86/kernel/apic/probe_64.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c76
-rw-r--r--arch/x86/kernel/cpu/common.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c135
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c40
-rw-r--r--arch/x86/kernel/cpu/perf_event.c80
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c16
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c26
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c644
-rw-r--r--arch/x86/kernel/cpuid.c1
-rw-r--r--arch/x86/kernel/dumpstack.c12
-rw-r--r--arch/x86/kernel/dumpstack_32.c25
-rw-r--r--arch/x86/kernel/dumpstack_64.c24
-rw-r--r--arch/x86/kernel/entry_32.S2
-rw-r--r--arch/x86/kernel/entry_64.S2
-rw-r--r--arch/x86/kernel/ftrace.c3
-rw-r--r--arch/x86/kernel/head_32.S91
-rw-r--r--arch/x86/kernel/hpet.c26
-rw-r--r--arch/x86/kernel/hw_breakpoint.c4
-rw-r--r--arch/x86/kernel/kgdb.c12
-rw-r--r--arch/x86/kernel/kprobes.c113
-rw-r--r--arch/x86/kernel/microcode_amd.c34
-rw-r--r--arch/x86/kernel/microcode_intel.c16
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c64
-rw-r--r--arch/x86/kernel/msr.c1
-rw-r--r--arch/x86/kernel/process.c10
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/pvclock.c5
-rw-r--r--arch/x86/kernel/resource.c48
-rw-r--r--arch/x86/kernel/setup.c18
-rw-r--r--arch/x86/kernel/smpboot.c36
-rw-r--r--arch/x86/kernel/stacktrace.c8
-rw-r--r--arch/x86/kernel/time.c18
-rw-r--r--arch/x86/kernel/trampoline_64.S2
-rw-r--r--arch/x86/kernel/traps.c35
-rw-r--r--arch/x86/kernel/tsc.c96
-rw-r--r--arch/x86/kernel/verify_cpu.S (renamed from arch/x86/kernel/verify_cpu_64.S)49
-rw-r--r--arch/x86/kernel/vmlinux.lds.S8
-rw-r--r--arch/x86/kernel/xsave.c3
-rw-r--r--arch/x86/kvm/i8259.c2
-rw-r--r--arch/x86/kvm/mmu.c3
-rw-r--r--arch/x86/kvm/svm.c6
-rw-r--r--arch/x86/kvm/vmx.c24
-rw-r--r--arch/x86/kvm/x86.c11
-rw-r--r--arch/x86/kvm/x86.h5
-rw-r--r--arch/x86/lguest/boot.c16
-rw-r--r--arch/x86/mm/init.c3
-rw-r--r--arch/x86/mm/init_32.c20
-rw-r--r--arch/x86/mm/kmemcheck/error.c2
-rw-r--r--arch/x86/mm/pageattr.c33
-rw-r--r--arch/x86/mm/setup_nx.c2
-rw-r--r--arch/x86/mm/srat_32.c1
-rw-r--r--arch/x86/mm/srat_64.c10
-rw-r--r--arch/x86/mm/tlb.c5
-rw-r--r--arch/x86/oprofile/backtrace.c2
-rw-r--r--arch/x86/oprofile/nmi_int.c3
-rw-r--r--arch/x86/oprofile/nmi_timer_int.c3
-rw-r--r--arch/x86/oprofile/op_model_amd.c79
-rw-r--r--arch/x86/oprofile/op_model_p4.c2
-rw-r--r--arch/x86/pci/i386.c18
-rw-r--r--arch/x86/pci/pcbios.c23
-rw-r--r--arch/x86/pci/xen.c27
-rw-r--r--arch/x86/platform/sfi/sfi.c4
-rw-r--r--arch/x86/platform/uv/tlb_uv.c24
-rw-r--r--arch/x86/platform/uv/uv_time.c4
-rw-r--r--arch/x86/platform/visws/visws_quirks.c2
-rw-r--r--arch/x86/vdso/Makefile4
-rw-r--r--arch/x86/xen/enlighten.c25
-rw-r--r--arch/x86/xen/mmu.c88
-rw-r--r--arch/x86/xen/platform-pci-unplug.c2
-rw-r--r--arch/x86/xen/setup.c53
-rw-r--r--arch/x86/xen/suspend.c1
-rw-r--r--arch/x86/xen/time.c2
-rw-r--r--arch/x86/xen/xen-ops.h2
118 files changed, 1437 insertions, 2006 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 58ecad57aad1..b6fccb07123e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,7 +21,7 @@ config X86
21 select HAVE_UNSTABLE_SCHED_CLOCK 21 select HAVE_UNSTABLE_SCHED_CLOCK
22 select HAVE_IDE 22 select HAVE_IDE
23 select HAVE_OPROFILE 23 select HAVE_OPROFILE
24 select HAVE_PERF_EVENTS if (!M386 && !M486) 24 select HAVE_PERF_EVENTS
25 select HAVE_IRQ_WORK 25 select HAVE_IRQ_WORK
26 select HAVE_IOREMAP_PROT 26 select HAVE_IOREMAP_PROT
27 select HAVE_KPROBES 27 select HAVE_KPROBES
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index b59ee765414e..45143bbcfe5e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -117,6 +117,17 @@ config DEBUG_RODATA_TEST
117 feature as well as for the change_page_attr() infrastructure. 117 feature as well as for the change_page_attr() infrastructure.
118 If in doubt, say "N" 118 If in doubt, say "N"
119 119
120config DEBUG_SET_MODULE_RONX
121 bool "Set loadable kernel module data as NX and text as RO"
122 depends on MODULES
123 ---help---
124 This option helps catch unintended modifications to loadable
125 kernel module's text and read-only data. It also prevents execution
126 of module data. Such protection may interfere with run-time code
127 patching and dynamic kernel tracing - and they might also protect
128 against certain classes of kernel exploits.
129 If in doubt, say "N".
130
120config DEBUG_NX_TEST 131config DEBUG_NX_TEST
121 tristate "Testcase for the NX non-executable stack feature" 132 tristate "Testcase for the NX non-executable stack feature"
122 depends on DEBUG_KERNEL && m 133 depends on DEBUG_KERNEL && m
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 52f85a196fa0..35af09d13dc1 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -182,7 +182,7 @@ no_longmode:
182 hlt 182 hlt
183 jmp 1b 183 jmp 1b
184 184
185#include "../../kernel/verify_cpu_64.S" 185#include "../../kernel/verify_cpu.S"
186 186
187 /* 187 /*
188 * Be careful here startup_64 needs to be at a predictable 188 * Be careful here startup_64 needs to be at a predictable
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 23f315c9f215..325c05294fc4 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -355,7 +355,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
355 if (heap > 0x3fffffffffffUL) 355 if (heap > 0x3fffffffffffUL)
356 error("Destination address too large"); 356 error("Destination address too large");
357#else 357#else
358 if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff)) 358 if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
359 error("Destination address too large"); 359 error("Destination address too large");
360#endif 360#endif
361#ifndef CONFIG_RELOCATABLE 361#ifndef CONFIG_RELOCATABLE
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index cbcc8d8ea93a..7a6e68e4f748 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -10,6 +10,7 @@
10 * by the Free Software Foundation. 10 * by the Free Software Foundation.
11 */ 11 */
12 12
13#include <linux/err.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/init.h> 15#include <linux/init.h>
15#include <linux/kernel.h> 16#include <linux/kernel.h>
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 849813f398e7..5852519b2d0f 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -28,7 +28,6 @@
28#include <linux/syscalls.h> 28#include <linux/syscalls.h>
29#include <linux/times.h> 29#include <linux/times.h>
30#include <linux/utsname.h> 30#include <linux/utsname.h>
31#include <linux/smp_lock.h>
32#include <linux/mm.h> 31#include <linux/mm.h>
33#include <linux/uio.h> 32#include <linux/uio.h>
34#include <linux/poll.h> 33#include <linux/poll.h>
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 76561d20ea2f..13009d1af99a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -66,6 +66,7 @@ extern void alternatives_smp_module_add(struct module *mod, char *name,
66extern void alternatives_smp_module_del(struct module *mod); 66extern void alternatives_smp_module_del(struct module *mod);
67extern void alternatives_smp_switch(int smp); 67extern void alternatives_smp_switch(int smp);
68extern int alternatives_text_reserved(void *start, void *end); 68extern int alternatives_text_reserved(void *start, void *end);
69extern bool skip_smp_alternatives;
69#else 70#else
70static inline void alternatives_smp_module_add(struct module *mod, char *name, 71static inline void alternatives_smp_module_add(struct module *mod, char *name,
71 void *locks, void *locks_end, 72 void *locks, void *locks_end,
@@ -180,8 +181,15 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
180 * On the local CPU you need to be protected again NMI or MCE handlers seeing an 181 * On the local CPU you need to be protected again NMI or MCE handlers seeing an
181 * inconsistent instruction while you patch. 182 * inconsistent instruction while you patch.
182 */ 183 */
184struct text_poke_param {
185 void *addr;
186 const void *opcode;
187 size_t len;
188};
189
183extern void *text_poke(void *addr, const void *opcode, size_t len); 190extern void *text_poke(void *addr, const void *opcode, size_t len);
184extern void *text_poke_smp(void *addr, const void *opcode, size_t len); 191extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
192extern void text_poke_smp_batch(struct text_poke_param *params, int n);
185 193
186#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) 194#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
187#define IDEAL_NOP_SIZE_5 5 195#define IDEAL_NOP_SIZE_5 5
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index a859ca461fb0..47a30ff8e517 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -145,6 +145,7 @@
145 145
146#ifdef CONFIG_X86_32 146#ifdef CONFIG_X86_32
147# define MAX_IO_APICS 64 147# define MAX_IO_APICS 64
148# define MAX_LOCAL_APIC 256
148#else 149#else
149# define MAX_IO_APICS 128 150# define MAX_IO_APICS 128
150# define MAX_LOCAL_APIC 32768 151# define MAX_LOCAL_APIC 32768
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 5be1542fbfaf..e99d55d74df5 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -72,6 +72,9 @@ struct e820map {
72#define BIOS_BEGIN 0x000a0000 72#define BIOS_BEGIN 0x000a0000
73#define BIOS_END 0x00100000 73#define BIOS_END 0x00100000
74 74
75#define BIOS_ROM_BASE 0xffe00000
76#define BIOS_ROM_END 0xffffffff
77
75#ifdef __KERNEL__ 78#ifdef __KERNEL__
76/* see comment in arch/x86/kernel/e820.c */ 79/* see comment in arch/x86/kernel/e820.c */
77extern struct e820map e820; 80extern struct e820map e820;
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 139591a933f6..0141b234406f 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -220,8 +220,8 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
220} 220}
221 221
222/* Return an pointer with offset calculated */ 222/* Return an pointer with offset calculated */
223static inline unsigned long __set_fixmap_offset(enum fixed_addresses idx, 223static __always_inline unsigned long
224 phys_addr_t phys, pgprot_t flags) 224__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
225{ 225{
226 __set_fixmap(idx, phys, flags); 226 __set_fixmap(idx, phys, flags);
227 return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1)); 227 return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 4aa2bb3b242a..ef328901c802 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -93,6 +93,17 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
93 int err; 93 int err;
94 94
95 /* See comment in fxsave() below. */ 95 /* See comment in fxsave() below. */
96#ifdef CONFIG_AS_FXSAVEQ
97 asm volatile("1: fxrstorq %[fx]\n\t"
98 "2:\n"
99 ".section .fixup,\"ax\"\n"
100 "3: movl $-1,%[err]\n"
101 " jmp 2b\n"
102 ".previous\n"
103 _ASM_EXTABLE(1b, 3b)
104 : [err] "=r" (err)
105 : [fx] "m" (*fx), "0" (0));
106#else
96 asm volatile("1: rex64/fxrstor (%[fx])\n\t" 107 asm volatile("1: rex64/fxrstor (%[fx])\n\t"
97 "2:\n" 108 "2:\n"
98 ".section .fixup,\"ax\"\n" 109 ".section .fixup,\"ax\"\n"
@@ -102,6 +113,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
102 _ASM_EXTABLE(1b, 3b) 113 _ASM_EXTABLE(1b, 3b)
103 : [err] "=r" (err) 114 : [err] "=r" (err)
104 : [fx] "R" (fx), "m" (*fx), "0" (0)); 115 : [fx] "R" (fx), "m" (*fx), "0" (0));
116#endif
105 return err; 117 return err;
106} 118}
107 119
@@ -119,6 +131,17 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
119 return -EFAULT; 131 return -EFAULT;
120 132
121 /* See comment in fxsave() below. */ 133 /* See comment in fxsave() below. */
134#ifdef CONFIG_AS_FXSAVEQ
135 asm volatile("1: fxsaveq %[fx]\n\t"
136 "2:\n"
137 ".section .fixup,\"ax\"\n"
138 "3: movl $-1,%[err]\n"
139 " jmp 2b\n"
140 ".previous\n"
141 _ASM_EXTABLE(1b, 3b)
142 : [err] "=r" (err), [fx] "=m" (*fx)
143 : "0" (0));
144#else
122 asm volatile("1: rex64/fxsave (%[fx])\n\t" 145 asm volatile("1: rex64/fxsave (%[fx])\n\t"
123 "2:\n" 146 "2:\n"
124 ".section .fixup,\"ax\"\n" 147 ".section .fixup,\"ax\"\n"
@@ -128,6 +151,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
128 _ASM_EXTABLE(1b, 3b) 151 _ASM_EXTABLE(1b, 3b)
129 : [err] "=r" (err), "=m" (*fx) 152 : [err] "=r" (err), "=m" (*fx)
130 : [fx] "R" (fx), "0" (0)); 153 : [fx] "R" (fx), "0" (0));
154#endif
131 if (unlikely(err) && 155 if (unlikely(err) &&
132 __clear_user(fx, sizeof(struct i387_fxsave_struct))) 156 __clear_user(fx, sizeof(struct i387_fxsave_struct)))
133 err = -EFAULT; 157 err = -EFAULT;
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 13b0ebaa512f..ba870bb6dd8e 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -15,10 +15,6 @@ static inline int irq_canonicalize(int irq)
15 return ((irq == 2) ? 9 : irq); 15 return ((irq == 2) ? 9 : irq);
16} 16}
17 17
18#ifdef CONFIG_X86_LOCAL_APIC
19# define ARCH_HAS_NMI_WATCHDOG
20#endif
21
22#ifdef CONFIG_X86_32 18#ifdef CONFIG_X86_32
23extern void irq_ctx_init(int cpu); 19extern void irq_ctx_init(int cpu);
24#else 20#else
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 5bdfca86581b..f23eb2528464 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -28,7 +28,7 @@ extern void die(const char *, struct pt_regs *,long);
28extern int __must_check __die(const char *, struct pt_regs *, long); 28extern int __must_check __die(const char *, struct pt_regs *, long);
29extern void show_registers(struct pt_regs *regs); 29extern void show_registers(struct pt_regs *regs);
30extern void show_trace(struct task_struct *t, struct pt_regs *regs, 30extern void show_trace(struct task_struct *t, struct pt_regs *regs,
31 unsigned long *sp, unsigned long bp); 31 unsigned long *sp);
32extern void __show_regs(struct pt_regs *regs, int all); 32extern void __show_regs(struct pt_regs *regs, int all);
33extern void show_regs(struct pt_regs *regs); 33extern void show_regs(struct pt_regs *regs);
34extern unsigned long oops_begin(void); 34extern unsigned long oops_begin(void);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9e6fe391094e..f702f82aa1eb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,7 +79,7 @@
79#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) 79#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
80#define KVM_MIN_FREE_MMU_PAGES 5 80#define KVM_MIN_FREE_MMU_PAGES 5
81#define KVM_REFILL_PAGES 25 81#define KVM_REFILL_PAGES 25
82#define KVM_MAX_CPUID_ENTRIES 40 82#define KVM_MAX_CPUID_ENTRIES 80
83#define KVM_NR_FIXED_MTRR_REGION 88 83#define KVM_NR_FIXED_MTRR_REGION 88
84#define KVM_NR_VAR_MTRR 8 84#define KVM_NR_VAR_MTRR 8
85 85
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c62c13cb9788..eb16e94ae04f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -223,6 +223,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c);
223 223
224void mce_log_therm_throt_event(__u64 status); 224void mce_log_therm_throt_event(__u64 status);
225 225
226/* Interrupt Handler for core thermal thresholds */
227extern int (*platform_thermal_notify)(__u64 msr_val);
228
226#ifdef CONFIG_X86_THERMAL_VECTOR 229#ifdef CONFIG_X86_THERMAL_VECTOR
227extern void mcheck_intel_therm_init(void); 230extern void mcheck_intel_therm_init(void);
228#else 231#else
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index ef51b501e22a..24215072d0e1 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -48,6 +48,12 @@ static inline struct microcode_ops * __init init_intel_microcode(void)
48 48
49#ifdef CONFIG_MICROCODE_AMD 49#ifdef CONFIG_MICROCODE_AMD
50extern struct microcode_ops * __init init_amd_microcode(void); 50extern struct microcode_ops * __init init_amd_microcode(void);
51
52static inline void get_ucode_data(void *to, const u8 *from, size_t n)
53{
54 memcpy(to, from, n);
55}
56
51#else 57#else
52static inline struct microcode_ops * __init init_amd_microcode(void) 58static inline struct microcode_ops * __init init_amd_microcode(void)
53{ 59{
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index c82868e9f905..0c90dd9f0505 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -5,8 +5,9 @@
5 5
6#include <asm/mpspec_def.h> 6#include <asm/mpspec_def.h>
7#include <asm/x86_init.h> 7#include <asm/x86_init.h>
8#include <asm/apicdef.h>
8 9
9extern int apic_version[MAX_APICS]; 10extern int apic_version[];
10extern int pic_mode; 11extern int pic_mode;
11 12
12#ifdef CONFIG_X86_32 13#ifdef CONFIG_X86_32
@@ -107,7 +108,7 @@ extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
107 int active_high_low); 108 int active_high_low);
108#endif /* CONFIG_ACPI */ 109#endif /* CONFIG_ACPI */
109 110
110#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) 111#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
111 112
112struct physid_mask { 113struct physid_mask {
113 unsigned long mask[PHYSID_ARRAY_SIZE]; 114 unsigned long mask[PHYSID_ARRAY_SIZE];
@@ -122,31 +123,31 @@ typedef struct physid_mask physid_mask_t;
122 test_and_set_bit(physid, (map).mask) 123 test_and_set_bit(physid, (map).mask)
123 124
124#define physids_and(dst, src1, src2) \ 125#define physids_and(dst, src1, src2) \
125 bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS) 126 bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
126 127
127#define physids_or(dst, src1, src2) \ 128#define physids_or(dst, src1, src2) \
128 bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS) 129 bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
129 130
130#define physids_clear(map) \ 131#define physids_clear(map) \
131 bitmap_zero((map).mask, MAX_APICS) 132 bitmap_zero((map).mask, MAX_LOCAL_APIC)
132 133
133#define physids_complement(dst, src) \ 134#define physids_complement(dst, src) \
134 bitmap_complement((dst).mask, (src).mask, MAX_APICS) 135 bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC)
135 136
136#define physids_empty(map) \ 137#define physids_empty(map) \
137 bitmap_empty((map).mask, MAX_APICS) 138 bitmap_empty((map).mask, MAX_LOCAL_APIC)
138 139
139#define physids_equal(map1, map2) \ 140#define physids_equal(map1, map2) \
140 bitmap_equal((map1).mask, (map2).mask, MAX_APICS) 141 bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC)
141 142
142#define physids_weight(map) \ 143#define physids_weight(map) \
143 bitmap_weight((map).mask, MAX_APICS) 144 bitmap_weight((map).mask, MAX_LOCAL_APIC)
144 145
145#define physids_shift_right(d, s, n) \ 146#define physids_shift_right(d, s, n) \
146 bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS) 147 bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC)
147 148
148#define physids_shift_left(d, s, n) \ 149#define physids_shift_left(d, s, n) \
149 bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) 150 bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC)
150 151
151static inline unsigned long physids_coerce(physid_mask_t *map) 152static inline unsigned long physids_coerce(physid_mask_t *map)
152{ 153{
@@ -159,14 +160,6 @@ static inline void physids_promote(unsigned long physids, physid_mask_t *map)
159 map->mask[0] = physids; 160 map->mask[0] = physids;
160} 161}
161 162
162/* Note: will create very large stack frames if physid_mask_t is big */
163#define physid_mask_of_physid(physid) \
164 ({ \
165 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
166 physid_set(physid, __physid_mask); \
167 __physid_mask; \
168 })
169
170static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) 163static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
171{ 164{
172 physids_clear(*map); 165 physids_clear(*map);
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
index 4a7f96d7c188..c0a955a9a087 100644
--- a/arch/x86/include/asm/mpspec_def.h
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -15,13 +15,6 @@
15 15
16#ifdef CONFIG_X86_32 16#ifdef CONFIG_X86_32
17# define MAX_MPC_ENTRY 1024 17# define MAX_MPC_ENTRY 1024
18# define MAX_APICS 256
19#else
20# if NR_CPUS <= 255
21# define MAX_APICS 255
22# else
23# define MAX_APICS 32768
24# endif
25#endif 18#endif
26 19
27/* Intel MP Floating Pointer Structure */ 20/* Intel MP Floating Pointer Structure */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 3ea3dc487047..4d0dfa0d998e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -123,12 +123,16 @@
123#define MSR_AMD64_IBSCTL 0xc001103a 123#define MSR_AMD64_IBSCTL 0xc001103a
124#define MSR_AMD64_IBSBRTARGET 0xc001103b 124#define MSR_AMD64_IBSBRTARGET 0xc001103b
125 125
126/* Fam 15h MSRs */
127#define MSR_F15H_PERF_CTL 0xc0010200
128#define MSR_F15H_PERF_CTR 0xc0010201
129
126/* Fam 10h MSRs */ 130/* Fam 10h MSRs */
127#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 131#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
128#define FAM10H_MMIO_CONF_ENABLE (1<<0) 132#define FAM10H_MMIO_CONF_ENABLE (1<<0)
129#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf 133#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
130#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 134#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
131#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff 135#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL
132#define FAM10H_MMIO_CONF_BASE_SHIFT 20 136#define FAM10H_MMIO_CONF_BASE_SHIFT 20
133#define MSR_FAM10H_NODE_ID 0xc001100c 137#define MSR_FAM10H_NODE_ID 0xc001100c
134 138
@@ -253,6 +257,18 @@
253#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1) 257#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
254#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24) 258#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)
255 259
260/* Thermal Thresholds Support */
261#define THERM_INT_THRESHOLD0_ENABLE (1 << 15)
262#define THERM_SHIFT_THRESHOLD0 8
263#define THERM_MASK_THRESHOLD0 (0x7f << THERM_SHIFT_THRESHOLD0)
264#define THERM_INT_THRESHOLD1_ENABLE (1 << 23)
265#define THERM_SHIFT_THRESHOLD1 16
266#define THERM_MASK_THRESHOLD1 (0x7f << THERM_SHIFT_THRESHOLD1)
267#define THERM_STATUS_THRESHOLD0 (1 << 6)
268#define THERM_LOG_THRESHOLD0 (1 << 7)
269#define THERM_STATUS_THRESHOLD1 (1 << 8)
270#define THERM_LOG_THRESHOLD1 (1 << 9)
271
256/* MISC_ENABLE bits: architectural */ 272/* MISC_ENABLE bits: architectural */
257#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) 273#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
258#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) 274#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 932f0f86b4b7..c4021b953510 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -5,41 +5,15 @@
5#include <asm/irq.h> 5#include <asm/irq.h>
6#include <asm/io.h> 6#include <asm/io.h>
7 7
8#ifdef ARCH_HAS_NMI_WATCHDOG 8#ifdef CONFIG_X86_LOCAL_APIC
9
10/**
11 * do_nmi_callback
12 *
13 * Check to see if a callback exists and execute it. Return 1
14 * if the handler exists and was handled successfully.
15 */
16int do_nmi_callback(struct pt_regs *regs, int cpu);
17 9
18extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); 10extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
19extern int check_nmi_watchdog(void);
20#if !defined(CONFIG_LOCKUP_DETECTOR)
21extern int nmi_watchdog_enabled;
22#endif
23extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); 11extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
24extern int reserve_perfctr_nmi(unsigned int); 12extern int reserve_perfctr_nmi(unsigned int);
25extern void release_perfctr_nmi(unsigned int); 13extern void release_perfctr_nmi(unsigned int);
26extern int reserve_evntsel_nmi(unsigned int); 14extern int reserve_evntsel_nmi(unsigned int);
27extern void release_evntsel_nmi(unsigned int); 15extern void release_evntsel_nmi(unsigned int);
28 16
29extern void setup_apic_nmi_watchdog(void *);
30extern void stop_apic_nmi_watchdog(void *);
31extern void disable_timer_nmi_watchdog(void);
32extern void enable_timer_nmi_watchdog(void);
33extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason);
34extern void cpu_nmi_set_wd_enabled(void);
35
36extern atomic_t nmi_active;
37extern unsigned int nmi_watchdog;
38#define NMI_NONE 0
39#define NMI_IO_APIC 1
40#define NMI_LOCAL_APIC 2
41#define NMI_INVALID 3
42
43struct ctl_table; 17struct ctl_table;
44extern int proc_nmi_enabled(struct ctl_table *, int , 18extern int proc_nmi_enabled(struct ctl_table *, int ,
45 void __user *, size_t *, loff_t *); 19 void __user *, size_t *, loff_t *);
@@ -47,33 +21,8 @@ extern int unknown_nmi_panic;
47 21
48void arch_trigger_all_cpu_backtrace(void); 22void arch_trigger_all_cpu_backtrace(void);
49#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace 23#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
50
51static inline void localise_nmi_watchdog(void)
52{
53 if (nmi_watchdog == NMI_IO_APIC)
54 nmi_watchdog = NMI_LOCAL_APIC;
55}
56
57/* check if nmi_watchdog is active (ie was specified at boot) */
58static inline int nmi_watchdog_active(void)
59{
60 /*
61 * actually it should be:
62 * return (nmi_watchdog == NMI_LOCAL_APIC ||
63 * nmi_watchdog == NMI_IO_APIC)
64 * but since they are power of two we could use a
65 * cheaper way --cvg
66 */
67 return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
68}
69#endif 24#endif
70 25
71void lapic_watchdog_stop(void);
72int lapic_watchdog_init(unsigned nmi_hz);
73int lapic_wd_event(unsigned nmi_hz);
74unsigned lapic_adjust_nmi_hz(unsigned hz);
75void disable_lapic_nmi_watchdog(void);
76void enable_lapic_nmi_watchdog(void);
77void stop_nmi(void); 26void stop_nmi(void);
78void restart_nmi(void); 27void restart_nmi(void);
79 28
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 18e3b8a8709f..7709c12431b8 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -112,7 +112,7 @@ static inline void arch_safe_halt(void)
112 112
113static inline void halt(void) 113static inline void halt(void)
114{ 114{
115 PVOP_VCALL0(pv_irq_ops.safe_halt); 115 PVOP_VCALL0(pv_irq_ops.halt);
116} 116}
117 117
118static inline void wbinvd(void) 118static inline void wbinvd(void)
@@ -824,27 +824,27 @@ static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
824#define __PV_IS_CALLEE_SAVE(func) \ 824#define __PV_IS_CALLEE_SAVE(func) \
825 ((struct paravirt_callee_save) { func }) 825 ((struct paravirt_callee_save) { func })
826 826
827static inline unsigned long arch_local_save_flags(void) 827static inline notrace unsigned long arch_local_save_flags(void)
828{ 828{
829 return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl); 829 return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
830} 830}
831 831
832static inline void arch_local_irq_restore(unsigned long f) 832static inline notrace void arch_local_irq_restore(unsigned long f)
833{ 833{
834 PVOP_VCALLEE1(pv_irq_ops.restore_fl, f); 834 PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
835} 835}
836 836
837static inline void arch_local_irq_disable(void) 837static inline notrace void arch_local_irq_disable(void)
838{ 838{
839 PVOP_VCALLEE0(pv_irq_ops.irq_disable); 839 PVOP_VCALLEE0(pv_irq_ops.irq_disable);
840} 840}
841 841
842static inline void arch_local_irq_enable(void) 842static inline notrace void arch_local_irq_enable(void)
843{ 843{
844 PVOP_VCALLEE0(pv_irq_ops.irq_enable); 844 PVOP_VCALLEE0(pv_irq_ops.irq_enable);
845} 845}
846 846
847static inline unsigned long arch_local_irq_save(void) 847static inline notrace unsigned long arch_local_irq_save(void)
848{ 848{
849 unsigned long f; 849 unsigned long f;
850 850
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index ca0437c714b2..676129229630 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -65,6 +65,7 @@ extern unsigned long pci_mem_start;
65 65
66#define PCIBIOS_MIN_CARDBUS_IO 0x4000 66#define PCIBIOS_MIN_CARDBUS_IO 0x4000
67 67
68extern int pcibios_enabled;
68void pcibios_config_init(void); 69void pcibios_config_init(void);
69struct pci_bus *pcibios_scan_root(int bus); 70struct pci_bus *pcibios_scan_root(int bus);
70 71
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 550e26b1dbb3..d9d4dae305f6 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -125,7 +125,6 @@ union cpuid10_edx {
125#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ 125#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */
126 126
127#ifdef CONFIG_PERF_EVENTS 127#ifdef CONFIG_PERF_EVENTS
128extern void init_hw_perf_events(void);
129extern void perf_events_lapic_init(void); 128extern void perf_events_lapic_init(void);
130 129
131#define PERF_EVENT_INDEX_OFFSET 0 130#define PERF_EVENT_INDEX_OFFSET 0
@@ -156,7 +155,6 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
156} 155}
157 156
158#else 157#else
159static inline void init_hw_perf_events(void) { }
160static inline void perf_events_lapic_init(void) { } 158static inline void perf_events_lapic_init(void) { }
161#endif 159#endif
162 160
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index a70cd216be5d..295e2ff18a6a 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -744,14 +744,6 @@ enum P4_ESCR_EMASKS {
744}; 744};
745 745
746/* 746/*
747 * P4 PEBS specifics (Replay Event only)
748 *
749 * Format (bits):
750 * 0-6: metric from P4_PEBS_METRIC enum
751 * 7 : reserved
752 * 8 : reserved
753 * 9-11 : reserved
754 *
755 * Note we have UOP and PEBS bits reserved for now 747 * Note we have UOP and PEBS bits reserved for now
756 * just in case if we will need them once 748 * just in case if we will need them once
757 */ 749 */
@@ -788,5 +780,60 @@ enum P4_PEBS_METRIC {
788 P4_PEBS_METRIC__max 780 P4_PEBS_METRIC__max
789}; 781};
790 782
783/*
784 * Notes on internal configuration of ESCR+CCCR tuples
785 *
786 * Since P4 has quite the different architecture of
787 * performance registers in compare with "architectural"
788 * once and we have on 64 bits to keep configuration
789 * of performance event, the following trick is used.
790 *
791 * 1) Since both ESCR and CCCR registers have only low
792 * 32 bits valuable, we pack them into a single 64 bit
793 * configuration. Low 32 bits of such config correspond
794 * to low 32 bits of CCCR register and high 32 bits
795 * correspond to low 32 bits of ESCR register.
796 *
797 * 2) The meaning of every bit of such config field can
798 * be found in Intel SDM but it should be noted that
799 * we "borrow" some reserved bits for own usage and
800 * clean them or set to a proper value when we do
801 * a real write to hardware registers.
802 *
803 * 3) The format of bits of config is the following
804 * and should be either 0 or set to some predefined
805 * values:
806 *
807 * Low 32 bits
808 * -----------
809 * 0-6: P4_PEBS_METRIC enum
810 * 7-11: reserved
811 * 12: reserved (Enable)
812 * 13-15: reserved (ESCR select)
813 * 16-17: Active Thread
814 * 18: Compare
815 * 19: Complement
816 * 20-23: Threshold
817 * 24: Edge
818 * 25: reserved (FORCE_OVF)
819 * 26: reserved (OVF_PMI_T0)
820 * 27: reserved (OVF_PMI_T1)
821 * 28-29: reserved
822 * 30: reserved (Cascade)
823 * 31: reserved (OVF)
824 *
825 * High 32 bits
826 * ------------
827 * 0: reserved (T1_USR)
828 * 1: reserved (T1_OS)
829 * 2: reserved (T0_USR)
830 * 3: reserved (T0_OS)
831 * 4: Tag Enable
832 * 5-8: Tag Value
833 * 9-24: Event Mask (may use P4_ESCR_EMASK_BIT helper)
834 * 25-30: enum P4_EVENTS
835 * 31: reserved (HT thread)
836 */
837
791#endif /* PERF_EVENT_P4_H */ 838#endif /* PERF_EVENT_P4_H */
792 839
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 7f7e577a0e39..31d84acc1512 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -11,6 +11,7 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
11void pvclock_read_wallclock(struct pvclock_wall_clock *wall, 11void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
12 struct pvclock_vcpu_time_info *vcpu, 12 struct pvclock_vcpu_time_info *vcpu,
13 struct timespec *ts); 13 struct timespec *ts);
14void pvclock_resume(void);
14 15
15/* 16/*
16 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, 17 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index 1def60114906..6c22bf353f26 100644
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -48,7 +48,6 @@ static inline void __init smpboot_setup_io_apic(void)
48 setup_IO_APIC(); 48 setup_IO_APIC();
49 else { 49 else {
50 nr_ioapics = 0; 50 nr_ioapics = 0;
51 localise_nmi_watchdog();
52 } 51 }
53#endif 52#endif
54} 53}
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 2b16a2ad23dc..52b5c7ed3608 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -7,6 +7,7 @@
7#define _ASM_X86_STACKTRACE_H 7#define _ASM_X86_STACKTRACE_H
8 8
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <linux/ptrace.h>
10 11
11extern int kstack_depth_to_print; 12extern int kstack_depth_to_print;
12 13
@@ -46,7 +47,7 @@ struct stacktrace_ops {
46}; 47};
47 48
48void dump_trace(struct task_struct *tsk, struct pt_regs *regs, 49void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
49 unsigned long *stack, unsigned long bp, 50 unsigned long *stack,
50 const struct stacktrace_ops *ops, void *data); 51 const struct stacktrace_ops *ops, void *data);
51 52
52#ifdef CONFIG_X86_32 53#ifdef CONFIG_X86_32
@@ -57,13 +58,39 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
57#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) 58#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
58#endif 59#endif
59 60
61#ifdef CONFIG_FRAME_POINTER
62static inline unsigned long
63stack_frame(struct task_struct *task, struct pt_regs *regs)
64{
65 unsigned long bp;
66
67 if (regs)
68 return regs->bp;
69
70 if (task == current) {
71 /* Grab bp right from our regs */
72 get_bp(bp);
73 return bp;
74 }
75
76 /* bp is the last reg pushed by switch_to */
77 return *(unsigned long *)task->thread.sp;
78}
79#else
80static inline unsigned long
81stack_frame(struct task_struct *task, struct pt_regs *regs)
82{
83 return 0;
84}
85#endif
86
60extern void 87extern void
61show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 88show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
62 unsigned long *stack, unsigned long bp, char *log_lvl); 89 unsigned long *stack, char *log_lvl);
63 90
64extern void 91extern void
65show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 92show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
66 unsigned long *sp, unsigned long bp, char *log_lvl); 93 unsigned long *sp, char *log_lvl);
67 94
68extern unsigned int code_bytes; 95extern unsigned int code_bytes;
69 96
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 5469630b27f5..fa7b9176b76c 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -10,12 +10,6 @@
10unsigned long long native_sched_clock(void); 10unsigned long long native_sched_clock(void);
11extern int recalibrate_cpu_khz(void); 11extern int recalibrate_cpu_khz(void);
12 12
13#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
14extern int timer_ack;
15#else
16# define timer_ack (0)
17#endif
18
19extern int no_timer_check; 13extern int no_timer_check;
20 14
21/* Accelerators for sched_clock() 15/* Accelerators for sched_clock()
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 42d412fd8b02..ce1d54c8a433 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -26,20 +26,22 @@
26 * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512, 26 * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
27 * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on. 27 * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
28 * 28 *
29 * We will use 31 sets, one for sending BAU messages from each of the 32 29 * We will use one set for sending BAU messages from each of the
30 * cpu's on the uvhub. 30 * cpu's on the uvhub.
31 * 31 *
32 * TLB shootdown will use the first of the 8 descriptors of each set. 32 * TLB shootdown will use the first of the 8 descriptors of each set.
33 * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). 33 * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
34 */ 34 */
35 35
36#define MAX_CPUS_PER_UVHUB 64
37#define MAX_CPUS_PER_SOCKET 32
38#define UV_ADP_SIZE 64 /* hardware-provided max. */
39#define UV_CPUS_PER_ACT_STATUS 32 /* hardware-provided max. */
36#define UV_ITEMS_PER_DESCRIPTOR 8 40#define UV_ITEMS_PER_DESCRIPTOR 8
37/* the 'throttle' to prevent the hardware stay-busy bug */ 41/* the 'throttle' to prevent the hardware stay-busy bug */
38#define MAX_BAU_CONCURRENT 3 42#define MAX_BAU_CONCURRENT 3
39#define UV_CPUS_PER_ACT_STATUS 32
40#define UV_ACT_STATUS_MASK 0x3 43#define UV_ACT_STATUS_MASK 0x3
41#define UV_ACT_STATUS_SIZE 2 44#define UV_ACT_STATUS_SIZE 2
42#define UV_ADP_SIZE 32
43#define UV_DISTRIBUTION_SIZE 256 45#define UV_DISTRIBUTION_SIZE 256
44#define UV_SW_ACK_NPENDING 8 46#define UV_SW_ACK_NPENDING 8
45#define UV_NET_ENDPOINT_INTD 0x38 47#define UV_NET_ENDPOINT_INTD 0x38
@@ -100,7 +102,6 @@
100 * number of destination side software ack resources 102 * number of destination side software ack resources
101 */ 103 */
102#define DEST_NUM_RESOURCES 8 104#define DEST_NUM_RESOURCES 8
103#define MAX_CPUS_PER_NODE 32
104/* 105/*
105 * completion statuses for sending a TLB flush message 106 * completion statuses for sending a TLB flush message
106 */ 107 */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index e969f691cbfd..a501741c2335 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -199,6 +199,8 @@ union uvh_apicid {
199#define UVH_APICID 0x002D0E00L 199#define UVH_APICID 0x002D0E00L
200#define UV_APIC_PNODE_SHIFT 6 200#define UV_APIC_PNODE_SHIFT 6
201 201
202#define UV_APICID_HIBIT_MASK 0xffff0000
203
202/* Local Bus from cpu's perspective */ 204/* Local Bus from cpu's perspective */
203#define LOCAL_BUS_BASE 0x1c00000 205#define LOCAL_BUS_BASE 0x1c00000
204#define LOCAL_BUS_SIZE (4 * 1024 * 1024) 206#define LOCAL_BUS_SIZE (4 * 1024 * 1024)
@@ -491,8 +493,10 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
491 } 493 }
492} 494}
493 495
496extern unsigned int uv_apicid_hibits;
494static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode) 497static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode)
495{ 498{
499 apicid |= uv_apicid_hibits;
496 return (1UL << UVH_IPI_INT_SEND_SHFT) | 500 return (1UL << UVH_IPI_INT_SEND_SHFT) |
497 ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | 501 ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) |
498 (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | 502 (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index 6d90adf4428a..20cafeac7455 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -5,7 +5,7 @@
5 * 5 *
6 * SGI UV MMR definitions 6 * SGI UV MMR definitions
7 * 7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10 10
11#ifndef _ASM_X86_UV_UV_MMRS_H 11#ifndef _ASM_X86_UV_UV_MMRS_H
@@ -754,6 +754,23 @@ union uvh_lb_bau_sb_descriptor_base_u {
754}; 754};
755 755
756/* ========================================================================= */ 756/* ========================================================================= */
757/* UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK */
758/* ========================================================================= */
759#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL
760#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x009f0
761
762#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0
763#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL
764
765union uvh_lb_target_physical_apic_id_mask_u {
766 unsigned long v;
767 struct uvh_lb_target_physical_apic_id_mask_s {
768 unsigned long bit_enables : 32; /* RW */
769 unsigned long rsvd_32_63 : 32; /* */
770 } s;
771};
772
773/* ========================================================================= */
757/* UVH_NODE_ID */ 774/* UVH_NODE_ID */
758/* ========================================================================= */ 775/* ========================================================================= */
759#define UVH_NODE_ID 0x0UL 776#define UVH_NODE_ID 0x0UL
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index e8506c1f0c55..1c10c88ee4e1 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -61,9 +61,9 @@ DEFINE_GUEST_HANDLE(void);
61#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) 61#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
62#endif 62#endif
63 63
64#ifndef machine_to_phys_mapping 64#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
65#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) 65#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
66#endif 66#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT)
67 67
68/* Maximum number of virtual CPUs in multi-processor guests. */ 68/* Maximum number of virtual CPUs in multi-processor guests. */
69#define MAX_VIRT_CPUS 32 69#define MAX_VIRT_CPUS 32
diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h
index 42a7e004ae5c..8413688b2571 100644
--- a/arch/x86/include/asm/xen/interface_32.h
+++ b/arch/x86/include/asm/xen/interface_32.h
@@ -32,6 +32,11 @@
32/* And the trap vector is... */ 32/* And the trap vector is... */
33#define TRAP_INSTR "int $0x82" 33#define TRAP_INSTR "int $0x82"
34 34
35#define __MACH2PHYS_VIRT_START 0xF5800000
36#define __MACH2PHYS_VIRT_END 0xF6800000
37
38#define __MACH2PHYS_SHIFT 2
39
35/* 40/*
36 * Virtual addresses beyond this are not modifiable by guest OSes. The 41 * Virtual addresses beyond this are not modifiable by guest OSes. The
37 * machine->physical mapping table starts at this address, read-only. 42 * machine->physical mapping table starts at this address, read-only.
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
index 100d2662b97c..839a4811cf98 100644
--- a/arch/x86/include/asm/xen/interface_64.h
+++ b/arch/x86/include/asm/xen/interface_64.h
@@ -39,18 +39,7 @@
39#define __HYPERVISOR_VIRT_END 0xFFFF880000000000 39#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
40#define __MACH2PHYS_VIRT_START 0xFFFF800000000000 40#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
41#define __MACH2PHYS_VIRT_END 0xFFFF804000000000 41#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
42 42#define __MACH2PHYS_SHIFT 3
43#ifndef HYPERVISOR_VIRT_START
44#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
45#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
46#endif
47
48#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
49#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
50#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
51#ifndef machine_to_phys_mapping
52#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
53#endif
54 43
55/* 44/*
56 * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base) 45 * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index dd8c1414b3d5..8760cc60a21c 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -5,6 +5,7 @@
5#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/pfn.h> 7#include <linux/pfn.h>
8#include <linux/mm.h>
8 9
9#include <asm/uaccess.h> 10#include <asm/uaccess.h>
10#include <asm/page.h> 11#include <asm/page.h>
@@ -35,6 +36,8 @@ typedef struct xpaddr {
35#define MAX_DOMAIN_PAGES \ 36#define MAX_DOMAIN_PAGES \
36 ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) 37 ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
37 38
39extern unsigned long *machine_to_phys_mapping;
40extern unsigned int machine_to_phys_order;
38 41
39extern unsigned long get_phys_to_machine(unsigned long pfn); 42extern unsigned long get_phys_to_machine(unsigned long pfn);
40extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); 43extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
@@ -69,10 +72,8 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
69 if (xen_feature(XENFEAT_auto_translated_physmap)) 72 if (xen_feature(XENFEAT_auto_translated_physmap))
70 return mfn; 73 return mfn;
71 74
72#if 0
73 if (unlikely((mfn >> machine_to_phys_order) != 0)) 75 if (unlikely((mfn >> machine_to_phys_order) != 0))
74 return max_mapnr; 76 return ~0;
75#endif
76 77
77 pfn = 0; 78 pfn = 0;
78 /* 79 /*
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index f60153d5de57..34244b2cd880 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -45,6 +45,7 @@ obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
46obj-y += tsc.o io_delay.o rtc.o 46obj-y += tsc.o io_delay.o rtc.o
47obj-y += pci-iommu_table.o 47obj-y += pci-iommu_table.o
48obj-y += resource.o
48 49
49obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 50obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
50obj-y += process.o 51obj-y += process.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 1a5b9a8e6c4f..ec881c6bfee0 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -198,6 +198,11 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
198{ 198{
199 unsigned int ver = 0; 199 unsigned int ver = 0;
200 200
201 if (id >= (MAX_LOCAL_APIC-1)) {
202 printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
203 return;
204 }
205
201 if (!enabled) { 206 if (!enabled) {
202 ++disabled_cpus; 207 ++disabled_cpus;
203 return; 208 return;
@@ -898,13 +903,13 @@ static int __init acpi_parse_madt_lapic_entries(void)
898 register_lapic_address(acpi_lapic_addr); 903 register_lapic_address(acpi_lapic_addr);
899 904
900 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, 905 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
901 acpi_parse_sapic, MAX_APICS); 906 acpi_parse_sapic, MAX_LOCAL_APIC);
902 907
903 if (!count) { 908 if (!count) {
904 x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, 909 x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
905 acpi_parse_x2apic, MAX_APICS); 910 acpi_parse_x2apic, MAX_LOCAL_APIC);
906 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, 911 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
907 acpi_parse_lapic, MAX_APICS); 912 acpi_parse_lapic, MAX_LOCAL_APIC);
908 } 913 }
909 if (!count && !x2count) { 914 if (!count && !x2count) {
910 printk(KERN_ERR PREFIX "No LAPIC entries present\n"); 915 printk(KERN_ERR PREFIX "No LAPIC entries present\n");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 5079f24c955a..123608531c8f 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -353,6 +353,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
353 mutex_unlock(&smp_alt); 353 mutex_unlock(&smp_alt);
354} 354}
355 355
356bool skip_smp_alternatives;
356void alternatives_smp_switch(int smp) 357void alternatives_smp_switch(int smp)
357{ 358{
358 struct smp_alt_module *mod; 359 struct smp_alt_module *mod;
@@ -368,7 +369,7 @@ void alternatives_smp_switch(int smp)
368 printk("lockdep: fixing up alternatives.\n"); 369 printk("lockdep: fixing up alternatives.\n");
369#endif 370#endif
370 371
371 if (noreplace_smp || smp_alt_once) 372 if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
372 return; 373 return;
373 BUG_ON(!smp && (num_online_cpus() > 1)); 374 BUG_ON(!smp && (num_online_cpus() > 1));
374 375
@@ -591,17 +592,21 @@ static atomic_t stop_machine_first;
591static int wrote_text; 592static int wrote_text;
592 593
593struct text_poke_params { 594struct text_poke_params {
594 void *addr; 595 struct text_poke_param *params;
595 const void *opcode; 596 int nparams;
596 size_t len;
597}; 597};
598 598
599static int __kprobes stop_machine_text_poke(void *data) 599static int __kprobes stop_machine_text_poke(void *data)
600{ 600{
601 struct text_poke_params *tpp = data; 601 struct text_poke_params *tpp = data;
602 struct text_poke_param *p;
603 int i;
602 604
603 if (atomic_dec_and_test(&stop_machine_first)) { 605 if (atomic_dec_and_test(&stop_machine_first)) {
604 text_poke(tpp->addr, tpp->opcode, tpp->len); 606 for (i = 0; i < tpp->nparams; i++) {
607 p = &tpp->params[i];
608 text_poke(p->addr, p->opcode, p->len);
609 }
605 smp_wmb(); /* Make sure other cpus see that this has run */ 610 smp_wmb(); /* Make sure other cpus see that this has run */
606 wrote_text = 1; 611 wrote_text = 1;
607 } else { 612 } else {
@@ -610,8 +615,12 @@ static int __kprobes stop_machine_text_poke(void *data)
610 smp_mb(); /* Load wrote_text before following execution */ 615 smp_mb(); /* Load wrote_text before following execution */
611 } 616 }
612 617
613 flush_icache_range((unsigned long)tpp->addr, 618 for (i = 0; i < tpp->nparams; i++) {
614 (unsigned long)tpp->addr + tpp->len); 619 p = &tpp->params[i];
620 flush_icache_range((unsigned long)p->addr,
621 (unsigned long)p->addr + p->len);
622 }
623
615 return 0; 624 return 0;
616} 625}
617 626
@@ -631,10 +640,13 @@ static int __kprobes stop_machine_text_poke(void *data)
631void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) 640void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
632{ 641{
633 struct text_poke_params tpp; 642 struct text_poke_params tpp;
643 struct text_poke_param p;
634 644
635 tpp.addr = addr; 645 p.addr = addr;
636 tpp.opcode = opcode; 646 p.opcode = opcode;
637 tpp.len = len; 647 p.len = len;
648 tpp.params = &p;
649 tpp.nparams = 1;
638 atomic_set(&stop_machine_first, 1); 650 atomic_set(&stop_machine_first, 1);
639 wrote_text = 0; 651 wrote_text = 0;
640 /* Use __stop_machine() because the caller already got online_cpus. */ 652 /* Use __stop_machine() because the caller already got online_cpus. */
@@ -642,6 +654,26 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
642 return addr; 654 return addr;
643} 655}
644 656
657/**
658 * text_poke_smp_batch - Update instructions on a live kernel on SMP
659 * @params: an array of text_poke parameters
660 * @n: the number of elements in params.
661 *
662 * Modify multi-byte instruction by using stop_machine() on SMP. Since the
663 * stop_machine() is heavy task, it is better to aggregate text_poke requests
664 * and do it once if possible.
665 *
666 * Note: Must be called under get_online_cpus() and text_mutex.
667 */
668void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
669{
670 struct text_poke_params tpp = {.params = params, .nparams = n};
671
672 atomic_set(&stop_machine_first, 1);
673 wrote_text = 0;
674 stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
675}
676
645#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) 677#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
646 678
647#ifdef CONFIG_X86_64 679#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 910f20b457c4..3966b564ea47 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -3,10 +3,7 @@
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
6ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) 6obj-y += hw_nmi.o
7obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
8endif
9obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o
10 7
11obj-$(CONFIG_X86_IO_APIC) += io_apic.o 8obj-$(CONFIG_X86_IO_APIC) += io_apic.o
12obj-$(CONFIG_SMP) += ipi.o 9obj-$(CONFIG_SMP) += ipi.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index c0f6426cd337..ce65d449b750 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -31,7 +31,6 @@
31#include <linux/init.h> 31#include <linux/init.h>
32#include <linux/cpu.h> 32#include <linux/cpu.h>
33#include <linux/dmi.h> 33#include <linux/dmi.h>
34#include <linux/nmi.h>
35#include <linux/smp.h> 34#include <linux/smp.h>
36#include <linux/mm.h> 35#include <linux/mm.h>
37 36
@@ -432,17 +431,18 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
432 reserved = reserve_eilvt_offset(offset, new); 431 reserved = reserve_eilvt_offset(offset, new);
433 432
434 if (reserved != new) { 433 if (reserved != new) {
435 pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but " 434 pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
436 "vector 0x%x was already reserved by another core, " 435 "vector 0x%x, but the register is already in use for "
437 "APIC%lX=0x%x\n", 436 "vector 0x%x on another cpu\n",
438 smp_processor_id(), new, reserved, reg, old); 437 smp_processor_id(), reg, offset, new, reserved);
439 return -EINVAL; 438 return -EINVAL;
440 } 439 }
441 440
442 if (!eilvt_entry_is_changeable(old, new)) { 441 if (!eilvt_entry_is_changeable(old, new)) {
443 pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but " 442 pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
444 "register already in use, APIC%lX=0x%x\n", 443 "vector 0x%x, but the register is already in use for "
445 smp_processor_id(), new, reg, old); 444 "vector 0x%x on this cpu\n",
445 smp_processor_id(), reg, offset, new, old);
446 return -EBUSY; 446 return -EBUSY;
447 } 447 }
448 448
@@ -799,11 +799,7 @@ void __init setup_boot_APIC_clock(void)
799 * PIT/HPET going. Otherwise register lapic as a dummy 799 * PIT/HPET going. Otherwise register lapic as a dummy
800 * device. 800 * device.
801 */ 801 */
802 if (nmi_watchdog != NMI_IO_APIC) 802 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
803 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
804 else
805 pr_warning("APIC timer registered as dummy,"
806 " due to nmi_watchdog=%d!\n", nmi_watchdog);
807 803
808 /* Setup the lapic or request the broadcast */ 804 /* Setup the lapic or request the broadcast */
809 setup_APIC_timer(); 805 setup_APIC_timer();
@@ -1384,8 +1380,15 @@ void __cpuinit end_local_APIC_setup(void)
1384 } 1380 }
1385#endif 1381#endif
1386 1382
1387 setup_apic_nmi_watchdog(NULL);
1388 apic_pm_activate(); 1383 apic_pm_activate();
1384
1385 /*
1386 * Now that local APIC setup is completed for BP, configure the fault
1387 * handling for interrupt remapping.
1388 */
1389 if (!smp_processor_id() && intr_remapping_enabled)
1390 enable_drhd_fault_handling();
1391
1389} 1392}
1390 1393
1391#ifdef CONFIG_X86_X2APIC 1394#ifdef CONFIG_X86_X2APIC
@@ -1694,7 +1697,7 @@ void __init register_lapic_address(unsigned long address)
1694 * This initializes the IO-APIC and APIC hardware if this is 1697 * This initializes the IO-APIC and APIC hardware if this is
1695 * a UP kernel. 1698 * a UP kernel.
1696 */ 1699 */
1697int apic_version[MAX_APICS]; 1700int apic_version[MAX_LOCAL_APIC];
1698 1701
1699int __init APIC_init_uniprocessor(void) 1702int __init APIC_init_uniprocessor(void)
1700{ 1703{
@@ -1759,17 +1762,10 @@ int __init APIC_init_uniprocessor(void)
1759 setup_IO_APIC(); 1762 setup_IO_APIC();
1760 else { 1763 else {
1761 nr_ioapics = 0; 1764 nr_ioapics = 0;
1762 localise_nmi_watchdog();
1763 } 1765 }
1764#else
1765 localise_nmi_watchdog();
1766#endif 1766#endif
1767 1767
1768 x86_init.timers.setup_percpu_clockev(); 1768 x86_init.timers.setup_percpu_clockev();
1769#ifdef CONFIG_X86_64
1770 check_nmi_watchdog();
1771#endif
1772
1773 return 0; 1769 return 0;
1774} 1770}
1775 1771
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index cefd6942f0e9..72ec29e1ae06 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -17,19 +17,31 @@
17#include <linux/nmi.h> 17#include <linux/nmi.h>
18#include <linux/module.h> 18#include <linux/module.h>
19 19
20/* For reliability, we're prepared to waste bits here. */ 20#ifdef CONFIG_HARDLOCKUP_DETECTOR
21static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
22
23u64 hw_nmi_get_sample_period(void) 21u64 hw_nmi_get_sample_period(void)
24{ 22{
25 return (u64)(cpu_khz) * 1000 * 60; 23 return (u64)(cpu_khz) * 1000 * 60;
26} 24}
25#endif
26
27#ifdef arch_trigger_all_cpu_backtrace
28/* For reliability, we're prepared to waste bits here. */
29static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
30
31/* "in progress" flag of arch_trigger_all_cpu_backtrace */
32static unsigned long backtrace_flag;
27 33
28#ifdef ARCH_HAS_NMI_WATCHDOG
29void arch_trigger_all_cpu_backtrace(void) 34void arch_trigger_all_cpu_backtrace(void)
30{ 35{
31 int i; 36 int i;
32 37
38 if (test_and_set_bit(0, &backtrace_flag))
39 /*
40 * If there is already a trigger_all_cpu_backtrace() in progress
41 * (backtrace_flag == 1), don't output double cpu dump infos.
42 */
43 return;
44
33 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); 45 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
34 46
35 printk(KERN_INFO "sending NMI to all CPUs:\n"); 47 printk(KERN_INFO "sending NMI to all CPUs:\n");
@@ -41,6 +53,9 @@ void arch_trigger_all_cpu_backtrace(void)
41 break; 53 break;
42 mdelay(1); 54 mdelay(1);
43 } 55 }
56
57 clear_bit(0, &backtrace_flag);
58 smp_mb__after_clear_bit();
44} 59}
45 60
46static int __kprobes 61static int __kprobes
@@ -49,7 +64,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
49{ 64{
50 struct die_args *args = __args; 65 struct die_args *args = __args;
51 struct pt_regs *regs; 66 struct pt_regs *regs;
52 int cpu = smp_processor_id(); 67 int cpu;
53 68
54 switch (cmd) { 69 switch (cmd) {
55 case DIE_NMI: 70 case DIE_NMI:
@@ -61,6 +76,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
61 } 76 }
62 77
63 regs = args->regs; 78 regs = args->regs;
79 cpu = smp_processor_id();
64 80
65 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { 81 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
66 static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; 82 static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -90,18 +106,3 @@ static int __init register_trigger_all_cpu_backtrace(void)
90} 106}
91early_initcall(register_trigger_all_cpu_backtrace); 107early_initcall(register_trigger_all_cpu_backtrace);
92#endif 108#endif
93
94/* STUB calls to mimic old nmi_watchdog behaviour */
95#if defined(CONFIG_X86_LOCAL_APIC)
96unsigned int nmi_watchdog = NMI_NONE;
97EXPORT_SYMBOL(nmi_watchdog);
98void acpi_nmi_enable(void) { return; }
99void acpi_nmi_disable(void) { return; }
100#endif
101atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
102EXPORT_SYMBOL(nmi_active);
103int unknown_nmi_panic;
104void cpu_nmi_set_wd_enabled(void) { return; }
105void stop_apic_nmi_watchdog(void *unused) { return; }
106void setup_apic_nmi_watchdog(void *unused) { return; }
107int __init check_nmi_watchdog(void) { return 0; }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index bb61a552d8c6..52735a710c30 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -54,7 +54,6 @@
54#include <asm/dma.h> 54#include <asm/dma.h>
55#include <asm/timer.h> 55#include <asm/timer.h>
56#include <asm/i8259.h> 56#include <asm/i8259.h>
57#include <asm/nmi.h>
58#include <asm/msidef.h> 57#include <asm/msidef.h>
59#include <asm/hypertransport.h> 58#include <asm/hypertransport.h>
60#include <asm/setup.h> 59#include <asm/setup.h>
@@ -2458,13 +2457,12 @@ static void ack_apic_level(struct irq_data *data)
2458{ 2457{
2459 struct irq_cfg *cfg = data->chip_data; 2458 struct irq_cfg *cfg = data->chip_data;
2460 int i, do_unmask_irq = 0, irq = data->irq; 2459 int i, do_unmask_irq = 0, irq = data->irq;
2461 struct irq_desc *desc = irq_to_desc(irq);
2462 unsigned long v; 2460 unsigned long v;
2463 2461
2464 irq_complete_move(cfg); 2462 irq_complete_move(cfg);
2465#ifdef CONFIG_GENERIC_PENDING_IRQ 2463#ifdef CONFIG_GENERIC_PENDING_IRQ
2466 /* If we are moving the irq we need to mask it */ 2464 /* If we are moving the irq we need to mask it */
2467 if (unlikely(desc->status & IRQ_MOVE_PENDING)) { 2465 if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
2468 do_unmask_irq = 1; 2466 do_unmask_irq = 1;
2469 mask_ioapic(cfg); 2467 mask_ioapic(cfg);
2470 } 2468 }
@@ -2671,24 +2669,6 @@ static void lapic_register_intr(int irq)
2671 "edge"); 2669 "edge");
2672} 2670}
2673 2671
2674static void __init setup_nmi(void)
2675{
2676 /*
2677 * Dirty trick to enable the NMI watchdog ...
2678 * We put the 8259A master into AEOI mode and
2679 * unmask on all local APICs LVT0 as NMI.
2680 *
2681 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
2682 * is from Maciej W. Rozycki - so we do not have to EOI from
2683 * the NMI handler or the timer interrupt.
2684 */
2685 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2686
2687 enable_NMI_through_LVT0();
2688
2689 apic_printk(APIC_VERBOSE, " done.\n");
2690}
2691
2692/* 2672/*
2693 * This looks a bit hackish but it's about the only one way of sending 2673 * This looks a bit hackish but it's about the only one way of sending
2694 * a few INTA cycles to 8259As and any associated glue logic. ICR does 2674 * a few INTA cycles to 8259As and any associated glue logic. ICR does
@@ -2794,15 +2774,6 @@ static inline void __init check_timer(void)
2794 */ 2774 */
2795 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2775 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2796 legacy_pic->init(1); 2776 legacy_pic->init(1);
2797#ifdef CONFIG_X86_32
2798 {
2799 unsigned int ver;
2800
2801 ver = apic_read(APIC_LVR);
2802 ver = GET_APIC_VERSION(ver);
2803 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2804 }
2805#endif
2806 2777
2807 pin1 = find_isa_irq_pin(0, mp_INT); 2778 pin1 = find_isa_irq_pin(0, mp_INT);
2808 apic1 = find_isa_irq_apic(0, mp_INT); 2779 apic1 = find_isa_irq_apic(0, mp_INT);
@@ -2850,10 +2821,6 @@ static inline void __init check_timer(void)
2850 unmask_ioapic(cfg); 2821 unmask_ioapic(cfg);
2851 } 2822 }
2852 if (timer_irq_works()) { 2823 if (timer_irq_works()) {
2853 if (nmi_watchdog == NMI_IO_APIC) {
2854 setup_nmi();
2855 legacy_pic->unmask(0);
2856 }
2857 if (disable_timer_pin_1 > 0) 2824 if (disable_timer_pin_1 > 0)
2858 clear_IO_APIC_pin(0, pin1); 2825 clear_IO_APIC_pin(0, pin1);
2859 goto out; 2826 goto out;
@@ -2879,11 +2846,6 @@ static inline void __init check_timer(void)
2879 if (timer_irq_works()) { 2846 if (timer_irq_works()) {
2880 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 2847 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2881 timer_through_8259 = 1; 2848 timer_through_8259 = 1;
2882 if (nmi_watchdog == NMI_IO_APIC) {
2883 legacy_pic->mask(0);
2884 setup_nmi();
2885 legacy_pic->unmask(0);
2886 }
2887 goto out; 2849 goto out;
2888 } 2850 }
2889 /* 2851 /*
@@ -2895,15 +2857,6 @@ static inline void __init check_timer(void)
2895 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); 2857 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
2896 } 2858 }
2897 2859
2898 if (nmi_watchdog == NMI_IO_APIC) {
2899 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
2900 "through the IO-APIC - disabling NMI Watchdog!\n");
2901 nmi_watchdog = NMI_NONE;
2902 }
2903#ifdef CONFIG_X86_32
2904 timer_ack = 0;
2905#endif
2906
2907 apic_printk(APIC_QUIET, KERN_INFO 2860 apic_printk(APIC_QUIET, KERN_INFO
2908 "...trying to set up timer as Virtual Wire IRQ...\n"); 2861 "...trying to set up timer as Virtual Wire IRQ...\n");
2909 2862
@@ -3441,6 +3394,7 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3441 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3394 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3442 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; 3395 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3443 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3396 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3397 msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
3444 3398
3445 dmar_msi_write(irq, &msg); 3399 dmar_msi_write(irq, &msg);
3446 3400
@@ -4133,7 +4087,8 @@ void __init pre_init_apic_IRQ0(void)
4133 4087
4134 printk(KERN_INFO "Early APIC setup for system timer0\n"); 4088 printk(KERN_INFO "Early APIC setup for system timer0\n");
4135#ifndef CONFIG_SMP 4089#ifndef CONFIG_SMP
4136 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); 4090 physid_set_mask_of_physid(boot_cpu_physical_apicid,
4091 &phys_cpu_present_map);
4137#endif 4092#endif
4138 /* Make sure the irq descriptor is set up */ 4093 /* Make sure the irq descriptor is set up */
4139 cfg = alloc_irq_and_cfg_at(0, 0); 4094 cfg = alloc_irq_and_cfg_at(0, 0);
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
deleted file mode 100644
index c90041ccb742..000000000000
--- a/arch/x86/kernel/apic/nmi.c
+++ /dev/null
@@ -1,567 +0,0 @@
1/*
2 * NMI watchdog support on APIC systems
3 *
4 * Started by Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes:
7 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
8 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
9 * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
10 * Pavel Machek and
11 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
12 */
13
14#include <asm/apic.h>
15
16#include <linux/nmi.h>
17#include <linux/mm.h>
18#include <linux/delay.h>
19#include <linux/interrupt.h>
20#include <linux/module.h>
21#include <linux/slab.h>
22#include <linux/sysdev.h>
23#include <linux/sysctl.h>
24#include <linux/percpu.h>
25#include <linux/kprobes.h>
26#include <linux/cpumask.h>
27#include <linux/kernel_stat.h>
28#include <linux/kdebug.h>
29#include <linux/smp.h>
30
31#include <asm/i8259.h>
32#include <asm/io_apic.h>
33#include <asm/proto.h>
34#include <asm/timer.h>
35
36#include <asm/mce.h>
37
38#include <asm/mach_traps.h>
39
40int unknown_nmi_panic;
41int nmi_watchdog_enabled;
42
43/* For reliability, we're prepared to waste bits here. */
44static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
45
46/* nmi_active:
47 * >0: the lapic NMI watchdog is active, but can be disabled
48 * <0: the lapic NMI watchdog has not been set up, and cannot
49 * be enabled
50 * 0: the lapic NMI watchdog is disabled, but can be enabled
51 */
52atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
53EXPORT_SYMBOL(nmi_active);
54
55unsigned int nmi_watchdog = NMI_NONE;
56EXPORT_SYMBOL(nmi_watchdog);
57
58static int panic_on_timeout;
59
60static unsigned int nmi_hz = HZ;
61static DEFINE_PER_CPU(short, wd_enabled);
62static int endflag __initdata;
63
64static inline unsigned int get_nmi_count(int cpu)
65{
66 return per_cpu(irq_stat, cpu).__nmi_count;
67}
68
69static inline int mce_in_progress(void)
70{
71#if defined(CONFIG_X86_MCE)
72 return atomic_read(&mce_entry) > 0;
73#endif
74 return 0;
75}
76
77/*
78 * Take the local apic timer and PIT/HPET into account. We don't
79 * know which one is active, when we have highres/dyntick on
80 */
81static inline unsigned int get_timer_irqs(int cpu)
82{
83 return per_cpu(irq_stat, cpu).apic_timer_irqs +
84 per_cpu(irq_stat, cpu).irq0_irqs;
85}
86
87#ifdef CONFIG_SMP
88/*
89 * The performance counters used by NMI_LOCAL_APIC don't trigger when
90 * the CPU is idle. To make sure the NMI watchdog really ticks on all
91 * CPUs during the test make them busy.
92 */
93static __init void nmi_cpu_busy(void *data)
94{
95 local_irq_enable_in_hardirq();
96 /*
97 * Intentionally don't use cpu_relax here. This is
98 * to make sure that the performance counter really ticks,
99 * even if there is a simulator or similar that catches the
100 * pause instruction. On a real HT machine this is fine because
101 * all other CPUs are busy with "useless" delay loops and don't
102 * care if they get somewhat less cycles.
103 */
104 while (endflag == 0)
105 mb();
106}
107#endif
108
109static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
110{
111 printk(KERN_CONT "\n");
112
113 printk(KERN_WARNING
114 "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
115 cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
116
117 printk(KERN_WARNING
118 "Please report this to bugzilla.kernel.org,\n");
119 printk(KERN_WARNING
120 "and attach the output of the 'dmesg' command.\n");
121
122 per_cpu(wd_enabled, cpu) = 0;
123 atomic_dec(&nmi_active);
124}
125
126static void __acpi_nmi_disable(void *__unused)
127{
128 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
129}
130
131int __init check_nmi_watchdog(void)
132{
133 unsigned int *prev_nmi_count;
134 int cpu;
135
136 if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
137 return 0;
138
139 prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
140 if (!prev_nmi_count)
141 goto error;
142
143 printk(KERN_INFO "Testing NMI watchdog ... ");
144
145#ifdef CONFIG_SMP
146 if (nmi_watchdog == NMI_LOCAL_APIC)
147 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
148#endif
149
150 for_each_possible_cpu(cpu)
151 prev_nmi_count[cpu] = get_nmi_count(cpu);
152 local_irq_enable();
153 mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
154
155 for_each_online_cpu(cpu) {
156 if (!per_cpu(wd_enabled, cpu))
157 continue;
158 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
159 report_broken_nmi(cpu, prev_nmi_count);
160 }
161 endflag = 1;
162 if (!atomic_read(&nmi_active)) {
163 kfree(prev_nmi_count);
164 atomic_set(&nmi_active, -1);
165 goto error;
166 }
167 printk("OK.\n");
168
169 /*
170 * now that we know it works we can reduce NMI frequency to
171 * something more reasonable; makes a difference in some configs
172 */
173 if (nmi_watchdog == NMI_LOCAL_APIC)
174 nmi_hz = lapic_adjust_nmi_hz(1);
175
176 kfree(prev_nmi_count);
177 return 0;
178error:
179 if (nmi_watchdog == NMI_IO_APIC) {
180 if (!timer_through_8259)
181 legacy_pic->mask(0);
182 on_each_cpu(__acpi_nmi_disable, NULL, 1);
183 }
184
185#ifdef CONFIG_X86_32
186 timer_ack = 0;
187#endif
188 return -1;
189}
190
191static int __init setup_nmi_watchdog(char *str)
192{
193 unsigned int nmi;
194
195 if (!strncmp(str, "panic", 5)) {
196 panic_on_timeout = 1;
197 str = strchr(str, ',');
198 if (!str)
199 return 1;
200 ++str;
201 }
202
203 if (!strncmp(str, "lapic", 5))
204 nmi_watchdog = NMI_LOCAL_APIC;
205 else if (!strncmp(str, "ioapic", 6))
206 nmi_watchdog = NMI_IO_APIC;
207 else {
208 get_option(&str, &nmi);
209 if (nmi >= NMI_INVALID)
210 return 0;
211 nmi_watchdog = nmi;
212 }
213
214 return 1;
215}
216__setup("nmi_watchdog=", setup_nmi_watchdog);
217
218/*
219 * Suspend/resume support
220 */
221#ifdef CONFIG_PM
222
223static int nmi_pm_active; /* nmi_active before suspend */
224
225static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
226{
227 /* only CPU0 goes here, other CPUs should be offline */
228 nmi_pm_active = atomic_read(&nmi_active);
229 stop_apic_nmi_watchdog(NULL);
230 BUG_ON(atomic_read(&nmi_active) != 0);
231 return 0;
232}
233
234static int lapic_nmi_resume(struct sys_device *dev)
235{
236 /* only CPU0 goes here, other CPUs should be offline */
237 if (nmi_pm_active > 0) {
238 setup_apic_nmi_watchdog(NULL);
239 touch_nmi_watchdog();
240 }
241 return 0;
242}
243
244static struct sysdev_class nmi_sysclass = {
245 .name = "lapic_nmi",
246 .resume = lapic_nmi_resume,
247 .suspend = lapic_nmi_suspend,
248};
249
250static struct sys_device device_lapic_nmi = {
251 .id = 0,
252 .cls = &nmi_sysclass,
253};
254
255static int __init init_lapic_nmi_sysfs(void)
256{
257 int error;
258
259 /*
260 * should really be a BUG_ON but b/c this is an
261 * init call, it just doesn't work. -dcz
262 */
263 if (nmi_watchdog != NMI_LOCAL_APIC)
264 return 0;
265
266 if (atomic_read(&nmi_active) < 0)
267 return 0;
268
269 error = sysdev_class_register(&nmi_sysclass);
270 if (!error)
271 error = sysdev_register(&device_lapic_nmi);
272 return error;
273}
274
275/* must come after the local APIC's device_initcall() */
276late_initcall(init_lapic_nmi_sysfs);
277
278#endif /* CONFIG_PM */
279
280static void __acpi_nmi_enable(void *__unused)
281{
282 apic_write(APIC_LVT0, APIC_DM_NMI);
283}
284
285/*
286 * Enable timer based NMIs on all CPUs:
287 */
288void acpi_nmi_enable(void)
289{
290 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
291 on_each_cpu(__acpi_nmi_enable, NULL, 1);
292}
293
294/*
295 * Disable timer based NMIs on all CPUs:
296 */
297void acpi_nmi_disable(void)
298{
299 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
300 on_each_cpu(__acpi_nmi_disable, NULL, 1);
301}
302
303/*
304 * This function is called as soon the LAPIC NMI watchdog driver has everything
305 * in place and it's ready to check if the NMIs belong to the NMI watchdog
306 */
307void cpu_nmi_set_wd_enabled(void)
308{
309 __get_cpu_var(wd_enabled) = 1;
310}
311
312void setup_apic_nmi_watchdog(void *unused)
313{
314 if (__get_cpu_var(wd_enabled))
315 return;
316
317 /* cheap hack to support suspend/resume */
318 /* if cpu0 is not active neither should the other cpus */
319 if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
320 return;
321
322 switch (nmi_watchdog) {
323 case NMI_LOCAL_APIC:
324 if (lapic_watchdog_init(nmi_hz) < 0) {
325 __get_cpu_var(wd_enabled) = 0;
326 return;
327 }
328 /* FALL THROUGH */
329 case NMI_IO_APIC:
330 __get_cpu_var(wd_enabled) = 1;
331 atomic_inc(&nmi_active);
332 }
333}
334
335void stop_apic_nmi_watchdog(void *unused)
336{
337 /* only support LOCAL and IO APICs for now */
338 if (!nmi_watchdog_active())
339 return;
340 if (__get_cpu_var(wd_enabled) == 0)
341 return;
342 if (nmi_watchdog == NMI_LOCAL_APIC)
343 lapic_watchdog_stop();
344 else
345 __acpi_nmi_disable(NULL);
346 __get_cpu_var(wd_enabled) = 0;
347 atomic_dec(&nmi_active);
348}
349
350/*
351 * the best way to detect whether a CPU has a 'hard lockup' problem
352 * is to check it's local APIC timer IRQ counts. If they are not
353 * changing then that CPU has some problem.
354 *
355 * as these watchdog NMI IRQs are generated on every CPU, we only
356 * have to check the current processor.
357 *
358 * since NMIs don't listen to _any_ locks, we have to be extremely
359 * careful not to rely on unsafe variables. The printk might lock
360 * up though, so we have to break up any console locks first ...
361 * [when there will be more tty-related locks, break them up here too!]
362 */
363
364static DEFINE_PER_CPU(unsigned, last_irq_sum);
365static DEFINE_PER_CPU(long, alert_counter);
366static DEFINE_PER_CPU(int, nmi_touch);
367
368void touch_nmi_watchdog(void)
369{
370 if (nmi_watchdog_active()) {
371 unsigned cpu;
372
373 /*
374 * Tell other CPUs to reset their alert counters. We cannot
375 * do it ourselves because the alert count increase is not
376 * atomic.
377 */
378 for_each_present_cpu(cpu) {
379 if (per_cpu(nmi_touch, cpu) != 1)
380 per_cpu(nmi_touch, cpu) = 1;
381 }
382 }
383
384 /*
385 * Tickle the softlockup detector too:
386 */
387 touch_softlockup_watchdog();
388}
389EXPORT_SYMBOL(touch_nmi_watchdog);
390
391notrace __kprobes int
392nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
393{
394 /*
395 * Since current_thread_info()-> is always on the stack, and we
396 * always switch the stack NMI-atomically, it's safe to use
397 * smp_processor_id().
398 */
399 unsigned int sum;
400 int touched = 0;
401 int cpu = smp_processor_id();
402 int rc = 0;
403
404 sum = get_timer_irqs(cpu);
405
406 if (__get_cpu_var(nmi_touch)) {
407 __get_cpu_var(nmi_touch) = 0;
408 touched = 1;
409 }
410
411 /* We can be called before check_nmi_watchdog, hence NULL check. */
412 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
413 static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
414
415 raw_spin_lock(&lock);
416 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
417 show_regs(regs);
418 dump_stack();
419 raw_spin_unlock(&lock);
420 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
421
422 rc = 1;
423 }
424
425 /* Could check oops_in_progress here too, but it's safer not to */
426 if (mce_in_progress())
427 touched = 1;
428
429 /* if the none of the timers isn't firing, this cpu isn't doing much */
430 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
431 /*
432 * Ayiee, looks like this CPU is stuck ...
433 * wait a few IRQs (5 seconds) before doing the oops ...
434 */
435 __this_cpu_inc(alert_counter);
436 if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
437 /*
438 * die_nmi will return ONLY if NOTIFY_STOP happens..
439 */
440 die_nmi("BUG: NMI Watchdog detected LOCKUP",
441 regs, panic_on_timeout);
442 } else {
443 __get_cpu_var(last_irq_sum) = sum;
444 __this_cpu_write(alert_counter, 0);
445 }
446
447 /* see if the nmi watchdog went off */
448 if (!__get_cpu_var(wd_enabled))
449 return rc;
450 switch (nmi_watchdog) {
451 case NMI_LOCAL_APIC:
452 rc |= lapic_wd_event(nmi_hz);
453 break;
454 case NMI_IO_APIC:
455 /*
456 * don't know how to accurately check for this.
457 * just assume it was a watchdog timer interrupt
458 * This matches the old behaviour.
459 */
460 rc = 1;
461 break;
462 }
463 return rc;
464}
465
466#ifdef CONFIG_SYSCTL
467
468static void enable_ioapic_nmi_watchdog_single(void *unused)
469{
470 __get_cpu_var(wd_enabled) = 1;
471 atomic_inc(&nmi_active);
472 __acpi_nmi_enable(NULL);
473}
474
475static void enable_ioapic_nmi_watchdog(void)
476{
477 on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
478 touch_nmi_watchdog();
479}
480
481static void disable_ioapic_nmi_watchdog(void)
482{
483 on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
484}
485
486static int __init setup_unknown_nmi_panic(char *str)
487{
488 unknown_nmi_panic = 1;
489 return 1;
490}
491__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
492
493static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
494{
495 unsigned char reason = get_nmi_reason();
496 char buf[64];
497
498 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
499 die_nmi(buf, regs, 1); /* Always panic here */
500 return 0;
501}
502
503/*
504 * proc handler for /proc/sys/kernel/nmi
505 */
506int proc_nmi_enabled(struct ctl_table *table, int write,
507 void __user *buffer, size_t *length, loff_t *ppos)
508{
509 int old_state;
510
511 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
512 old_state = nmi_watchdog_enabled;
513 proc_dointvec(table, write, buffer, length, ppos);
514 if (!!old_state == !!nmi_watchdog_enabled)
515 return 0;
516
517 if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
518 printk(KERN_WARNING
519 "NMI watchdog is permanently disabled\n");
520 return -EIO;
521 }
522
523 if (nmi_watchdog == NMI_LOCAL_APIC) {
524 if (nmi_watchdog_enabled)
525 enable_lapic_nmi_watchdog();
526 else
527 disable_lapic_nmi_watchdog();
528 } else if (nmi_watchdog == NMI_IO_APIC) {
529 if (nmi_watchdog_enabled)
530 enable_ioapic_nmi_watchdog();
531 else
532 disable_ioapic_nmi_watchdog();
533 } else {
534 printk(KERN_WARNING
535 "NMI watchdog doesn't know what hardware to touch\n");
536 return -EIO;
537 }
538 return 0;
539}
540
541#endif /* CONFIG_SYSCTL */
542
543int do_nmi_callback(struct pt_regs *regs, int cpu)
544{
545#ifdef CONFIG_SYSCTL
546 if (unknown_nmi_panic)
547 return unknown_nmi_panic_callback(regs, cpu);
548#endif
549 return 0;
550}
551
552void arch_trigger_all_cpu_backtrace(void)
553{
554 int i;
555
556 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
557
558 printk(KERN_INFO "sending NMI to all CPUs:\n");
559 apic->send_IPI_all(NMI_VECTOR);
560
561 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
562 for (i = 0; i < 10 * 1000; i++) {
563 if (cpumask_empty(to_cpumask(backtrace_mask)))
564 break;
565 mdelay(1);
566 }
567}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index f9e4e6a54073..d8c4a6feb286 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -79,13 +79,6 @@ void __init default_setup_apic_routing(void)
79 /* need to update phys_pkg_id */ 79 /* need to update phys_pkg_id */
80 apic->phys_pkg_id = apicid_phys_pkg_id; 80 apic->phys_pkg_id = apicid_phys_pkg_id;
81 } 81 }
82
83 /*
84 * Now that apic routing model is selected, configure the
85 * fault handling for intr remapping.
86 */
87 if (intr_remapping_enabled)
88 enable_drhd_fault_handling();
89} 82}
90 83
91/* Same for both flat and physical. */ 84/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 194539aea175..2a3f2a7db243 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -44,8 +44,20 @@ static u64 gru_start_paddr, gru_end_paddr;
44static union uvh_apicid uvh_apicid; 44static union uvh_apicid uvh_apicid;
45int uv_min_hub_revision_id; 45int uv_min_hub_revision_id;
46EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); 46EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
47unsigned int uv_apicid_hibits;
48EXPORT_SYMBOL_GPL(uv_apicid_hibits);
47static DEFINE_SPINLOCK(uv_nmi_lock); 49static DEFINE_SPINLOCK(uv_nmi_lock);
48 50
51static unsigned long __init uv_early_read_mmr(unsigned long addr)
52{
53 unsigned long val, *mmr;
54
55 mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
56 val = *mmr;
57 early_iounmap(mmr, sizeof(*mmr));
58 return val;
59}
60
49static inline bool is_GRU_range(u64 start, u64 end) 61static inline bool is_GRU_range(u64 start, u64 end)
50{ 62{
51 return start >= gru_start_paddr && end <= gru_end_paddr; 63 return start >= gru_start_paddr && end <= gru_end_paddr;
@@ -56,28 +68,24 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end)
56 return is_ISA_range(start, end) || is_GRU_range(start, end); 68 return is_ISA_range(start, end) || is_GRU_range(start, end);
57} 69}
58 70
59static int early_get_nodeid(void) 71static int __init early_get_pnodeid(void)
60{ 72{
61 union uvh_node_id_u node_id; 73 union uvh_node_id_u node_id;
62 unsigned long *mmr; 74 union uvh_rh_gam_config_mmr_u m_n_config;
63 75 int pnode;
64 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
65 node_id.v = *mmr;
66 early_iounmap(mmr, sizeof(*mmr));
67 76
68 /* Currently, all blades have same revision number */ 77 /* Currently, all blades have same revision number */
78 node_id.v = uv_early_read_mmr(UVH_NODE_ID);
79 m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
69 uv_min_hub_revision_id = node_id.s.revision; 80 uv_min_hub_revision_id = node_id.s.revision;
70 81
71 return node_id.s.node_id; 82 pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
83 return pnode;
72} 84}
73 85
74static void __init early_get_apic_pnode_shift(void) 86static void __init early_get_apic_pnode_shift(void)
75{ 87{
76 unsigned long *mmr; 88 uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
77
78 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr));
79 uvh_apicid.v = *mmr;
80 early_iounmap(mmr, sizeof(*mmr));
81 if (!uvh_apicid.v) 89 if (!uvh_apicid.v)
82 /* 90 /*
83 * Old bios, use default value 91 * Old bios, use default value
@@ -85,12 +93,25 @@ static void __init early_get_apic_pnode_shift(void)
85 uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT; 93 uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT;
86} 94}
87 95
96/*
97 * Add an extra bit as dictated by bios to the destination apicid of
98 * interrupts potentially passing through the UV HUB. This prevents
99 * a deadlock between interrupts and IO port operations.
100 */
101static void __init uv_set_apicid_hibit(void)
102{
103 union uvh_lb_target_physical_apic_id_mask_u apicid_mask;
104
105 apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK);
106 uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK;
107}
108
88static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 109static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
89{ 110{
90 int nodeid; 111 int pnodeid;
91 112
92 if (!strcmp(oem_id, "SGI")) { 113 if (!strcmp(oem_id, "SGI")) {
93 nodeid = early_get_nodeid(); 114 pnodeid = early_get_pnodeid();
94 early_get_apic_pnode_shift(); 115 early_get_apic_pnode_shift();
95 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; 116 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
96 x86_platform.nmi_init = uv_nmi_init; 117 x86_platform.nmi_init = uv_nmi_init;
@@ -100,8 +121,9 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
100 uv_system_type = UV_X2APIC; 121 uv_system_type = UV_X2APIC;
101 else if (!strcmp(oem_table_id, "UVH")) { 122 else if (!strcmp(oem_table_id, "UVH")) {
102 __get_cpu_var(x2apic_extra_bits) = 123 __get_cpu_var(x2apic_extra_bits) =
103 nodeid << (uvh_apicid.s.pnode_shift - 1); 124 pnodeid << uvh_apicid.s.pnode_shift;
104 uv_system_type = UV_NON_UNIQUE_APIC; 125 uv_system_type = UV_NON_UNIQUE_APIC;
126 uv_set_apicid_hibit();
105 return 1; 127 return 1;
106 } 128 }
107 } 129 }
@@ -155,6 +177,7 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri
155 int pnode; 177 int pnode;
156 178
157 pnode = uv_apicid_to_pnode(phys_apicid); 179 pnode = uv_apicid_to_pnode(phys_apicid);
180 phys_apicid |= uv_apicid_hibits;
158 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 181 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
159 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | 182 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
160 ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | 183 ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
@@ -236,7 +259,7 @@ static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
236 int cpu = cpumask_first(cpumask); 259 int cpu = cpumask_first(cpumask);
237 260
238 if ((unsigned)cpu < nr_cpu_ids) 261 if ((unsigned)cpu < nr_cpu_ids)
239 return per_cpu(x86_cpu_to_apicid, cpu); 262 return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
240 else 263 else
241 return BAD_APICID; 264 return BAD_APICID;
242} 265}
@@ -255,7 +278,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
255 if (cpumask_test_cpu(cpu, cpu_online_mask)) 278 if (cpumask_test_cpu(cpu, cpu_online_mask))
256 break; 279 break;
257 } 280 }
258 return per_cpu(x86_cpu_to_apicid, cpu); 281 return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
259} 282}
260 283
261static unsigned int x2apic_get_apic_id(unsigned long x) 284static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -661,27 +684,32 @@ void uv_nmi_init(void)
661void __init uv_system_init(void) 684void __init uv_system_init(void)
662{ 685{
663 union uvh_rh_gam_config_mmr_u m_n_config; 686 union uvh_rh_gam_config_mmr_u m_n_config;
687 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
664 union uvh_node_id_u node_id; 688 union uvh_node_id_u node_id;
665 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; 689 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
666 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; 690 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io;
667 int gnode_extra, max_pnode = 0; 691 int gnode_extra, max_pnode = 0;
668 unsigned long mmr_base, present, paddr; 692 unsigned long mmr_base, present, paddr;
669 unsigned short pnode_mask; 693 unsigned short pnode_mask, pnode_io_mask;
670 694
671 map_low_mmrs(); 695 map_low_mmrs();
672 696
673 m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); 697 m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
674 m_val = m_n_config.s.m_skt; 698 m_val = m_n_config.s.m_skt;
675 n_val = m_n_config.s.n_skt; 699 n_val = m_n_config.s.n_skt;
700 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
701 n_io = mmioh.s.n_io;
676 mmr_base = 702 mmr_base =
677 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & 703 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
678 ~UV_MMR_ENABLE; 704 ~UV_MMR_ENABLE;
679 pnode_mask = (1 << n_val) - 1; 705 pnode_mask = (1 << n_val) - 1;
706 pnode_io_mask = (1 << n_io) - 1;
707
680 node_id.v = uv_read_local_mmr(UVH_NODE_ID); 708 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
681 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; 709 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
682 gnode_upper = ((unsigned long)gnode_extra << m_val); 710 gnode_upper = ((unsigned long)gnode_extra << m_val);
683 printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", 711 printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n",
684 n_val, m_val, gnode_upper, gnode_extra); 712 n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask);
685 713
686 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); 714 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
687 715
@@ -714,7 +742,7 @@ void __init uv_system_init(void)
714 for (j = 0; j < 64; j++) { 742 for (j = 0; j < 64; j++) {
715 if (!test_bit(j, &present)) 743 if (!test_bit(j, &present))
716 continue; 744 continue;
717 pnode = (i * 64 + j); 745 pnode = (i * 64 + j) & pnode_mask;
718 uv_blade_info[blade].pnode = pnode; 746 uv_blade_info[blade].pnode = pnode;
719 uv_blade_info[blade].nr_possible_cpus = 0; 747 uv_blade_info[blade].nr_possible_cpus = 0;
720 uv_blade_info[blade].nr_online_cpus = 0; 748 uv_blade_info[blade].nr_online_cpus = 0;
@@ -735,6 +763,7 @@ void __init uv_system_init(void)
735 /* 763 /*
736 * apic_pnode_shift must be set before calling uv_apicid_to_pnode(); 764 * apic_pnode_shift must be set before calling uv_apicid_to_pnode();
737 */ 765 */
766 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
738 uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; 767 uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
739 pnode = uv_apicid_to_pnode(apicid); 768 pnode = uv_apicid_to_pnode(apicid);
740 blade = boot_pnode_to_blade(pnode); 769 blade = boot_pnode_to_blade(pnode);
@@ -751,7 +780,6 @@ void __init uv_system_init(void)
751 uv_cpu_hub_info(cpu)->numa_blade_id = blade; 780 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
752 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; 781 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
753 uv_cpu_hub_info(cpu)->pnode = pnode; 782 uv_cpu_hub_info(cpu)->pnode = pnode;
754 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
755 uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; 783 uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
756 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 784 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
757 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; 785 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
@@ -775,7 +803,7 @@ void __init uv_system_init(void)
775 803
776 map_gru_high(max_pnode); 804 map_gru_high(max_pnode);
777 map_mmr_high(max_pnode); 805 map_mmr_high(max_pnode);
778 map_mmioh_high(max_pnode); 806 map_mmioh_high(max_pnode & pnode_io_mask);
779 807
780 uv_cpu_init(); 808 uv_cpu_init();
781 uv_scir_register_cpu_notifier(); 809 uv_scir_register_cpu_notifier();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4b68bda30938..1d59834396bd 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -894,7 +894,6 @@ void __init identify_boot_cpu(void)
894#else 894#else
895 vgetcpu_set_mode(); 895 vgetcpu_set_mode();
896#endif 896#endif
897 init_hw_perf_events();
898} 897}
899 898
900void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 899void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 80c482382d5c..5bf2fac52aca 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -31,8 +31,6 @@
31#include <asm/mce.h> 31#include <asm/mce.h>
32#include <asm/msr.h> 32#include <asm/msr.h>
33 33
34#define PFX "mce_threshold: "
35#define VERSION "version 1.1.1"
36#define NR_BANKS 6 34#define NR_BANKS 6
37#define NR_BLOCKS 9 35#define NR_BLOCKS 9
38#define THRESHOLD_MAX 0xFFF 36#define THRESHOLD_MAX 0xFFF
@@ -59,12 +57,6 @@ struct threshold_block {
59 struct list_head miscj; 57 struct list_head miscj;
60}; 58};
61 59
62/* defaults used early on boot */
63static struct threshold_block threshold_defaults = {
64 .interrupt_enable = 0,
65 .threshold_limit = THRESHOLD_MAX,
66};
67
68struct threshold_bank { 60struct threshold_bank {
69 struct kobject *kobj; 61 struct kobject *kobj;
70 struct threshold_block *blocks; 62 struct threshold_block *blocks;
@@ -89,50 +81,101 @@ static void amd_threshold_interrupt(void);
89struct thresh_restart { 81struct thresh_restart {
90 struct threshold_block *b; 82 struct threshold_block *b;
91 int reset; 83 int reset;
84 int set_lvt_off;
85 int lvt_off;
92 u16 old_limit; 86 u16 old_limit;
93}; 87};
94 88
89static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
90{
91 int msr = (hi & MASK_LVTOFF_HI) >> 20;
92
93 if (apic < 0) {
94 pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
95 "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
96 b->bank, b->block, b->address, hi, lo);
97 return 0;
98 }
99
100 if (apic != msr) {
101 pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
102 "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
103 b->cpu, apic, b->bank, b->block, b->address, hi, lo);
104 return 0;
105 }
106
107 return 1;
108};
109
95/* must be called with correct cpu affinity */ 110/* must be called with correct cpu affinity */
96/* Called via smp_call_function_single() */ 111/* Called via smp_call_function_single() */
97static void threshold_restart_bank(void *_tr) 112static void threshold_restart_bank(void *_tr)
98{ 113{
99 struct thresh_restart *tr = _tr; 114 struct thresh_restart *tr = _tr;
100 u32 mci_misc_hi, mci_misc_lo; 115 u32 hi, lo;
101 116
102 rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); 117 rdmsr(tr->b->address, lo, hi);
103 118
104 if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) 119 if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
105 tr->reset = 1; /* limit cannot be lower than err count */ 120 tr->reset = 1; /* limit cannot be lower than err count */
106 121
107 if (tr->reset) { /* reset err count and overflow bit */ 122 if (tr->reset) { /* reset err count and overflow bit */
108 mci_misc_hi = 123 hi =
109 (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | 124 (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
110 (THRESHOLD_MAX - tr->b->threshold_limit); 125 (THRESHOLD_MAX - tr->b->threshold_limit);
111 } else if (tr->old_limit) { /* change limit w/o reset */ 126 } else if (tr->old_limit) { /* change limit w/o reset */
112 int new_count = (mci_misc_hi & THRESHOLD_MAX) + 127 int new_count = (hi & THRESHOLD_MAX) +
113 (tr->old_limit - tr->b->threshold_limit); 128 (tr->old_limit - tr->b->threshold_limit);
114 129
115 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | 130 hi = (hi & ~MASK_ERR_COUNT_HI) |
116 (new_count & THRESHOLD_MAX); 131 (new_count & THRESHOLD_MAX);
117 } 132 }
118 133
134 if (tr->set_lvt_off) {
135 if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
136 /* set new lvt offset */
137 hi &= ~MASK_LVTOFF_HI;
138 hi |= tr->lvt_off << 20;
139 }
140 }
141
119 tr->b->interrupt_enable ? 142 tr->b->interrupt_enable ?
120 (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : 143 (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
121 (mci_misc_hi &= ~MASK_INT_TYPE_HI); 144 (hi &= ~MASK_INT_TYPE_HI);
122 145
123 mci_misc_hi |= MASK_COUNT_EN_HI; 146 hi |= MASK_COUNT_EN_HI;
124 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); 147 wrmsr(tr->b->address, lo, hi);
148}
149
150static void mce_threshold_block_init(struct threshold_block *b, int offset)
151{
152 struct thresh_restart tr = {
153 .b = b,
154 .set_lvt_off = 1,
155 .lvt_off = offset,
156 };
157
158 b->threshold_limit = THRESHOLD_MAX;
159 threshold_restart_bank(&tr);
160};
161
162static int setup_APIC_mce(int reserved, int new)
163{
164 if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
165 APIC_EILVT_MSG_FIX, 0))
166 return new;
167
168 return reserved;
125} 169}
126 170
127/* cpu init entry point, called from mce.c with preempt off */ 171/* cpu init entry point, called from mce.c with preempt off */
128void mce_amd_feature_init(struct cpuinfo_x86 *c) 172void mce_amd_feature_init(struct cpuinfo_x86 *c)
129{ 173{
174 struct threshold_block b;
130 unsigned int cpu = smp_processor_id(); 175 unsigned int cpu = smp_processor_id();
131 u32 low = 0, high = 0, address = 0; 176 u32 low = 0, high = 0, address = 0;
132 unsigned int bank, block; 177 unsigned int bank, block;
133 struct thresh_restart tr; 178 int offset = -1;
134 int lvt_off = -1;
135 u8 offset;
136 179
137 for (bank = 0; bank < NR_BANKS; ++bank) { 180 for (bank = 0; bank < NR_BANKS; ++bank) {
138 for (block = 0; block < NR_BLOCKS; ++block) { 181 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -163,39 +206,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
163 if (shared_bank[bank] && c->cpu_core_id) 206 if (shared_bank[bank] && c->cpu_core_id)
164 break; 207 break;
165#endif 208#endif
166 offset = (high & MASK_LVTOFF_HI) >> 20; 209 offset = setup_APIC_mce(offset,
167 if (lvt_off < 0) { 210 (high & MASK_LVTOFF_HI) >> 20);
168 if (setup_APIC_eilvt(offset,
169 THRESHOLD_APIC_VECTOR,
170 APIC_EILVT_MSG_FIX, 0)) {
171 pr_err(FW_BUG "cpu %d, failed to "
172 "setup threshold interrupt "
173 "for bank %d, block %d "
174 "(MSR%08X=0x%x%08x)",
175 smp_processor_id(), bank, block,
176 address, high, low);
177 continue;
178 }
179 lvt_off = offset;
180 } else if (lvt_off != offset) {
181 pr_err(FW_BUG "cpu %d, invalid threshold "
182 "interrupt offset %d for bank %d,"
183 "block %d (MSR%08X=0x%x%08x)",
184 smp_processor_id(), lvt_off, bank,
185 block, address, high, low);
186 continue;
187 }
188
189 high &= ~MASK_LVTOFF_HI;
190 high |= lvt_off << 20;
191 wrmsr(address, low, high);
192 211
193 threshold_defaults.address = address; 212 memset(&b, 0, sizeof(b));
194 tr.b = &threshold_defaults; 213 b.cpu = cpu;
195 tr.reset = 0; 214 b.bank = bank;
196 tr.old_limit = 0; 215 b.block = block;
197 threshold_restart_bank(&tr); 216 b.address = address;
198 217
218 mce_threshold_block_init(&b, offset);
199 mce_threshold_vector = amd_threshold_interrupt; 219 mce_threshold_vector = amd_threshold_interrupt;
200 } 220 }
201 } 221 }
@@ -298,9 +318,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
298 318
299 b->interrupt_enable = !!new; 319 b->interrupt_enable = !!new;
300 320
321 memset(&tr, 0, sizeof(tr));
301 tr.b = b; 322 tr.b = b;
302 tr.reset = 0;
303 tr.old_limit = 0;
304 323
305 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 324 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
306 325
@@ -321,10 +340,10 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
321 if (new < 1) 340 if (new < 1)
322 new = 1; 341 new = 1;
323 342
343 memset(&tr, 0, sizeof(tr));
324 tr.old_limit = b->threshold_limit; 344 tr.old_limit = b->threshold_limit;
325 b->threshold_limit = new; 345 b->threshold_limit = new;
326 tr.b = b; 346 tr.b = b;
327 tr.reset = 0;
328 347
329 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 348 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
330 349
@@ -603,9 +622,9 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
603 continue; 622 continue;
604 err = threshold_create_bank(cpu, bank); 623 err = threshold_create_bank(cpu, bank);
605 if (err) 624 if (err)
606 goto out; 625 return err;
607 } 626 }
608out: 627
609 return err; 628 return err;
610} 629}
611 630
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 4b683267eca5..e12246ff5aa6 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -53,8 +53,13 @@ struct thermal_state {
53 struct _thermal_state core_power_limit; 53 struct _thermal_state core_power_limit;
54 struct _thermal_state package_throttle; 54 struct _thermal_state package_throttle;
55 struct _thermal_state package_power_limit; 55 struct _thermal_state package_power_limit;
56 struct _thermal_state core_thresh0;
57 struct _thermal_state core_thresh1;
56}; 58};
57 59
60/* Callback to handle core threshold interrupts */
61int (*platform_thermal_notify)(__u64 msr_val);
62
58static DEFINE_PER_CPU(struct thermal_state, thermal_state); 63static DEFINE_PER_CPU(struct thermal_state, thermal_state);
59 64
60static atomic_t therm_throt_en = ATOMIC_INIT(0); 65static atomic_t therm_throt_en = ATOMIC_INIT(0);
@@ -200,6 +205,22 @@ static int therm_throt_process(bool new_event, int event, int level)
200 return 0; 205 return 0;
201} 206}
202 207
208static int thresh_event_valid(int event)
209{
210 struct _thermal_state *state;
211 unsigned int this_cpu = smp_processor_id();
212 struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
213 u64 now = get_jiffies_64();
214
215 state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1;
216
217 if (time_before64(now, state->next_check))
218 return 0;
219
220 state->next_check = now + CHECK_INTERVAL;
221 return 1;
222}
223
203#ifdef CONFIG_SYSFS 224#ifdef CONFIG_SYSFS
204/* Add/Remove thermal_throttle interface for CPU device: */ 225/* Add/Remove thermal_throttle interface for CPU device: */
205static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, 226static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
@@ -313,6 +334,22 @@ device_initcall(thermal_throttle_init_device);
313#define PACKAGE_THROTTLED ((__u64)2 << 62) 334#define PACKAGE_THROTTLED ((__u64)2 << 62)
314#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) 335#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
315 336
337static void notify_thresholds(__u64 msr_val)
338{
339 /* check whether the interrupt handler is defined;
340 * otherwise simply return
341 */
342 if (!platform_thermal_notify)
343 return;
344
345 /* lower threshold reached */
346 if ((msr_val & THERM_LOG_THRESHOLD0) && thresh_event_valid(0))
347 platform_thermal_notify(msr_val);
348 /* higher threshold reached */
349 if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1))
350 platform_thermal_notify(msr_val);
351}
352
316/* Thermal transition interrupt handler */ 353/* Thermal transition interrupt handler */
317static void intel_thermal_interrupt(void) 354static void intel_thermal_interrupt(void)
318{ 355{
@@ -321,6 +358,9 @@ static void intel_thermal_interrupt(void)
321 358
322 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 359 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
323 360
361 /* Check for violation of core thermal thresholds*/
362 notify_thresholds(msr_val);
363
324 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, 364 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
325 THERMAL_THROTTLING_EVENT, 365 THERMAL_THROTTLING_EVENT,
326 CORE_LEVEL) != 0) 366 CORE_LEVEL) != 0)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ed6310183efb..0a360d146596 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -330,9 +330,6 @@ static bool reserve_pmc_hardware(void)
330{ 330{
331 int i; 331 int i;
332 332
333 if (nmi_watchdog == NMI_LOCAL_APIC)
334 disable_lapic_nmi_watchdog();
335
336 for (i = 0; i < x86_pmu.num_counters; i++) { 333 for (i = 0; i < x86_pmu.num_counters; i++) {
337 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 334 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
338 goto perfctr_fail; 335 goto perfctr_fail;
@@ -355,9 +352,6 @@ perfctr_fail:
355 for (i--; i >= 0; i--) 352 for (i--; i >= 0; i--)
356 release_perfctr_nmi(x86_pmu.perfctr + i); 353 release_perfctr_nmi(x86_pmu.perfctr + i);
357 354
358 if (nmi_watchdog == NMI_LOCAL_APIC)
359 enable_lapic_nmi_watchdog();
360
361 return false; 355 return false;
362} 356}
363 357
@@ -369,9 +363,6 @@ static void release_pmc_hardware(void)
369 release_perfctr_nmi(x86_pmu.perfctr + i); 363 release_perfctr_nmi(x86_pmu.perfctr + i);
370 release_evntsel_nmi(x86_pmu.eventsel + i); 364 release_evntsel_nmi(x86_pmu.eventsel + i);
371 } 365 }
372
373 if (nmi_watchdog == NMI_LOCAL_APIC)
374 enable_lapic_nmi_watchdog();
375} 366}
376 367
377#else 368#else
@@ -381,6 +372,58 @@ static void release_pmc_hardware(void) {}
381 372
382#endif 373#endif
383 374
375static bool check_hw_exists(void)
376{
377 u64 val, val_new = 0;
378 int i, reg, ret = 0;
379
380 /*
381 * Check to see if the BIOS enabled any of the counters, if so
382 * complain and bail.
383 */
384 for (i = 0; i < x86_pmu.num_counters; i++) {
385 reg = x86_pmu.eventsel + i;
386 ret = rdmsrl_safe(reg, &val);
387 if (ret)
388 goto msr_fail;
389 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
390 goto bios_fail;
391 }
392
393 if (x86_pmu.num_counters_fixed) {
394 reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
395 ret = rdmsrl_safe(reg, &val);
396 if (ret)
397 goto msr_fail;
398 for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
399 if (val & (0x03 << i*4))
400 goto bios_fail;
401 }
402 }
403
404 /*
405 * Now write a value and read it back to see if it matches,
406 * this is needed to detect certain hardware emulators (qemu/kvm)
407 * that don't trap on the MSR access and always return 0s.
408 */
409 val = 0xabcdUL;
410 ret = checking_wrmsrl(x86_pmu.perfctr, val);
411 ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new);
412 if (ret || val != val_new)
413 goto msr_fail;
414
415 return true;
416
417bios_fail:
418 printk(KERN_CONT "Broken BIOS detected, using software events only.\n");
419 printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
420 return false;
421
422msr_fail:
423 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
424 return false;
425}
426
384static void reserve_ds_buffers(void); 427static void reserve_ds_buffers(void);
385static void release_ds_buffers(void); 428static void release_ds_buffers(void);
386 429
@@ -437,7 +480,7 @@ static int x86_setup_perfctr(struct perf_event *event)
437 struct hw_perf_event *hwc = &event->hw; 480 struct hw_perf_event *hwc = &event->hw;
438 u64 config; 481 u64 config;
439 482
440 if (!hwc->sample_period) { 483 if (!is_sampling_event(event)) {
441 hwc->sample_period = x86_pmu.max_period; 484 hwc->sample_period = x86_pmu.max_period;
442 hwc->last_period = hwc->sample_period; 485 hwc->last_period = hwc->sample_period;
443 local64_set(&hwc->period_left, hwc->sample_period); 486 local64_set(&hwc->period_left, hwc->sample_period);
@@ -1348,7 +1391,7 @@ static void __init pmu_check_apic(void)
1348 pr_info("no hardware sampling interrupt available.\n"); 1391 pr_info("no hardware sampling interrupt available.\n");
1349} 1392}
1350 1393
1351void __init init_hw_perf_events(void) 1394int __init init_hw_perf_events(void)
1352{ 1395{
1353 struct event_constraint *c; 1396 struct event_constraint *c;
1354 int err; 1397 int err;
@@ -1363,15 +1406,19 @@ void __init init_hw_perf_events(void)
1363 err = amd_pmu_init(); 1406 err = amd_pmu_init();
1364 break; 1407 break;
1365 default: 1408 default:
1366 return; 1409 return 0;
1367 } 1410 }
1368 if (err != 0) { 1411 if (err != 0) {
1369 pr_cont("no PMU driver, software events only.\n"); 1412 pr_cont("no PMU driver, software events only.\n");
1370 return; 1413 return 0;
1371 } 1414 }
1372 1415
1373 pmu_check_apic(); 1416 pmu_check_apic();
1374 1417
1418 /* sanity check that the hardware exists or is emulated */
1419 if (!check_hw_exists())
1420 return 0;
1421
1375 pr_cont("%s PMU driver.\n", x86_pmu.name); 1422 pr_cont("%s PMU driver.\n", x86_pmu.name);
1376 1423
1377 if (x86_pmu.quirks) 1424 if (x86_pmu.quirks)
@@ -1418,9 +1465,12 @@ void __init init_hw_perf_events(void)
1418 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); 1465 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
1419 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); 1466 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
1420 1467
1421 perf_pmu_register(&pmu); 1468 perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1422 perf_cpu_notifier(x86_pmu_notifier); 1469 perf_cpu_notifier(x86_pmu_notifier);
1470
1471 return 0;
1423} 1472}
1473early_initcall(init_hw_perf_events);
1424 1474
1425static inline void x86_pmu_read(struct perf_event *event) 1475static inline void x86_pmu_read(struct perf_event *event)
1426{ 1476{
@@ -1666,7 +1716,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1666 1716
1667 perf_callchain_store(entry, regs->ip); 1717 perf_callchain_store(entry, regs->ip);
1668 1718
1669 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); 1719 dump_trace(NULL, regs, NULL, &backtrace_ops, entry);
1670} 1720}
1671 1721
1672#ifdef CONFIG_COMPAT 1722#ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index e421b8cd6944..67e2202a6039 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,7 +1,5 @@
1#ifdef CONFIG_CPU_SUP_AMD 1#ifdef CONFIG_CPU_SUP_AMD
2 2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4
5static __initconst const u64 amd_hw_cache_event_ids 3static __initconst const u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX] 4 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX] 5 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -275,7 +273,7 @@ done:
275 return &emptyconstraint; 273 return &emptyconstraint;
276} 274}
277 275
278static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) 276static struct amd_nb *amd_alloc_nb(int cpu)
279{ 277{
280 struct amd_nb *nb; 278 struct amd_nb *nb;
281 int i; 279 int i;
@@ -285,7 +283,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
285 if (!nb) 283 if (!nb)
286 return NULL; 284 return NULL;
287 285
288 nb->nb_id = nb_id; 286 nb->nb_id = -1;
289 287
290 /* 288 /*
291 * initialize all possible NB constraints 289 * initialize all possible NB constraints
@@ -306,7 +304,7 @@ static int amd_pmu_cpu_prepare(int cpu)
306 if (boot_cpu_data.x86_max_cores < 2) 304 if (boot_cpu_data.x86_max_cores < 2)
307 return NOTIFY_OK; 305 return NOTIFY_OK;
308 306
309 cpuc->amd_nb = amd_alloc_nb(cpu, -1); 307 cpuc->amd_nb = amd_alloc_nb(cpu);
310 if (!cpuc->amd_nb) 308 if (!cpuc->amd_nb)
311 return NOTIFY_BAD; 309 return NOTIFY_BAD;
312 310
@@ -325,8 +323,6 @@ static void amd_pmu_cpu_starting(int cpu)
325 nb_id = amd_get_nb_id(cpu); 323 nb_id = amd_get_nb_id(cpu);
326 WARN_ON_ONCE(nb_id == BAD_APICID); 324 WARN_ON_ONCE(nb_id == BAD_APICID);
327 325
328 raw_spin_lock(&amd_nb_lock);
329
330 for_each_online_cpu(i) { 326 for_each_online_cpu(i) {
331 nb = per_cpu(cpu_hw_events, i).amd_nb; 327 nb = per_cpu(cpu_hw_events, i).amd_nb;
332 if (WARN_ON_ONCE(!nb)) 328 if (WARN_ON_ONCE(!nb))
@@ -341,8 +337,6 @@ static void amd_pmu_cpu_starting(int cpu)
341 337
342 cpuc->amd_nb->nb_id = nb_id; 338 cpuc->amd_nb->nb_id = nb_id;
343 cpuc->amd_nb->refcnt++; 339 cpuc->amd_nb->refcnt++;
344
345 raw_spin_unlock(&amd_nb_lock);
346} 340}
347 341
348static void amd_pmu_cpu_dead(int cpu) 342static void amd_pmu_cpu_dead(int cpu)
@@ -354,8 +348,6 @@ static void amd_pmu_cpu_dead(int cpu)
354 348
355 cpuhw = &per_cpu(cpu_hw_events, cpu); 349 cpuhw = &per_cpu(cpu_hw_events, cpu);
356 350
357 raw_spin_lock(&amd_nb_lock);
358
359 if (cpuhw->amd_nb) { 351 if (cpuhw->amd_nb) {
360 struct amd_nb *nb = cpuhw->amd_nb; 352 struct amd_nb *nb = cpuhw->amd_nb;
361 353
@@ -364,8 +356,6 @@ static void amd_pmu_cpu_dead(int cpu)
364 356
365 cpuhw->amd_nb = NULL; 357 cpuhw->amd_nb = NULL;
366 } 358 }
367
368 raw_spin_unlock(&amd_nb_lock);
369} 359}
370 360
371static __initconst const struct x86_pmu amd_pmu = { 361static __initconst const struct x86_pmu amd_pmu = {
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index c8f5c088cad1..24e390e40f2e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -816,6 +816,32 @@ static int intel_pmu_hw_config(struct perf_event *event)
816 if (ret) 816 if (ret)
817 return ret; 817 return ret;
818 818
819 if (event->attr.precise_ip &&
820 (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
821 /*
822 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
823 * (0x003c) so that we can use it with PEBS.
824 *
825 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
826 * PEBS capable. However we can use INST_RETIRED.ANY_P
827 * (0x00c0), which is a PEBS capable event, to get the same
828 * count.
829 *
830 * INST_RETIRED.ANY_P counts the number of cycles that retires
831 * CNTMASK instructions. By setting CNTMASK to a value (16)
832 * larger than the maximum number of instructions that can be
833 * retired per cycle (4) and then inverting the condition, we
834 * count all cycles that retire 16 or less instructions, which
835 * is every cycle.
836 *
837 * Thereby we gain a PEBS capable cycle counter.
838 */
839 u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */
840
841 alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
842 event->hw.config = alt_config;
843 }
844
819 if (event->attr.type != PERF_TYPE_RAW) 845 if (event->attr.type != PERF_TYPE_RAW)
820 return 0; 846 return 0;
821 847
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index d9f4ff8fcd69..d5a236615501 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -16,32 +16,12 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/nmi.h> 19#include <asm/nmi.h>
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/apic.h> 22#include <asm/apic.h>
23#include <asm/perf_event.h> 23#include <asm/perf_event.h>
24 24
25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr;
27 unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
28 unsigned int evntsel_msr; /* the MSR to select the events to handle */
29};
30
31/* Interface defining a CPU specific perfctr watchdog */
32struct wd_ops {
33 int (*reserve)(void);
34 void (*unreserve)(void);
35 int (*setup)(unsigned nmi_hz);
36 void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
37 void (*stop)(void);
38 unsigned perfctr;
39 unsigned evntsel;
40 u64 checkbit;
41};
42
43static const struct wd_ops *wd_ops;
44
45/* 25/*
46 * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's 26 * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
47 * offset from MSR_P4_BSU_ESCR0. 27 * offset from MSR_P4_BSU_ESCR0.
@@ -60,8 +40,6 @@ static const struct wd_ops *wd_ops;
60static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); 40static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
61static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); 41static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
62 42
63static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
64
65/* converts an msr to an appropriate reservation bit */ 43/* converts an msr to an appropriate reservation bit */
66static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) 44static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
67{ 45{
@@ -172,623 +150,3 @@ void release_evntsel_nmi(unsigned int msr)
172 clear_bit(counter, evntsel_nmi_owner); 150 clear_bit(counter, evntsel_nmi_owner);
173} 151}
174EXPORT_SYMBOL(release_evntsel_nmi); 152EXPORT_SYMBOL(release_evntsel_nmi);
175
176void disable_lapic_nmi_watchdog(void)
177{
178 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
179
180 if (atomic_read(&nmi_active) <= 0)
181 return;
182
183 on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
184
185 if (wd_ops)
186 wd_ops->unreserve();
187
188 BUG_ON(atomic_read(&nmi_active) != 0);
189}
190
191void enable_lapic_nmi_watchdog(void)
192{
193 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
194
195 /* are we already enabled */
196 if (atomic_read(&nmi_active) != 0)
197 return;
198
199 /* are we lapic aware */
200 if (!wd_ops)
201 return;
202 if (!wd_ops->reserve()) {
203 printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
204 return;
205 }
206
207 on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
208 touch_nmi_watchdog();
209}
210
211/*
212 * Activate the NMI watchdog via the local APIC.
213 */
214
215static unsigned int adjust_for_32bit_ctr(unsigned int hz)
216{
217 u64 counter_val;
218 unsigned int retval = hz;
219
220 /*
221 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
222 * are writable, with higher bits sign extending from bit 31.
223 * So, we can only program the counter with 31 bit values and
224 * 32nd bit should be 1, for 33.. to be 1.
225 * Find the appropriate nmi_hz
226 */
227 counter_val = (u64)cpu_khz * 1000;
228 do_div(counter_val, retval);
229 if (counter_val > 0x7fffffffULL) {
230 u64 count = (u64)cpu_khz * 1000;
231 do_div(count, 0x7fffffffUL);
232 retval = count + 1;
233 }
234 return retval;
235}
236
237static void write_watchdog_counter(unsigned int perfctr_msr,
238 const char *descr, unsigned nmi_hz)
239{
240 u64 count = (u64)cpu_khz * 1000;
241
242 do_div(count, nmi_hz);
243 if (descr)
244 pr_debug("setting %s to -0x%08Lx\n", descr, count);
245 wrmsrl(perfctr_msr, 0 - count);
246}
247
248static void write_watchdog_counter32(unsigned int perfctr_msr,
249 const char *descr, unsigned nmi_hz)
250{
251 u64 count = (u64)cpu_khz * 1000;
252
253 do_div(count, nmi_hz);
254 if (descr)
255 pr_debug("setting %s to -0x%08Lx\n", descr, count);
256 wrmsr(perfctr_msr, (u32)(-count), 0);
257}
258
259/*
260 * AMD K7/K8/Family10h/Family11h support.
261 * AMD keeps this interface nicely stable so there is not much variety
262 */
263#define K7_EVNTSEL_ENABLE (1 << 22)
264#define K7_EVNTSEL_INT (1 << 20)
265#define K7_EVNTSEL_OS (1 << 17)
266#define K7_EVNTSEL_USR (1 << 16)
267#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
268#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
269
270static int setup_k7_watchdog(unsigned nmi_hz)
271{
272 unsigned int perfctr_msr, evntsel_msr;
273 unsigned int evntsel;
274 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
275
276 perfctr_msr = wd_ops->perfctr;
277 evntsel_msr = wd_ops->evntsel;
278
279 wrmsrl(perfctr_msr, 0UL);
280
281 evntsel = K7_EVNTSEL_INT
282 | K7_EVNTSEL_OS
283 | K7_EVNTSEL_USR
284 | K7_NMI_EVENT;
285
286 /* setup the timer */
287 wrmsr(evntsel_msr, evntsel, 0);
288 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz);
289
290 /* initialize the wd struct before enabling */
291 wd->perfctr_msr = perfctr_msr;
292 wd->evntsel_msr = evntsel_msr;
293 wd->cccr_msr = 0; /* unused */
294
295 /* ok, everything is initialized, announce that we're set */
296 cpu_nmi_set_wd_enabled();
297
298 apic_write(APIC_LVTPC, APIC_DM_NMI);
299 evntsel |= K7_EVNTSEL_ENABLE;
300 wrmsr(evntsel_msr, evntsel, 0);
301
302 return 1;
303}
304
305static void single_msr_stop_watchdog(void)
306{
307 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
308
309 wrmsr(wd->evntsel_msr, 0, 0);
310}
311
312static int single_msr_reserve(void)
313{
314 if (!reserve_perfctr_nmi(wd_ops->perfctr))
315 return 0;
316
317 if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
318 release_perfctr_nmi(wd_ops->perfctr);
319 return 0;
320 }
321 return 1;
322}
323
324static void single_msr_unreserve(void)
325{
326 release_evntsel_nmi(wd_ops->evntsel);
327 release_perfctr_nmi(wd_ops->perfctr);
328}
329
330static void __kprobes
331single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
332{
333 /* start the cycle over again */
334 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
335}
336
337static const struct wd_ops k7_wd_ops = {
338 .reserve = single_msr_reserve,
339 .unreserve = single_msr_unreserve,
340 .setup = setup_k7_watchdog,
341 .rearm = single_msr_rearm,
342 .stop = single_msr_stop_watchdog,
343 .perfctr = MSR_K7_PERFCTR0,
344 .evntsel = MSR_K7_EVNTSEL0,
345 .checkbit = 1ULL << 47,
346};
347
348/*
349 * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
350 */
351#define P6_EVNTSEL0_ENABLE (1 << 22)
352#define P6_EVNTSEL_INT (1 << 20)
353#define P6_EVNTSEL_OS (1 << 17)
354#define P6_EVNTSEL_USR (1 << 16)
355#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
356#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
357
358static int setup_p6_watchdog(unsigned nmi_hz)
359{
360 unsigned int perfctr_msr, evntsel_msr;
361 unsigned int evntsel;
362 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
363
364 perfctr_msr = wd_ops->perfctr;
365 evntsel_msr = wd_ops->evntsel;
366
367 /* KVM doesn't implement this MSR */
368 if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
369 return 0;
370
371 evntsel = P6_EVNTSEL_INT
372 | P6_EVNTSEL_OS
373 | P6_EVNTSEL_USR
374 | P6_NMI_EVENT;
375
376 /* setup the timer */
377 wrmsr(evntsel_msr, evntsel, 0);
378 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
379 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz);
380
381 /* initialize the wd struct before enabling */
382 wd->perfctr_msr = perfctr_msr;
383 wd->evntsel_msr = evntsel_msr;
384 wd->cccr_msr = 0; /* unused */
385
386 /* ok, everything is initialized, announce that we're set */
387 cpu_nmi_set_wd_enabled();
388
389 apic_write(APIC_LVTPC, APIC_DM_NMI);
390 evntsel |= P6_EVNTSEL0_ENABLE;
391 wrmsr(evntsel_msr, evntsel, 0);
392
393 return 1;
394}
395
396static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
397{
398 /*
399 * P6 based Pentium M need to re-unmask
400 * the apic vector but it doesn't hurt
401 * other P6 variant.
402 * ArchPerfom/Core Duo also needs this
403 */
404 apic_write(APIC_LVTPC, APIC_DM_NMI);
405
406 /* P6/ARCH_PERFMON has 32 bit counter write */
407 write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz);
408}
409
410static const struct wd_ops p6_wd_ops = {
411 .reserve = single_msr_reserve,
412 .unreserve = single_msr_unreserve,
413 .setup = setup_p6_watchdog,
414 .rearm = p6_rearm,
415 .stop = single_msr_stop_watchdog,
416 .perfctr = MSR_P6_PERFCTR0,
417 .evntsel = MSR_P6_EVNTSEL0,
418 .checkbit = 1ULL << 39,
419};
420
421/*
422 * Intel P4 performance counters.
423 * By far the most complicated of all.
424 */
425#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7)
426#define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
427#define P4_ESCR_OS (1 << 3)
428#define P4_ESCR_USR (1 << 2)
429#define P4_CCCR_OVF_PMI0 (1 << 26)
430#define P4_CCCR_OVF_PMI1 (1 << 27)
431#define P4_CCCR_THRESHOLD(N) ((N) << 20)
432#define P4_CCCR_COMPLEMENT (1 << 19)
433#define P4_CCCR_COMPARE (1 << 18)
434#define P4_CCCR_REQUIRED (3 << 16)
435#define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
436#define P4_CCCR_ENABLE (1 << 12)
437#define P4_CCCR_OVF (1 << 31)
438
439#define P4_CONTROLS 18
440static unsigned int p4_controls[18] = {
441 MSR_P4_BPU_CCCR0,
442 MSR_P4_BPU_CCCR1,
443 MSR_P4_BPU_CCCR2,
444 MSR_P4_BPU_CCCR3,
445 MSR_P4_MS_CCCR0,
446 MSR_P4_MS_CCCR1,
447 MSR_P4_MS_CCCR2,
448 MSR_P4_MS_CCCR3,
449 MSR_P4_FLAME_CCCR0,
450 MSR_P4_FLAME_CCCR1,
451 MSR_P4_FLAME_CCCR2,
452 MSR_P4_FLAME_CCCR3,
453 MSR_P4_IQ_CCCR0,
454 MSR_P4_IQ_CCCR1,
455 MSR_P4_IQ_CCCR2,
456 MSR_P4_IQ_CCCR3,
457 MSR_P4_IQ_CCCR4,
458 MSR_P4_IQ_CCCR5,
459};
460/*
461 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
462 * CRU_ESCR0 (with any non-null event selector) through a complemented
463 * max threshold. [IA32-Vol3, Section 14.9.9]
464 */
465static int setup_p4_watchdog(unsigned nmi_hz)
466{
467 unsigned int perfctr_msr, evntsel_msr, cccr_msr;
468 unsigned int evntsel, cccr_val;
469 unsigned int misc_enable, dummy;
470 unsigned int ht_num;
471 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
472
473 rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
474 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
475 return 0;
476
477#ifdef CONFIG_SMP
478 /* detect which hyperthread we are on */
479 if (smp_num_siblings == 2) {
480 unsigned int ebx, apicid;
481
482 ebx = cpuid_ebx(1);
483 apicid = (ebx >> 24) & 0xff;
484 ht_num = apicid & 1;
485 } else
486#endif
487 ht_num = 0;
488
489 /*
490 * performance counters are shared resources
491 * assign each hyperthread its own set
492 * (re-use the ESCR0 register, seems safe
493 * and keeps the cccr_val the same)
494 */
495 if (!ht_num) {
496 /* logical cpu 0 */
497 perfctr_msr = MSR_P4_IQ_PERFCTR0;
498 evntsel_msr = MSR_P4_CRU_ESCR0;
499 cccr_msr = MSR_P4_IQ_CCCR0;
500 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
501
502 /*
503 * If we're on the kdump kernel or other situation, we may
504 * still have other performance counter registers set to
505 * interrupt and they'll keep interrupting forever because
506 * of the P4_CCCR_OVF quirk. So we need to ACK all the
507 * pending interrupts and disable all the registers here,
508 * before reenabling the NMI delivery. Refer to p4_rearm()
509 * about the P4_CCCR_OVF quirk.
510 */
511 if (reset_devices) {
512 unsigned int low, high;
513 int i;
514
515 for (i = 0; i < P4_CONTROLS; i++) {
516 rdmsr(p4_controls[i], low, high);
517 low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
518 wrmsr(p4_controls[i], low, high);
519 }
520 }
521 } else {
522 /* logical cpu 1 */
523 perfctr_msr = MSR_P4_IQ_PERFCTR1;
524 evntsel_msr = MSR_P4_CRU_ESCR0;
525 cccr_msr = MSR_P4_IQ_CCCR1;
526
527 /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
528 if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
529 cccr_val = P4_CCCR_OVF_PMI0;
530 else
531 cccr_val = P4_CCCR_OVF_PMI1;
532 cccr_val |= P4_CCCR_ESCR_SELECT(4);
533 }
534
535 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
536 | P4_ESCR_OS
537 | P4_ESCR_USR;
538
539 cccr_val |= P4_CCCR_THRESHOLD(15)
540 | P4_CCCR_COMPLEMENT
541 | P4_CCCR_COMPARE
542 | P4_CCCR_REQUIRED;
543
544 wrmsr(evntsel_msr, evntsel, 0);
545 wrmsr(cccr_msr, cccr_val, 0);
546 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
547
548 wd->perfctr_msr = perfctr_msr;
549 wd->evntsel_msr = evntsel_msr;
550 wd->cccr_msr = cccr_msr;
551
552 /* ok, everything is initialized, announce that we're set */
553 cpu_nmi_set_wd_enabled();
554
555 apic_write(APIC_LVTPC, APIC_DM_NMI);
556 cccr_val |= P4_CCCR_ENABLE;
557 wrmsr(cccr_msr, cccr_val, 0);
558 return 1;
559}
560
561static void stop_p4_watchdog(void)
562{
563 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
564 wrmsr(wd->cccr_msr, 0, 0);
565 wrmsr(wd->evntsel_msr, 0, 0);
566}
567
568static int p4_reserve(void)
569{
570 if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
571 return 0;
572#ifdef CONFIG_SMP
573 if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
574 goto fail1;
575#endif
576 if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
577 goto fail2;
578 /* RED-PEN why is ESCR1 not reserved here? */
579 return 1;
580 fail2:
581#ifdef CONFIG_SMP
582 if (smp_num_siblings > 1)
583 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
584 fail1:
585#endif
586 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
587 return 0;
588}
589
590static void p4_unreserve(void)
591{
592#ifdef CONFIG_SMP
593 if (smp_num_siblings > 1)
594 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
595#endif
596 release_evntsel_nmi(MSR_P4_CRU_ESCR0);
597 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
598}
599
600static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
601{
602 unsigned dummy;
603 /*
604 * P4 quirks:
605 * - An overflown perfctr will assert its interrupt
606 * until the OVF flag in its CCCR is cleared.
607 * - LVTPC is masked on interrupt and must be
608 * unmasked by the LVTPC handler.
609 */
610 rdmsrl(wd->cccr_msr, dummy);
611 dummy &= ~P4_CCCR_OVF;
612 wrmsrl(wd->cccr_msr, dummy);
613 apic_write(APIC_LVTPC, APIC_DM_NMI);
614 /* start the cycle over again */
615 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
616}
617
618static const struct wd_ops p4_wd_ops = {
619 .reserve = p4_reserve,
620 .unreserve = p4_unreserve,
621 .setup = setup_p4_watchdog,
622 .rearm = p4_rearm,
623 .stop = stop_p4_watchdog,
624 /* RED-PEN this is wrong for the other sibling */
625 .perfctr = MSR_P4_BPU_PERFCTR0,
626 .evntsel = MSR_P4_BSU_ESCR0,
627 .checkbit = 1ULL << 39,
628};
629
630/*
631 * Watchdog using the Intel architected PerfMon.
632 * Used for Core2 and hopefully all future Intel CPUs.
633 */
634#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
635#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
636
637static struct wd_ops intel_arch_wd_ops;
638
639static int setup_intel_arch_watchdog(unsigned nmi_hz)
640{
641 unsigned int ebx;
642 union cpuid10_eax eax;
643 unsigned int unused;
644 unsigned int perfctr_msr, evntsel_msr;
645 unsigned int evntsel;
646 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
647
648 /*
649 * Check whether the Architectural PerfMon supports
650 * Unhalted Core Cycles Event or not.
651 * NOTE: Corresponding bit = 0 in ebx indicates event present.
652 */
653 cpuid(10, &(eax.full), &ebx, &unused, &unused);
654 if ((eax.split.mask_length <
655 (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
656 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
657 return 0;
658
659 perfctr_msr = wd_ops->perfctr;
660 evntsel_msr = wd_ops->evntsel;
661
662 wrmsrl(perfctr_msr, 0UL);
663
664 evntsel = ARCH_PERFMON_EVENTSEL_INT
665 | ARCH_PERFMON_EVENTSEL_OS
666 | ARCH_PERFMON_EVENTSEL_USR
667 | ARCH_PERFMON_NMI_EVENT_SEL
668 | ARCH_PERFMON_NMI_EVENT_UMASK;
669
670 /* setup the timer */
671 wrmsr(evntsel_msr, evntsel, 0);
672 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
673 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
674
675 wd->perfctr_msr = perfctr_msr;
676 wd->evntsel_msr = evntsel_msr;
677 wd->cccr_msr = 0; /* unused */
678
679 /* ok, everything is initialized, announce that we're set */
680 cpu_nmi_set_wd_enabled();
681
682 apic_write(APIC_LVTPC, APIC_DM_NMI);
683 evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
684 wrmsr(evntsel_msr, evntsel, 0);
685 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
686 return 1;
687}
688
689static struct wd_ops intel_arch_wd_ops __read_mostly = {
690 .reserve = single_msr_reserve,
691 .unreserve = single_msr_unreserve,
692 .setup = setup_intel_arch_watchdog,
693 .rearm = p6_rearm,
694 .stop = single_msr_stop_watchdog,
695 .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
696 .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
697};
698
699static void probe_nmi_watchdog(void)
700{
701 switch (boot_cpu_data.x86_vendor) {
702 case X86_VENDOR_AMD:
703 if (boot_cpu_data.x86 == 6 ||
704 (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15))
705 wd_ops = &k7_wd_ops;
706 return;
707 case X86_VENDOR_INTEL:
708 /* Work around where perfctr1 doesn't have a working enable
709 * bit as described in the following errata:
710 * AE49 Core Duo and Intel Core Solo 65 nm
711 * AN49 Intel Pentium Dual-Core
712 * AF49 Dual-Core Intel Xeon Processor LV
713 */
714 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
715 ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
716 boot_cpu_data.x86_mask == 4))) {
717 intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
718 intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
719 }
720 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
721 wd_ops = &intel_arch_wd_ops;
722 break;
723 }
724 switch (boot_cpu_data.x86) {
725 case 6:
726 if (boot_cpu_data.x86_model > 13)
727 return;
728
729 wd_ops = &p6_wd_ops;
730 break;
731 case 15:
732 wd_ops = &p4_wd_ops;
733 break;
734 default:
735 return;
736 }
737 break;
738 }
739}
740
741/* Interface to nmi.c */
742
743int lapic_watchdog_init(unsigned nmi_hz)
744{
745 if (!wd_ops) {
746 probe_nmi_watchdog();
747 if (!wd_ops) {
748 printk(KERN_INFO "NMI watchdog: CPU not supported\n");
749 return -1;
750 }
751
752 if (!wd_ops->reserve()) {
753 printk(KERN_ERR
754 "NMI watchdog: cannot reserve perfctrs\n");
755 return -1;
756 }
757 }
758
759 if (!(wd_ops->setup(nmi_hz))) {
760 printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
761 raw_smp_processor_id());
762 return -1;
763 }
764
765 return 0;
766}
767
768void lapic_watchdog_stop(void)
769{
770 if (wd_ops)
771 wd_ops->stop();
772}
773
774unsigned lapic_adjust_nmi_hz(unsigned hz)
775{
776 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
777 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
778 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
779 hz = adjust_for_32bit_ctr(hz);
780 return hz;
781}
782
783int __kprobes lapic_wd_event(unsigned nmi_hz)
784{
785 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
786 u64 ctr;
787
788 rdmsrl(wd->perfctr_msr, ctr);
789 if (ctr & wd_ops->checkbit) /* perfctr still running? */
790 return 0;
791
792 wd_ops->rearm(wd, nmi_hz);
793 return 1;
794}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 1b7b31ab7d86..212a6a42527c 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -33,7 +33,6 @@
33#include <linux/init.h> 33#include <linux/init.h>
34#include <linux/poll.h> 34#include <linux/poll.h>
35#include <linux/smp.h> 35#include <linux/smp.h>
36#include <linux/smp_lock.h>
37#include <linux/major.h> 36#include <linux/major.h>
38#include <linux/fs.h> 37#include <linux/fs.h>
39#include <linux/device.h> 38#include <linux/device.h>
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6e8752c1bd52..8474c998cbd4 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -175,21 +175,21 @@ static const struct stacktrace_ops print_trace_ops = {
175 175
176void 176void
177show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 177show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
178 unsigned long *stack, unsigned long bp, char *log_lvl) 178 unsigned long *stack, char *log_lvl)
179{ 179{
180 printk("%sCall Trace:\n", log_lvl); 180 printk("%sCall Trace:\n", log_lvl);
181 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 181 dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
182} 182}
183 183
184void show_trace(struct task_struct *task, struct pt_regs *regs, 184void show_trace(struct task_struct *task, struct pt_regs *regs,
185 unsigned long *stack, unsigned long bp) 185 unsigned long *stack)
186{ 186{
187 show_trace_log_lvl(task, regs, stack, bp, ""); 187 show_trace_log_lvl(task, regs, stack, "");
188} 188}
189 189
190void show_stack(struct task_struct *task, unsigned long *sp) 190void show_stack(struct task_struct *task, unsigned long *sp)
191{ 191{
192 show_stack_log_lvl(task, NULL, sp, 0, ""); 192 show_stack_log_lvl(task, NULL, sp, "");
193} 193}
194 194
195/* 195/*
@@ -210,7 +210,7 @@ void dump_stack(void)
210 init_utsname()->release, 210 init_utsname()->release,
211 (int)strcspn(init_utsname()->version, " "), 211 (int)strcspn(init_utsname()->version, " "),
212 init_utsname()->version); 212 init_utsname()->version);
213 show_trace(NULL, NULL, &stack, bp); 213 show_trace(NULL, NULL, &stack);
214} 214}
215EXPORT_SYMBOL(dump_stack); 215EXPORT_SYMBOL(dump_stack);
216 216
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 1bc7f75a5bda..74cc1eda384b 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,11 +17,12 @@
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19 19
20void dump_trace(struct task_struct *task, struct pt_regs *regs, 20void dump_trace(struct task_struct *task,
21 unsigned long *stack, unsigned long bp, 21 struct pt_regs *regs, unsigned long *stack,
22 const struct stacktrace_ops *ops, void *data) 22 const struct stacktrace_ops *ops, void *data)
23{ 23{
24 int graph = 0; 24 int graph = 0;
25 unsigned long bp;
25 26
26 if (!task) 27 if (!task)
27 task = current; 28 task = current;
@@ -34,18 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
34 stack = (unsigned long *)task->thread.sp; 35 stack = (unsigned long *)task->thread.sp;
35 } 36 }
36 37
37#ifdef CONFIG_FRAME_POINTER 38 bp = stack_frame(task, regs);
38 if (!bp) {
39 if (task == current) {
40 /* Grab bp right from our regs */
41 get_bp(bp);
42 } else {
43 /* bp is the last reg pushed by switch_to */
44 bp = *(unsigned long *) task->thread.sp;
45 }
46 }
47#endif
48
49 for (;;) { 39 for (;;) {
50 struct thread_info *context; 40 struct thread_info *context;
51 41
@@ -65,7 +55,7 @@ EXPORT_SYMBOL(dump_trace);
65 55
66void 56void
67show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 57show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
68 unsigned long *sp, unsigned long bp, char *log_lvl) 58 unsigned long *sp, char *log_lvl)
69{ 59{
70 unsigned long *stack; 60 unsigned long *stack;
71 int i; 61 int i;
@@ -87,7 +77,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
87 touch_nmi_watchdog(); 77 touch_nmi_watchdog();
88 } 78 }
89 printk(KERN_CONT "\n"); 79 printk(KERN_CONT "\n");
90 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 80 show_trace_log_lvl(task, regs, sp, log_lvl);
91} 81}
92 82
93 83
@@ -112,8 +102,7 @@ void show_registers(struct pt_regs *regs)
112 u8 *ip; 102 u8 *ip;
113 103
114 printk(KERN_EMERG "Stack:\n"); 104 printk(KERN_EMERG "Stack:\n");
115 show_stack_log_lvl(NULL, regs, &regs->sp, 105 show_stack_log_lvl(NULL, regs, &regs->sp, KERN_EMERG);
116 0, KERN_EMERG);
117 106
118 printk(KERN_EMERG "Code: "); 107 printk(KERN_EMERG "Code: ");
119 108
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 6a340485249a..64101335de19 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -139,8 +139,8 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
139 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 139 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
140 */ 140 */
141 141
142void dump_trace(struct task_struct *task, struct pt_regs *regs, 142void dump_trace(struct task_struct *task,
143 unsigned long *stack, unsigned long bp, 143 struct pt_regs *regs, unsigned long *stack,
144 const struct stacktrace_ops *ops, void *data) 144 const struct stacktrace_ops *ops, void *data)
145{ 145{
146 const unsigned cpu = get_cpu(); 146 const unsigned cpu = get_cpu();
@@ -149,6 +149,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
149 unsigned used = 0; 149 unsigned used = 0;
150 struct thread_info *tinfo; 150 struct thread_info *tinfo;
151 int graph = 0; 151 int graph = 0;
152 unsigned long bp;
152 153
153 if (!task) 154 if (!task)
154 task = current; 155 task = current;
@@ -160,18 +161,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
160 stack = (unsigned long *)task->thread.sp; 161 stack = (unsigned long *)task->thread.sp;
161 } 162 }
162 163
163#ifdef CONFIG_FRAME_POINTER 164 bp = stack_frame(task, regs);
164 if (!bp) {
165 if (task == current) {
166 /* Grab bp right from our regs */
167 get_bp(bp);
168 } else {
169 /* bp is the last reg pushed by switch_to */
170 bp = *(unsigned long *) task->thread.sp;
171 }
172 }
173#endif
174
175 /* 165 /*
176 * Print function call entries in all stacks, starting at the 166 * Print function call entries in all stacks, starting at the
177 * current stack address. If the stacks consist of nested 167 * current stack address. If the stacks consist of nested
@@ -235,7 +225,7 @@ EXPORT_SYMBOL(dump_trace);
235 225
236void 226void
237show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 227show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
238 unsigned long *sp, unsigned long bp, char *log_lvl) 228 unsigned long *sp, char *log_lvl)
239{ 229{
240 unsigned long *irq_stack_end; 230 unsigned long *irq_stack_end;
241 unsigned long *irq_stack; 231 unsigned long *irq_stack;
@@ -279,7 +269,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
279 preempt_enable(); 269 preempt_enable();
280 270
281 printk(KERN_CONT "\n"); 271 printk(KERN_CONT "\n");
282 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 272 show_trace_log_lvl(task, regs, sp, log_lvl);
283} 273}
284 274
285void show_registers(struct pt_regs *regs) 275void show_registers(struct pt_regs *regs)
@@ -308,7 +298,7 @@ void show_registers(struct pt_regs *regs)
308 298
309 printk(KERN_EMERG "Stack:\n"); 299 printk(KERN_EMERG "Stack:\n");
310 show_stack_log_lvl(NULL, regs, (unsigned long *)sp, 300 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
311 regs->bp, KERN_EMERG); 301 KERN_EMERG);
312 302
313 printk(KERN_EMERG "Code: "); 303 printk(KERN_EMERG "Code: ");
314 304
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 59e175e89599..591e60104278 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -395,7 +395,7 @@ sysenter_past_esp:
395 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 395 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
396 * pushed above; +8 corresponds to copy_thread's esp0 setting. 396 * pushed above; +8 corresponds to copy_thread's esp0 setting.
397 */ 397 */
398 pushl_cfi (TI_sysenter_return-THREAD_SIZE_asm+8+4*4)(%esp) 398 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE_asm+8+4*4)(%esp)
399 CFI_REL_OFFSET eip, 0 399 CFI_REL_OFFSET eip, 0
400 400
401 pushl_cfi %eax 401 pushl_cfi %eax
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index fe2690d71c0c..e3ba417e8697 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -295,6 +295,7 @@ ENDPROC(native_usergs_sysret64)
295 .endm 295 .endm
296 296
297/* save partial stack frame */ 297/* save partial stack frame */
298 .pushsection .kprobes.text, "ax"
298ENTRY(save_args) 299ENTRY(save_args)
299 XCPT_FRAME 300 XCPT_FRAME
300 cld 301 cld
@@ -334,6 +335,7 @@ ENTRY(save_args)
334 ret 335 ret
335 CFI_ENDPROC 336 CFI_ENDPROC
336END(save_args) 337END(save_args)
338 .popsection
337 339
338ENTRY(save_rest) 340ENTRY(save_rest)
339 PARTIAL_FRAME 1 REST_SKIP+8 341 PARTIAL_FRAME 1 REST_SKIP+8
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 3afb33f14d2d..298448656b60 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -19,6 +19,7 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/list.h> 21#include <linux/list.h>
22#include <linux/module.h>
22 23
23#include <trace/syscall.h> 24#include <trace/syscall.h>
24 25
@@ -49,6 +50,7 @@ static DEFINE_PER_CPU(int, save_modifying_code);
49int ftrace_arch_code_modify_prepare(void) 50int ftrace_arch_code_modify_prepare(void)
50{ 51{
51 set_kernel_text_rw(); 52 set_kernel_text_rw();
53 set_all_modules_text_rw();
52 modifying_code = 1; 54 modifying_code = 1;
53 return 0; 55 return 0;
54} 56}
@@ -56,6 +58,7 @@ int ftrace_arch_code_modify_prepare(void)
56int ftrace_arch_code_modify_post_process(void) 58int ftrace_arch_code_modify_post_process(void)
57{ 59{
58 modifying_code = 0; 60 modifying_code = 0;
61 set_all_modules_text_ro();
59 set_kernel_text_ro(); 62 set_kernel_text_ro();
60 return 0; 63 return 0;
61} 64}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index bcece91dd311..9f54b209c378 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -60,16 +60,18 @@
60#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) 60#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
61#endif 61#endif
62 62
63/* Number of possible pages in the lowmem region */
64LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT)
65
63/* Enough space to fit pagetables for the low memory linear map */ 66/* Enough space to fit pagetables for the low memory linear map */
64MAPPING_BEYOND_END = \ 67MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
65 PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
66 68
67/* 69/*
68 * Worst-case size of the kernel mapping we need to make: 70 * Worst-case size of the kernel mapping we need to make:
69 * the worst-case size of the kernel itself, plus the extra we need 71 * a relocatable kernel can live anywhere in lowmem, so we need to be able
70 * to map for the linear map. 72 * to map all of lowmem.
71 */ 73 */
72KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT 74KERNEL_PAGES = LOWMEM_PAGES
73 75
74INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm 76INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
75RESERVE_BRK(pagetables, INIT_MAP_SIZE) 77RESERVE_BRK(pagetables, INIT_MAP_SIZE)
@@ -137,39 +139,6 @@ ENTRY(startup_32)
137 movl %eax, pa(olpc_ofw_pgd) 139 movl %eax, pa(olpc_ofw_pgd)
138#endif 140#endif
139 141
140#ifdef CONFIG_PARAVIRT
141 /* This is can only trip for a broken bootloader... */
142 cmpw $0x207, pa(boot_params + BP_version)
143 jb default_entry
144
145 /* Paravirt-compatible boot parameters. Look to see what architecture
146 we're booting under. */
147 movl pa(boot_params + BP_hardware_subarch), %eax
148 cmpl $num_subarch_entries, %eax
149 jae bad_subarch
150
151 movl pa(subarch_entries)(,%eax,4), %eax
152 subl $__PAGE_OFFSET, %eax
153 jmp *%eax
154
155bad_subarch:
156WEAK(lguest_entry)
157WEAK(xen_entry)
158 /* Unknown implementation; there's really
159 nothing we can do at this point. */
160 ud2a
161
162 __INITDATA
163
164subarch_entries:
165 .long default_entry /* normal x86/PC */
166 .long lguest_entry /* lguest hypervisor */
167 .long xen_entry /* Xen hypervisor */
168 .long default_entry /* Moorestown MID */
169num_subarch_entries = (. - subarch_entries) / 4
170.previous
171#endif /* CONFIG_PARAVIRT */
172
173/* 142/*
174 * Initialize page tables. This creates a PDE and a set of page 143 * Initialize page tables. This creates a PDE and a set of page
175 * tables, which are located immediately beyond __brk_base. The variable 144 * tables, which are located immediately beyond __brk_base. The variable
@@ -179,7 +148,6 @@ num_subarch_entries = (. - subarch_entries) / 4
179 * 148 *
180 * Note that the stack is not yet set up! 149 * Note that the stack is not yet set up!
181 */ 150 */
182default_entry:
183#ifdef CONFIG_X86_PAE 151#ifdef CONFIG_X86_PAE
184 152
185 /* 153 /*
@@ -259,7 +227,42 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
259 movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax 227 movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
260 movl %eax,pa(initial_page_table+0xffc) 228 movl %eax,pa(initial_page_table+0xffc)
261#endif 229#endif
262 jmp 3f 230
231#ifdef CONFIG_PARAVIRT
232 /* This is can only trip for a broken bootloader... */
233 cmpw $0x207, pa(boot_params + BP_version)
234 jb default_entry
235
236 /* Paravirt-compatible boot parameters. Look to see what architecture
237 we're booting under. */
238 movl pa(boot_params + BP_hardware_subarch), %eax
239 cmpl $num_subarch_entries, %eax
240 jae bad_subarch
241
242 movl pa(subarch_entries)(,%eax,4), %eax
243 subl $__PAGE_OFFSET, %eax
244 jmp *%eax
245
246bad_subarch:
247WEAK(lguest_entry)
248WEAK(xen_entry)
249 /* Unknown implementation; there's really
250 nothing we can do at this point. */
251 ud2a
252
253 __INITDATA
254
255subarch_entries:
256 .long default_entry /* normal x86/PC */
257 .long lguest_entry /* lguest hypervisor */
258 .long xen_entry /* Xen hypervisor */
259 .long default_entry /* Moorestown MID */
260num_subarch_entries = (. - subarch_entries) / 4
261.previous
262#else
263 jmp default_entry
264#endif /* CONFIG_PARAVIRT */
265
263/* 266/*
264 * Non-boot CPU entry point; entered from trampoline.S 267 * Non-boot CPU entry point; entered from trampoline.S
265 * We can't lgdt here, because lgdt itself uses a data segment, but 268 * We can't lgdt here, because lgdt itself uses a data segment, but
@@ -280,7 +283,7 @@ ENTRY(startup_32_smp)
280 movl %eax,%fs 283 movl %eax,%fs
281 movl %eax,%gs 284 movl %eax,%gs
282#endif /* CONFIG_SMP */ 285#endif /* CONFIG_SMP */
2833: 286default_entry:
284 287
285/* 288/*
286 * New page tables may be in 4Mbyte page mode and may 289 * New page tables may be in 4Mbyte page mode and may
@@ -314,6 +317,10 @@ ENTRY(startup_32_smp)
314 subl $0x80000001, %eax 317 subl $0x80000001, %eax
315 cmpl $(0x8000ffff-0x80000001), %eax 318 cmpl $(0x8000ffff-0x80000001), %eax
316 ja 6f 319 ja 6f
320
321 /* Clear bogus XD_DISABLE bits */
322 call verify_cpu
323
317 mov $0x80000001, %eax 324 mov $0x80000001, %eax
318 cpuid 325 cpuid
319 /* Execute Disable bit supported? */ 326 /* Execute Disable bit supported? */
@@ -609,6 +616,8 @@ ignore_int:
609#endif 616#endif
610 iret 617 iret
611 618
619#include "verify_cpu.S"
620
612 __REFDATA 621 __REFDATA
613.align 4 622.align 4
614ENTRY(initial_code) 623ENTRY(initial_code)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ae03cab4352e..4ff5968f12d2 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -27,6 +27,9 @@
27#define HPET_DEV_FSB_CAP 0x1000 27#define HPET_DEV_FSB_CAP 0x1000
28#define HPET_DEV_PERI_CAP 0x2000 28#define HPET_DEV_PERI_CAP 0x2000
29 29
30#define HPET_MIN_CYCLES 128
31#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
32
30#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) 33#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
31 34
32/* 35/*
@@ -299,8 +302,9 @@ static void hpet_legacy_clockevent_register(void)
299 /* Calculate the min / max delta */ 302 /* Calculate the min / max delta */
300 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, 303 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
301 &hpet_clockevent); 304 &hpet_clockevent);
302 /* 5 usec minimum reprogramming delta. */ 305 /* Setup minimum reprogramming delta. */
303 hpet_clockevent.min_delta_ns = 5000; 306 hpet_clockevent.min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA,
307 &hpet_clockevent);
304 308
305 /* 309 /*
306 * Start hpet with the boot cpu mask and make it 310 * Start hpet with the boot cpu mask and make it
@@ -393,22 +397,24 @@ static int hpet_next_event(unsigned long delta,
393 * the wraparound into account) nor a simple count down event 397 * the wraparound into account) nor a simple count down event
394 * mode. Further the write to the comparator register is 398 * mode. Further the write to the comparator register is
395 * delayed internally up to two HPET clock cycles in certain 399 * delayed internally up to two HPET clock cycles in certain
396 * chipsets (ATI, ICH9,10). We worked around that by reading 400 * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even
397 * back the compare register, but that required another 401 * longer delays. We worked around that by reading back the
398 * workaround for ICH9,10 chips where the first readout after 402 * compare register, but that required another workaround for
399 * write can return the old stale value. We already have a 403 * ICH9,10 chips where the first readout after write can
400 * minimum delta of 5us enforced, but a NMI or SMI hitting 404 * return the old stale value. We already had a minimum
405 * programming delta of 5us enforced, but a NMI or SMI hitting
401 * between the counter readout and the comparator write can 406 * between the counter readout and the comparator write can
402 * move us behind that point easily. Now instead of reading 407 * move us behind that point easily. Now instead of reading
403 * the compare register back several times, we make the ETIME 408 * the compare register back several times, we make the ETIME
404 * decision based on the following: Return ETIME if the 409 * decision based on the following: Return ETIME if the
405 * counter value after the write is less than 8 HPET cycles 410 * counter value after the write is less than HPET_MIN_CYCLES
406 * away from the event or if the counter is already ahead of 411 * away from the event or if the counter is already ahead of
407 * the event. 412 * the event. The minimum programming delta for the generic
413 * clockevents code is set to 1.5 * HPET_MIN_CYCLES.
408 */ 414 */
409 res = (s32)(cnt - hpet_readl(HPET_COUNTER)); 415 res = (s32)(cnt - hpet_readl(HPET_COUNTER));
410 416
411 return res < 8 ? -ETIME : 0; 417 return res < HPET_MIN_CYCLES ? -ETIME : 0;
412} 418}
413 419
414static void hpet_legacy_set_mode(enum clock_event_mode mode, 420static void hpet_legacy_set_mode(enum clock_event_mode mode,
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index ff15c9dcc25d..42c594254507 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -433,6 +433,10 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
433 dr6_p = (unsigned long *)ERR_PTR(args->err); 433 dr6_p = (unsigned long *)ERR_PTR(args->err);
434 dr6 = *dr6_p; 434 dr6 = *dr6_p;
435 435
436 /* If it's a single step, TRAP bits are random */
437 if (dr6 & DR_STEP)
438 return NOTIFY_DONE;
439
436 /* Do an early return if no trap bits are set in DR6 */ 440 /* Do an early return if no trap bits are set in DR6 */
437 if ((dr6 & DR_TRAP_BITS) == 0) 441 if ((dr6 & DR_TRAP_BITS) == 0)
438 return NOTIFY_DONE; 442 return NOTIFY_DONE;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index ec592caac4b4..cd21b654dec6 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -315,14 +315,18 @@ static void kgdb_remove_all_hw_break(void)
315 if (!breakinfo[i].enabled) 315 if (!breakinfo[i].enabled)
316 continue; 316 continue;
317 bp = *per_cpu_ptr(breakinfo[i].pev, cpu); 317 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
318 if (bp->attr.disabled == 1) 318 if (!bp->attr.disabled) {
319 arch_uninstall_hw_breakpoint(bp);
320 bp->attr.disabled = 1;
319 continue; 321 continue;
322 }
320 if (dbg_is_early) 323 if (dbg_is_early)
321 early_dr7 &= ~encode_dr7(i, breakinfo[i].len, 324 early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
322 breakinfo[i].type); 325 breakinfo[i].type);
323 else 326 else if (hw_break_release_slot(i))
324 arch_uninstall_hw_breakpoint(bp); 327 printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n",
325 bp->attr.disabled = 1; 328 breakinfo[i].addr);
329 breakinfo[i].enabled = 0;
326 } 330 }
327} 331}
328 332
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 1cbd54c0df99..5940282bd2f9 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1184,6 +1184,10 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
1184{ 1184{
1185 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); 1185 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1186 1186
1187 /* This is possible if op is under delayed unoptimizing */
1188 if (kprobe_disabled(&op->kp))
1189 return;
1190
1187 preempt_disable(); 1191 preempt_disable();
1188 if (kprobe_running()) { 1192 if (kprobe_running()) {
1189 kprobes_inc_nmissed_count(&op->kp); 1193 kprobes_inc_nmissed_count(&op->kp);
@@ -1401,10 +1405,16 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1401 return 0; 1405 return 0;
1402} 1406}
1403 1407
1404/* Replace a breakpoint (int3) with a relative jump. */ 1408#define MAX_OPTIMIZE_PROBES 256
1405int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) 1409static struct text_poke_param *jump_poke_params;
1410static struct jump_poke_buffer {
1411 u8 buf[RELATIVEJUMP_SIZE];
1412} *jump_poke_bufs;
1413
1414static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
1415 u8 *insn_buf,
1416 struct optimized_kprobe *op)
1406{ 1417{
1407 unsigned char jmp_code[RELATIVEJUMP_SIZE];
1408 s32 rel = (s32)((long)op->optinsn.insn - 1418 s32 rel = (s32)((long)op->optinsn.insn -
1409 ((long)op->kp.addr + RELATIVEJUMP_SIZE)); 1419 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1410 1420
@@ -1412,16 +1422,79 @@ int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
1412 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, 1422 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1413 RELATIVE_ADDR_SIZE); 1423 RELATIVE_ADDR_SIZE);
1414 1424
1415 jmp_code[0] = RELATIVEJUMP_OPCODE; 1425 insn_buf[0] = RELATIVEJUMP_OPCODE;
1416 *(s32 *)(&jmp_code[1]) = rel; 1426 *(s32 *)(&insn_buf[1]) = rel;
1427
1428 tprm->addr = op->kp.addr;
1429 tprm->opcode = insn_buf;
1430 tprm->len = RELATIVEJUMP_SIZE;
1431}
1432
1433/*
1434 * Replace breakpoints (int3) with relative jumps.
1435 * Caller must call with locking kprobe_mutex and text_mutex.
1436 */
1437void __kprobes arch_optimize_kprobes(struct list_head *oplist)
1438{
1439 struct optimized_kprobe *op, *tmp;
1440 int c = 0;
1441
1442 list_for_each_entry_safe(op, tmp, oplist, list) {
1443 WARN_ON(kprobe_disabled(&op->kp));
1444 /* Setup param */
1445 setup_optimize_kprobe(&jump_poke_params[c],
1446 jump_poke_bufs[c].buf, op);
1447 list_del_init(&op->list);
1448 if (++c >= MAX_OPTIMIZE_PROBES)
1449 break;
1450 }
1417 1451
1418 /* 1452 /*
1419 * text_poke_smp doesn't support NMI/MCE code modifying. 1453 * text_poke_smp doesn't support NMI/MCE code modifying.
1420 * However, since kprobes itself also doesn't support NMI/MCE 1454 * However, since kprobes itself also doesn't support NMI/MCE
1421 * code probing, it's not a problem. 1455 * code probing, it's not a problem.
1422 */ 1456 */
1423 text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); 1457 text_poke_smp_batch(jump_poke_params, c);
1424 return 0; 1458}
1459
1460static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
1461 u8 *insn_buf,
1462 struct optimized_kprobe *op)
1463{
1464 /* Set int3 to first byte for kprobes */
1465 insn_buf[0] = BREAKPOINT_INSTRUCTION;
1466 memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1467
1468 tprm->addr = op->kp.addr;
1469 tprm->opcode = insn_buf;
1470 tprm->len = RELATIVEJUMP_SIZE;
1471}
1472
1473/*
1474 * Recover original instructions and breakpoints from relative jumps.
1475 * Caller must call with locking kprobe_mutex.
1476 */
1477extern void arch_unoptimize_kprobes(struct list_head *oplist,
1478 struct list_head *done_list)
1479{
1480 struct optimized_kprobe *op, *tmp;
1481 int c = 0;
1482
1483 list_for_each_entry_safe(op, tmp, oplist, list) {
1484 /* Setup param */
1485 setup_unoptimize_kprobe(&jump_poke_params[c],
1486 jump_poke_bufs[c].buf, op);
1487 list_move(&op->list, done_list);
1488 if (++c >= MAX_OPTIMIZE_PROBES)
1489 break;
1490 }
1491
1492 /*
1493 * text_poke_smp doesn't support NMI/MCE code modifying.
1494 * However, since kprobes itself also doesn't support NMI/MCE
1495 * code probing, it's not a problem.
1496 */
1497 text_poke_smp_batch(jump_poke_params, c);
1425} 1498}
1426 1499
1427/* Replace a relative jump with a breakpoint (int3). */ 1500/* Replace a relative jump with a breakpoint (int3). */
@@ -1453,11 +1526,35 @@ static int __kprobes setup_detour_execution(struct kprobe *p,
1453 } 1526 }
1454 return 0; 1527 return 0;
1455} 1528}
1529
1530static int __kprobes init_poke_params(void)
1531{
1532 /* Allocate code buffer and parameter array */
1533 jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
1534 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1535 if (!jump_poke_bufs)
1536 return -ENOMEM;
1537
1538 jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
1539 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1540 if (!jump_poke_params) {
1541 kfree(jump_poke_bufs);
1542 jump_poke_bufs = NULL;
1543 return -ENOMEM;
1544 }
1545
1546 return 0;
1547}
1548#else /* !CONFIG_OPTPROBES */
1549static int __kprobes init_poke_params(void)
1550{
1551 return 0;
1552}
1456#endif 1553#endif
1457 1554
1458int __init arch_init_kprobes(void) 1555int __init arch_init_kprobes(void)
1459{ 1556{
1460 return 0; 1557 return init_poke_params();
1461} 1558}
1462 1559
1463int __kprobes arch_trampoline_kprobe(struct kprobe *p) 1560int __kprobes arch_trampoline_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index ce0cb4721c9a..0fe6d1a66c38 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -155,12 +155,6 @@ static int apply_microcode_amd(int cpu)
155 return 0; 155 return 0;
156} 156}
157 157
158static int get_ucode_data(void *to, const u8 *from, size_t n)
159{
160 memcpy(to, from, n);
161 return 0;
162}
163
164static void * 158static void *
165get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) 159get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
166{ 160{
@@ -168,8 +162,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
168 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; 162 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
169 void *mc; 163 void *mc;
170 164
171 if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR)) 165 get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR);
172 return NULL;
173 166
174 if (section_hdr[0] != UCODE_UCODE_TYPE) { 167 if (section_hdr[0] != UCODE_UCODE_TYPE) {
175 pr_err("error: invalid type field in container file section header\n"); 168 pr_err("error: invalid type field in container file section header\n");
@@ -183,16 +176,13 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
183 return NULL; 176 return NULL;
184 } 177 }
185 178
186 mc = vmalloc(UCODE_MAX_SIZE); 179 mc = vzalloc(UCODE_MAX_SIZE);
187 if (mc) { 180 if (!mc)
188 memset(mc, 0, UCODE_MAX_SIZE); 181 return NULL;
189 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, 182
190 total_size)) { 183 get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size);
191 vfree(mc); 184 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
192 mc = NULL; 185
193 } else
194 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
195 }
196 return mc; 186 return mc;
197} 187}
198 188
@@ -202,8 +192,7 @@ static int install_equiv_cpu_table(const u8 *buf)
202 unsigned int *buf_pos = (unsigned int *)container_hdr; 192 unsigned int *buf_pos = (unsigned int *)container_hdr;
203 unsigned long size; 193 unsigned long size;
204 194
205 if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE)) 195 get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE);
206 return 0;
207 196
208 size = buf_pos[2]; 197 size = buf_pos[2];
209 198
@@ -219,10 +208,7 @@ static int install_equiv_cpu_table(const u8 *buf)
219 } 208 }
220 209
221 buf += UCODE_CONTAINER_HEADER_SIZE; 210 buf += UCODE_CONTAINER_HEADER_SIZE;
222 if (get_ucode_data(equiv_cpu_table, buf, size)) { 211 get_ucode_data(equiv_cpu_table, buf, size);
223 vfree(equiv_cpu_table);
224 return 0;
225 }
226 212
227 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ 213 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
228} 214}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index dcb65cc0a053..1a1b606d3e92 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -364,8 +364,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
364 364
365 /* For performance reasons, reuse mc area when possible */ 365 /* For performance reasons, reuse mc area when possible */
366 if (!mc || mc_size > curr_mc_size) { 366 if (!mc || mc_size > curr_mc_size) {
367 if (mc) 367 vfree(mc);
368 vfree(mc);
369 mc = vmalloc(mc_size); 368 mc = vmalloc(mc_size);
370 if (!mc) 369 if (!mc)
371 break; 370 break;
@@ -374,13 +373,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
374 373
375 if (get_ucode_data(mc, ucode_ptr, mc_size) || 374 if (get_ucode_data(mc, ucode_ptr, mc_size) ||
376 microcode_sanity_check(mc) < 0) { 375 microcode_sanity_check(mc) < 0) {
377 vfree(mc);
378 break; 376 break;
379 } 377 }
380 378
381 if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { 379 if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
382 if (new_mc) 380 vfree(new_mc);
383 vfree(new_mc);
384 new_rev = mc_header.rev; 381 new_rev = mc_header.rev;
385 new_mc = mc; 382 new_mc = mc;
386 mc = NULL; /* trigger new vmalloc */ 383 mc = NULL; /* trigger new vmalloc */
@@ -390,12 +387,10 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
390 leftover -= mc_size; 387 leftover -= mc_size;
391 } 388 }
392 389
393 if (mc) 390 vfree(mc);
394 vfree(mc);
395 391
396 if (leftover) { 392 if (leftover) {
397 if (new_mc) 393 vfree(new_mc);
398 vfree(new_mc);
399 state = UCODE_ERROR; 394 state = UCODE_ERROR;
400 goto out; 395 goto out;
401 } 396 }
@@ -405,8 +400,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
405 goto out; 400 goto out;
406 } 401 }
407 402
408 if (uci->mc) 403 vfree(uci->mc);
409 vfree(uci->mc);
410 uci->mc = (struct microcode_intel *)new_mc; 404 uci->mc = (struct microcode_intel *)new_mc;
411 405
412 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", 406 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 6da143c2a6b8..ac861b8348e2 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -25,7 +25,6 @@ struct pci_hostbridge_probe {
25}; 25};
26 26
27static u64 __cpuinitdata fam10h_pci_mmconf_base; 27static u64 __cpuinitdata fam10h_pci_mmconf_base;
28static int __cpuinitdata fam10h_pci_mmconf_base_status;
29 28
30static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { 29static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
31 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, 30 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 },
@@ -44,10 +43,12 @@ static int __cpuinit cmp_range(const void *x1, const void *x2)
44 return start1 - start2; 43 return start1 - start2;
45} 44}
46 45
47/*[47:0] */ 46#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT)
48/* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */ 47#define MMCONF_MASK (~(MMCONF_UNIT - 1))
48#define MMCONF_SIZE (MMCONF_UNIT << 8)
49/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */
49#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) 50#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32)
50#define BASE_VALID(b) ((b != (0xfdULL << 32)) && (b != (0xfeULL << 32))) 51#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40))
51static void __cpuinit get_fam10h_pci_mmconf_base(void) 52static void __cpuinit get_fam10h_pci_mmconf_base(void)
52{ 53{
53 int i; 54 int i;
@@ -64,12 +65,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
64 struct range range[8]; 65 struct range range[8];
65 66
66 /* only try to get setting from BSP */ 67 /* only try to get setting from BSP */
67 /* -1 or 1 */ 68 if (fam10h_pci_mmconf_base)
68 if (fam10h_pci_mmconf_base_status)
69 return; 69 return;
70 70
71 if (!early_pci_allowed()) 71 if (!early_pci_allowed())
72 goto fail; 72 return;
73 73
74 found = 0; 74 found = 0;
75 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { 75 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
@@ -91,7 +91,7 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
91 } 91 }
92 92
93 if (!found) 93 if (!found)
94 goto fail; 94 return;
95 95
96 /* SYS_CFG */ 96 /* SYS_CFG */
97 address = MSR_K8_SYSCFG; 97 address = MSR_K8_SYSCFG;
@@ -99,16 +99,16 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
99 99
100 /* TOP_MEM2 is not enabled? */ 100 /* TOP_MEM2 is not enabled? */
101 if (!(val & (1<<21))) { 101 if (!(val & (1<<21))) {
102 tom2 = 0; 102 tom2 = 1ULL << 32;
103 } else { 103 } else {
104 /* TOP_MEM2 */ 104 /* TOP_MEM2 */
105 address = MSR_K8_TOP_MEM2; 105 address = MSR_K8_TOP_MEM2;
106 rdmsrl(address, val); 106 rdmsrl(address, val);
107 tom2 = val & (0xffffULL<<32); 107 tom2 = max(val & 0xffffff800000ULL, 1ULL << 32);
108 } 108 }
109 109
110 if (base <= tom2) 110 if (base <= tom2)
111 base = tom2 + (1ULL<<32); 111 base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK;
112 112
113 /* 113 /*
114 * need to check if the range is in the high mmio range that is 114 * need to check if the range is in the high mmio range that is
@@ -123,11 +123,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
123 if (!(reg & 3)) 123 if (!(reg & 3))
124 continue; 124 continue;
125 125
126 start = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ 126 start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/
127 reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); 127 reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3));
128 end = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ 128 end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/
129 129
130 if (!end) 130 if (end < tom2)
131 continue; 131 continue;
132 132
133 range[hi_mmio_num].start = start; 133 range[hi_mmio_num].start = start;
@@ -143,32 +143,27 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
143 143
144 if (range[hi_mmio_num - 1].end < base) 144 if (range[hi_mmio_num - 1].end < base)
145 goto out; 145 goto out;
146 if (range[0].start > base) 146 if (range[0].start > base + MMCONF_SIZE)
147 goto out; 147 goto out;
148 148
149 /* need to find one window */ 149 /* need to find one window */
150 base = range[0].start - (1ULL << 32); 150 base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT;
151 if ((base > tom2) && BASE_VALID(base)) 151 if ((base > tom2) && BASE_VALID(base))
152 goto out; 152 goto out;
153 base = range[hi_mmio_num - 1].end + (1ULL << 32); 153 base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK;
154 if ((base > tom2) && BASE_VALID(base)) 154 if (BASE_VALID(base))
155 goto out; 155 goto out;
156 /* need to find window between ranges */ 156 /* need to find window between ranges */
157 if (hi_mmio_num > 1) 157 for (i = 1; i < hi_mmio_num; i++) {
158 for (i = 0; i < hi_mmio_num - 1; i++) { 158 base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK;
159 if (range[i + 1].start > (range[i].end + (1ULL << 32))) { 159 val = range[i].start & MMCONF_MASK;
160 base = range[i].end + (1ULL << 32); 160 if (val >= base + MMCONF_SIZE && BASE_VALID(base))
161 if ((base > tom2) && BASE_VALID(base)) 161 goto out;
162 goto out;
163 }
164 } 162 }
165
166fail:
167 fam10h_pci_mmconf_base_status = -1;
168 return; 163 return;
164
169out: 165out:
170 fam10h_pci_mmconf_base = base; 166 fam10h_pci_mmconf_base = base;
171 fam10h_pci_mmconf_base_status = 1;
172} 167}
173 168
174void __cpuinit fam10h_check_enable_mmcfg(void) 169void __cpuinit fam10h_check_enable_mmcfg(void)
@@ -190,11 +185,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
190 185
191 /* only trust the one handle 256 buses, if acpi=off */ 186 /* only trust the one handle 256 buses, if acpi=off */
192 if (!acpi_pci_disabled || busnbits >= 8) { 187 if (!acpi_pci_disabled || busnbits >= 8) {
193 u64 base; 188 u64 base = val & MMCONF_MASK;
194 base = val & (0xffffULL << 32); 189
195 if (fam10h_pci_mmconf_base_status <= 0) { 190 if (!fam10h_pci_mmconf_base) {
196 fam10h_pci_mmconf_base = base; 191 fam10h_pci_mmconf_base = base;
197 fam10h_pci_mmconf_base_status = 1;
198 return; 192 return;
199 } else if (fam10h_pci_mmconf_base == base) 193 } else if (fam10h_pci_mmconf_base == base)
200 return; 194 return;
@@ -206,8 +200,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
206 * with 256 buses 200 * with 256 buses
207 */ 201 */
208 get_fam10h_pci_mmconf_base(); 202 get_fam10h_pci_mmconf_base();
209 if (fam10h_pci_mmconf_base_status <= 0) 203 if (!fam10h_pci_mmconf_base) {
204 pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
210 return; 205 return;
206 }
211 207
212 printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); 208 printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n");
213 val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) | 209 val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7bf2dc4c8f70..12fcbe2c143e 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -30,7 +30,6 @@
30#include <linux/init.h> 30#include <linux/init.h>
31#include <linux/poll.h> 31#include <linux/poll.h>
32#include <linux/smp.h> 32#include <linux/smp.h>
33#include <linux/smp_lock.h>
34#include <linux/major.h> 33#include <linux/major.h>
35#include <linux/fs.h> 34#include <linux/fs.h>
36#include <linux/device.h> 35#include <linux/device.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 57d1868a86aa..c852041bfc3d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -91,8 +91,7 @@ void exit_thread(void)
91void show_regs(struct pt_regs *regs) 91void show_regs(struct pt_regs *regs)
92{ 92{
93 show_registers(regs); 93 show_registers(regs);
94 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 94 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs));
95 regs->bp);
96} 95}
97 96
98void show_regs_common(void) 97void show_regs_common(void)
@@ -374,6 +373,7 @@ void default_idle(void)
374{ 373{
375 if (hlt_use_halt()) { 374 if (hlt_use_halt()) {
376 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 375 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
376 trace_cpu_idle(1, smp_processor_id());
377 current_thread_info()->status &= ~TS_POLLING; 377 current_thread_info()->status &= ~TS_POLLING;
378 /* 378 /*
379 * TS_POLLING-cleared state must be visible before we 379 * TS_POLLING-cleared state must be visible before we
@@ -444,6 +444,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
444void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 444void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
445{ 445{
446 trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); 446 trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
447 trace_cpu_idle((ax>>4)+1, smp_processor_id());
447 if (!need_resched()) { 448 if (!need_resched()) {
448 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 449 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
449 clflush((void *)&current_thread_info()->flags); 450 clflush((void *)&current_thread_info()->flags);
@@ -460,6 +461,7 @@ static void mwait_idle(void)
460{ 461{
461 if (!need_resched()) { 462 if (!need_resched()) {
462 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 463 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
464 trace_cpu_idle(1, smp_processor_id());
463 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 465 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
464 clflush((void *)&current_thread_info()->flags); 466 clflush((void *)&current_thread_info()->flags);
465 467
@@ -481,10 +483,12 @@ static void mwait_idle(void)
481static void poll_idle(void) 483static void poll_idle(void)
482{ 484{
483 trace_power_start(POWER_CSTATE, 0, smp_processor_id()); 485 trace_power_start(POWER_CSTATE, 0, smp_processor_id());
486 trace_cpu_idle(0, smp_processor_id());
484 local_irq_enable(); 487 local_irq_enable();
485 while (!need_resched()) 488 while (!need_resched())
486 cpu_relax(); 489 cpu_relax();
487 trace_power_end(0); 490 trace_power_end(smp_processor_id());
491 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
488} 492}
489 493
490/* 494/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 96586c3cbbbf..4b9befa0e347 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -113,8 +113,8 @@ void cpu_idle(void)
113 stop_critical_timings(); 113 stop_critical_timings();
114 pm_idle(); 114 pm_idle();
115 start_critical_timings(); 115 start_critical_timings();
116
117 trace_power_end(smp_processor_id()); 116 trace_power_end(smp_processor_id());
117 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
118 } 118 }
119 tick_nohz_restart_sched_tick(); 119 tick_nohz_restart_sched_tick();
120 preempt_enable_no_resched(); 120 preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b3d7a3a04f38..4c818a738396 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -142,6 +142,8 @@ void cpu_idle(void)
142 start_critical_timings(); 142 start_critical_timings();
143 143
144 trace_power_end(smp_processor_id()); 144 trace_power_end(smp_processor_id());
145 trace_cpu_idle(PWR_EVENT_EXIT,
146 smp_processor_id());
145 147
146 /* In many cases the interrupt that ended idle 148 /* In many cases the interrupt that ended idle
147 has already called exit_idle. But some idle 149 has already called exit_idle. But some idle
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 008b91eefa18..42eb3300dfc6 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -83,6 +83,11 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
83 83
84static atomic64_t last_value = ATOMIC64_INIT(0); 84static atomic64_t last_value = ATOMIC64_INIT(0);
85 85
86void pvclock_resume(void)
87{
88 atomic64_set(&last_value, 0);
89}
90
86cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 91cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
87{ 92{
88 struct pvclock_shadow_time shadow; 93 struct pvclock_shadow_time shadow;
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
new file mode 100644
index 000000000000..2a26819bb6a8
--- /dev/null
+++ b/arch/x86/kernel/resource.c
@@ -0,0 +1,48 @@
1#include <linux/ioport.h>
2#include <asm/e820.h>
3
4static void resource_clip(struct resource *res, resource_size_t start,
5 resource_size_t end)
6{
7 resource_size_t low = 0, high = 0;
8
9 if (res->end < start || res->start > end)
10 return; /* no conflict */
11
12 if (res->start < start)
13 low = start - res->start;
14
15 if (res->end > end)
16 high = res->end - end;
17
18 /* Keep the area above or below the conflict, whichever is larger */
19 if (low > high)
20 res->end = start - 1;
21 else
22 res->start = end + 1;
23}
24
25static void remove_e820_regions(struct resource *avail)
26{
27 int i;
28 struct e820entry *entry;
29
30 for (i = 0; i < e820.nr_map; i++) {
31 entry = &e820.map[i];
32
33 resource_clip(avail, entry->addr,
34 entry->addr + entry->size - 1);
35 }
36}
37
38void arch_remove_reservations(struct resource *avail)
39{
40 /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */
41 if (avail->flags & IORESOURCE_MEM) {
42 if (avail->start < BIOS_END)
43 avail->start = BIOS_END;
44 resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
45
46 remove_e820_regions(avail);
47 }
48}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0afb8c7e3803..d3cfe26c0252 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -501,7 +501,18 @@ static inline unsigned long long get_total_mem(void)
501 return total << PAGE_SHIFT; 501 return total << PAGE_SHIFT;
502} 502}
503 503
504#define DEFAULT_BZIMAGE_ADDR_MAX 0x37FFFFFF 504/*
505 * Keep the crash kernel below this limit. On 32 bits earlier kernels
506 * would limit the kernel to the low 512 MiB due to mapping restrictions.
507 * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
508 * limit once kexec-tools are fixed.
509 */
510#ifdef CONFIG_X86_32
511# define CRASH_KERNEL_ADDR_MAX (512 << 20)
512#else
513# define CRASH_KERNEL_ADDR_MAX (896 << 20)
514#endif
515
505static void __init reserve_crashkernel(void) 516static void __init reserve_crashkernel(void)
506{ 517{
507 unsigned long long total_mem; 518 unsigned long long total_mem;
@@ -520,10 +531,10 @@ static void __init reserve_crashkernel(void)
520 const unsigned long long alignment = 16<<20; /* 16M */ 531 const unsigned long long alignment = 16<<20; /* 16M */
521 532
522 /* 533 /*
523 * kexec want bzImage is below DEFAULT_BZIMAGE_ADDR_MAX 534 * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
524 */ 535 */
525 crash_base = memblock_find_in_range(alignment, 536 crash_base = memblock_find_in_range(alignment,
526 DEFAULT_BZIMAGE_ADDR_MAX, crash_size, alignment); 537 CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
527 538
528 if (crash_base == MEMBLOCK_ERROR) { 539 if (crash_base == MEMBLOCK_ERROR) {
529 pr_info("crashkernel reservation failed - No suitable area found.\n"); 540 pr_info("crashkernel reservation failed - No suitable area found.\n");
@@ -769,7 +780,6 @@ void __init setup_arch(char **cmdline_p)
769 780
770 x86_init.oem.arch_setup(); 781 x86_init.oem.arch_setup();
771 782
772 resource_alloc_from_bottom = 0;
773 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; 783 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
774 setup_memory_map(); 784 setup_memory_map();
775 parse_setup_data(); 785 parse_setup_data();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 083e99d1b7df..ee886fe10ef4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -281,6 +281,13 @@ static void __cpuinit smp_callin(void)
281 */ 281 */
282 smp_store_cpu_info(cpuid); 282 smp_store_cpu_info(cpuid);
283 283
284 /*
285 * This must be done before setting cpu_online_mask
286 * or calling notify_cpu_starting.
287 */
288 set_cpu_sibling_map(raw_smp_processor_id());
289 wmb();
290
284 notify_cpu_starting(cpuid); 291 notify_cpu_starting(cpuid);
285 292
286 /* 293 /*
@@ -316,16 +323,6 @@ notrace static void __cpuinit start_secondary(void *unused)
316 */ 323 */
317 check_tsc_sync_target(); 324 check_tsc_sync_target();
318 325
319 if (nmi_watchdog == NMI_IO_APIC) {
320 legacy_pic->mask(0);
321 enable_NMI_through_LVT0();
322 legacy_pic->unmask(0);
323 }
324
325 /* This must be done before setting cpu_online_mask */
326 set_cpu_sibling_map(raw_smp_processor_id());
327 wmb();
328
329 /* 326 /*
330 * We need to hold call_lock, so there is no inconsistency 327 * We need to hold call_lock, so there is no inconsistency
331 * between the time smp_call_function() determines number of 328 * between the time smp_call_function() determines number of
@@ -1061,8 +1058,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
1061 printk(KERN_INFO "SMP mode deactivated.\n"); 1058 printk(KERN_INFO "SMP mode deactivated.\n");
1062 smpboot_clear_io_apic(); 1059 smpboot_clear_io_apic();
1063 1060
1064 localise_nmi_watchdog();
1065
1066 connect_bsp_APIC(); 1061 connect_bsp_APIC();
1067 setup_local_APIC(); 1062 setup_local_APIC();
1068 end_local_APIC_setup(); 1063 end_local_APIC_setup();
@@ -1166,6 +1161,20 @@ out:
1166 preempt_enable(); 1161 preempt_enable();
1167} 1162}
1168 1163
1164void arch_disable_nonboot_cpus_begin(void)
1165{
1166 /*
1167 * Avoid the smp alternatives switch during the disable_nonboot_cpus().
1168 * In the suspend path, we will be back in the SMP mode shortly anyways.
1169 */
1170 skip_smp_alternatives = true;
1171}
1172
1173void arch_disable_nonboot_cpus_end(void)
1174{
1175 skip_smp_alternatives = false;
1176}
1177
1169void arch_enable_nonboot_cpus_begin(void) 1178void arch_enable_nonboot_cpus_begin(void)
1170{ 1179{
1171 set_mtrr_aps_delayed_init(); 1180 set_mtrr_aps_delayed_init();
@@ -1196,7 +1205,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1196#ifdef CONFIG_X86_IO_APIC 1205#ifdef CONFIG_X86_IO_APIC
1197 setup_ioapic_dest(); 1206 setup_ioapic_dest();
1198#endif 1207#endif
1199 check_nmi_watchdog();
1200 mtrr_aps_init(); 1208 mtrr_aps_init();
1201} 1209}
1202 1210
@@ -1341,8 +1349,6 @@ int native_cpu_disable(void)
1341 if (cpu == 0) 1349 if (cpu == 0)
1342 return -EBUSY; 1350 return -EBUSY;
1343 1351
1344 if (nmi_watchdog == NMI_LOCAL_APIC)
1345 stop_apic_nmi_watchdog(NULL);
1346 clear_local_APIC(); 1352 clear_local_APIC();
1347 1353
1348 cpu_disable_common(); 1354 cpu_disable_common();
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index b53c525368a7..938c8e10a19a 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -73,22 +73,22 @@ static const struct stacktrace_ops save_stack_ops_nosched = {
73 */ 73 */
74void save_stack_trace(struct stack_trace *trace) 74void save_stack_trace(struct stack_trace *trace)
75{ 75{
76 dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); 76 dump_trace(current, NULL, NULL, &save_stack_ops, trace);
77 if (trace->nr_entries < trace->max_entries) 77 if (trace->nr_entries < trace->max_entries)
78 trace->entries[trace->nr_entries++] = ULONG_MAX; 78 trace->entries[trace->nr_entries++] = ULONG_MAX;
79} 79}
80EXPORT_SYMBOL_GPL(save_stack_trace); 80EXPORT_SYMBOL_GPL(save_stack_trace);
81 81
82void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) 82void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
83{ 83{
84 dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); 84 dump_trace(current, regs, NULL, &save_stack_ops, trace);
85 if (trace->nr_entries < trace->max_entries) 85 if (trace->nr_entries < trace->max_entries)
86 trace->entries[trace->nr_entries++] = ULONG_MAX; 86 trace->entries[trace->nr_entries++] = ULONG_MAX;
87} 87}
88 88
89void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 89void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
90{ 90{
91 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); 91 dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace);
92 if (trace->nr_entries < trace->max_entries) 92 if (trace->nr_entries < trace->max_entries)
93 trace->entries[trace->nr_entries++] = ULONG_MAX; 93 trace->entries[trace->nr_entries++] = ULONG_MAX;
94} 94}
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index fb5cc5e14cfa..25a28a245937 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -22,10 +22,6 @@
22#include <asm/hpet.h> 22#include <asm/hpet.h>
23#include <asm/time.h> 23#include <asm/time.h>
24 24
25#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
26int timer_ack;
27#endif
28
29#ifdef CONFIG_X86_64 25#ifdef CONFIG_X86_64
30volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; 26volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
31#endif 27#endif
@@ -63,20 +59,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
63 /* Keep nmi watchdog up to date */ 59 /* Keep nmi watchdog up to date */
64 inc_irq_stat(irq0_irqs); 60 inc_irq_stat(irq0_irqs);
65 61
66 /* Optimized out for !IO_APIC and x86_64 */
67 if (timer_ack) {
68 /*
69 * Subtle, when I/O APICs are used we have to ack timer IRQ
70 * manually to deassert NMI lines for the watchdog if run
71 * on an 82489DX-based system.
72 */
73 raw_spin_lock(&i8259A_lock);
74 outb(0x0c, PIC_MASTER_OCW3);
75 /* Ack the IRQ; AEOI will end it automatically. */
76 inb(PIC_MASTER_POLL);
77 raw_spin_unlock(&i8259A_lock);
78 }
79
80 global_clock_event->event_handler(global_clock_event); 62 global_clock_event->event_handler(global_clock_event);
81 63
82 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ 64 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 3af2dff58b21..075d130efcf9 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -127,7 +127,7 @@ startup_64:
127no_longmode: 127no_longmode:
128 hlt 128 hlt
129 jmp no_longmode 129 jmp no_longmode
130#include "verify_cpu_64.S" 130#include "verify_cpu.S"
131 131
132 # Careful these need to be in the same 64K segment as the above; 132 # Careful these need to be in the same 64K segment as the above;
133tidt: 133tidt:
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index cb838ca42c96..c76aaca5694d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -83,6 +83,8 @@ EXPORT_SYMBOL_GPL(used_vectors);
83 83
84static int ignore_nmis; 84static int ignore_nmis;
85 85
86int unknown_nmi_panic;
87
86static inline void conditional_sti(struct pt_regs *regs) 88static inline void conditional_sti(struct pt_regs *regs)
87{ 89{
88 if (regs->flags & X86_EFLAGS_IF) 90 if (regs->flags & X86_EFLAGS_IF)
@@ -300,6 +302,13 @@ gp_in_kernel:
300 die("general protection fault", regs, error_code); 302 die("general protection fault", regs, error_code);
301} 303}
302 304
305static int __init setup_unknown_nmi_panic(char *str)
306{
307 unknown_nmi_panic = 1;
308 return 1;
309}
310__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
311
303static notrace __kprobes void 312static notrace __kprobes void
304mem_parity_error(unsigned char reason, struct pt_regs *regs) 313mem_parity_error(unsigned char reason, struct pt_regs *regs)
305{ 314{
@@ -342,9 +351,11 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
342 reason = (reason & 0xf) | 8; 351 reason = (reason & 0xf) | 8;
343 outb(reason, 0x61); 352 outb(reason, 0x61);
344 353
345 i = 2000; 354 i = 20000;
346 while (--i) 355 while (--i) {
347 udelay(1000); 356 touch_nmi_watchdog();
357 udelay(100);
358 }
348 359
349 reason &= ~8; 360 reason &= ~8;
350 outb(reason, 0x61); 361 outb(reason, 0x61);
@@ -371,7 +382,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
371 reason, smp_processor_id()); 382 reason, smp_processor_id());
372 383
373 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); 384 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
374 if (panic_on_unrecovered_nmi) 385 if (unknown_nmi_panic || panic_on_unrecovered_nmi)
375 panic("NMI: Not continuing"); 386 panic("NMI: Not continuing");
376 387
377 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 388 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
@@ -397,20 +408,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
397 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 408 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
398 == NOTIFY_STOP) 409 == NOTIFY_STOP)
399 return; 410 return;
400
401#ifndef CONFIG_LOCKUP_DETECTOR
402 /*
403 * Ok, so this is none of the documented NMI sources,
404 * so it must be the NMI watchdog.
405 */
406 if (nmi_watchdog_tick(regs, reason))
407 return;
408 if (!do_nmi_callback(regs, cpu))
409#endif /* !CONFIG_LOCKUP_DETECTOR */
410 unknown_nmi_error(reason, regs);
411#else
412 unknown_nmi_error(reason, regs);
413#endif 411#endif
412 unknown_nmi_error(reason, regs);
414 413
415 return; 414 return;
416 } 415 }
@@ -446,14 +445,12 @@ do_nmi(struct pt_regs *regs, long error_code)
446 445
447void stop_nmi(void) 446void stop_nmi(void)
448{ 447{
449 acpi_nmi_disable();
450 ignore_nmis++; 448 ignore_nmis++;
451} 449}
452 450
453void restart_nmi(void) 451void restart_nmi(void)
454{ 452{
455 ignore_nmis--; 453 ignore_nmis--;
456 acpi_nmi_enable();
457} 454}
458 455
459/* May run on IST stack. */ 456/* May run on IST stack. */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 0c40d8b72416..356a0d455cf9 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -872,6 +872,9 @@ __cpuinit int unsynchronized_tsc(void)
872 872
873 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 873 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
874 return 0; 874 return 0;
875
876 if (tsc_clocksource_reliable)
877 return 0;
875 /* 878 /*
876 * Intel systems are normally all synchronized. 879 * Intel systems are normally all synchronized.
877 * Exceptions must mark TSC as unstable: 880 * Exceptions must mark TSC as unstable:
@@ -879,14 +882,92 @@ __cpuinit int unsynchronized_tsc(void)
879 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { 882 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
880 /* assume multi socket systems are not synchronized: */ 883 /* assume multi socket systems are not synchronized: */
881 if (num_possible_cpus() > 1) 884 if (num_possible_cpus() > 1)
882 tsc_unstable = 1; 885 return 1;
883 } 886 }
884 887
885 return tsc_unstable; 888 return 0;
889}
890
891
892static void tsc_refine_calibration_work(struct work_struct *work);
893static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
894/**
895 * tsc_refine_calibration_work - Further refine tsc freq calibration
896 * @work - ignored.
897 *
898 * This functions uses delayed work over a period of a
899 * second to further refine the TSC freq value. Since this is
900 * timer based, instead of loop based, we don't block the boot
901 * process while this longer calibration is done.
902 *
903 * If there are any calibration anomolies (too many SMIs, etc),
904 * or the refined calibration is off by 1% of the fast early
905 * calibration, we throw out the new calibration and use the
906 * early calibration.
907 */
908static void tsc_refine_calibration_work(struct work_struct *work)
909{
910 static u64 tsc_start = -1, ref_start;
911 static int hpet;
912 u64 tsc_stop, ref_stop, delta;
913 unsigned long freq;
914
915 /* Don't bother refining TSC on unstable systems */
916 if (check_tsc_unstable())
917 goto out;
918
919 /*
920 * Since the work is started early in boot, we may be
921 * delayed the first time we expire. So set the workqueue
922 * again once we know timers are working.
923 */
924 if (tsc_start == -1) {
925 /*
926 * Only set hpet once, to avoid mixing hardware
927 * if the hpet becomes enabled later.
928 */
929 hpet = is_hpet_enabled();
930 schedule_delayed_work(&tsc_irqwork, HZ);
931 tsc_start = tsc_read_refs(&ref_start, hpet);
932 return;
933 }
934
935 tsc_stop = tsc_read_refs(&ref_stop, hpet);
936
937 /* hpet or pmtimer available ? */
938 if (!hpet && !ref_start && !ref_stop)
939 goto out;
940
941 /* Check, whether the sampling was disturbed by an SMI */
942 if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
943 goto out;
944
945 delta = tsc_stop - tsc_start;
946 delta *= 1000000LL;
947 if (hpet)
948 freq = calc_hpet_ref(delta, ref_start, ref_stop);
949 else
950 freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
951
952 /* Make sure we're within 1% */
953 if (abs(tsc_khz - freq) > tsc_khz/100)
954 goto out;
955
956 tsc_khz = freq;
957 printk(KERN_INFO "Refined TSC clocksource calibration: "
958 "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
959 (unsigned long)tsc_khz % 1000);
960
961out:
962 clocksource_register_khz(&clocksource_tsc, tsc_khz);
886} 963}
887 964
888static void __init init_tsc_clocksource(void) 965
966static int __init init_tsc_clocksource(void)
889{ 967{
968 if (!cpu_has_tsc || tsc_disabled > 0)
969 return 0;
970
890 if (tsc_clocksource_reliable) 971 if (tsc_clocksource_reliable)
891 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 972 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
892 /* lower the rating if we already know its unstable: */ 973 /* lower the rating if we already know its unstable: */
@@ -894,8 +975,14 @@ static void __init init_tsc_clocksource(void)
894 clocksource_tsc.rating = 0; 975 clocksource_tsc.rating = 0;
895 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; 976 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
896 } 977 }
897 clocksource_register_khz(&clocksource_tsc, tsc_khz); 978 schedule_delayed_work(&tsc_irqwork, 0);
979 return 0;
898} 980}
981/*
982 * We use device_initcall here, to ensure we run after the hpet
983 * is fully initialized, which may occur at fs_initcall time.
984 */
985device_initcall(init_tsc_clocksource);
899 986
900void __init tsc_init(void) 987void __init tsc_init(void)
901{ 988{
@@ -949,6 +1036,5 @@ void __init tsc_init(void)
949 mark_tsc_unstable("TSCs unsynchronized"); 1036 mark_tsc_unstable("TSCs unsynchronized");
950 1037
951 check_system_tsc_reliable(); 1038 check_system_tsc_reliable();
952 init_tsc_clocksource();
953} 1039}
954 1040
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu.S
index 56a8c2a867d9..0edefc19a113 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -7,6 +7,7 @@
7 * Copyright (c) 2007 Andi Kleen (ak@suse.de) 7 * Copyright (c) 2007 Andi Kleen (ak@suse.de)
8 * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) 8 * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
9 * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) 9 * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
10 * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com)
10 * 11 *
11 * This source code is licensed under the GNU General Public License, 12 * This source code is licensed under the GNU General Public License,
12 * Version 2. See the file COPYING for more details. 13 * Version 2. See the file COPYING for more details.
@@ -14,18 +15,17 @@
14 * This is a common code for verification whether CPU supports 15 * This is a common code for verification whether CPU supports
15 * long mode and SSE or not. It is not called directly instead this 16 * long mode and SSE or not. It is not called directly instead this
16 * file is included at various places and compiled in that context. 17 * file is included at various places and compiled in that context.
17 * Following are the current usage. 18 * This file is expected to run in 32bit code. Currently:
18 * 19 *
19 * This file is included by both 16bit and 32bit code. 20 * arch/x86/boot/compressed/head_64.S: Boot cpu verification
21 * arch/x86/kernel/trampoline_64.S: secondary processor verfication
22 * arch/x86/kernel/head_32.S: processor startup
20 * 23 *
21 * arch/x86_64/boot/setup.S : Boot cpu verification (16bit) 24 * verify_cpu, returns the status of longmode and SSE in register %eax.
22 * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
23 * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
24 * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
25 *
26 * verify_cpu, returns the status of cpu check in register %eax.
27 * 0: Success 1: Failure 25 * 0: Success 1: Failure
28 * 26 *
27 * On Intel, the XD_DISABLE flag will be cleared as a side-effect.
28 *
29 * The caller needs to check for the error code and take the action 29 * The caller needs to check for the error code and take the action
30 * appropriately. Either display a message or halt. 30 * appropriately. Either display a message or halt.
31 */ 31 */
@@ -62,8 +62,41 @@ verify_cpu:
62 cmpl $0x444d4163,%ecx 62 cmpl $0x444d4163,%ecx
63 jnz verify_cpu_noamd 63 jnz verify_cpu_noamd
64 mov $1,%di # cpu is from AMD 64 mov $1,%di # cpu is from AMD
65 jmp verify_cpu_check
65 66
66verify_cpu_noamd: 67verify_cpu_noamd:
68 cmpl $0x756e6547,%ebx # GenuineIntel?
69 jnz verify_cpu_check
70 cmpl $0x49656e69,%edx
71 jnz verify_cpu_check
72 cmpl $0x6c65746e,%ecx
73 jnz verify_cpu_check
74
75 # only call IA32_MISC_ENABLE when:
76 # family > 6 || (family == 6 && model >= 0xd)
77 movl $0x1, %eax # check CPU family and model
78 cpuid
79 movl %eax, %ecx
80
81 andl $0x0ff00f00, %eax # mask family and extended family
82 shrl $8, %eax
83 cmpl $6, %eax
84 ja verify_cpu_clear_xd # family > 6, ok
85 jb verify_cpu_check # family < 6, skip
86
87 andl $0x000f00f0, %ecx # mask model and extended model
88 shrl $4, %ecx
89 cmpl $0xd, %ecx
90 jb verify_cpu_check # family == 6, model < 0xd, skip
91
92verify_cpu_clear_xd:
93 movl $MSR_IA32_MISC_ENABLE, %ecx
94 rdmsr
95 btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
96 jnc verify_cpu_check # only write MSR if bit was changed
97 wrmsr
98
99verify_cpu_check:
67 movl $0x1,%eax # Does the cpu have what it takes 100 movl $0x1,%eax # Does the cpu have what it takes
68 cpuid 101 cpuid
69 andl $REQUIRED_MASK0,%edx 102 andl $REQUIRED_MASK0,%edx
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e03530aebfd0..bf4700755184 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -69,7 +69,7 @@ jiffies_64 = jiffies;
69 69
70PHDRS { 70PHDRS {
71 text PT_LOAD FLAGS(5); /* R_E */ 71 text PT_LOAD FLAGS(5); /* R_E */
72 data PT_LOAD FLAGS(7); /* RWE */ 72 data PT_LOAD FLAGS(6); /* RW_ */
73#ifdef CONFIG_X86_64 73#ifdef CONFIG_X86_64
74 user PT_LOAD FLAGS(5); /* R_E */ 74 user PT_LOAD FLAGS(5); /* R_E */
75#ifdef CONFIG_SMP 75#ifdef CONFIG_SMP
@@ -116,6 +116,10 @@ SECTIONS
116 116
117 EXCEPTION_TABLE(16) :text = 0x9090 117 EXCEPTION_TABLE(16) :text = 0x9090
118 118
119#if defined(CONFIG_DEBUG_RODATA)
120 /* .text should occupy whole number of pages */
121 . = ALIGN(PAGE_SIZE);
122#endif
119 X64_ALIGN_DEBUG_RODATA_BEGIN 123 X64_ALIGN_DEBUG_RODATA_BEGIN
120 RO_DATA(PAGE_SIZE) 124 RO_DATA(PAGE_SIZE)
121 X64_ALIGN_DEBUG_RODATA_END 125 X64_ALIGN_DEBUG_RODATA_END
@@ -335,7 +339,7 @@ SECTIONS
335 __bss_start = .; 339 __bss_start = .;
336 *(.bss..page_aligned) 340 *(.bss..page_aligned)
337 *(.bss) 341 *(.bss)
338 . = ALIGN(4); 342 . = ALIGN(PAGE_SIZE);
339 __bss_stop = .; 343 __bss_stop = .;
340 } 344 }
341 345
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 9c253bd65e24..547128546cc3 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -394,7 +394,8 @@ static void __init setup_xstate_init(void)
394 * Setup init_xstate_buf to represent the init state of 394 * Setup init_xstate_buf to represent the init state of
395 * all the features managed by the xsave 395 * all the features managed by the xsave
396 */ 396 */
397 init_xstate_buf = alloc_bootmem(xstate_size); 397 init_xstate_buf = alloc_bootmem_align(xstate_size,
398 __alignof__(struct xsave_struct));
398 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; 399 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
399 400
400 clts(); 401 clts();
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index f628234fbeca..3cece05e4ac4 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -575,6 +575,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
575 s->pics[1].elcr_mask = 0xde; 575 s->pics[1].elcr_mask = 0xde;
576 s->pics[0].pics_state = s; 576 s->pics[0].pics_state = s;
577 s->pics[1].pics_state = s; 577 s->pics[1].pics_state = s;
578 s->pics[0].isr_ack = 0xff;
579 s->pics[1].isr_ack = 0xff;
578 580
579 /* 581 /*
580 * Initialize PIO device 582 * Initialize PIO device
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fb8b376bf28c..fbb04aee8301 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2394,7 +2394,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2394 ASSERT(!VALID_PAGE(root)); 2394 ASSERT(!VALID_PAGE(root));
2395 spin_lock(&vcpu->kvm->mmu_lock); 2395 spin_lock(&vcpu->kvm->mmu_lock);
2396 kvm_mmu_free_some_pages(vcpu); 2396 kvm_mmu_free_some_pages(vcpu);
2397 sp = kvm_mmu_get_page(vcpu, i << 30, i << 30, 2397 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
2398 i << 30,
2398 PT32_ROOT_LEVEL, 1, ACC_ALL, 2399 PT32_ROOT_LEVEL, 1, ACC_ALL,
2399 NULL); 2400 NULL);
2400 root = __pa(sp->spt); 2401 root = __pa(sp->spt);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 82e144a4e514..b81a9b7c2ca4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3395,6 +3395,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3395 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 3395 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3396 3396
3397 load_host_msrs(vcpu); 3397 load_host_msrs(vcpu);
3398 kvm_load_ldt(ldt_selector);
3398 loadsegment(fs, fs_selector); 3399 loadsegment(fs, fs_selector);
3399#ifdef CONFIG_X86_64 3400#ifdef CONFIG_X86_64
3400 load_gs_index(gs_selector); 3401 load_gs_index(gs_selector);
@@ -3402,7 +3403,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3402#else 3403#else
3403 loadsegment(gs, gs_selector); 3404 loadsegment(gs, gs_selector);
3404#endif 3405#endif
3405 kvm_load_ldt(ldt_selector);
3406 3406
3407 reload_tss(vcpu); 3407 reload_tss(vcpu);
3408 3408
@@ -3494,6 +3494,10 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
3494static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 3494static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3495{ 3495{
3496 switch (func) { 3496 switch (func) {
3497 case 0x00000001:
3498 /* Mask out xsave bit as long as it is not supported by SVM */
3499 entry->ecx &= ~(bit(X86_FEATURE_XSAVE));
3500 break;
3497 case 0x80000001: 3501 case 0x80000001:
3498 if (nested) 3502 if (nested)
3499 entry->ecx |= (1 << 2); /* Set SVM bit */ 3503 entry->ecx |= (1 << 2); /* Set SVM bit */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8da0e45ff7c9..81fcbe9515c5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -821,10 +821,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
821#endif 821#endif
822 822
823#ifdef CONFIG_X86_64 823#ifdef CONFIG_X86_64
824 if (is_long_mode(&vmx->vcpu)) { 824 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
825 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 825 if (is_long_mode(&vmx->vcpu))
826 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 826 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
827 }
828#endif 827#endif
829 for (i = 0; i < vmx->save_nmsrs; ++i) 828 for (i = 0; i < vmx->save_nmsrs; ++i)
830 kvm_set_shared_msr(vmx->guest_msrs[i].index, 829 kvm_set_shared_msr(vmx->guest_msrs[i].index,
@@ -839,23 +838,23 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
839 838
840 ++vmx->vcpu.stat.host_state_reload; 839 ++vmx->vcpu.stat.host_state_reload;
841 vmx->host_state.loaded = 0; 840 vmx->host_state.loaded = 0;
842 if (vmx->host_state.fs_reload_needed) 841#ifdef CONFIG_X86_64
843 loadsegment(fs, vmx->host_state.fs_sel); 842 if (is_long_mode(&vmx->vcpu))
843 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
844#endif
844 if (vmx->host_state.gs_ldt_reload_needed) { 845 if (vmx->host_state.gs_ldt_reload_needed) {
845 kvm_load_ldt(vmx->host_state.ldt_sel); 846 kvm_load_ldt(vmx->host_state.ldt_sel);
846#ifdef CONFIG_X86_64 847#ifdef CONFIG_X86_64
847 load_gs_index(vmx->host_state.gs_sel); 848 load_gs_index(vmx->host_state.gs_sel);
848 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
849#else 849#else
850 loadsegment(gs, vmx->host_state.gs_sel); 850 loadsegment(gs, vmx->host_state.gs_sel);
851#endif 851#endif
852 } 852 }
853 if (vmx->host_state.fs_reload_needed)
854 loadsegment(fs, vmx->host_state.fs_sel);
853 reload_tss(); 855 reload_tss();
854#ifdef CONFIG_X86_64 856#ifdef CONFIG_X86_64
855 if (is_long_mode(&vmx->vcpu)) { 857 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
856 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
857 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
858 }
859#endif 858#endif
860 if (current_thread_info()->status & TS_USEDFPU) 859 if (current_thread_info()->status & TS_USEDFPU)
861 clts(); 860 clts();
@@ -4228,11 +4227,6 @@ static int vmx_get_lpage_level(void)
4228 return PT_PDPE_LEVEL; 4227 return PT_PDPE_LEVEL;
4229} 4228}
4230 4229
4231static inline u32 bit(int bitno)
4232{
4233 return 1 << (bitno & 31);
4234}
4235
4236static void vmx_cpuid_update(struct kvm_vcpu *vcpu) 4230static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4237{ 4231{
4238 struct kvm_cpuid_entry2 *best; 4232 struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cdac9e592aa5..b989e1f1e5d3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -155,11 +155,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
155 155
156u64 __read_mostly host_xcr0; 156u64 __read_mostly host_xcr0;
157 157
158static inline u32 bit(int bitno)
159{
160 return 1 << (bitno & 31);
161}
162
163static void kvm_on_user_return(struct user_return_notifier *urn) 158static void kvm_on_user_return(struct user_return_notifier *urn)
164{ 159{
165 unsigned slot; 160 unsigned slot;
@@ -4569,9 +4564,11 @@ static void kvm_timer_init(void)
4569#ifdef CONFIG_CPU_FREQ 4564#ifdef CONFIG_CPU_FREQ
4570 struct cpufreq_policy policy; 4565 struct cpufreq_policy policy;
4571 memset(&policy, 0, sizeof(policy)); 4566 memset(&policy, 0, sizeof(policy));
4572 cpufreq_get_policy(&policy, get_cpu()); 4567 cpu = get_cpu();
4568 cpufreq_get_policy(&policy, cpu);
4573 if (policy.cpuinfo.max_freq) 4569 if (policy.cpuinfo.max_freq)
4574 max_tsc_khz = policy.cpuinfo.max_freq; 4570 max_tsc_khz = policy.cpuinfo.max_freq;
4571 put_cpu();
4575#endif 4572#endif
4576 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 4573 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
4577 CPUFREQ_TRANSITION_NOTIFIER); 4574 CPUFREQ_TRANSITION_NOTIFIER);
@@ -5522,6 +5519,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5522 5519
5523 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 5520 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
5524 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5521 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
5522 if (sregs->cr4 & X86_CR4_OSXSAVE)
5523 update_cpuid(vcpu);
5525 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5524 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5526 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); 5525 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
5527 mmu_reset_needed = 1; 5526 mmu_reset_needed = 1;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2cea414489f3..c600da830ce0 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -70,6 +70,11 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
70 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 70 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
71} 71}
72 72
73static inline u32 bit(int bitno)
74{
75 return 1 << (bitno & 31);
76}
77
73void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 78void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
74void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 79void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
75int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); 80int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 73b1e1a1f489..4996cf5f73a0 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -531,7 +531,10 @@ static void lguest_write_cr3(unsigned long cr3)
531{ 531{
532 lguest_data.pgdir = cr3; 532 lguest_data.pgdir = cr3;
533 lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); 533 lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
534 cr3_changed = true; 534
535 /* These two page tables are simple, linear, and used during boot */
536 if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
537 cr3_changed = true;
535} 538}
536 539
537static unsigned long lguest_read_cr3(void) 540static unsigned long lguest_read_cr3(void)
@@ -703,9 +706,9 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
703 * to forget all of them. Fortunately, this is very rare. 706 * to forget all of them. Fortunately, this is very rare.
704 * 707 *
705 * ... except in early boot when the kernel sets up the initial pagetables, 708 * ... except in early boot when the kernel sets up the initial pagetables,
706 * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell 709 * which makes booting astonishingly slow: 48 seconds! So we don't even tell
707 * the Host anything changed until we've done the first page table switch, 710 * the Host anything changed until we've done the first real page table switch,
708 * which brings boot back to 0.25 seconds. 711 * which brings boot back to 4.3 seconds.
709 */ 712 */
710static void lguest_set_pte(pte_t *ptep, pte_t pteval) 713static void lguest_set_pte(pte_t *ptep, pte_t pteval)
711{ 714{
@@ -1002,7 +1005,7 @@ static void lguest_time_init(void)
1002 clockevents_register_device(&lguest_clockevent); 1005 clockevents_register_device(&lguest_clockevent);
1003 1006
1004 /* Finally, we unblock the timer interrupt. */ 1007 /* Finally, we unblock the timer interrupt. */
1005 enable_lguest_irq(0); 1008 clear_bit(0, lguest_data.blocked_interrupts);
1006} 1009}
1007 1010
1008/* 1011/*
@@ -1349,9 +1352,6 @@ __init void lguest_init(void)
1349 */ 1352 */
1350 switch_to_new_gdt(0); 1353 switch_to_new_gdt(0);
1351 1354
1352 /* We actually boot with all memory mapped, but let's say 128MB. */
1353 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1354
1355 /* 1355 /*
1356 * The Host<->Guest Switcher lives at the top of our address space, and 1356 * The Host<->Guest Switcher lives at the top of our address space, and
1357 * the Host told us how big it is when we made LGUEST_INIT hypercall: 1357 * the Host told us how big it is when we made LGUEST_INIT hypercall:
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c0e28a13de7d..947f42abe820 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -364,8 +364,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
364 /* 364 /*
365 * We just marked the kernel text read only above, now that 365 * We just marked the kernel text read only above, now that
366 * we are going to free part of that, we need to make that 366 * we are going to free part of that, we need to make that
367 * writeable first. 367 * writeable and non-executable first.
368 */ 368 */
369 set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
369 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); 370 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
370 371
371 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); 372 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0e969f9f401b..f89b5bb4e93f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -226,7 +226,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
226 226
227static inline int is_kernel_text(unsigned long addr) 227static inline int is_kernel_text(unsigned long addr)
228{ 228{
229 if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) 229 if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
230 return 1; 230 return 1;
231 return 0; 231 return 0;
232} 232}
@@ -912,6 +912,23 @@ void set_kernel_text_ro(void)
912 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 912 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
913} 913}
914 914
915static void mark_nxdata_nx(void)
916{
917 /*
918 * When this called, init has already been executed and released,
919 * so everything past _etext sould be NX.
920 */
921 unsigned long start = PFN_ALIGN(_etext);
922 /*
923 * This comes from is_kernel_text upper limit. Also HPAGE where used:
924 */
925 unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
926
927 if (__supported_pte_mask & _PAGE_NX)
928 printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
929 set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
930}
931
915void mark_rodata_ro(void) 932void mark_rodata_ro(void)
916{ 933{
917 unsigned long start = PFN_ALIGN(_text); 934 unsigned long start = PFN_ALIGN(_text);
@@ -946,6 +963,7 @@ void mark_rodata_ro(void)
946 printk(KERN_INFO "Testing CPA: write protecting again\n"); 963 printk(KERN_INFO "Testing CPA: write protecting again\n");
947 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 964 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
948#endif 965#endif
966 mark_nxdata_nx();
949} 967}
950#endif 968#endif
951 969
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index af3b6c8a436f..704a37cedddb 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
185 e->trace.entries = e->trace_entries; 185 e->trace.entries = e->trace_entries;
186 e->trace.max_entries = ARRAY_SIZE(e->trace_entries); 186 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
187 e->trace.skip = 0; 187 e->trace.skip = 0;
188 save_stack_trace_bp(&e->trace, regs->bp); 188 save_stack_trace_regs(&e->trace, regs);
189 189
190 /* Round address down to nearest 16 bytes */ 190 /* Round address down to nearest 16 bytes */
191 shadow_copy = kmemcheck_shadow_lookup(address 191 shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 532e7933d606..8b830ca14ac4 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -13,6 +13,7 @@
13#include <linux/pfn.h> 13#include <linux/pfn.h>
14#include <linux/percpu.h> 14#include <linux/percpu.h>
15#include <linux/gfp.h> 15#include <linux/gfp.h>
16#include <linux/pci.h>
16 17
17#include <asm/e820.h> 18#include <asm/e820.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
@@ -255,13 +256,16 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
255 unsigned long pfn) 256 unsigned long pfn)
256{ 257{
257 pgprot_t forbidden = __pgprot(0); 258 pgprot_t forbidden = __pgprot(0);
259 pgprot_t required = __pgprot(0);
258 260
259 /* 261 /*
260 * The BIOS area between 640k and 1Mb needs to be executable for 262 * The BIOS area between 640k and 1Mb needs to be executable for
261 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. 263 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
262 */ 264 */
263 if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) 265#ifdef CONFIG_PCI_BIOS
266 if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
264 pgprot_val(forbidden) |= _PAGE_NX; 267 pgprot_val(forbidden) |= _PAGE_NX;
268#endif
265 269
266 /* 270 /*
267 * The kernel text needs to be executable for obvious reasons 271 * The kernel text needs to be executable for obvious reasons
@@ -278,6 +282,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
278 if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, 282 if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) 283 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
280 pgprot_val(forbidden) |= _PAGE_RW; 284 pgprot_val(forbidden) |= _PAGE_RW;
285 /*
286 * .data and .bss should always be writable.
287 */
288 if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
289 within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
290 pgprot_val(required) |= _PAGE_RW;
281 291
282#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) 292#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
283 /* 293 /*
@@ -317,6 +327,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
317#endif 327#endif
318 328
319 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 329 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
330 prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
320 331
321 return prot; 332 return prot;
322} 333}
@@ -393,7 +404,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
393{ 404{
394 unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; 405 unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
395 pte_t new_pte, old_pte, *tmp; 406 pte_t new_pte, old_pte, *tmp;
396 pgprot_t old_prot, new_prot; 407 pgprot_t old_prot, new_prot, req_prot;
397 int i, do_split = 1; 408 int i, do_split = 1;
398 unsigned int level; 409 unsigned int level;
399 410
@@ -438,10 +449,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
438 * We are safe now. Check whether the new pgprot is the same: 449 * We are safe now. Check whether the new pgprot is the same:
439 */ 450 */
440 old_pte = *kpte; 451 old_pte = *kpte;
441 old_prot = new_prot = pte_pgprot(old_pte); 452 old_prot = new_prot = req_prot = pte_pgprot(old_pte);
442 453
443 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); 454 pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
444 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); 455 pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
445 456
446 /* 457 /*
447 * old_pte points to the large page base address. So we need 458 * old_pte points to the large page base address. So we need
@@ -450,17 +461,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
450 pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); 461 pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
451 cpa->pfn = pfn; 462 cpa->pfn = pfn;
452 463
453 new_prot = static_protections(new_prot, address, pfn); 464 new_prot = static_protections(req_prot, address, pfn);
454 465
455 /* 466 /*
456 * We need to check the full range, whether 467 * We need to check the full range, whether
457 * static_protection() requires a different pgprot for one of 468 * static_protection() requires a different pgprot for one of
458 * the pages in the range we try to preserve: 469 * the pages in the range we try to preserve:
459 */ 470 */
460 addr = address + PAGE_SIZE; 471 addr = address & pmask;
461 pfn++; 472 pfn = pte_pfn(old_pte);
462 for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { 473 for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
463 pgprot_t chk_prot = static_protections(new_prot, addr, pfn); 474 pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
464 475
465 if (pgprot_val(chk_prot) != pgprot_val(new_prot)) 476 if (pgprot_val(chk_prot) != pgprot_val(new_prot))
466 goto out_unlock; 477 goto out_unlock;
@@ -483,7 +494,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
483 * that we limited the number of possible pages already to 494 * that we limited the number of possible pages already to
484 * the number of pages in the large page. 495 * the number of pages in the large page.
485 */ 496 */
486 if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { 497 if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
487 /* 498 /*
488 * The address is aligned and the number of pages 499 * The address is aligned and the number of pages
489 * covers the full page. 500 * covers the full page.
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index a3250aa34086..410531d3c292 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -41,7 +41,7 @@ void __init x86_report_nx(void)
41{ 41{
42 if (!cpu_has_nx) { 42 if (!cpu_has_nx) {
43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " 43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
44 "missing in CPU or disabled in BIOS!\n"); 44 "missing in CPU!\n");
45 } else { 45 } else {
46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
47 if (disable_nx) { 47 if (disable_nx) {
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index a17dffd136c1..f16434568a51 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -92,6 +92,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
92 /* mark this node as "seen" in node bitmap */ 92 /* mark this node as "seen" in node bitmap */
93 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); 93 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
94 94
95 /* don't need to check apic_id here, because it is always 8 bits */
95 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; 96 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
96 97
97 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", 98 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index a35cb9d8b060..171a0aacb99a 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
134 } 134 }
135 135
136 apic_id = pa->apic_id; 136 apic_id = pa->apic_id;
137 if (apic_id >= MAX_LOCAL_APIC) {
138 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
139 return;
140 }
137 apicid_to_node[apic_id] = node; 141 apicid_to_node[apic_id] = node;
138 node_set(node, cpu_nodes_parsed); 142 node_set(node, cpu_nodes_parsed);
139 acpi_numa = 1; 143 acpi_numa = 1;
@@ -168,6 +172,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
168 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; 172 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
169 else 173 else
170 apic_id = pa->apic_id; 174 apic_id = pa->apic_id;
175
176 if (apic_id >= MAX_LOCAL_APIC) {
177 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
178 return;
179 }
180
171 apicid_to_node[apic_id] = node; 181 apicid_to_node[apic_id] = node;
172 node_set(node, cpu_nodes_parsed); 182 node_set(node, cpu_nodes_parsed);
173 acpi_numa = 1; 183 acpi_numa = 1;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 12cdbb17ad18..6acc724d5d8f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -223,7 +223,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
223 223
224static void __cpuinit calculate_tlb_offset(void) 224static void __cpuinit calculate_tlb_offset(void)
225{ 225{
226 int cpu, node, nr_node_vecs; 226 int cpu, node, nr_node_vecs, idx = 0;
227 /* 227 /*
228 * we are changing tlb_vector_offset for each CPU in runtime, but this 228 * we are changing tlb_vector_offset for each CPU in runtime, but this
229 * will not cause inconsistency, as the write is atomic under X86. we 229 * will not cause inconsistency, as the write is atomic under X86. we
@@ -239,7 +239,7 @@ static void __cpuinit calculate_tlb_offset(void)
239 nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; 239 nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
240 240
241 for_each_online_node(node) { 241 for_each_online_node(node) {
242 int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) * 242 int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
243 nr_node_vecs; 243 nr_node_vecs;
244 int cpu_offset = 0; 244 int cpu_offset = 0;
245 for_each_cpu(cpu, cpumask_of_node(node)) { 245 for_each_cpu(cpu, cpumask_of_node(node)) {
@@ -248,6 +248,7 @@ static void __cpuinit calculate_tlb_offset(void)
248 cpu_offset++; 248 cpu_offset++;
249 cpu_offset = cpu_offset % nr_node_vecs; 249 cpu_offset = cpu_offset % nr_node_vecs;
250 } 250 }
251 idx++;
251 } 252 }
252} 253}
253 254
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 2d49d4e19a36..72cbec14d783 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -126,7 +126,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
126 if (!user_mode_vm(regs)) { 126 if (!user_mode_vm(regs)) {
127 unsigned long stack = kernel_stack_pointer(regs); 127 unsigned long stack = kernel_stack_pointer(regs);
128 if (depth) 128 if (depth)
129 dump_trace(NULL, regs, (unsigned long *)stack, 0, 129 dump_trace(NULL, regs, (unsigned long *)stack,
130 &backtrace_ops, &depth); 130 &backtrace_ops, &depth);
131 return; 131 return;
132 } 132 }
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 4e8baad36d37..358c8b9c96a7 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -732,6 +732,9 @@ int __init op_nmi_init(struct oprofile_operations *ops)
732 case 0x14: 732 case 0x14:
733 cpu_type = "x86-64/family14h"; 733 cpu_type = "x86-64/family14h";
734 break; 734 break;
735 case 0x15:
736 cpu_type = "x86-64/family15h";
737 break;
735 default: 738 default:
736 return -ENODEV; 739 return -ENODEV;
737 } 740 }
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
index e3ecb71b5790..0636dd93cef8 100644
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ b/arch/x86/oprofile/nmi_timer_int.c
@@ -58,9 +58,6 @@ static void timer_stop(void)
58 58
59int __init op_nmi_timer_init(struct oprofile_operations *ops) 59int __init op_nmi_timer_init(struct oprofile_operations *ops)
60{ 60{
61 if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0))
62 return -ENODEV;
63
64 ops->start = timer_start; 61 ops->start = timer_start;
65 ops->stop = timer_stop; 62 ops->stop = timer_stop;
66 ops->cpu_type = "timer"; 63 ops->cpu_type = "timer";
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index a011bcc0f943..c3b8e24f2b16 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -29,11 +29,12 @@
29#include "op_x86_model.h" 29#include "op_x86_model.h"
30#include "op_counter.h" 30#include "op_counter.h"
31 31
32#define NUM_COUNTERS 4 32#define NUM_COUNTERS 4
33#define NUM_COUNTERS_F15H 6
33#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX 34#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
34#define NUM_VIRT_COUNTERS 32 35#define NUM_VIRT_COUNTERS 32
35#else 36#else
36#define NUM_VIRT_COUNTERS NUM_COUNTERS 37#define NUM_VIRT_COUNTERS 0
37#endif 38#endif
38 39
39#define OP_EVENT_MASK 0x0FFF 40#define OP_EVENT_MASK 0x0FFF
@@ -41,7 +42,8 @@
41 42
42#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) 43#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21))
43 44
44static unsigned long reset_value[NUM_VIRT_COUNTERS]; 45static int num_counters;
46static unsigned long reset_value[OP_MAX_COUNTER];
45 47
46#define IBS_FETCH_SIZE 6 48#define IBS_FETCH_SIZE 6
47#define IBS_OP_SIZE 12 49#define IBS_OP_SIZE 12
@@ -387,7 +389,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
387 int i; 389 int i;
388 390
389 /* enable active counters */ 391 /* enable active counters */
390 for (i = 0; i < NUM_COUNTERS; ++i) { 392 for (i = 0; i < num_counters; ++i) {
391 int virt = op_x86_phys_to_virt(i); 393 int virt = op_x86_phys_to_virt(i);
392 if (!reset_value[virt]) 394 if (!reset_value[virt])
393 continue; 395 continue;
@@ -406,7 +408,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
406{ 408{
407 int i; 409 int i;
408 410
409 for (i = 0; i < NUM_COUNTERS; ++i) { 411 for (i = 0; i < num_counters; ++i) {
410 if (!msrs->counters[i].addr) 412 if (!msrs->counters[i].addr)
411 continue; 413 continue;
412 release_perfctr_nmi(MSR_K7_PERFCTR0 + i); 414 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
@@ -418,7 +420,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
418{ 420{
419 int i; 421 int i;
420 422
421 for (i = 0; i < NUM_COUNTERS; i++) { 423 for (i = 0; i < num_counters; i++) {
422 if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) 424 if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
423 goto fail; 425 goto fail;
424 if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) { 426 if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
@@ -426,8 +428,13 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
426 goto fail; 428 goto fail;
427 } 429 }
428 /* both registers must be reserved */ 430 /* both registers must be reserved */
429 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; 431 if (num_counters == NUM_COUNTERS_F15H) {
430 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; 432 msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
433 msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
434 } else {
435 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
436 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
437 }
431 continue; 438 continue;
432 fail: 439 fail:
433 if (!counter_config[i].enabled) 440 if (!counter_config[i].enabled)
@@ -447,7 +454,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
447 int i; 454 int i;
448 455
449 /* setup reset_value */ 456 /* setup reset_value */
450 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { 457 for (i = 0; i < OP_MAX_COUNTER; ++i) {
451 if (counter_config[i].enabled 458 if (counter_config[i].enabled
452 && msrs->counters[op_x86_virt_to_phys(i)].addr) 459 && msrs->counters[op_x86_virt_to_phys(i)].addr)
453 reset_value[i] = counter_config[i].count; 460 reset_value[i] = counter_config[i].count;
@@ -456,7 +463,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
456 } 463 }
457 464
458 /* clear all counters */ 465 /* clear all counters */
459 for (i = 0; i < NUM_COUNTERS; ++i) { 466 for (i = 0; i < num_counters; ++i) {
460 if (!msrs->controls[i].addr) 467 if (!msrs->controls[i].addr)
461 continue; 468 continue;
462 rdmsrl(msrs->controls[i].addr, val); 469 rdmsrl(msrs->controls[i].addr, val);
@@ -472,7 +479,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
472 } 479 }
473 480
474 /* enable active counters */ 481 /* enable active counters */
475 for (i = 0; i < NUM_COUNTERS; ++i) { 482 for (i = 0; i < num_counters; ++i) {
476 int virt = op_x86_phys_to_virt(i); 483 int virt = op_x86_phys_to_virt(i);
477 if (!reset_value[virt]) 484 if (!reset_value[virt])
478 continue; 485 continue;
@@ -503,7 +510,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs,
503 u64 val; 510 u64 val;
504 int i; 511 int i;
505 512
506 for (i = 0; i < NUM_COUNTERS; ++i) { 513 for (i = 0; i < num_counters; ++i) {
507 int virt = op_x86_phys_to_virt(i); 514 int virt = op_x86_phys_to_virt(i);
508 if (!reset_value[virt]) 515 if (!reset_value[virt])
509 continue; 516 continue;
@@ -526,7 +533,7 @@ static void op_amd_start(struct op_msrs const * const msrs)
526 u64 val; 533 u64 val;
527 int i; 534 int i;
528 535
529 for (i = 0; i < NUM_COUNTERS; ++i) { 536 for (i = 0; i < num_counters; ++i) {
530 if (!reset_value[op_x86_phys_to_virt(i)]) 537 if (!reset_value[op_x86_phys_to_virt(i)])
531 continue; 538 continue;
532 rdmsrl(msrs->controls[i].addr, val); 539 rdmsrl(msrs->controls[i].addr, val);
@@ -546,7 +553,7 @@ static void op_amd_stop(struct op_msrs const * const msrs)
546 * Subtle: stop on all counters to avoid race with setting our 553 * Subtle: stop on all counters to avoid race with setting our
547 * pm callback 554 * pm callback
548 */ 555 */
549 for (i = 0; i < NUM_COUNTERS; ++i) { 556 for (i = 0; i < num_counters; ++i) {
550 if (!reset_value[op_x86_phys_to_virt(i)]) 557 if (!reset_value[op_x86_phys_to_virt(i)])
551 continue; 558 continue;
552 rdmsrl(msrs->controls[i].addr, val); 559 rdmsrl(msrs->controls[i].addr, val);
@@ -603,6 +610,7 @@ static int force_ibs_eilvt_setup(void)
603 ret = setup_ibs_ctl(i); 610 ret = setup_ibs_ctl(i);
604 if (ret) 611 if (ret)
605 return ret; 612 return ret;
613 pr_err(FW_BUG "using offset %d for IBS interrupts\n", i);
606 return 0; 614 return 0;
607 } 615 }
608 616
@@ -630,21 +638,29 @@ static int __init_ibs_nmi(void)
630 return 0; 638 return 0;
631} 639}
632 640
633/* initialize the APIC for the IBS interrupts if available */ 641/*
642 * check and reserve APIC extended interrupt LVT offset for IBS if
643 * available
644 *
645 * init_ibs() preforms implicitly cpu-local operations, so pin this
646 * thread to its current CPU
647 */
648
634static void init_ibs(void) 649static void init_ibs(void)
635{ 650{
636 ibs_caps = get_ibs_caps(); 651 preempt_disable();
637 652
653 ibs_caps = get_ibs_caps();
638 if (!ibs_caps) 654 if (!ibs_caps)
639 return; 655 goto out;
640 656
641 if (__init_ibs_nmi()) { 657 if (__init_ibs_nmi() < 0)
642 ibs_caps = 0; 658 ibs_caps = 0;
643 return; 659 else
644 } 660 printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
645 661
646 printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", 662out:
647 (unsigned)ibs_caps); 663 preempt_enable();
648} 664}
649 665
650static int (*create_arch_files)(struct super_block *sb, struct dentry *root); 666static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
@@ -698,18 +714,29 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
698 return 0; 714 return 0;
699} 715}
700 716
717struct op_x86_model_spec op_amd_spec;
718
701static int op_amd_init(struct oprofile_operations *ops) 719static int op_amd_init(struct oprofile_operations *ops)
702{ 720{
703 init_ibs(); 721 init_ibs();
704 create_arch_files = ops->create_files; 722 create_arch_files = ops->create_files;
705 ops->create_files = setup_ibs_files; 723 ops->create_files = setup_ibs_files;
724
725 if (boot_cpu_data.x86 == 0x15) {
726 num_counters = NUM_COUNTERS_F15H;
727 } else {
728 num_counters = NUM_COUNTERS;
729 }
730
731 op_amd_spec.num_counters = num_counters;
732 op_amd_spec.num_controls = num_counters;
733 op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS);
734
706 return 0; 735 return 0;
707} 736}
708 737
709struct op_x86_model_spec op_amd_spec = { 738struct op_x86_model_spec op_amd_spec = {
710 .num_counters = NUM_COUNTERS, 739 /* num_counters/num_controls filled in at runtime */
711 .num_controls = NUM_COUNTERS,
712 .num_virt_counters = NUM_VIRT_COUNTERS,
713 .reserved = MSR_AMD_EVENTSEL_RESERVED, 740 .reserved = MSR_AMD_EVENTSEL_RESERVED,
714 .event_mask = OP_EVENT_MASK, 741 .event_mask = OP_EVENT_MASK,
715 .init = op_amd_init, 742 .init = op_amd_init,
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 182558dd5515..9fadec074142 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -11,7 +11,7 @@
11#include <linux/oprofile.h> 11#include <linux/oprofile.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/ptrace.h> 13#include <linux/ptrace.h>
14#include <linux/nmi.h> 14#include <asm/nmi.h>
15#include <asm/msr.h> 15#include <asm/msr.h>
16#include <asm/fixmap.h> 16#include <asm/fixmap.h>
17#include <asm/apic.h> 17#include <asm/apic.h>
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index c4bb261c106e..b1805b78842f 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -65,21 +65,13 @@ pcibios_align_resource(void *data, const struct resource *res,
65 resource_size_t size, resource_size_t align) 65 resource_size_t size, resource_size_t align)
66{ 66{
67 struct pci_dev *dev = data; 67 struct pci_dev *dev = data;
68 resource_size_t start = round_down(res->end - size + 1, align); 68 resource_size_t start = res->start;
69 69
70 if (res->flags & IORESOURCE_IO) { 70 if (res->flags & IORESOURCE_IO) {
71 71 if (skip_isa_ioresource_align(dev))
72 /* 72 return start;
73 * If we're avoiding ISA aliases, the largest contiguous I/O 73 if (start & 0x300)
74 * port space is 256 bytes. Clearing bits 9 and 10 preserves 74 start = (start + 0x3ff) & ~0x3ff;
75 * all 256-byte and smaller alignments, so the result will
76 * still be correctly aligned.
77 */
78 if (!skip_isa_ioresource_align(dev))
79 start &= ~0x300;
80 } else if (res->flags & IORESOURCE_MEM) {
81 if (start < BIOS_END)
82 start = res->end; /* fail; no space */
83 } 75 }
84 return start; 76 return start;
85} 77}
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 2492d165096a..a5f7d0d63de0 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -9,6 +9,7 @@
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <asm/pci_x86.h> 10#include <asm/pci_x86.h>
11#include <asm/pci-functions.h> 11#include <asm/pci-functions.h>
12#include <asm/cacheflush.h>
12 13
13/* BIOS32 signature: "_32_" */ 14/* BIOS32 signature: "_32_" */
14#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) 15#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
@@ -25,6 +26,27 @@
25#define PCIBIOS_HW_TYPE1_SPEC 0x10 26#define PCIBIOS_HW_TYPE1_SPEC 0x10
26#define PCIBIOS_HW_TYPE2_SPEC 0x20 27#define PCIBIOS_HW_TYPE2_SPEC 0x20
27 28
29int pcibios_enabled;
30
31/* According to the BIOS specification at:
32 * http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could
33 * restrict the x zone to some pages and make it ro. But this may be
34 * broken on some bios, complex to handle with static_protections.
35 * We could make the 0xe0000-0x100000 range rox, but this can break
36 * some ISA mapping.
37 *
38 * So we let's an rw and x hole when pcibios is used. This shouldn't
39 * happen for modern system with mmconfig, and if you don't want it
40 * you could disable pcibios...
41 */
42static inline void set_bios_x(void)
43{
44 pcibios_enabled = 1;
45 set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
46 if (__supported_pte_mask & _PAGE_NX)
47 printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n");
48}
49
28/* 50/*
29 * This is the standard structure used to identify the entry point 51 * This is the standard structure used to identify the entry point
30 * to the BIOS32 Service Directory, as documented in 52 * to the BIOS32 Service Directory, as documented in
@@ -332,6 +354,7 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
332 DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", 354 DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
333 bios32_entry); 355 bios32_entry);
334 bios32_indirect.address = bios32_entry + PAGE_OFFSET; 356 bios32_indirect.address = bios32_entry + PAGE_OFFSET;
357 set_bios_x();
335 if (check_pcibios()) 358 if (check_pcibios())
336 return &pci_bios_access; 359 return &pci_bios_access;
337 } 360 }
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index d7b5109f7a9c..25cd4a07d09f 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -70,6 +70,9 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
70struct xen_pci_frontend_ops *xen_pci_frontend; 70struct xen_pci_frontend_ops *xen_pci_frontend;
71EXPORT_SYMBOL_GPL(xen_pci_frontend); 71EXPORT_SYMBOL_GPL(xen_pci_frontend);
72 72
73#define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \
74 MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0))
75
73static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq, 76static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
74 struct msi_msg *msg) 77 struct msi_msg *msg)
75{ 78{
@@ -83,12 +86,7 @@ static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
83 MSI_ADDR_REDIRECTION_CPU | 86 MSI_ADDR_REDIRECTION_CPU |
84 MSI_ADDR_DEST_ID(pirq); 87 MSI_ADDR_DEST_ID(pirq);
85 88
86 msg->data = 89 msg->data = XEN_PIRQ_MSI_DATA;
87 MSI_DATA_TRIGGER_EDGE |
88 MSI_DATA_LEVEL_ASSERT |
89 /* delivery mode reserved */
90 (3 << 8) |
91 MSI_DATA_VECTOR(0);
92} 90}
93 91
94static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 92static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
@@ -98,8 +96,23 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
98 struct msi_msg msg; 96 struct msi_msg msg;
99 97
100 list_for_each_entry(msidesc, &dev->msi_list, list) { 98 list_for_each_entry(msidesc, &dev->msi_list, list) {
99 __read_msi_msg(msidesc, &msg);
100 pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
101 ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
102 if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) {
103 xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
104 "msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ);
105 if (irq < 0)
106 goto error;
107 ret = set_irq_msi(irq, msidesc);
108 if (ret < 0)
109 goto error_while;
110 printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d"
111 " pirq=%d\n", irq, pirq);
112 return 0;
113 }
101 xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? 114 xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
102 "msi-x" : "msi", &irq, &pirq); 115 "msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ));
103 if (irq < 0 || pirq < 0) 116 if (irq < 0 || pirq < 0)
104 goto error; 117 goto error;
105 printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq); 118 printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq);
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
index cc822a29b227..7785b72ecc3a 100644
--- a/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -37,9 +37,9 @@ static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
37/* All CPUs enumerated by SFI must be present and enabled */ 37/* All CPUs enumerated by SFI must be present and enabled */
38static void __cpuinit mp_sfi_register_lapic(u8 id) 38static void __cpuinit mp_sfi_register_lapic(u8 id)
39{ 39{
40 if (MAX_APICS - id <= 0) { 40 if (MAX_LOCAL_APIC - id <= 0) {
41 pr_warning("Processor #%d invalid (max %d)\n", 41 pr_warning("Processor #%d invalid (max %d)\n",
42 id, MAX_APICS); 42 id, MAX_LOCAL_APIC);
43 return; 43 return;
44 } 44 }
45 45
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index a318194002b5..df58e9cad96a 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1341,7 +1341,7 @@ uv_activation_descriptor_init(int node, int pnode)
1341 1341
1342 /* 1342 /*
1343 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) 1343 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
1344 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub 1344 * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE)
1345 */ 1345 */
1346 bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE 1346 bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE
1347 * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); 1347 * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
@@ -1455,7 +1455,7 @@ static void __init uv_init_uvhub(int uvhub, int vector)
1455 * the below initialization can't be in firmware because the 1455 * the below initialization can't be in firmware because the
1456 * messaging IRQ will be determined by the OS 1456 * messaging IRQ will be determined by the OS
1457 */ 1457 */
1458 apicid = uvhub_to_first_apicid(uvhub); 1458 apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
1459 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, 1459 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
1460 ((apicid << 32) | vector)); 1460 ((apicid << 32) | vector));
1461} 1461}
@@ -1490,7 +1490,7 @@ calculate_destination_timeout(void)
1490/* 1490/*
1491 * initialize the bau_control structure for each cpu 1491 * initialize the bau_control structure for each cpu
1492 */ 1492 */
1493static void __init uv_init_per_cpu(int nuvhubs) 1493static int __init uv_init_per_cpu(int nuvhubs)
1494{ 1494{
1495 int i; 1495 int i;
1496 int cpu; 1496 int cpu;
@@ -1507,7 +1507,7 @@ static void __init uv_init_per_cpu(int nuvhubs)
1507 struct bau_control *smaster = NULL; 1507 struct bau_control *smaster = NULL;
1508 struct socket_desc { 1508 struct socket_desc {
1509 short num_cpus; 1509 short num_cpus;
1510 short cpu_number[16]; 1510 short cpu_number[MAX_CPUS_PER_SOCKET];
1511 }; 1511 };
1512 struct uvhub_desc { 1512 struct uvhub_desc {
1513 unsigned short socket_mask; 1513 unsigned short socket_mask;
@@ -1540,6 +1540,10 @@ static void __init uv_init_per_cpu(int nuvhubs)
1540 sdp = &bdp->socket[socket]; 1540 sdp = &bdp->socket[socket];
1541 sdp->cpu_number[sdp->num_cpus] = cpu; 1541 sdp->cpu_number[sdp->num_cpus] = cpu;
1542 sdp->num_cpus++; 1542 sdp->num_cpus++;
1543 if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
1544 printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus);
1545 return 1;
1546 }
1543 } 1547 }
1544 for (uvhub = 0; uvhub < nuvhubs; uvhub++) { 1548 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1545 if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) 1549 if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
@@ -1570,6 +1574,12 @@ static void __init uv_init_per_cpu(int nuvhubs)
1570 bcp->uvhub_master = hmaster; 1574 bcp->uvhub_master = hmaster;
1571 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)-> 1575 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
1572 blade_processor_id; 1576 blade_processor_id;
1577 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
1578 printk(KERN_EMERG
1579 "%d cpus per uvhub invalid\n",
1580 bcp->uvhub_cpu);
1581 return 1;
1582 }
1573 } 1583 }
1574nextsocket: 1584nextsocket:
1575 socket++; 1585 socket++;
@@ -1595,6 +1605,7 @@ nextsocket:
1595 bcp->congested_reps = congested_reps; 1605 bcp->congested_reps = congested_reps;
1596 bcp->congested_period = congested_period; 1606 bcp->congested_period = congested_period;
1597 } 1607 }
1608 return 0;
1598} 1609}
1599 1610
1600/* 1611/*
@@ -1625,7 +1636,10 @@ static int __init uv_bau_init(void)
1625 spin_lock_init(&disable_lock); 1636 spin_lock_init(&disable_lock);
1626 congested_cycles = microsec_2_cycles(congested_response_us); 1637 congested_cycles = microsec_2_cycles(congested_response_us);
1627 1638
1628 uv_init_per_cpu(nuvhubs); 1639 if (uv_init_per_cpu(nuvhubs)) {
1640 nobau = 1;
1641 return 0;
1642 }
1629 1643
1630 uv_partition_base_pnode = 0x7fffffff; 1644 uv_partition_base_pnode = 0x7fffffff;
1631 for (uvhub = 0; uvhub < nuvhubs; uvhub++) 1645 for (uvhub = 0; uvhub < nuvhubs; uvhub++)
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 56e421bc379b..9daf5d1af9f1 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -89,6 +89,7 @@ static void uv_rtc_send_IPI(int cpu)
89 89
90 apicid = cpu_physical_id(cpu); 90 apicid = cpu_physical_id(cpu);
91 pnode = uv_apicid_to_pnode(apicid); 91 pnode = uv_apicid_to_pnode(apicid);
92 apicid |= uv_apicid_hibits;
92 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 93 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
93 (apicid << UVH_IPI_INT_APIC_ID_SHFT) | 94 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
94 (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT); 95 (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
@@ -107,6 +108,7 @@ static int uv_intr_pending(int pnode)
107static int uv_setup_intr(int cpu, u64 expires) 108static int uv_setup_intr(int cpu, u64 expires)
108{ 109{
109 u64 val; 110 u64 val;
111 unsigned long apicid = cpu_physical_id(cpu) | uv_apicid_hibits;
110 int pnode = uv_cpu_to_pnode(cpu); 112 int pnode = uv_cpu_to_pnode(cpu);
111 113
112 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, 114 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
@@ -117,7 +119,7 @@ static int uv_setup_intr(int cpu, u64 expires)
117 UVH_EVENT_OCCURRED0_RTC1_MASK); 119 UVH_EVENT_OCCURRED0_RTC1_MASK);
118 120
119 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | 121 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
120 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); 122 ((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
121 123
122 /* Set configuration */ 124 /* Set configuration */
123 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); 125 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index 3371bd053b89..632037671746 100644
--- a/arch/x86/platform/visws/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -171,7 +171,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
171 ver = m->apicver; 171 ver = m->apicver;
172 if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) { 172 if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
173 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", 173 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
174 m->apicid, MAX_APICS); 174 m->apicid, MAX_LOCAL_APIC);
175 return; 175 return;
176 } 176 }
177 177
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 4a2afa1bac51..b6552b189bcd 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -25,7 +25,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
25 25
26export CPPFLAGS_vdso.lds += -P -C 26export CPPFLAGS_vdso.lds += -P -C
27 27
28VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \ 28VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
29 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 29 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
30 30
31$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so 31$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
@@ -69,7 +69,7 @@ vdso32.so-$(VDSO32-y) += sysenter
69vdso32-images = $(vdso32.so-y:%=vdso32-%.so) 69vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
70 70
71CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) 71CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
72VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1 72VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-soname=linux-gate.so.1
73 73
74# This makes sure the $(obj) subdirectory exists even though vdso32/ 74# This makes sure the $(obj) subdirectory exists even though vdso32/
75# is not a kbuild sub-make subdirectory. 75# is not a kbuild sub-make subdirectory.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 235c0f4d3861..44dcad43989d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -75,6 +75,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
75enum xen_domain_type xen_domain_type = XEN_NATIVE; 75enum xen_domain_type xen_domain_type = XEN_NATIVE;
76EXPORT_SYMBOL_GPL(xen_domain_type); 76EXPORT_SYMBOL_GPL(xen_domain_type);
77 77
78unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
79EXPORT_SYMBOL(machine_to_phys_mapping);
80unsigned int machine_to_phys_order;
81EXPORT_SYMBOL(machine_to_phys_order);
82
78struct start_info *xen_start_info; 83struct start_info *xen_start_info;
79EXPORT_SYMBOL_GPL(xen_start_info); 84EXPORT_SYMBOL_GPL(xen_start_info);
80 85
@@ -1016,10 +1021,6 @@ static void xen_reboot(int reason)
1016{ 1021{
1017 struct sched_shutdown r = { .reason = reason }; 1022 struct sched_shutdown r = { .reason = reason };
1018 1023
1019#ifdef CONFIG_SMP
1020 stop_other_cpus();
1021#endif
1022
1023 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) 1024 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
1024 BUG(); 1025 BUG();
1025} 1026}
@@ -1090,6 +1091,8 @@ static void __init xen_setup_stackprotector(void)
1090/* First C function to be called on Xen boot */ 1091/* First C function to be called on Xen boot */
1091asmlinkage void __init xen_start_kernel(void) 1092asmlinkage void __init xen_start_kernel(void)
1092{ 1093{
1094 struct physdev_set_iopl set_iopl;
1095 int rc;
1093 pgd_t *pgd; 1096 pgd_t *pgd;
1094 1097
1095 if (!xen_start_info) 1098 if (!xen_start_info)
@@ -1097,6 +1100,8 @@ asmlinkage void __init xen_start_kernel(void)
1097 1100
1098 xen_domain_type = XEN_PV_DOMAIN; 1101 xen_domain_type = XEN_PV_DOMAIN;
1099 1102
1103 xen_setup_machphys_mapping();
1104
1100 /* Install Xen paravirt ops */ 1105 /* Install Xen paravirt ops */
1101 pv_info = xen_info; 1106 pv_info = xen_info;
1102 pv_init_ops = xen_init_ops; 1107 pv_init_ops = xen_init_ops;
@@ -1191,8 +1196,6 @@ asmlinkage void __init xen_start_kernel(void)
1191 /* Allocate and initialize top and mid mfn levels for p2m structure */ 1196 /* Allocate and initialize top and mid mfn levels for p2m structure */
1192 xen_build_mfn_list_list(); 1197 xen_build_mfn_list_list();
1193 1198
1194 init_mm.pgd = pgd;
1195
1196 /* keep using Xen gdt for now; no urgent need to change it */ 1199 /* keep using Xen gdt for now; no urgent need to change it */
1197 1200
1198#ifdef CONFIG_X86_32 1201#ifdef CONFIG_X86_32
@@ -1202,10 +1205,18 @@ asmlinkage void __init xen_start_kernel(void)
1202#else 1205#else
1203 pv_info.kernel_rpl = 0; 1206 pv_info.kernel_rpl = 0;
1204#endif 1207#endif
1205
1206 /* set the limit of our address space */ 1208 /* set the limit of our address space */
1207 xen_reserve_top(); 1209 xen_reserve_top();
1208 1210
1211 /* We used to do this in xen_arch_setup, but that is too late on AMD
1212 * were early_cpu_init (run before ->arch_setup()) calls early_amd_init
1213 * which pokes 0xcf8 port.
1214 */
1215 set_iopl.iopl = 1;
1216 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1217 if (rc != 0)
1218 xen_raw_printk("physdev_op failed %d\n", rc);
1219
1209#ifdef CONFIG_X86_32 1220#ifdef CONFIG_X86_32
1210 /* set up basic CPUID stuff */ 1221 /* set up basic CPUID stuff */
1211 cpu_detect(&new_cpu_data); 1222 cpu_detect(&new_cpu_data);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 21ed8d7f75a5..44924e551fde 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2034,6 +2034,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
2034 set_page_prot(pmd, PAGE_KERNEL_RO); 2034 set_page_prot(pmd, PAGE_KERNEL_RO);
2035} 2035}
2036 2036
2037void __init xen_setup_machphys_mapping(void)
2038{
2039 struct xen_machphys_mapping mapping;
2040 unsigned long machine_to_phys_nr_ents;
2041
2042 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
2043 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
2044 machine_to_phys_nr_ents = mapping.max_mfn + 1;
2045 } else {
2046 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
2047 }
2048 machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
2049}
2050
2037#ifdef CONFIG_X86_64 2051#ifdef CONFIG_X86_64
2038static void convert_pfn_mfn(void *v) 2052static void convert_pfn_mfn(void *v)
2039{ 2053{
@@ -2119,44 +2133,83 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
2119 return pgd; 2133 return pgd;
2120} 2134}
2121#else /* !CONFIG_X86_64 */ 2135#else /* !CONFIG_X86_64 */
2122static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD); 2136static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
2137static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
2138
2139static __init void xen_write_cr3_init(unsigned long cr3)
2140{
2141 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
2142
2143 BUG_ON(read_cr3() != __pa(initial_page_table));
2144 BUG_ON(cr3 != __pa(swapper_pg_dir));
2145
2146 /*
2147 * We are switching to swapper_pg_dir for the first time (from
2148 * initial_page_table) and therefore need to mark that page
2149 * read-only and then pin it.
2150 *
2151 * Xen disallows sharing of kernel PMDs for PAE
2152 * guests. Therefore we must copy the kernel PMD from
2153 * initial_page_table into a new kernel PMD to be used in
2154 * swapper_pg_dir.
2155 */
2156 swapper_kernel_pmd =
2157 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2158 memcpy(swapper_kernel_pmd, initial_kernel_pmd,
2159 sizeof(pmd_t) * PTRS_PER_PMD);
2160 swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
2161 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
2162 set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
2163
2164 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
2165 xen_write_cr3(cr3);
2166 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
2167
2168 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
2169 PFN_DOWN(__pa(initial_page_table)));
2170 set_page_prot(initial_page_table, PAGE_KERNEL);
2171 set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
2172
2173 pv_mmu_ops.write_cr3 = &xen_write_cr3;
2174}
2123 2175
2124__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, 2176__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
2125 unsigned long max_pfn) 2177 unsigned long max_pfn)
2126{ 2178{
2127 pmd_t *kernel_pmd; 2179 pmd_t *kernel_pmd;
2128 2180
2129 level2_kernel_pgt = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); 2181 initial_kernel_pmd =
2182 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2130 2183
2131 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + 2184 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
2132 xen_start_info->nr_pt_frames * PAGE_SIZE + 2185 xen_start_info->nr_pt_frames * PAGE_SIZE +
2133 512*1024); 2186 512*1024);
2134 2187
2135 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 2188 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
2136 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); 2189 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
2137 2190
2138 xen_map_identity_early(level2_kernel_pgt, max_pfn); 2191 xen_map_identity_early(initial_kernel_pmd, max_pfn);
2139 2192
2140 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); 2193 memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
2141 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], 2194 initial_page_table[KERNEL_PGD_BOUNDARY] =
2142 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); 2195 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
2143 2196
2144 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); 2197 set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
2145 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); 2198 set_page_prot(initial_page_table, PAGE_KERNEL_RO);
2146 set_page_prot(empty_zero_page, PAGE_KERNEL_RO); 2199 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
2147 2200
2148 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 2201 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
2149 2202
2150 xen_write_cr3(__pa(swapper_pg_dir)); 2203 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
2151 2204 PFN_DOWN(__pa(initial_page_table)));
2152 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); 2205 xen_write_cr3(__pa(initial_page_table));
2153 2206
2154 memblock_x86_reserve_range(__pa(xen_start_info->pt_base), 2207 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
2155 __pa(xen_start_info->pt_base + 2208 __pa(xen_start_info->pt_base +
2156 xen_start_info->nr_pt_frames * PAGE_SIZE), 2209 xen_start_info->nr_pt_frames * PAGE_SIZE),
2157 "XEN PAGETABLES"); 2210 "XEN PAGETABLES");
2158 2211
2159 return swapper_pg_dir; 2212 return initial_page_table;
2160} 2213}
2161#endif /* CONFIG_X86_64 */ 2214#endif /* CONFIG_X86_64 */
2162 2215
@@ -2290,7 +2343,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
2290 .write_cr2 = xen_write_cr2, 2343 .write_cr2 = xen_write_cr2,
2291 2344
2292 .read_cr3 = xen_read_cr3, 2345 .read_cr3 = xen_read_cr3,
2346#ifdef CONFIG_X86_32
2347 .write_cr3 = xen_write_cr3_init,
2348#else
2293 .write_cr3 = xen_write_cr3, 2349 .write_cr3 = xen_write_cr3,
2350#endif
2294 2351
2295 .flush_tlb_user = xen_flush_tlb, 2352 .flush_tlb_user = xen_flush_tlb,
2296 .flush_tlb_kernel = xen_flush_tlb, 2353 .flush_tlb_kernel = xen_flush_tlb,
@@ -2358,8 +2415,6 @@ void __init xen_init_mmu_ops(void)
2358 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; 2415 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2359 pv_mmu_ops = xen_mmu_ops; 2416 pv_mmu_ops = xen_mmu_ops;
2360 2417
2361 vmap_lazy_unmap = false;
2362
2363 memset(dummy_mapping, 0xff, PAGE_SIZE); 2418 memset(dummy_mapping, 0xff, PAGE_SIZE);
2364} 2419}
2365 2420
@@ -2627,7 +2682,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2627 2682
2628 prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); 2683 prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2629 2684
2630 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; 2685 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
2686 (VM_PFNMAP | VM_RESERVED | VM_IO)));
2631 2687
2632 rmd.mfn = mfn; 2688 rmd.mfn = mfn;
2633 rmd.prot = prot; 2689 rmd.prot = prot;
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 0f456386cce5..25c52f94a27c 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -68,7 +68,7 @@ static int __init check_platform_magic(void)
68 return 0; 68 return 0;
69} 69}
70 70
71void __init xen_unplug_emulated_devices(void) 71void xen_unplug_emulated_devices(void)
72{ 72{
73 int r; 73 int r;
74 74
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 769c4b01fa32..b5a7f928234b 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -23,7 +23,6 @@
23#include <xen/interface/callback.h> 23#include <xen/interface/callback.h>
24#include <xen/interface/memory.h> 24#include <xen/interface/memory.h>
25#include <xen/interface/physdev.h> 25#include <xen/interface/physdev.h>
26#include <xen/interface/memory.h>
27#include <xen/features.h> 26#include <xen/features.h>
28 27
29#include "xen-ops.h" 28#include "xen-ops.h"
@@ -182,24 +181,21 @@ char * __init xen_memory_setup(void)
182 for (i = 0; i < memmap.nr_entries; i++) { 181 for (i = 0; i < memmap.nr_entries; i++) {
183 unsigned long long end = map[i].addr + map[i].size; 182 unsigned long long end = map[i].addr + map[i].size;
184 183
185 if (map[i].type == E820_RAM) { 184 if (map[i].type == E820_RAM && end > mem_end) {
186 if (map[i].addr < mem_end && end > mem_end) { 185 /* RAM off the end - may be partially included */
187 /* Truncate region to max_mem. */ 186 u64 delta = min(map[i].size, end - mem_end);
188 u64 delta = end - mem_end;
189 187
190 map[i].size -= delta; 188 map[i].size -= delta;
191 extra_pages += PFN_DOWN(delta); 189 end -= delta;
192 190
193 end = mem_end; 191 extra_pages += PFN_DOWN(delta);
194 }
195 } 192 }
196 193
197 if (end > xen_extra_mem_start) 194 if (map[i].size > 0 && end > xen_extra_mem_start)
198 xen_extra_mem_start = end; 195 xen_extra_mem_start = end;
199 196
200 /* If region is non-RAM or below mem_end, add what remains */ 197 /* Add region if any remains */
201 if ((map[i].type != E820_RAM || map[i].addr < mem_end) && 198 if (map[i].size > 0)
202 map[i].size > 0)
203 e820_add_region(map[i].addr, map[i].size, map[i].type); 199 e820_add_region(map[i].addr, map[i].size, map[i].type);
204 } 200 }
205 201
@@ -248,26 +244,11 @@ char * __init xen_memory_setup(void)
248 else 244 else
249 extra_pages = 0; 245 extra_pages = 0;
250 246
251 if (!xen_initial_domain()) 247 xen_add_extra_mem(extra_pages);
252 xen_add_extra_mem(extra_pages);
253 248
254 return "Xen"; 249 return "Xen";
255} 250}
256 251
257static void xen_idle(void)
258{
259 local_irq_disable();
260
261 if (need_resched())
262 local_irq_enable();
263 else {
264 current_thread_info()->status &= ~TS_POLLING;
265 smp_mb__after_clear_bit();
266 safe_halt();
267 current_thread_info()->status |= TS_POLLING;
268 }
269}
270
271/* 252/*
272 * Set the bit indicating "nosegneg" library variants should be used. 253 * Set the bit indicating "nosegneg" library variants should be used.
273 * We only need to bother in pure 32-bit mode; compat 32-bit processes 254 * We only need to bother in pure 32-bit mode; compat 32-bit processes
@@ -337,9 +318,6 @@ void __cpuinit xen_enable_syscall(void)
337 318
338void __init xen_arch_setup(void) 319void __init xen_arch_setup(void)
339{ 320{
340 struct physdev_set_iopl set_iopl;
341 int rc;
342
343 xen_panic_handler_init(); 321 xen_panic_handler_init();
344 322
345 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); 323 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
@@ -356,11 +334,6 @@ void __init xen_arch_setup(void)
356 xen_enable_sysenter(); 334 xen_enable_sysenter();
357 xen_enable_syscall(); 335 xen_enable_syscall();
358 336
359 set_iopl.iopl = 1;
360 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
361 if (rc != 0)
362 printk(KERN_INFO "physdev_op failed %d\n", rc);
363
364#ifdef CONFIG_ACPI 337#ifdef CONFIG_ACPI
365 if (!(xen_start_info->flags & SIF_INITDOMAIN)) { 338 if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
366 printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); 339 printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
@@ -372,7 +345,11 @@ void __init xen_arch_setup(void)
372 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? 345 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
373 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); 346 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
374 347
375 pm_idle = xen_idle; 348 /* Set up idle, making sure it calls safe_halt() pvop */
349#ifdef CONFIG_X86_32
350 boot_cpu_data.hlt_works_ok = 1;
351#endif
352 pm_idle = default_idle;
376 353
377 fiddle_vdso(); 354 fiddle_vdso();
378} 355}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 1d789d56877c..9bbd63a129b5 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -31,6 +31,7 @@ void xen_hvm_post_suspend(int suspend_cancelled)
31 int cpu; 31 int cpu;
32 xen_hvm_init_shared_info(); 32 xen_hvm_init_shared_info();
33 xen_callback_vector(); 33 xen_callback_vector();
34 xen_unplug_emulated_devices();
34 if (xen_feature(XENFEAT_hvm_safe_pvclock)) { 35 if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
35 for_each_online_cpu(cpu) { 36 for_each_online_cpu(cpu) {
36 xen_setup_runstate_info(cpu); 37 xen_setup_runstate_info(cpu);
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index b2bb5aa3b054..5da5e53fb94c 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -426,6 +426,8 @@ void xen_timer_resume(void)
426{ 426{
427 int cpu; 427 int cpu;
428 428
429 pvclock_resume();
430
429 if (xen_clockevent != &xen_vcpuop_clockevent) 431 if (xen_clockevent != &xen_vcpuop_clockevent)
430 return; 432 return;
431 433
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 64044747348e..9d41bf985757 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -43,7 +43,7 @@ void xen_vcpu_restore(void);
43 43
44void xen_callback_vector(void); 44void xen_callback_vector(void);
45void xen_hvm_init_shared_info(void); 45void xen_hvm_init_shared_info(void);
46void __init xen_unplug_emulated_devices(void); 46void xen_unplug_emulated_devices(void);
47 47
48void __init xen_build_dynamic_phys_to_machine(void); 48void __init xen_build_dynamic_phys_to_machine(void);
49 49