aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorRussell King <rmk+kernel@arm.linux.org.uk>2009-09-22 15:54:53 -0400
committerRussell King <rmk+kernel@arm.linux.org.uk>2009-09-22 16:01:40 -0400
commitae19ffbadc1b2100285a5b5b3d0a4e0a11390904 (patch)
tree3c2086ab67398a019089a47ca3f362a4bc6db74f /arch/x86/kernel
parent34e84f39a27d059a3e6ec6e8b94aafa702e6f220 (diff)
parent9173a8ef24a6b1b8031507b35b8ffe5f85a87692 (diff)
Merge branch 'master' into for-linus
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/apic/nmi.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/amd.c10
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c88
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile5
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c116
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c158
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c312
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c10
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c94
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c163
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c13
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c46
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c14
-rw-r--r--arch/x86/kernel/cpu/sched.c55
-rw-r--r--arch/x86/kernel/entry_64.S6
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/pci-dma.c4
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c7
-rw-r--r--arch/x86/kernel/setup.c23
-rw-r--r--arch/x86/kernel/setup_percpu.c364
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/smpboot.c16
-rw-r--r--arch/x86/kernel/tboot.c447
-rw-r--r--arch/x86/kernel/vmlinux.lds.S11
34 files changed, 1004 insertions, 1135 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 430d5b24af7b..832cb838cb48 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
52obj-$(CONFIG_X86_32) += tls.o 52obj-$(CONFIG_X86_32) += tls.o
53obj-$(CONFIG_IA32_EMULATION) += tls.o 53obj-$(CONFIG_IA32_EMULATION) += tls.o
54obj-y += step.o 54obj-y += step.o
55obj-$(CONFIG_INTEL_TXT) += tboot.o
55obj-$(CONFIG_STACKTRACE) += stacktrace.o 56obj-$(CONFIG_STACKTRACE) += stacktrace.o
56obj-y += cpu/ 57obj-y += cpu/
57obj-y += acpi/ 58obj-y += acpi/
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index db7220220d09..cb66a22d98ad 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)
66 66
67static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
68{ 68{
69#if defined(CONFIG_X86_NEW_MCE) 69#if defined(CONFIG_X86_MCE)
70 return atomic_read(&mce_entry) > 0; 70 return atomic_read(&mce_entry) > 0;
71#endif 71#endif
72 return 0; 72 return 0;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c1f253dac155..8dd30638fe44 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -13,7 +13,7 @@ CFLAGS_common.o := $(nostackp)
13 13
14obj-y := intel_cacheinfo.o addon_cpuid_features.o 14obj-y := intel_cacheinfo.o addon_cpuid_features.o
15obj-y += proc.o capflags.o powerflags.o common.o 15obj-y += proc.o capflags.o powerflags.o common.o
16obj-y += vmware.o hypervisor.o 16obj-y += vmware.o hypervisor.o sched.o
17 17
18obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 18obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
19obj-$(CONFIG_X86_64) += bugs_64.o 19obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 22a47c82f3c0..f32fa71ccf97 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -333,6 +333,16 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
333#endif 333#endif
334} 334}
335 335
336int amd_get_nb_id(int cpu)
337{
338 int id = 0;
339#ifdef CONFIG_SMP
340 id = per_cpu(cpu_llc_id, cpu);
341#endif
342 return id;
343}
344EXPORT_SYMBOL_GPL(amd_get_nb_id);
345
336static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 346static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
337{ 347{
338#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 348#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 6b2a52dd0403..dca325c03999 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -30,8 +30,8 @@
30#include <asm/apic.h> 30#include <asm/apic.h>
31#include <asm/desc.h> 31#include <asm/desc.h>
32 32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); 33static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); 34static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr);
35static DEFINE_PER_CPU(int, cpu_priv_count); 35static DEFINE_PER_CPU(int, cpu_priv_count);
36 36
37static DEFINE_MUTEX(cpu_debug_lock); 37static DEFINE_MUTEX(cpu_debug_lock);
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index ae9b503220ca..4109679863c1 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -60,7 +60,6 @@ enum {
60}; 60};
61 61
62#define INTEL_MSR_RANGE (0xffff) 62#define INTEL_MSR_RANGE (0xffff)
63#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
64 63
65struct acpi_cpufreq_data { 64struct acpi_cpufreq_data {
66 struct acpi_processor_performance *acpi_data; 65 struct acpi_processor_performance *acpi_data;
@@ -71,11 +70,7 @@ struct acpi_cpufreq_data {
71 70
72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); 71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
73 72
74struct acpi_msr_data { 73static DEFINE_PER_CPU(struct aperfmperf, old_perf);
75 u64 saved_aperf, saved_mperf;
76};
77
78static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);
79 74
80DEFINE_TRACE(power_mark); 75DEFINE_TRACE(power_mark);
81 76
@@ -244,23 +239,12 @@ static u32 get_cur_val(const struct cpumask *mask)
244 return cmd.val; 239 return cmd.val;
245} 240}
246 241
247struct perf_pair {
248 union {
249 struct {
250 u32 lo;
251 u32 hi;
252 } split;
253 u64 whole;
254 } aperf, mperf;
255};
256
257/* Called via smp_call_function_single(), on the target CPU */ 242/* Called via smp_call_function_single(), on the target CPU */
258static void read_measured_perf_ctrs(void *_cur) 243static void read_measured_perf_ctrs(void *_cur)
259{ 244{
260 struct perf_pair *cur = _cur; 245 struct aperfmperf *am = _cur;
261 246
262 rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi); 247 get_aperfmperf(am);
263 rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);
264} 248}
265 249
266/* 250/*
@@ -279,63 +263,17 @@ static void read_measured_perf_ctrs(void *_cur)
279static unsigned int get_measured_perf(struct cpufreq_policy *policy, 263static unsigned int get_measured_perf(struct cpufreq_policy *policy,
280 unsigned int cpu) 264 unsigned int cpu)
281{ 265{
282 struct perf_pair readin, cur; 266 struct aperfmperf perf;
283 unsigned int perf_percent; 267 unsigned long ratio;
284 unsigned int retval; 268 unsigned int retval;
285 269
286 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1)) 270 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
287 return 0; 271 return 0;
288 272
289 cur.aperf.whole = readin.aperf.whole - 273 ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf);
290 per_cpu(msr_data, cpu).saved_aperf; 274 per_cpu(old_perf, cpu) = perf;
291 cur.mperf.whole = readin.mperf.whole -
292 per_cpu(msr_data, cpu).saved_mperf;
293 per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;
294 per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;
295
296#ifdef __i386__
297 /*
298 * We dont want to do 64 bit divide with 32 bit kernel
299 * Get an approximate value. Return failure in case we cannot get
300 * an approximate value.
301 */
302 if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) {
303 int shift_count;
304 u32 h;
305
306 h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi);
307 shift_count = fls(h);
308
309 cur.aperf.whole >>= shift_count;
310 cur.mperf.whole >>= shift_count;
311 }
312
313 if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) {
314 int shift_count = 7;
315 cur.aperf.split.lo >>= shift_count;
316 cur.mperf.split.lo >>= shift_count;
317 }
318
319 if (cur.aperf.split.lo && cur.mperf.split.lo)
320 perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo;
321 else
322 perf_percent = 0;
323 275
324#else 276 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
325 if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
326 int shift_count = 7;
327 cur.aperf.whole >>= shift_count;
328 cur.mperf.whole >>= shift_count;
329 }
330
331 if (cur.aperf.whole && cur.mperf.whole)
332 perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
333 else
334 perf_percent = 0;
335
336#endif
337
338 retval = (policy->cpuinfo.max_freq * perf_percent) / 100;
339 277
340 return retval; 278 return retval;
341} 279}
@@ -731,12 +669,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
731 acpi_processor_notify_smm(THIS_MODULE); 669 acpi_processor_notify_smm(THIS_MODULE);
732 670
733 /* Check for APERF/MPERF support in hardware */ 671 /* Check for APERF/MPERF support in hardware */
734 if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) { 672 if (cpu_has(c, X86_FEATURE_APERFMPERF))
735 unsigned int ecx; 673 acpi_cpufreq_driver.getavg = get_measured_perf;
736 ecx = cpuid_ecx(6);
737 if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
738 acpi_cpufreq_driver.getavg = get_measured_perf;
739 }
740 674
741 dprintk("CPU%u - ACPI performance management activated.\n", cpu); 675 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
742 for (i = 0; i < perf->state_count; i++) 676 for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 80a722a071b5..40e1835b35e8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -350,6 +350,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
350 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 350 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
351 } 351 }
352 352
353 if (c->cpuid_level > 6) {
354 unsigned ecx = cpuid_ecx(6);
355 if (ecx & 0x01)
356 set_cpu_cap(c, X86_FEATURE_APERFMPERF);
357 }
358
353 if (cpu_has_xmm2) 359 if (cpu_has_xmm2)
354 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 360 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
355 if (cpu_has_ds) { 361 if (cpu_has_ds) {
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 188a1ca5ad2b..4ac6d48fe11b 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,11 +1,8 @@
1obj-y = mce.o 1obj-y = mce.o mce-severity.o
2 2
3obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
4obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
5obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o 3obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
6obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o 4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
7obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o 5obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
8obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
9obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o 6obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
10obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o 7obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
11 8
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
deleted file mode 100644
index b945d5dbc609..000000000000
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ /dev/null
@@ -1,116 +0,0 @@
1/*
2 * Athlon specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@redhat.com>
4 */
5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h>
10
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/mce.h>
14#include <asm/msr.h>
15
16/* Machine Check Handler For AMD Athlon/Duron: */
17static void k7_machine_check(struct pt_regs *regs, long error_code)
18{
19 u32 alow, ahigh, high, low;
20 u32 mcgstl, mcgsth;
21 int recover = 1;
22 int i;
23
24 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
25 if (mcgstl & (1<<0)) /* Recoverable ? */
26 recover = 0;
27
28 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
29 smp_processor_id(), mcgsth, mcgstl);
30
31 for (i = 1; i < nr_mce_banks; i++) {
32 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
33 if (high & (1<<31)) {
34 char misc[20];
35 char addr[24];
36
37 misc[0] = '\0';
38 addr[0] = '\0';
39
40 if (high & (1<<29))
41 recover |= 1;
42 if (high & (1<<25))
43 recover |= 2;
44 high &= ~(1<<31);
45
46 if (high & (1<<27)) {
47 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
48 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
49 }
50 if (high & (1<<26)) {
51 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
52 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
53 }
54
55 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
56 smp_processor_id(), i, high, low, misc, addr);
57
58 /* Clear it: */
59 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
60 /* Serialize: */
61 wmb();
62 add_taint(TAINT_MACHINE_CHECK);
63 }
64 }
65
66 if (recover & 2)
67 panic("CPU context corrupt");
68 if (recover & 1)
69 panic("Unable to continue");
70
71 printk(KERN_EMERG "Attempting to continue.\n");
72
73 mcgstl &= ~(1<<2);
74 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
75}
76
77
78/* AMD K7 machine check is Intel like: */
79void amd_mcheck_init(struct cpuinfo_x86 *c)
80{
81 u32 l, h;
82 int i;
83
84 if (!cpu_has(c, X86_FEATURE_MCE))
85 return;
86
87 machine_check_vector = k7_machine_check;
88 /* Make sure the vector pointer is visible before we enable MCEs: */
89 wmb();
90
91 printk(KERN_INFO "Intel machine check architecture supported.\n");
92
93 rdmsr(MSR_IA32_MCG_CAP, l, h);
94 if (l & (1<<8)) /* Control register present ? */
95 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
96 nr_mce_banks = l & 0xff;
97
98 /*
99 * Clear status for MC index 0 separately, we don't touch CTL,
100 * as some K7 Athlons cause spurious MCEs when its enabled:
101 */
102 if (boot_cpu_data.x86 == 6) {
103 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
104 i = 1;
105 } else
106 i = 0;
107
108 for (; i < nr_mce_banks; i++) {
109 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
110 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
111 }
112
113 set_in_cr4(X86_CR4_MCE);
114 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
115 smp_processor_id());
116}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index a3a235a53f09..7029f0e2acad 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -18,7 +18,12 @@
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/smp.h> 20#include <linux/smp.h>
21#include <linux/notifier.h>
22#include <linux/kdebug.h>
23#include <linux/cpu.h>
24#include <linux/sched.h>
21#include <asm/mce.h> 25#include <asm/mce.h>
26#include <asm/apic.h>
22 27
23/* Update fake mce registers on current CPU. */ 28/* Update fake mce registers on current CPU. */
24static void inject_mce(struct mce *m) 29static void inject_mce(struct mce *m)
@@ -39,44 +44,141 @@ static void inject_mce(struct mce *m)
39 i->finished = 1; 44 i->finished = 1;
40} 45}
41 46
42struct delayed_mce { 47static void raise_poll(struct mce *m)
43 struct timer_list timer; 48{
44 struct mce m; 49 unsigned long flags;
45}; 50 mce_banks_t b;
46 51
47/* Inject mce on current CPU */ 52 memset(&b, 0xff, sizeof(mce_banks_t));
48static void raise_mce(unsigned long data) 53 local_irq_save(flags);
54 machine_check_poll(0, &b);
55 local_irq_restore(flags);
56 m->finished = 0;
57}
58
59static void raise_exception(struct mce *m, struct pt_regs *pregs)
49{ 60{
50 struct delayed_mce *dm = (struct delayed_mce *)data; 61 struct pt_regs regs;
51 struct mce *m = &dm->m; 62 unsigned long flags;
52 int cpu = m->extcpu;
53 63
54 inject_mce(m); 64 if (!pregs) {
55 if (m->status & MCI_STATUS_UC) {
56 struct pt_regs regs;
57 memset(&regs, 0, sizeof(struct pt_regs)); 65 memset(&regs, 0, sizeof(struct pt_regs));
58 regs.ip = m->ip; 66 regs.ip = m->ip;
59 regs.cs = m->cs; 67 regs.cs = m->cs;
68 pregs = &regs;
69 }
70 /* in mcheck exeception handler, irq will be disabled */
71 local_irq_save(flags);
72 do_machine_check(pregs, 0);
73 local_irq_restore(flags);
74 m->finished = 0;
75}
76
77static cpumask_t mce_inject_cpumask;
78
79static int mce_raise_notify(struct notifier_block *self,
80 unsigned long val, void *data)
81{
82 struct die_args *args = (struct die_args *)data;
83 int cpu = smp_processor_id();
84 struct mce *m = &__get_cpu_var(injectm);
85 if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask))
86 return NOTIFY_DONE;
87 cpu_clear(cpu, mce_inject_cpumask);
88 if (m->inject_flags & MCJ_EXCEPTION)
89 raise_exception(m, args->regs);
90 else if (m->status)
91 raise_poll(m);
92 return NOTIFY_STOP;
93}
94
95static struct notifier_block mce_raise_nb = {
96 .notifier_call = mce_raise_notify,
97 .priority = 1000,
98};
99
100/* Inject mce on current CPU */
101static int raise_local(struct mce *m)
102{
103 int context = MCJ_CTX(m->inject_flags);
104 int ret = 0;
105 int cpu = m->extcpu;
106
107 if (m->inject_flags & MCJ_EXCEPTION) {
60 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); 108 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
61 do_machine_check(&regs, 0); 109 switch (context) {
110 case MCJ_CTX_IRQ:
111 /*
112 * Could do more to fake interrupts like
113 * calling irq_enter, but the necessary
114 * machinery isn't exported currently.
115 */
116 /*FALL THROUGH*/
117 case MCJ_CTX_PROCESS:
118 raise_exception(m, NULL);
119 break;
120 default:
121 printk(KERN_INFO "Invalid MCE context\n");
122 ret = -EINVAL;
123 }
62 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); 124 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
63 } else { 125 } else if (m->status) {
64 mce_banks_t b;
65 memset(&b, 0xff, sizeof(mce_banks_t));
66 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); 126 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
67 machine_check_poll(0, &b); 127 raise_poll(m);
68 mce_notify_irq(); 128 mce_notify_irq();
69 printk(KERN_INFO "Finished machine check poll on CPU %d\n", 129 printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
70 cpu); 130 } else
71 } 131 m->finished = 0;
72 kfree(dm); 132
133 return ret;
134}
135
136static void raise_mce(struct mce *m)
137{
138 int context = MCJ_CTX(m->inject_flags);
139
140 inject_mce(m);
141
142 if (context == MCJ_CTX_RANDOM)
143 return;
144
145#ifdef CONFIG_X86_LOCAL_APIC
146 if (m->inject_flags & MCJ_NMI_BROADCAST) {
147 unsigned long start;
148 int cpu;
149 get_online_cpus();
150 mce_inject_cpumask = cpu_online_map;
151 cpu_clear(get_cpu(), mce_inject_cpumask);
152 for_each_online_cpu(cpu) {
153 struct mce *mcpu = &per_cpu(injectm, cpu);
154 if (!mcpu->finished ||
155 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
156 cpu_clear(cpu, mce_inject_cpumask);
157 }
158 if (!cpus_empty(mce_inject_cpumask))
159 apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR);
160 start = jiffies;
161 while (!cpus_empty(mce_inject_cpumask)) {
162 if (!time_before(jiffies, start + 2*HZ)) {
163 printk(KERN_ERR
164 "Timeout waiting for mce inject NMI %lx\n",
165 *cpus_addr(mce_inject_cpumask));
166 break;
167 }
168 cpu_relax();
169 }
170 raise_local(m);
171 put_cpu();
172 put_online_cpus();
173 } else
174#endif
175 raise_local(m);
73} 176}
74 177
75/* Error injection interface */ 178/* Error injection interface */
76static ssize_t mce_write(struct file *filp, const char __user *ubuf, 179static ssize_t mce_write(struct file *filp, const char __user *ubuf,
77 size_t usize, loff_t *off) 180 size_t usize, loff_t *off)
78{ 181{
79 struct delayed_mce *dm;
80 struct mce m; 182 struct mce m;
81 183
82 if (!capable(CAP_SYS_ADMIN)) 184 if (!capable(CAP_SYS_ADMIN))
@@ -96,19 +198,12 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
96 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) 198 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
97 return -EINVAL; 199 return -EINVAL;
98 200
99 dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
100 if (!dm)
101 return -ENOMEM;
102
103 /* 201 /*
104 * Need to give user space some time to set everything up, 202 * Need to give user space some time to set everything up,
105 * so do it a jiffie or two later everywhere. 203 * so do it a jiffie or two later everywhere.
106 * Should we use a hrtimer here for better synchronization?
107 */ 204 */
108 memcpy(&dm->m, &m, sizeof(struct mce)); 205 schedule_timeout(2);
109 setup_timer(&dm->timer, raise_mce, (unsigned long)dm); 206 raise_mce(&m);
110 dm->timer.expires = jiffies + 2;
111 add_timer_on(&dm->timer, m.extcpu);
112 return usize; 207 return usize;
113} 208}
114 209
@@ -116,6 +211,7 @@ static int inject_init(void)
116{ 211{
117 printk(KERN_INFO "Machine check injector initialized\n"); 212 printk(KERN_INFO "Machine check injector initialized\n");
118 mce_chrdev_ops.write = mce_write; 213 mce_chrdev_ops.write = mce_write;
214 register_die_notifier(&mce_raise_nb);
119 return 0; 215 return 0;
120} 216}
121 217
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 54dcb8ff12e5..32996f9fab67 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,3 +1,4 @@
1#include <linux/sysdev.h>
1#include <asm/mce.h> 2#include <asm/mce.h>
2 3
3enum severity_level { 4enum severity_level {
@@ -10,6 +11,20 @@ enum severity_level {
10 MCE_PANIC_SEVERITY, 11 MCE_PANIC_SEVERITY,
11}; 12};
12 13
14#define ATTR_LEN 16
15
16/* One object for each MCE bank, shared by all CPUs */
17struct mce_bank {
18 u64 ctl; /* subevents to enable */
19 unsigned char init; /* initialise bank? */
20 struct sysdev_attribute attr; /* sysdev attribute */
21 char attrname[ATTR_LEN]; /* attribute name */
22};
23
13int mce_severity(struct mce *a, int tolerant, char **msg); 24int mce_severity(struct mce *a, int tolerant, char **msg);
25struct dentry *mce_get_debugfs_dir(void);
14 26
15extern int mce_ser; 27extern int mce_ser;
28
29extern struct mce_bank *mce_banks;
30
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index ff0807f97056..8a85dd1b1aa1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -139,6 +139,7 @@ int mce_severity(struct mce *a, int tolerant, char **msg)
139 } 139 }
140} 140}
141 141
142#ifdef CONFIG_DEBUG_FS
142static void *s_start(struct seq_file *f, loff_t *pos) 143static void *s_start(struct seq_file *f, loff_t *pos)
143{ 144{
144 if (*pos >= ARRAY_SIZE(severities)) 145 if (*pos >= ARRAY_SIZE(severities))
@@ -197,7 +198,7 @@ static int __init severities_debugfs_init(void)
197{ 198{
198 struct dentry *dmce = NULL, *fseverities_coverage = NULL; 199 struct dentry *dmce = NULL, *fseverities_coverage = NULL;
199 200
200 dmce = debugfs_create_dir("mce", NULL); 201 dmce = mce_get_debugfs_dir();
201 if (dmce == NULL) 202 if (dmce == NULL)
202 goto err_out; 203 goto err_out;
203 fseverities_coverage = debugfs_create_file("severities-coverage", 204 fseverities_coverage = debugfs_create_file("severities-coverage",
@@ -209,10 +210,7 @@ static int __init severities_debugfs_init(void)
209 return 0; 210 return 0;
210 211
211err_out: 212err_out:
212 if (fseverities_coverage)
213 debugfs_remove(fseverities_coverage);
214 if (dmce)
215 debugfs_remove(dmce);
216 return -ENOMEM; 213 return -ENOMEM;
217} 214}
218late_initcall(severities_debugfs_init); 215late_initcall(severities_debugfs_init);
216#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9bfe9d2ea615..2f5aab26320e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -34,6 +34,7 @@
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/fs.h> 35#include <linux/fs.h>
36#include <linux/mm.h> 36#include <linux/mm.h>
37#include <linux/debugfs.h>
37 38
38#include <asm/processor.h> 39#include <asm/processor.h>
39#include <asm/hw_irq.h> 40#include <asm/hw_irq.h>
@@ -45,21 +46,8 @@
45 46
46#include "mce-internal.h" 47#include "mce-internal.h"
47 48
48/* Handle unconfigured int18 (should never happen) */
49static void unexpected_machine_check(struct pt_regs *regs, long error_code)
50{
51 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
52 smp_processor_id());
53}
54
55/* Call the installed machine check handler for this CPU setup. */
56void (*machine_check_vector)(struct pt_regs *, long error_code) =
57 unexpected_machine_check;
58
59int mce_disabled __read_mostly; 49int mce_disabled __read_mostly;
60 50
61#ifdef CONFIG_X86_NEW_MCE
62
63#define MISC_MCELOG_MINOR 227 51#define MISC_MCELOG_MINOR 227
64 52
65#define SPINUNIT 100 /* 100ns */ 53#define SPINUNIT 100 /* 100ns */
@@ -77,7 +65,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count);
77 */ 65 */
78static int tolerant __read_mostly = 1; 66static int tolerant __read_mostly = 1;
79static int banks __read_mostly; 67static int banks __read_mostly;
80static u64 *bank __read_mostly;
81static int rip_msr __read_mostly; 68static int rip_msr __read_mostly;
82static int mce_bootlog __read_mostly = -1; 69static int mce_bootlog __read_mostly = -1;
83static int monarch_timeout __read_mostly = -1; 70static int monarch_timeout __read_mostly = -1;
@@ -87,13 +74,13 @@ int mce_cmci_disabled __read_mostly;
87int mce_ignore_ce __read_mostly; 74int mce_ignore_ce __read_mostly;
88int mce_ser __read_mostly; 75int mce_ser __read_mostly;
89 76
77struct mce_bank *mce_banks __read_mostly;
78
90/* User mode helper program triggered by machine check event */ 79/* User mode helper program triggered by machine check event */
91static unsigned long mce_need_notify; 80static unsigned long mce_need_notify;
92static char mce_helper[128]; 81static char mce_helper[128];
93static char *mce_helper_argv[2] = { mce_helper, NULL }; 82static char *mce_helper_argv[2] = { mce_helper, NULL };
94 83
95static unsigned long dont_init_banks;
96
97static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 84static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
98static DEFINE_PER_CPU(struct mce, mces_seen); 85static DEFINE_PER_CPU(struct mce, mces_seen);
99static int cpu_missing; 86static int cpu_missing;
@@ -104,11 +91,6 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
104 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 91 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
105}; 92};
106 93
107static inline int skip_bank_init(int i)
108{
109 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
110}
111
112static DEFINE_PER_CPU(struct work_struct, mce_work); 94static DEFINE_PER_CPU(struct work_struct, mce_work);
113 95
114/* Do initial initialization of a struct mce */ 96/* Do initial initialization of a struct mce */
@@ -232,6 +214,9 @@ static void print_mce_tail(void)
232 214
233static atomic_t mce_paniced; 215static atomic_t mce_paniced;
234 216
217static int fake_panic;
218static atomic_t mce_fake_paniced;
219
235/* Panic in progress. Enable interrupts and wait for final IPI */ 220/* Panic in progress. Enable interrupts and wait for final IPI */
236static void wait_for_panic(void) 221static void wait_for_panic(void)
237{ 222{
@@ -249,15 +234,21 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
249{ 234{
250 int i; 235 int i;
251 236
252 /* 237 if (!fake_panic) {
253 * Make sure only one CPU runs in machine check panic 238 /*
254 */ 239 * Make sure only one CPU runs in machine check panic
255 if (atomic_add_return(1, &mce_paniced) > 1) 240 */
256 wait_for_panic(); 241 if (atomic_inc_return(&mce_paniced) > 1)
257 barrier(); 242 wait_for_panic();
243 barrier();
258 244
259 bust_spinlocks(1); 245 bust_spinlocks(1);
260 console_verbose(); 246 console_verbose();
247 } else {
248 /* Don't log too much for fake panic */
249 if (atomic_inc_return(&mce_fake_paniced) > 1)
250 return;
251 }
261 print_mce_head(); 252 print_mce_head();
262 /* First print corrected ones that are still unlogged */ 253 /* First print corrected ones that are still unlogged */
263 for (i = 0; i < MCE_LOG_LEN; i++) { 254 for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -284,9 +275,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
284 print_mce_tail(); 275 print_mce_tail();
285 if (exp) 276 if (exp)
286 printk(KERN_EMERG "Machine check: %s\n", exp); 277 printk(KERN_EMERG "Machine check: %s\n", exp);
287 if (panic_timeout == 0) 278 if (!fake_panic) {
288 panic_timeout = mce_panic_timeout; 279 if (panic_timeout == 0)
289 panic(msg); 280 panic_timeout = mce_panic_timeout;
281 panic(msg);
282 } else
283 printk(KERN_EMERG "Fake kernel panic: %s\n", msg);
290} 284}
291 285
292/* Support code for software error injection */ 286/* Support code for software error injection */
@@ -296,11 +290,11 @@ static int msr_to_offset(u32 msr)
296 unsigned bank = __get_cpu_var(injectm.bank); 290 unsigned bank = __get_cpu_var(injectm.bank);
297 if (msr == rip_msr) 291 if (msr == rip_msr)
298 return offsetof(struct mce, ip); 292 return offsetof(struct mce, ip);
299 if (msr == MSR_IA32_MC0_STATUS + bank*4) 293 if (msr == MSR_IA32_MCx_STATUS(bank))
300 return offsetof(struct mce, status); 294 return offsetof(struct mce, status);
301 if (msr == MSR_IA32_MC0_ADDR + bank*4) 295 if (msr == MSR_IA32_MCx_ADDR(bank))
302 return offsetof(struct mce, addr); 296 return offsetof(struct mce, addr);
303 if (msr == MSR_IA32_MC0_MISC + bank*4) 297 if (msr == MSR_IA32_MCx_MISC(bank))
304 return offsetof(struct mce, misc); 298 return offsetof(struct mce, misc);
305 if (msr == MSR_IA32_MCG_STATUS) 299 if (msr == MSR_IA32_MCG_STATUS)
306 return offsetof(struct mce, mcgstatus); 300 return offsetof(struct mce, mcgstatus);
@@ -505,7 +499,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
505 499
506 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 500 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
507 for (i = 0; i < banks; i++) { 501 for (i = 0; i < banks; i++) {
508 if (!bank[i] || !test_bit(i, *b)) 502 if (!mce_banks[i].ctl || !test_bit(i, *b))
509 continue; 503 continue;
510 504
511 m.misc = 0; 505 m.misc = 0;
@@ -514,7 +508,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
514 m.tsc = 0; 508 m.tsc = 0;
515 509
516 barrier(); 510 barrier();
517 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 511 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
518 if (!(m.status & MCI_STATUS_VAL)) 512 if (!(m.status & MCI_STATUS_VAL))
519 continue; 513 continue;
520 514
@@ -529,9 +523,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
529 continue; 523 continue;
530 524
531 if (m.status & MCI_STATUS_MISCV) 525 if (m.status & MCI_STATUS_MISCV)
532 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 526 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
533 if (m.status & MCI_STATUS_ADDRV) 527 if (m.status & MCI_STATUS_ADDRV)
534 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 528 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
535 529
536 if (!(flags & MCP_TIMESTAMP)) 530 if (!(flags & MCP_TIMESTAMP))
537 m.tsc = 0; 531 m.tsc = 0;
@@ -547,7 +541,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
547 /* 541 /*
548 * Clear state for this bank. 542 * Clear state for this bank.
549 */ 543 */
550 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 544 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
551 } 545 }
552 546
553 /* 547 /*
@@ -568,7 +562,7 @@ static int mce_no_way_out(struct mce *m, char **msg)
568 int i; 562 int i;
569 563
570 for (i = 0; i < banks; i++) { 564 for (i = 0; i < banks; i++) {
571 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 565 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
572 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 566 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
573 return 1; 567 return 1;
574 } 568 }
@@ -628,7 +622,7 @@ out:
628 * This way we prevent any potential data corruption in a unrecoverable case 622 * This way we prevent any potential data corruption in a unrecoverable case
629 * and also makes sure always all CPU's errors are examined. 623 * and also makes sure always all CPU's errors are examined.
630 * 624 *
631 * Also this detects the case of an machine check event coming from outer 625 * Also this detects the case of a machine check event coming from outer
632 * space (not detected by any CPUs) In this case some external agent wants 626 * space (not detected by any CPUs) In this case some external agent wants
633 * us to shut down, so panic too. 627 * us to shut down, so panic too.
634 * 628 *
@@ -681,7 +675,7 @@ static void mce_reign(void)
681 * No machine check event found. Must be some external 675 * No machine check event found. Must be some external
682 * source or one CPU is hung. Panic. 676 * source or one CPU is hung. Panic.
683 */ 677 */
684 if (!m && tolerant < 3) 678 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
685 mce_panic("Machine check from unknown source", NULL, NULL); 679 mce_panic("Machine check from unknown source", NULL, NULL);
686 680
687 /* 681 /*
@@ -715,7 +709,7 @@ static int mce_start(int *no_way_out)
715 * global_nwo should be updated before mce_callin 709 * global_nwo should be updated before mce_callin
716 */ 710 */
717 smp_wmb(); 711 smp_wmb();
718 order = atomic_add_return(1, &mce_callin); 712 order = atomic_inc_return(&mce_callin);
719 713
720 /* 714 /*
721 * Wait for everyone. 715 * Wait for everyone.
@@ -852,7 +846,7 @@ static void mce_clear_state(unsigned long *toclear)
852 846
853 for (i = 0; i < banks; i++) { 847 for (i = 0; i < banks; i++) {
854 if (test_bit(i, toclear)) 848 if (test_bit(i, toclear))
855 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 849 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
856 } 850 }
857} 851}
858 852
@@ -905,11 +899,11 @@ void do_machine_check(struct pt_regs *regs, long error_code)
905 mce_setup(&m); 899 mce_setup(&m);
906 900
907 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 901 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
908 no_way_out = mce_no_way_out(&m, &msg);
909
910 final = &__get_cpu_var(mces_seen); 902 final = &__get_cpu_var(mces_seen);
911 *final = m; 903 *final = m;
912 904
905 no_way_out = mce_no_way_out(&m, &msg);
906
913 barrier(); 907 barrier();
914 908
915 /* 909 /*
@@ -926,14 +920,14 @@ void do_machine_check(struct pt_regs *regs, long error_code)
926 order = mce_start(&no_way_out); 920 order = mce_start(&no_way_out);
927 for (i = 0; i < banks; i++) { 921 for (i = 0; i < banks; i++) {
928 __clear_bit(i, toclear); 922 __clear_bit(i, toclear);
929 if (!bank[i]) 923 if (!mce_banks[i].ctl)
930 continue; 924 continue;
931 925
932 m.misc = 0; 926 m.misc = 0;
933 m.addr = 0; 927 m.addr = 0;
934 m.bank = i; 928 m.bank = i;
935 929
936 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 930 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
937 if ((m.status & MCI_STATUS_VAL) == 0) 931 if ((m.status & MCI_STATUS_VAL) == 0)
938 continue; 932 continue;
939 933
@@ -974,9 +968,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
974 kill_it = 1; 968 kill_it = 1;
975 969
976 if (m.status & MCI_STATUS_MISCV) 970 if (m.status & MCI_STATUS_MISCV)
977 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 971 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
978 if (m.status & MCI_STATUS_ADDRV) 972 if (m.status & MCI_STATUS_ADDRV)
979 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 973 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
980 974
981 /* 975 /*
982 * Action optional error. Queue address for later processing. 976 * Action optional error. Queue address for later processing.
@@ -1101,7 +1095,7 @@ void mce_log_therm_throt_event(__u64 status)
1101 */ 1095 */
1102static int check_interval = 5 * 60; /* 5 minutes */ 1096static int check_interval = 5 * 60; /* 5 minutes */
1103 1097
1104static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 1098static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1105static DEFINE_PER_CPU(struct timer_list, mce_timer); 1099static DEFINE_PER_CPU(struct timer_list, mce_timer);
1106 1100
1107static void mcheck_timer(unsigned long data) 1101static void mcheck_timer(unsigned long data)
@@ -1120,7 +1114,7 @@ static void mcheck_timer(unsigned long data)
1120 * Alert userspace if needed. If we logged an MCE, reduce the 1114 * Alert userspace if needed. If we logged an MCE, reduce the
1121 * polling interval, otherwise increase the polling interval. 1115 * polling interval, otherwise increase the polling interval.
1122 */ 1116 */
1123 n = &__get_cpu_var(next_interval); 1117 n = &__get_cpu_var(mce_next_interval);
1124 if (mce_notify_irq()) 1118 if (mce_notify_irq())
1125 *n = max(*n/2, HZ/100); 1119 *n = max(*n/2, HZ/100);
1126 else 1120 else
@@ -1169,10 +1163,25 @@ int mce_notify_irq(void)
1169} 1163}
1170EXPORT_SYMBOL_GPL(mce_notify_irq); 1164EXPORT_SYMBOL_GPL(mce_notify_irq);
1171 1165
1166static int mce_banks_init(void)
1167{
1168 int i;
1169
1170 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1171 if (!mce_banks)
1172 return -ENOMEM;
1173 for (i = 0; i < banks; i++) {
1174 struct mce_bank *b = &mce_banks[i];
1175 b->ctl = -1ULL;
1176 b->init = 1;
1177 }
1178 return 0;
1179}
1180
1172/* 1181/*
1173 * Initialize Machine Checks for a CPU. 1182 * Initialize Machine Checks for a CPU.
1174 */ 1183 */
1175static int mce_cap_init(void) 1184static int __cpuinit mce_cap_init(void)
1176{ 1185{
1177 unsigned b; 1186 unsigned b;
1178 u64 cap; 1187 u64 cap;
@@ -1192,11 +1201,10 @@ static int mce_cap_init(void)
1192 /* Don't support asymmetric configurations today */ 1201 /* Don't support asymmetric configurations today */
1193 WARN_ON(banks != 0 && b != banks); 1202 WARN_ON(banks != 0 && b != banks);
1194 banks = b; 1203 banks = b;
1195 if (!bank) { 1204 if (!mce_banks) {
1196 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 1205 int err = mce_banks_init();
1197 if (!bank) 1206 if (err)
1198 return -ENOMEM; 1207 return err;
1199 memset(bank, 0xff, banks * sizeof(u64));
1200 } 1208 }
1201 1209
1202 /* Use accurate RIP reporting if available. */ 1210 /* Use accurate RIP reporting if available. */
@@ -1228,15 +1236,16 @@ static void mce_init(void)
1228 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1236 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1229 1237
1230 for (i = 0; i < banks; i++) { 1238 for (i = 0; i < banks; i++) {
1231 if (skip_bank_init(i)) 1239 struct mce_bank *b = &mce_banks[i];
1240 if (!b->init)
1232 continue; 1241 continue;
1233 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 1242 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1234 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 1243 wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1235 } 1244 }
1236} 1245}
1237 1246
1238/* Add per CPU specific workarounds here */ 1247/* Add per CPU specific workarounds here */
1239static int mce_cpu_quirks(struct cpuinfo_x86 *c) 1248static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1240{ 1249{
1241 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1250 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1242 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1251 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1251,7 +1260,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1251 * trips off incorrectly with the IOMMU & 3ware 1260 * trips off incorrectly with the IOMMU & 3ware
1252 * & Cerberus: 1261 * & Cerberus:
1253 */ 1262 */
1254 clear_bit(10, (unsigned long *)&bank[4]); 1263 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1255 } 1264 }
1256 if (c->x86 <= 17 && mce_bootlog < 0) { 1265 if (c->x86 <= 17 && mce_bootlog < 0) {
1257 /* 1266 /*
@@ -1265,7 +1274,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1265 * by default. 1274 * by default.
1266 */ 1275 */
1267 if (c->x86 == 6 && banks > 0) 1276 if (c->x86 == 6 && banks > 0)
1268 bank[0] = 0; 1277 mce_banks[0].ctl = 0;
1269 } 1278 }
1270 1279
1271 if (c->x86_vendor == X86_VENDOR_INTEL) { 1280 if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1278,8 +1287,8 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1278 * valid event later, merely don't write CTL0. 1287 * valid event later, merely don't write CTL0.
1279 */ 1288 */
1280 1289
1281 if (c->x86 == 6 && c->x86_model < 0x1A) 1290 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1282 __set_bit(0, &dont_init_banks); 1291 mce_banks[0].init = 0;
1283 1292
1284 /* 1293 /*
1285 * All newer Intel systems support MCE broadcasting. Enable 1294 * All newer Intel systems support MCE broadcasting. Enable
@@ -1335,7 +1344,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
1335static void mce_init_timer(void) 1344static void mce_init_timer(void)
1336{ 1345{
1337 struct timer_list *t = &__get_cpu_var(mce_timer); 1346 struct timer_list *t = &__get_cpu_var(mce_timer);
1338 int *n = &__get_cpu_var(next_interval); 1347 int *n = &__get_cpu_var(mce_next_interval);
1339 1348
1340 if (mce_ignore_ce) 1349 if (mce_ignore_ce)
1341 return; 1350 return;
@@ -1348,6 +1357,17 @@ static void mce_init_timer(void)
1348 add_timer_on(t, smp_processor_id()); 1357 add_timer_on(t, smp_processor_id());
1349} 1358}
1350 1359
1360/* Handle unconfigured int18 (should never happen) */
1361static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1362{
1363 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
1364 smp_processor_id());
1365}
1366
1367/* Call the installed machine check handler for this CPU setup. */
1368void (*machine_check_vector)(struct pt_regs *, long error_code) =
1369 unexpected_machine_check;
1370
1351/* 1371/*
1352 * Called for each booted CPU to set up machine checks. 1372 * Called for each booted CPU to set up machine checks.
1353 * Must be called with preempt off: 1373 * Must be called with preempt off:
@@ -1561,8 +1581,10 @@ static struct miscdevice mce_log_device = {
1561 */ 1581 */
1562static int __init mcheck_enable(char *str) 1582static int __init mcheck_enable(char *str)
1563{ 1583{
1564 if (*str == 0) 1584 if (*str == 0) {
1565 enable_p5_mce(); 1585 enable_p5_mce();
1586 return 1;
1587 }
1566 if (*str == '=') 1588 if (*str == '=')
1567 str++; 1589 str++;
1568 if (!strcmp(str, "off")) 1590 if (!strcmp(str, "off"))
@@ -1603,8 +1625,9 @@ static int mce_disable(void)
1603 int i; 1625 int i;
1604 1626
1605 for (i = 0; i < banks; i++) { 1627 for (i = 0; i < banks; i++) {
1606 if (!skip_bank_init(i)) 1628 struct mce_bank *b = &mce_banks[i];
1607 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1629 if (b->init)
1630 wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1608 } 1631 }
1609 return 0; 1632 return 0;
1610} 1633}
@@ -1679,14 +1702,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev);
1679__cpuinitdata 1702__cpuinitdata
1680void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1703void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1681 1704
1682static struct sysdev_attribute *bank_attrs; 1705static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
1706{
1707 return container_of(attr, struct mce_bank, attr);
1708}
1683 1709
1684static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1710static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1685 char *buf) 1711 char *buf)
1686{ 1712{
1687 u64 b = bank[attr - bank_attrs]; 1713 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
1688
1689 return sprintf(buf, "%llx\n", b);
1690} 1714}
1691 1715
1692static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1716static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
@@ -1697,7 +1721,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1697 if (strict_strtoull(buf, 0, &new) < 0) 1721 if (strict_strtoull(buf, 0, &new) < 0)
1698 return -EINVAL; 1722 return -EINVAL;
1699 1723
1700 bank[attr - bank_attrs] = new; 1724 attr_to_bank(attr)->ctl = new;
1701 mce_restart(); 1725 mce_restart();
1702 1726
1703 return size; 1727 return size;
@@ -1839,7 +1863,7 @@ static __cpuinit int mce_create_device(unsigned int cpu)
1839 } 1863 }
1840 for (j = 0; j < banks; j++) { 1864 for (j = 0; j < banks; j++) {
1841 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1865 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1842 &bank_attrs[j]); 1866 &mce_banks[j].attr);
1843 if (err) 1867 if (err)
1844 goto error2; 1868 goto error2;
1845 } 1869 }
@@ -1848,10 +1872,10 @@ static __cpuinit int mce_create_device(unsigned int cpu)
1848 return 0; 1872 return 0;
1849error2: 1873error2:
1850 while (--j >= 0) 1874 while (--j >= 0)
1851 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); 1875 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
1852error: 1876error:
1853 while (--i >= 0) 1877 while (--i >= 0)
1854 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1878 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
1855 1879
1856 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1880 sysdev_unregister(&per_cpu(mce_dev, cpu));
1857 1881
@@ -1869,7 +1893,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
1869 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1893 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1870 1894
1871 for (i = 0; i < banks; i++) 1895 for (i = 0; i < banks; i++)
1872 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1896 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
1873 1897
1874 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1898 sysdev_unregister(&per_cpu(mce_dev, cpu));
1875 cpumask_clear_cpu(cpu, mce_dev_initialized); 1899 cpumask_clear_cpu(cpu, mce_dev_initialized);
@@ -1886,8 +1910,9 @@ static void mce_disable_cpu(void *h)
1886 if (!(action & CPU_TASKS_FROZEN)) 1910 if (!(action & CPU_TASKS_FROZEN))
1887 cmci_clear(); 1911 cmci_clear();
1888 for (i = 0; i < banks; i++) { 1912 for (i = 0; i < banks; i++) {
1889 if (!skip_bank_init(i)) 1913 struct mce_bank *b = &mce_banks[i];
1890 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1914 if (b->init)
1915 wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1891 } 1916 }
1892} 1917}
1893 1918
@@ -1902,8 +1927,9 @@ static void mce_reenable_cpu(void *h)
1902 if (!(action & CPU_TASKS_FROZEN)) 1927 if (!(action & CPU_TASKS_FROZEN))
1903 cmci_reenable(); 1928 cmci_reenable();
1904 for (i = 0; i < banks; i++) { 1929 for (i = 0; i < banks; i++) {
1905 if (!skip_bank_init(i)) 1930 struct mce_bank *b = &mce_banks[i];
1906 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1931 if (b->init)
1932 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1907 } 1933 }
1908} 1934}
1909 1935
@@ -1935,7 +1961,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1935 case CPU_DOWN_FAILED: 1961 case CPU_DOWN_FAILED:
1936 case CPU_DOWN_FAILED_FROZEN: 1962 case CPU_DOWN_FAILED_FROZEN:
1937 t->expires = round_jiffies(jiffies + 1963 t->expires = round_jiffies(jiffies +
1938 __get_cpu_var(next_interval)); 1964 __get_cpu_var(mce_next_interval));
1939 add_timer_on(t, cpu); 1965 add_timer_on(t, cpu);
1940 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1966 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1941 break; 1967 break;
@@ -1951,35 +1977,21 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1951 .notifier_call = mce_cpu_callback, 1977 .notifier_call = mce_cpu_callback,
1952}; 1978};
1953 1979
1954static __init int mce_init_banks(void) 1980static __init void mce_init_banks(void)
1955{ 1981{
1956 int i; 1982 int i;
1957 1983
1958 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1959 GFP_KERNEL);
1960 if (!bank_attrs)
1961 return -ENOMEM;
1962
1963 for (i = 0; i < banks; i++) { 1984 for (i = 0; i < banks; i++) {
1964 struct sysdev_attribute *a = &bank_attrs[i]; 1985 struct mce_bank *b = &mce_banks[i];
1986 struct sysdev_attribute *a = &b->attr;
1965 1987
1966 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1988 a->attr.name = b->attrname;
1967 if (!a->attr.name) 1989 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
1968 goto nomem;
1969 1990
1970 a->attr.mode = 0644; 1991 a->attr.mode = 0644;
1971 a->show = show_bank; 1992 a->show = show_bank;
1972 a->store = set_bank; 1993 a->store = set_bank;
1973 } 1994 }
1974 return 0;
1975
1976nomem:
1977 while (--i >= 0)
1978 kfree(bank_attrs[i].attr.name);
1979 kfree(bank_attrs);
1980 bank_attrs = NULL;
1981
1982 return -ENOMEM;
1983} 1995}
1984 1996
1985static __init int mce_init_device(void) 1997static __init int mce_init_device(void)
@@ -1992,9 +2004,7 @@ static __init int mce_init_device(void)
1992 2004
1993 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 2005 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1994 2006
1995 err = mce_init_banks(); 2007 mce_init_banks();
1996 if (err)
1997 return err;
1998 2008
1999 err = sysdev_class_register(&mce_sysclass); 2009 err = sysdev_class_register(&mce_sysclass);
2000 if (err) 2010 if (err)
@@ -2014,57 +2024,65 @@ static __init int mce_init_device(void)
2014 2024
2015device_initcall(mce_init_device); 2025device_initcall(mce_init_device);
2016 2026
2017#else /* CONFIG_X86_OLD_MCE: */ 2027/*
2018 2028 * Old style boot options parsing. Only for compatibility.
2019int nr_mce_banks; 2029 */
2020EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 2030static int __init mcheck_disable(char *str)
2031{
2032 mce_disabled = 1;
2033 return 1;
2034}
2035__setup("nomce", mcheck_disable);
2021 2036
2022/* This has to be run for each processor */ 2037#ifdef CONFIG_DEBUG_FS
2023void mcheck_init(struct cpuinfo_x86 *c) 2038struct dentry *mce_get_debugfs_dir(void)
2024{ 2039{
2025 if (mce_disabled) 2040 static struct dentry *dmce;
2026 return;
2027 2041
2028 switch (c->x86_vendor) { 2042 if (!dmce)
2029 case X86_VENDOR_AMD: 2043 dmce = debugfs_create_dir("mce", NULL);
2030 amd_mcheck_init(c);
2031 break;
2032 2044
2033 case X86_VENDOR_INTEL: 2045 return dmce;
2034 if (c->x86 == 5) 2046}
2035 intel_p5_mcheck_init(c);
2036 if (c->x86 == 6)
2037 intel_p6_mcheck_init(c);
2038 if (c->x86 == 15)
2039 intel_p4_mcheck_init(c);
2040 break;
2041 2047
2042 case X86_VENDOR_CENTAUR: 2048static void mce_reset(void)
2043 if (c->x86 == 5) 2049{
2044 winchip_mcheck_init(c); 2050 cpu_missing = 0;
2045 break; 2051 atomic_set(&mce_fake_paniced, 0);
2052 atomic_set(&mce_executing, 0);
2053 atomic_set(&mce_callin, 0);
2054 atomic_set(&global_nwo, 0);
2055}
2046 2056
2047 default: 2057static int fake_panic_get(void *data, u64 *val)
2048 break; 2058{
2049 } 2059 *val = fake_panic;
2050 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 2060 return 0;
2051} 2061}
2052 2062
2053static int __init mcheck_enable(char *str) 2063static int fake_panic_set(void *data, u64 val)
2054{ 2064{
2055 mce_p5_enabled = 1; 2065 mce_reset();
2056 return 1; 2066 fake_panic = val;
2067 return 0;
2057} 2068}
2058__setup("mce", mcheck_enable);
2059 2069
2060#endif /* CONFIG_X86_OLD_MCE */ 2070DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2071 fake_panic_set, "%llu\n");
2061 2072
2062/* 2073static int __init mce_debugfs_init(void)
2063 * Old style boot options parsing. Only for compatibility.
2064 */
2065static int __init mcheck_disable(char *str)
2066{ 2074{
2067 mce_disabled = 1; 2075 struct dentry *dmce, *ffake_panic;
2068 return 1; 2076
2077 dmce = mce_get_debugfs_dir();
2078 if (!dmce)
2079 return -ENOMEM;
2080 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2081 &fake_panic_fops);
2082 if (!ffake_panic)
2083 return -ENOMEM;
2084
2085 return 0;
2069} 2086}
2070__setup("nomce", mcheck_disable); 2087late_initcall(mce_debugfs_init);
2088#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 1fecba404fd8..8cd5224943b5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -69,7 +69,7 @@ struct threshold_bank {
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_var_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
73 73
74#ifdef CONFIG_SMP 74#ifdef CONFIG_SMP
75static unsigned char shared_bank[NR_BANKS] = { 75static unsigned char shared_bank[NR_BANKS] = {
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index e1acec0f7a32..889f665fe93d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -90,7 +90,7 @@ static void cmci_discover(int banks, int boot)
90 if (test_bit(i, owned)) 90 if (test_bit(i, owned))
91 continue; 91 continue;
92 92
93 rdmsrl(MSR_IA32_MC0_CTL2 + i, val); 93 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
94 94
95 /* Already owned by someone else? */ 95 /* Already owned by someone else? */
96 if (val & CMCI_EN) { 96 if (val & CMCI_EN) {
@@ -101,8 +101,8 @@ static void cmci_discover(int banks, int boot)
101 } 101 }
102 102
103 val |= CMCI_EN | CMCI_THRESHOLD; 103 val |= CMCI_EN | CMCI_THRESHOLD;
104 wrmsrl(MSR_IA32_MC0_CTL2 + i, val); 104 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
105 rdmsrl(MSR_IA32_MC0_CTL2 + i, val); 105 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
106 106
107 /* Did the enable bit stick? -- the bank supports CMCI */ 107 /* Did the enable bit stick? -- the bank supports CMCI */
108 if (val & CMCI_EN) { 108 if (val & CMCI_EN) {
@@ -152,9 +152,9 @@ void cmci_clear(void)
152 if (!test_bit(i, __get_cpu_var(mce_banks_owned))) 152 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
153 continue; 153 continue;
154 /* Disable CMCI */ 154 /* Disable CMCI */
155 rdmsrl(MSR_IA32_MC0_CTL2 + i, val); 155 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
156 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); 156 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
157 wrmsrl(MSR_IA32_MC0_CTL2 + i, val); 157 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
158 __clear_bit(i, __get_cpu_var(mce_banks_owned)); 158 __clear_bit(i, __get_cpu_var(mce_banks_owned));
159 } 159 }
160 spin_unlock_irqrestore(&cmci_discover_lock, flags); 160 spin_unlock_irqrestore(&cmci_discover_lock, flags);
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
deleted file mode 100644
index f5f2d6f71fb6..000000000000
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ /dev/null
@@ -1,94 +0,0 @@
1/*
2 * Non Fatal Machine Check Exception Reporting
3 *
4 * (C) Copyright 2002 Dave Jones. <davej@redhat.com>
5 *
6 * This file contains routines to check for non-fatal MCEs every 15s
7 *
8 */
9#include <linux/interrupt.h>
10#include <linux/workqueue.h>
11#include <linux/jiffies.h>
12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/init.h>
16#include <linux/smp.h>
17
18#include <asm/processor.h>
19#include <asm/system.h>
20#include <asm/mce.h>
21#include <asm/msr.h>
22
23static int firstbank;
24
25#define MCE_RATE (15*HZ) /* timer rate is 15s */
26
27static void mce_checkregs(void *info)
28{
29 u32 low, high;
30 int i;
31
32 for (i = firstbank; i < nr_mce_banks; i++) {
33 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
34
35 if (!(high & (1<<31)))
36 continue;
37
38 printk(KERN_INFO "MCE: The hardware reports a non fatal, "
39 "correctable incident occurred on CPU %d.\n",
40 smp_processor_id());
41
42 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
43
44 /*
45 * Scrub the error so we don't pick it up in MCE_RATE
46 * seconds time:
47 */
48 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
49
50 /* Serialize: */
51 wmb();
52 add_taint(TAINT_MACHINE_CHECK);
53 }
54}
55
56static void mce_work_fn(struct work_struct *work);
57static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
58
59static void mce_work_fn(struct work_struct *work)
60{
61 on_each_cpu(mce_checkregs, NULL, 1);
62 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
63}
64
65static int __init init_nonfatal_mce_checker(void)
66{
67 struct cpuinfo_x86 *c = &boot_cpu_data;
68
69 /* Check for MCE support */
70 if (!cpu_has(c, X86_FEATURE_MCE))
71 return -ENODEV;
72
73 /* Check for PPro style MCA */
74 if (!cpu_has(c, X86_FEATURE_MCA))
75 return -ENODEV;
76
77 /* Some Athlons misbehave when we frob bank 0 */
78 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
79 boot_cpu_data.x86 == 6)
80 firstbank = 1;
81 else
82 firstbank = 0;
83
84 /*
85 * Check for non-fatal errors every MCE_RATE s
86 */
87 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
88 printk(KERN_INFO "Machine check exception polling timer started.\n");
89
90 return 0;
91}
92module_init(init_nonfatal_mce_checker);
93
94MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
deleted file mode 100644
index 4482aea9aa2e..000000000000
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ /dev/null
@@ -1,163 +0,0 @@
1/*
2 * P4 specific Machine Check Exception Reporting
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/smp.h>
8
9#include <asm/processor.h>
10#include <asm/mce.h>
11#include <asm/msr.h>
12
13/* as supported by the P4/Xeon family */
14struct intel_mce_extended_msrs {
15 u32 eax;
16 u32 ebx;
17 u32 ecx;
18 u32 edx;
19 u32 esi;
20 u32 edi;
21 u32 ebp;
22 u32 esp;
23 u32 eflags;
24 u32 eip;
25 /* u32 *reserved[]; */
26};
27
28static int mce_num_extended_msrs;
29
30/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
31static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
32{
33 u32 h;
34
35 rdmsr(MSR_IA32_MCG_EAX, r->eax, h);
36 rdmsr(MSR_IA32_MCG_EBX, r->ebx, h);
37 rdmsr(MSR_IA32_MCG_ECX, r->ecx, h);
38 rdmsr(MSR_IA32_MCG_EDX, r->edx, h);
39 rdmsr(MSR_IA32_MCG_ESI, r->esi, h);
40 rdmsr(MSR_IA32_MCG_EDI, r->edi, h);
41 rdmsr(MSR_IA32_MCG_EBP, r->ebp, h);
42 rdmsr(MSR_IA32_MCG_ESP, r->esp, h);
43 rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h);
44 rdmsr(MSR_IA32_MCG_EIP, r->eip, h);
45}
46
47static void intel_machine_check(struct pt_regs *regs, long error_code)
48{
49 u32 alow, ahigh, high, low;
50 u32 mcgstl, mcgsth;
51 int recover = 1;
52 int i;
53
54 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
55 if (mcgstl & (1<<0)) /* Recoverable ? */
56 recover = 0;
57
58 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
59 smp_processor_id(), mcgsth, mcgstl);
60
61 if (mce_num_extended_msrs > 0) {
62 struct intel_mce_extended_msrs dbg;
63
64 intel_get_extended_msrs(&dbg);
65
66 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
67 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
68 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
69 smp_processor_id(), dbg.eip, dbg.eflags,
70 dbg.eax, dbg.ebx, dbg.ecx, dbg.edx,
71 dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
72 }
73
74 for (i = 0; i < nr_mce_banks; i++) {
75 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
76 if (high & (1<<31)) {
77 char misc[20];
78 char addr[24];
79
80 misc[0] = addr[0] = '\0';
81 if (high & (1<<29))
82 recover |= 1;
83 if (high & (1<<25))
84 recover |= 2;
85 high &= ~(1<<31);
86 if (high & (1<<27)) {
87 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
88 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
89 }
90 if (high & (1<<26)) {
91 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
92 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
93 }
94 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
95 smp_processor_id(), i, high, low, misc, addr);
96 }
97 }
98
99 if (recover & 2)
100 panic("CPU context corrupt");
101 if (recover & 1)
102 panic("Unable to continue");
103
104 printk(KERN_EMERG "Attempting to continue.\n");
105
106 /*
107 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
108 * recoverable/continuable.This will allow BIOS to look at the MSRs
109 * for errors if the OS could not log the error.
110 */
111 for (i = 0; i < nr_mce_banks; i++) {
112 u32 msr;
113 msr = MSR_IA32_MC0_STATUS+i*4;
114 rdmsr(msr, low, high);
115 if (high&(1<<31)) {
116 /* Clear it */
117 wrmsr(msr, 0UL, 0UL);
118 /* Serialize */
119 wmb();
120 add_taint(TAINT_MACHINE_CHECK);
121 }
122 }
123 mcgstl &= ~(1<<2);
124 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
125}
126
127void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
128{
129 u32 l, h;
130 int i;
131
132 machine_check_vector = intel_machine_check;
133 wmb();
134
135 printk(KERN_INFO "Intel machine check architecture supported.\n");
136 rdmsr(MSR_IA32_MCG_CAP, l, h);
137 if (l & (1<<8)) /* Control register present ? */
138 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
139 nr_mce_banks = l & 0xff;
140
141 for (i = 0; i < nr_mce_banks; i++) {
142 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
143 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
144 }
145
146 set_in_cr4(X86_CR4_MCE);
147 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
148 smp_processor_id());
149
150 /* Check for P4/Xeon extended MCE MSRs */
151 rdmsr(MSR_IA32_MCG_CAP, l, h);
152 if (l & (1<<9)) {/* MCG_EXT_P */
153 mce_num_extended_msrs = (l >> 16) & 0xff;
154 printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
155 " available\n",
156 smp_processor_id(), mce_num_extended_msrs);
157
158#ifdef CONFIG_X86_MCE_P4THERMAL
159 /* Check for P4/Xeon Thermal monitor */
160 intel_init_thermal(c);
161#endif
162 }
163}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
deleted file mode 100644
index 01e4f8178183..000000000000
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ /dev/null
@@ -1,127 +0,0 @@
1/*
2 * P6 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */
5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h>
10
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/mce.h>
14#include <asm/msr.h>
15
16/* Machine Check Handler For PII/PIII */
17static void intel_machine_check(struct pt_regs *regs, long error_code)
18{
19 u32 alow, ahigh, high, low;
20 u32 mcgstl, mcgsth;
21 int recover = 1;
22 int i;
23
24 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
25 if (mcgstl & (1<<0)) /* Recoverable ? */
26 recover = 0;
27
28 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
29 smp_processor_id(), mcgsth, mcgstl);
30
31 for (i = 0; i < nr_mce_banks; i++) {
32 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
33 if (high & (1<<31)) {
34 char misc[20];
35 char addr[24];
36
37 misc[0] = '\0';
38 addr[0] = '\0';
39
40 if (high & (1<<29))
41 recover |= 1;
42 if (high & (1<<25))
43 recover |= 2;
44 high &= ~(1<<31);
45
46 if (high & (1<<27)) {
47 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
48 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
49 }
50 if (high & (1<<26)) {
51 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
52 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
53 }
54
55 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
56 smp_processor_id(), i, high, low, misc, addr);
57 }
58 }
59
60 if (recover & 2)
61 panic("CPU context corrupt");
62 if (recover & 1)
63 panic("Unable to continue");
64
65 printk(KERN_EMERG "Attempting to continue.\n");
66 /*
67 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
68 * recoverable/continuable.This will allow BIOS to look at the MSRs
69 * for errors if the OS could not log the error:
70 */
71 for (i = 0; i < nr_mce_banks; i++) {
72 unsigned int msr;
73
74 msr = MSR_IA32_MC0_STATUS+i*4;
75 rdmsr(msr, low, high);
76 if (high & (1<<31)) {
77 /* Clear it: */
78 wrmsr(msr, 0UL, 0UL);
79 /* Serialize: */
80 wmb();
81 add_taint(TAINT_MACHINE_CHECK);
82 }
83 }
84 mcgstl &= ~(1<<2);
85 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
86}
87
88/* Set up machine check reporting for processors with Intel style MCE: */
89void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
90{
91 u32 l, h;
92 int i;
93
94 /* Check for MCE support */
95 if (!cpu_has(c, X86_FEATURE_MCE))
96 return;
97
98 /* Check for PPro style MCA */
99 if (!cpu_has(c, X86_FEATURE_MCA))
100 return;
101
102 /* Ok machine check is available */
103 machine_check_vector = intel_machine_check;
104 /* Make sure the vector pointer is visible before we enable MCEs: */
105 wmb();
106
107 printk(KERN_INFO "Intel machine check architecture supported.\n");
108 rdmsr(MSR_IA32_MCG_CAP, l, h);
109 if (l & (1<<8)) /* Control register present ? */
110 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
111 nr_mce_banks = l & 0xff;
112
113 /*
114 * Following the example in IA-32 SDM Vol 3:
115 * - MC0_CTL should not be written
116 * - Status registers on all banks should be cleared on reset
117 */
118 for (i = 1; i < nr_mce_banks; i++)
119 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
120
121 for (i = 0; i < nr_mce_banks; i++)
122 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
123
124 set_in_cr4(X86_CR4_MCE);
125 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
126 smp_processor_id());
127}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 5957a93e5173..63a56d147e4a 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -260,9 +260,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
260 return; 260 return;
261 } 261 }
262 262
263 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
264 tm2 = 1;
265
266 /* Check whether a vector already exists */ 263 /* Check whether a vector already exists */
267 if (h & APIC_VECTOR_MASK) { 264 if (h & APIC_VECTOR_MASK) {
268 printk(KERN_DEBUG 265 printk(KERN_DEBUG
@@ -271,6 +268,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
271 return; 268 return;
272 } 269 }
273 270
271 /* early Pentium M models use different method for enabling TM2 */
272 if (cpu_has(c, X86_FEATURE_TM2)) {
273 if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
274 rdmsr(MSR_THERM2_CTL, l, h);
275 if (l & MSR_THERM2_CTL_TM_SELECT)
276 tm2 = 1;
277 } else if (l & MSR_IA32_MISC_ENABLE_TM2)
278 tm2 = 1;
279 }
280
274 /* We'll mask the thermal vector in the lapic till we're ready: */ 281 /* We'll mask the thermal vector in the lapic till we're ready: */
275 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; 282 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
276 apic_write(APIC_LVTTHMR, h); 283 apic_write(APIC_LVTTHMR, h);
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 7af0f88a4163..84e83de54575 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -58,6 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
58static DEFINE_MUTEX(mtrr_mutex); 58static DEFINE_MUTEX(mtrr_mutex);
59 59
60u64 size_or_mask, size_and_mask; 60u64 size_or_mask, size_and_mask;
61static bool mtrr_aps_delayed_init;
61 62
62static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; 63static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
63 64
@@ -163,7 +164,10 @@ static void ipi_handler(void *info)
163 if (data->smp_reg != ~0U) { 164 if (data->smp_reg != ~0U) {
164 mtrr_if->set(data->smp_reg, data->smp_base, 165 mtrr_if->set(data->smp_reg, data->smp_base,
165 data->smp_size, data->smp_type); 166 data->smp_size, data->smp_type);
166 } else { 167 } else if (mtrr_aps_delayed_init) {
168 /*
169 * Initialize the MTRRs inaddition to the synchronisation.
170 */
167 mtrr_if->set_all(); 171 mtrr_if->set_all();
168 } 172 }
169 173
@@ -265,6 +269,8 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
265 */ 269 */
266 if (reg != ~0U) 270 if (reg != ~0U)
267 mtrr_if->set(reg, base, size, type); 271 mtrr_if->set(reg, base, size, type);
272 else if (!mtrr_aps_delayed_init)
273 mtrr_if->set_all();
268 274
269 /* Wait for the others */ 275 /* Wait for the others */
270 while (atomic_read(&data.count)) 276 while (atomic_read(&data.count))
@@ -721,9 +727,7 @@ void __init mtrr_bp_init(void)
721 727
722void mtrr_ap_init(void) 728void mtrr_ap_init(void)
723{ 729{
724 unsigned long flags; 730 if (!use_intel() || mtrr_aps_delayed_init)
725
726 if (!mtrr_if || !use_intel())
727 return; 731 return;
728 /* 732 /*
729 * Ideally we should hold mtrr_mutex here to avoid mtrr entries 733 * Ideally we should hold mtrr_mutex here to avoid mtrr entries
@@ -738,11 +742,7 @@ void mtrr_ap_init(void)
738 * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug 742 * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
739 * lock to prevent mtrr entry changes 743 * lock to prevent mtrr entry changes
740 */ 744 */
741 local_irq_save(flags); 745 set_mtrr(~0U, 0, 0, 0);
742
743 mtrr_if->set_all();
744
745 local_irq_restore(flags);
746} 746}
747 747
748/** 748/**
@@ -753,6 +753,34 @@ void mtrr_save_state(void)
753 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); 753 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
754} 754}
755 755
756void set_mtrr_aps_delayed_init(void)
757{
758 if (!use_intel())
759 return;
760
761 mtrr_aps_delayed_init = true;
762}
763
764/*
765 * MTRR initialization for all AP's
766 */
767void mtrr_aps_init(void)
768{
769 if (!use_intel())
770 return;
771
772 set_mtrr(~0U, 0, 0, 0);
773 mtrr_aps_delayed_init = false;
774}
775
776void mtrr_bp_restore(void)
777{
778 if (!use_intel())
779 return;
780
781 mtrr_if->set_all();
782}
783
756static int __init mtrr_init_finialize(void) 784static int __init mtrr_init_finialize(void)
757{ 785{
758 if (!mtrr_if) 786 if (!mtrr_if)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f9cd0849bd42..2732e2c1e4d3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1211,7 +1211,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
1211 x86_pmu_disable_counter(hwc, idx); 1211 x86_pmu_disable_counter(hwc, idx);
1212} 1212}
1213 1213
1214static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); 1214static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1215 1215
1216/* 1216/*
1217 * Set the next IRQ period, based on the hwc->period_left value. 1217 * Set the next IRQ period, based on the hwc->period_left value.
@@ -1253,7 +1253,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
1253 if (left > x86_pmu.max_period) 1253 if (left > x86_pmu.max_period)
1254 left = x86_pmu.max_period; 1254 left = x86_pmu.max_period;
1255 1255
1256 per_cpu(prev_left[idx], smp_processor_id()) = left; 1256 per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1257 1257
1258 /* 1258 /*
1259 * The hw counter starts counting from this counter offset, 1259 * The hw counter starts counting from this counter offset,
@@ -1470,7 +1470,7 @@ void perf_counter_print_debug(void)
1470 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1470 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1471 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1471 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1472 1472
1473 prev_left = per_cpu(prev_left[idx], cpu); 1473 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1474 1474
1475 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", 1475 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1476 cpu, idx, pmc_ctrl); 1476 cpu, idx, pmc_ctrl);
@@ -2110,8 +2110,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
2110 entry->ip[entry->nr++] = ip; 2110 entry->ip[entry->nr++] = ip;
2111} 2111}
2112 2112
2113static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); 2113static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
2114static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); 2114static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
2115static DEFINE_PER_CPU(int, in_nmi_frame); 2115static DEFINE_PER_CPU(int, in_nmi_frame);
2116 2116
2117 2117
@@ -2264,9 +2264,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2264 struct perf_callchain_entry *entry; 2264 struct perf_callchain_entry *entry;
2265 2265
2266 if (in_nmi()) 2266 if (in_nmi())
2267 entry = &__get_cpu_var(nmi_entry); 2267 entry = &__get_cpu_var(pmc_nmi_entry);
2268 else 2268 else
2269 entry = &__get_cpu_var(irq_entry); 2269 entry = &__get_cpu_var(pmc_irq_entry);
2270 2270
2271 entry->nr = 0; 2271 entry->nr = 0;
2272 2272
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
new file mode 100644
index 000000000000..a640ae5ad201
--- /dev/null
+++ b/arch/x86/kernel/cpu/sched.c
@@ -0,0 +1,55 @@
1#include <linux/sched.h>
2#include <linux/math64.h>
3#include <linux/percpu.h>
4#include <linux/irqflags.h>
5
6#include <asm/cpufeature.h>
7#include <asm/processor.h>
8
9#ifdef CONFIG_SMP
10
11static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
12
13static unsigned long scale_aperfmperf(void)
14{
15 struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
16 unsigned long ratio, flags;
17
18 local_irq_save(flags);
19 get_aperfmperf(&val);
20 local_irq_restore(flags);
21
22 ratio = calc_aperfmperf_ratio(old, &val);
23 *old = val;
24
25 return ratio;
26}
27
28unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
29{
30 /*
31 * do aperf/mperf on the cpu level because it includes things
32 * like turbo mode, which are relevant to full cores.
33 */
34 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
35 return scale_aperfmperf();
36
37 /*
38 * maybe have something cpufreq here
39 */
40
41 return default_scale_freq_power(sd, cpu);
42}
43
44unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
45{
46 /*
47 * aperf/mperf already includes the smt gain
48 */
49 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
50 return SCHED_LOAD_SCALE;
51
52 return default_scale_smt_power(sd, cpu);
53}
54
55#endif
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c251be745107..d59fe323807e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -146,7 +146,7 @@ ENTRY(ftrace_graph_caller)
146END(ftrace_graph_caller) 146END(ftrace_graph_caller)
147 147
148GLOBAL(return_to_handler) 148GLOBAL(return_to_handler)
149 subq $80, %rsp 149 subq $24, %rsp
150 150
151 /* Save the return values */ 151 /* Save the return values */
152 movq %rax, (%rsp) 152 movq %rax, (%rsp)
@@ -155,10 +155,10 @@ GLOBAL(return_to_handler)
155 155
156 call ftrace_return_to_handler 156 call ftrace_return_to_handler
157 157
158 movq %rax, 72(%rsp) 158 movq %rax, 16(%rsp)
159 movq 8(%rsp), %rdx 159 movq 8(%rsp), %rdx
160 movq (%rsp), %rax 160 movq (%rsp), %rax
161 addq $72, %rsp 161 addq $16, %rsp
162 retq 162 retq
163#endif 163#endif
164 164
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index b0cdde6932f5..74656d1d4e30 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -104,7 +104,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)
104 seq_printf(p, " Threshold APIC interrupts\n"); 104 seq_printf(p, " Threshold APIC interrupts\n");
105# endif 105# endif
106#endif 106#endif
107#ifdef CONFIG_X86_NEW_MCE 107#ifdef CONFIG_X86_MCE
108 seq_printf(p, "%*s: ", prec, "MCE"); 108 seq_printf(p, "%*s: ", prec, "MCE");
109 for_each_online_cpu(j) 109 for_each_online_cpu(j)
110 seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); 110 seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
@@ -200,7 +200,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
200 sum += irq_stats(cpu)->irq_threshold_count; 200 sum += irq_stats(cpu)->irq_threshold_count;
201# endif 201# endif
202#endif 202#endif
203#ifdef CONFIG_X86_NEW_MCE 203#ifdef CONFIG_X86_MCE
204 sum += per_cpu(mce_exception_count, cpu); 204 sum += per_cpu(mce_exception_count, cpu);
205 sum += per_cpu(mce_poll_count, cpu); 205 sum += per_cpu(mce_poll_count, cpu);
206#endif 206#endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 92b7703d3d58..ccf8ab54f31a 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -190,7 +190,7 @@ static void __init apic_intr_init(void)
190#ifdef CONFIG_X86_MCE_THRESHOLD 190#ifdef CONFIG_X86_MCE_THRESHOLD
191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
192#endif 192#endif
193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) 193#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
194 alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); 194 alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
195#endif 195#endif
196 196
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d71c8655905b..64b838eac18c 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -225,10 +225,8 @@ static __init int iommu_setup(char *p)
225 if (!strncmp(p, "soft", 4)) 225 if (!strncmp(p, "soft", 4))
226 swiotlb = 1; 226 swiotlb = 1;
227#endif 227#endif
228 if (!strncmp(p, "pt", 2)) { 228 if (!strncmp(p, "pt", 2))
229 iommu_pass_through = 1; 229 iommu_pass_through = 1;
230 return 1;
231 }
232 230
233 gart_parse_options(p); 231 gart_parse_options(p);
234 232
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index af71d06624bf..6c3b2c6fd772 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -508,7 +508,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
508 508
509 pci_read_config_dword(nb_ht, 0x60, &val); 509 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7); 510 set_dev_node(&dev->dev, val & 7);
511 pci_dev_put(dev); 511 pci_dev_put(nb_ht);
512} 512}
513 513
514DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 514DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a06e8d101844..27349f92a6d7 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -4,6 +4,7 @@
4#include <linux/pm.h> 4#include <linux/pm.h>
5#include <linux/efi.h> 5#include <linux/efi.h>
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/tboot.h>
7#include <acpi/reboot.h> 8#include <acpi/reboot.h>
8#include <asm/io.h> 9#include <asm/io.h>
9#include <asm/apic.h> 10#include <asm/apic.h>
@@ -508,6 +509,8 @@ static void native_machine_emergency_restart(void)
508 if (reboot_emergency) 509 if (reboot_emergency)
509 emergency_vmx_disable_all(); 510 emergency_vmx_disable_all();
510 511
512 tboot_shutdown(TB_SHUTDOWN_REBOOT);
513
511 /* Tell the BIOS if we want cold or warm reboot */ 514 /* Tell the BIOS if we want cold or warm reboot */
512 *((unsigned short *)__va(0x472)) = reboot_mode; 515 *((unsigned short *)__va(0x472)) = reboot_mode;
513 516
@@ -634,6 +637,8 @@ static void native_machine_halt(void)
634 /* stop other cpus and apics */ 637 /* stop other cpus and apics */
635 machine_shutdown(); 638 machine_shutdown();
636 639
640 tboot_shutdown(TB_SHUTDOWN_HALT);
641
637 /* stop this cpu */ 642 /* stop this cpu */
638 stop_this_cpu(NULL); 643 stop_this_cpu(NULL);
639} 644}
@@ -645,6 +650,8 @@ static void native_machine_power_off(void)
645 machine_shutdown(); 650 machine_shutdown();
646 pm_power_off(); 651 pm_power_off();
647 } 652 }
653 /* a fallback in case there is no PM info available */
654 tboot_shutdown(TB_SHUTDOWN_HALT);
648} 655}
649 656
650struct machine_ops machine_ops = { 657struct machine_ops machine_ops = {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 63f32d220ef2..19f15c4076fb 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -66,6 +66,7 @@
66 66
67#include <linux/percpu.h> 67#include <linux/percpu.h>
68#include <linux/crash_dump.h> 68#include <linux/crash_dump.h>
69#include <linux/tboot.h>
69 70
70#include <video/edid.h> 71#include <video/edid.h>
71 72
@@ -711,6 +712,21 @@ void __init setup_arch(char **cmdline_p)
711 printk(KERN_INFO "Command line: %s\n", boot_command_line); 712 printk(KERN_INFO "Command line: %s\n", boot_command_line);
712#endif 713#endif
713 714
715 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
716 *cmdline_p = command_line;
717
718#ifdef CONFIG_X86_64
719 /*
720 * Must call this twice: Once just to detect whether hardware doesn't
721 * support NX (so that the early EHCI debug console setup can safely
722 * call set_fixmap(), and then again after parsing early parameters to
723 * honor the respective command line option.
724 */
725 check_efer();
726#endif
727
728 parse_early_param();
729
714 /* VMI may relocate the fixmap; do this before touching ioremap area */ 730 /* VMI may relocate the fixmap; do this before touching ioremap area */
715 vmi_init(); 731 vmi_init();
716 732
@@ -793,11 +809,6 @@ void __init setup_arch(char **cmdline_p)
793#endif 809#endif
794#endif 810#endif
795 811
796 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
797 *cmdline_p = command_line;
798
799 parse_early_param();
800
801#ifdef CONFIG_X86_64 812#ifdef CONFIG_X86_64
802 check_efer(); 813 check_efer();
803#endif 814#endif
@@ -977,6 +988,8 @@ void __init setup_arch(char **cmdline_p)
977 paravirt_pagetable_setup_done(swapper_pg_dir); 988 paravirt_pagetable_setup_done(swapper_pg_dir);
978 paravirt_post_allocator_init(); 989 paravirt_post_allocator_init();
979 990
991 tboot_probe();
992
980#ifdef CONFIG_X86_64 993#ifdef CONFIG_X86_64
981 map_vsyscall(); 994 map_vsyscall();
982#endif 995#endif
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 07d81916f212..d559af913e1f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
55#define PERCPU_FIRST_CHUNK_RESERVE 0 55#define PERCPU_FIRST_CHUNK_RESERVE 0
56#endif 56#endif
57 57
58#ifdef CONFIG_X86_32
58/** 59/**
59 * pcpu_need_numa - determine percpu allocation needs to consider NUMA 60 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
60 * 61 *
@@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void)
83#endif 84#endif
84 return false; 85 return false;
85} 86}
87#endif
86 88
87/** 89/**
88 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu 90 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
@@ -124,308 +126,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
124} 126}
125 127
126/* 128/*
127 * Large page remap allocator 129 * Helpers for first chunk memory allocation
128 *
129 * This allocator uses PMD page as unit. A PMD page is allocated for
130 * each cpu and each is remapped into vmalloc area using PMD mapping.
131 * As PMD page is quite large, only part of it is used for the first
132 * chunk. Unused part is returned to the bootmem allocator.
133 *
134 * So, the PMD pages are mapped twice - once to the physical mapping
135 * and to the vmalloc area for the first percpu chunk. The double
136 * mapping does add one more PMD TLB entry pressure but still is much
137 * better than only using 4k mappings while still being NUMA friendly.
138 */ 130 */
139#ifdef CONFIG_NEED_MULTIPLE_NODES 131static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
140struct pcpul_ent {
141 unsigned int cpu;
142 void *ptr;
143};
144
145static size_t pcpul_size;
146static struct pcpul_ent *pcpul_map;
147static struct vm_struct pcpul_vm;
148
149static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
150{ 132{
151 size_t off = (size_t)pageno << PAGE_SHIFT; 133 return pcpu_alloc_bootmem(cpu, size, align);
152
153 if (off >= pcpul_size)
154 return NULL;
155
156 return virt_to_page(pcpul_map[cpu].ptr + off);
157} 134}
158 135
159static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 136static void __init pcpu_fc_free(void *ptr, size_t size)
160{ 137{
161 size_t map_size, dyn_size; 138 free_bootmem(__pa(ptr), size);
162 unsigned int cpu;
163 int i, j;
164 ssize_t ret;
165
166 if (!chosen) {
167 size_t vm_size = VMALLOC_END - VMALLOC_START;
168 size_t tot_size = nr_cpu_ids * PMD_SIZE;
169
170 /* on non-NUMA, embedding is better */
171 if (!pcpu_need_numa())
172 return -EINVAL;
173
174 /* don't consume more than 20% of vmalloc area */
175 if (tot_size > vm_size / 5) {
176 pr_info("PERCPU: too large chunk size %zuMB for "
177 "large page remap\n", tot_size >> 20);
178 return -EINVAL;
179 }
180 }
181
182 /* need PSE */
183 if (!cpu_has_pse) {
184 pr_warning("PERCPU: lpage allocator requires PSE\n");
185 return -EINVAL;
186 }
187
188 /*
189 * Currently supports only single page. Supporting multiple
190 * pages won't be too difficult if it ever becomes necessary.
191 */
192 pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
193 PERCPU_DYNAMIC_RESERVE);
194 if (pcpul_size > PMD_SIZE) {
195 pr_warning("PERCPU: static data is larger than large page, "
196 "can't use large page\n");
197 return -EINVAL;
198 }
199 dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
200
201 /* allocate pointer array and alloc large pages */
202 map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
203 pcpul_map = alloc_bootmem(map_size);
204
205 for_each_possible_cpu(cpu) {
206 pcpul_map[cpu].cpu = cpu;
207 pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
208 PMD_SIZE);
209 if (!pcpul_map[cpu].ptr) {
210 pr_warning("PERCPU: failed to allocate large page "
211 "for cpu%u\n", cpu);
212 goto enomem;
213 }
214
215 /*
216 * Only use pcpul_size bytes and give back the rest.
217 *
218 * Ingo: The 2MB up-rounding bootmem is needed to make
219 * sure the partial 2MB page is still fully RAM - it's
220 * not well-specified to have a PAT-incompatible area
221 * (unmapped RAM, device memory, etc.) in that hole.
222 */
223 free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
224 PMD_SIZE - pcpul_size);
225
226 memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
227 }
228
229 /* allocate address and map */
230 pcpul_vm.flags = VM_ALLOC;
231 pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
232 vm_area_register_early(&pcpul_vm, PMD_SIZE);
233
234 for_each_possible_cpu(cpu) {
235 pmd_t *pmd, pmd_v;
236
237 pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
238 cpu * PMD_SIZE);
239 pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
240 PAGE_KERNEL_LARGE);
241 set_pmd(pmd, pmd_v);
242 }
243
244 /* we're ready, commit */
245 pr_info("PERCPU: Remapped at %p with large pages, static data "
246 "%zu bytes\n", pcpul_vm.addr, static_size);
247
248 ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
249 PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
250 PMD_SIZE, pcpul_vm.addr, NULL);
251
252 /* sort pcpul_map array for pcpu_lpage_remapped() */
253 for (i = 0; i < nr_cpu_ids - 1; i++)
254 for (j = i + 1; j < nr_cpu_ids; j++)
255 if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
256 struct pcpul_ent tmp = pcpul_map[i];
257 pcpul_map[i] = pcpul_map[j];
258 pcpul_map[j] = tmp;
259 }
260
261 return ret;
262
263enomem:
264 for_each_possible_cpu(cpu)
265 if (pcpul_map[cpu].ptr)
266 free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
267 free_bootmem(__pa(pcpul_map), map_size);
268 return -ENOMEM;
269} 139}
270 140
271/** 141static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
272 * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
273 * @kaddr: the kernel address in question
274 *
275 * Determine whether @kaddr falls in the pcpul recycled area. This is
276 * used by pageattr to detect VM aliases and break up the pcpu PMD
277 * mapping such that the same physical page is not mapped under
278 * different attributes.
279 *
280 * The recycled area is always at the tail of a partially used PMD
281 * page.
282 *
283 * RETURNS:
284 * Address of corresponding remapped pcpu address if match is found;
285 * otherwise, NULL.
286 */
287void *pcpu_lpage_remapped(void *kaddr)
288{ 142{
289 void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); 143#ifdef CONFIG_NEED_MULTIPLE_NODES
290 unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; 144 if (early_cpu_to_node(from) == early_cpu_to_node(to))
291 int left = 0, right = nr_cpu_ids - 1; 145 return LOCAL_DISTANCE;
292 int pos; 146 else
293 147 return REMOTE_DISTANCE;
294 /* pcpul in use at all? */
295 if (!pcpul_map)
296 return NULL;
297
298 /* okay, perform binary search */
299 while (left <= right) {
300 pos = (left + right) / 2;
301
302 if (pcpul_map[pos].ptr < pmd_addr)
303 left = pos + 1;
304 else if (pcpul_map[pos].ptr > pmd_addr)
305 right = pos - 1;
306 else {
307 /* it shouldn't be in the area for the first chunk */
308 WARN_ON(offset < pcpul_size);
309
310 return pcpul_vm.addr +
311 pcpul_map[pos].cpu * PMD_SIZE + offset;
312 }
313 }
314
315 return NULL;
316}
317#else 148#else
318static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 149 return LOCAL_DISTANCE;
319{
320 return -EINVAL;
321}
322#endif 150#endif
323
324/*
325 * Embedding allocator
326 *
327 * The first chunk is sized to just contain the static area plus
328 * module and dynamic reserves and embedded into linear physical
329 * mapping so that it can use PMD mapping without additional TLB
330 * pressure.
331 */
332static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
333{
334 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
335
336 /*
337 * If large page isn't supported, there's no benefit in doing
338 * this. Also, embedding allocation doesn't play well with
339 * NUMA.
340 */
341 if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
342 return -EINVAL;
343
344 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
345 reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
346} 151}
347 152
348/* 153static void __init pcpup_populate_pte(unsigned long addr)
349 * 4k page allocator
350 *
351 * This is the basic allocator. Static percpu area is allocated
352 * page-by-page and most of initialization is done by the generic
353 * setup function.
354 */
355static struct page **pcpu4k_pages __initdata;
356static int pcpu4k_nr_static_pages __initdata;
357
358static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
359{
360 if (pageno < pcpu4k_nr_static_pages)
361 return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
362 return NULL;
363}
364
365static void __init pcpu4k_populate_pte(unsigned long addr)
366{ 154{
367 populate_extra_pte(addr); 155 populate_extra_pte(addr);
368} 156}
369 157
370static ssize_t __init setup_pcpu_4k(size_t static_size)
371{
372 size_t pages_size;
373 unsigned int cpu;
374 int i, j;
375 ssize_t ret;
376
377 pcpu4k_nr_static_pages = PFN_UP(static_size);
378
379 /* unaligned allocations can't be freed, round up to page size */
380 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
381 * sizeof(pcpu4k_pages[0]));
382 pcpu4k_pages = alloc_bootmem(pages_size);
383
384 /* allocate and copy */
385 j = 0;
386 for_each_possible_cpu(cpu)
387 for (i = 0; i < pcpu4k_nr_static_pages; i++) {
388 void *ptr;
389
390 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
391 if (!ptr) {
392 pr_warning("PERCPU: failed to allocate "
393 "4k page for cpu%u\n", cpu);
394 goto enomem;
395 }
396
397 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
398 pcpu4k_pages[j++] = virt_to_page(ptr);
399 }
400
401 /* we're ready, commit */
402 pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
403 pcpu4k_nr_static_pages, static_size);
404
405 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
406 PERCPU_FIRST_CHUNK_RESERVE, -1,
407 -1, NULL, pcpu4k_populate_pte);
408 goto out_free_ar;
409
410enomem:
411 while (--j >= 0)
412 free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
413 ret = -ENOMEM;
414out_free_ar:
415 free_bootmem(__pa(pcpu4k_pages), pages_size);
416 return ret;
417}
418
419/* for explicit first chunk allocator selection */
420static char pcpu_chosen_alloc[16] __initdata;
421
422static int __init percpu_alloc_setup(char *str)
423{
424 strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
425 return 0;
426}
427early_param("percpu_alloc", percpu_alloc_setup);
428
429static inline void setup_percpu_segment(int cpu) 158static inline void setup_percpu_segment(int cpu)
430{ 159{
431#ifdef CONFIG_X86_32 160#ifdef CONFIG_X86_32
@@ -441,52 +170,49 @@ static inline void setup_percpu_segment(int cpu)
441 170
442void __init setup_per_cpu_areas(void) 171void __init setup_per_cpu_areas(void)
443{ 172{
444 size_t static_size = __per_cpu_end - __per_cpu_start;
445 unsigned int cpu; 173 unsigned int cpu;
446 unsigned long delta; 174 unsigned long delta;
447 size_t pcpu_unit_size; 175 int rc;
448 ssize_t ret;
449 176
450 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 177 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
451 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 178 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
452 179
453 /* 180 /*
454 * Allocate percpu area. If PSE is supported, try to make use 181 * Allocate percpu area. Embedding allocator is our favorite;
455 * of large page mappings. Please read comments on top of 182 * however, on NUMA configurations, it can result in very
456 * each allocator for details. 183 * sparse unit mapping and vmalloc area isn't spacious enough
184 * on 32bit. Use page in that case.
457 */ 185 */
458 ret = -EINVAL; 186#ifdef CONFIG_X86_32
459 if (strlen(pcpu_chosen_alloc)) { 187 if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
460 if (strcmp(pcpu_chosen_alloc, "4k")) { 188 pcpu_chosen_fc = PCPU_FC_PAGE;
461 if (!strcmp(pcpu_chosen_alloc, "lpage")) 189#endif
462 ret = setup_pcpu_lpage(static_size, true); 190 rc = -EINVAL;
463 else if (!strcmp(pcpu_chosen_alloc, "embed")) 191 if (pcpu_chosen_fc != PCPU_FC_PAGE) {
464 ret = setup_pcpu_embed(static_size, true); 192 const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
465 else 193 const size_t dyn_size = PERCPU_MODULE_RESERVE +
466 pr_warning("PERCPU: unknown allocator %s " 194 PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
467 "specified\n", pcpu_chosen_alloc); 195
468 if (ret < 0) 196 rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
469 pr_warning("PERCPU: %s allocator failed (%zd), " 197 dyn_size, atom_size,
470 "falling back to 4k\n", 198 pcpu_cpu_distance,
471 pcpu_chosen_alloc, ret); 199 pcpu_fc_alloc, pcpu_fc_free);
472 } 200 if (rc < 0)
473 } else { 201 pr_warning("PERCPU: %s allocator failed (%d), "
474 ret = setup_pcpu_lpage(static_size, false); 202 "falling back to page size\n",
475 if (ret < 0) 203 pcpu_fc_names[pcpu_chosen_fc], rc);
476 ret = setup_pcpu_embed(static_size, false);
477 } 204 }
478 if (ret < 0) 205 if (rc < 0)
479 ret = setup_pcpu_4k(static_size); 206 rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
480 if (ret < 0) 207 pcpu_fc_alloc, pcpu_fc_free,
481 panic("cannot allocate static percpu area (%zu bytes, err=%zd)", 208 pcpup_populate_pte);
482 static_size, ret); 209 if (rc < 0)
483 210 panic("cannot initialize percpu area (err=%d)", rc);
484 pcpu_unit_size = ret;
485 211
486 /* alrighty, percpu areas up and running */ 212 /* alrighty, percpu areas up and running */
487 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 213 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
488 for_each_possible_cpu(cpu) { 214 for_each_possible_cpu(cpu) {
489 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; 215 per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
490 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); 216 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
491 per_cpu(cpu_number, cpu) = cpu; 217 per_cpu(cpu_number, cpu) = cpu;
492 setup_percpu_segment(cpu); 218 setup_percpu_segment(cpu);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 81e58238c4ce..6a44a76055ad 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -856,7 +856,7 @@ static void do_signal(struct pt_regs *regs)
856void 856void
857do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 857do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
858{ 858{
859#ifdef CONFIG_X86_NEW_MCE 859#ifdef CONFIG_X86_MCE
860 /* notify userspace of pending MCEs */ 860 /* notify userspace of pending MCEs */
861 if (thread_info_flags & _TIF_MCE_NOTIFY) 861 if (thread_info_flags & _TIF_MCE_NOTIFY)
862 mce_notify_process(); 862 mce_notify_process();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c36cc1452cdc..a25eeec00080 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -47,6 +47,7 @@
47#include <linux/bootmem.h> 47#include <linux/bootmem.h>
48#include <linux/err.h> 48#include <linux/err.h>
49#include <linux/nmi.h> 49#include <linux/nmi.h>
50#include <linux/tboot.h>
50 51
51#include <asm/acpi.h> 52#include <asm/acpi.h>
52#include <asm/desc.h> 53#include <asm/desc.h>
@@ -1117,9 +1118,22 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1117 1118
1118 if (is_uv_system()) 1119 if (is_uv_system())
1119 uv_system_init(); 1120 uv_system_init();
1121
1122 set_mtrr_aps_delayed_init();
1120out: 1123out:
1121 preempt_enable(); 1124 preempt_enable();
1122} 1125}
1126
1127void arch_enable_nonboot_cpus_begin(void)
1128{
1129 set_mtrr_aps_delayed_init();
1130}
1131
1132void arch_enable_nonboot_cpus_end(void)
1133{
1134 mtrr_aps_init();
1135}
1136
1123/* 1137/*
1124 * Early setup to make printk work. 1138 * Early setup to make printk work.
1125 */ 1139 */
@@ -1141,6 +1155,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1141 setup_ioapic_dest(); 1155 setup_ioapic_dest();
1142#endif 1156#endif
1143 check_nmi_watchdog(); 1157 check_nmi_watchdog();
1158 mtrr_aps_init();
1144} 1159}
1145 1160
1146static int __initdata setup_possible_cpus = -1; 1161static int __initdata setup_possible_cpus = -1;
@@ -1318,6 +1333,7 @@ void play_dead_common(void)
1318void native_play_dead(void) 1333void native_play_dead(void)
1319{ 1334{
1320 play_dead_common(); 1335 play_dead_common();
1336 tboot_shutdown(TB_SHUTDOWN_WFS);
1321 wbinvd_halt(); 1337 wbinvd_halt();
1322} 1338}
1323 1339
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
new file mode 100644
index 000000000000..86c9f91b48ae
--- /dev/null
+++ b/arch/x86/kernel/tboot.c
@@ -0,0 +1,447 @@
1/*
2 * tboot.c: main implementation of helper functions used by kernel for
3 * runtime support of Intel(R) Trusted Execution Technology
4 *
5 * Copyright (c) 2006-2009, Intel Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 */
21
22#include <linux/dma_remapping.h>
23#include <linux/init_task.h>
24#include <linux/spinlock.h>
25#include <linux/delay.h>
26#include <linux/sched.h>
27#include <linux/init.h>
28#include <linux/dmar.h>
29#include <linux/cpu.h>
30#include <linux/pfn.h>
31#include <linux/mm.h>
32#include <linux/tboot.h>
33
34#include <asm/trampoline.h>
35#include <asm/processor.h>
36#include <asm/bootparam.h>
37#include <asm/pgtable.h>
38#include <asm/pgalloc.h>
39#include <asm/fixmap.h>
40#include <asm/proto.h>
41#include <asm/setup.h>
42#include <asm/e820.h>
43#include <asm/io.h>
44
45#include "acpi/realmode/wakeup.h"
46
47/* Global pointer to shared data; NULL means no measured launch. */
48struct tboot *tboot __read_mostly;
49
50/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
51#define AP_WAIT_TIMEOUT 1
52
53#undef pr_fmt
54#define pr_fmt(fmt) "tboot: " fmt
55
56static u8 tboot_uuid[16] __initdata = TBOOT_UUID;
57
58void __init tboot_probe(void)
59{
60 /* Look for valid page-aligned address for shared page. */
61 if (!boot_params.tboot_addr)
62 return;
63 /*
64 * also verify that it is mapped as we expect it before calling
65 * set_fixmap(), to reduce chance of garbage value causing crash
66 */
67 if (!e820_any_mapped(boot_params.tboot_addr,
68 boot_params.tboot_addr, E820_RESERVED)) {
69 pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n");
70 return;
71 }
72
73 /* only a natively booted kernel should be using TXT */
74 if (paravirt_enabled()) {
75 pr_warning("non-0 tboot_addr but pv_ops is enabled\n");
76 return;
77 }
78
79 /* Map and check for tboot UUID. */
80 set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
81 tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
82 if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
83 pr_warning("tboot at 0x%llx is invalid\n",
84 boot_params.tboot_addr);
85 tboot = NULL;
86 return;
87 }
88 if (tboot->version < 5) {
89 pr_warning("tboot version is invalid: %u\n", tboot->version);
90 tboot = NULL;
91 return;
92 }
93
94 pr_info("found shared page at phys addr 0x%llx:\n",
95 boot_params.tboot_addr);
96 pr_debug("version: %d\n", tboot->version);
97 pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
98 pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
99 pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
100 pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
101}
102
103static pgd_t *tboot_pg_dir;
104static struct mm_struct tboot_mm = {
105 .mm_rb = RB_ROOT,
106 .pgd = swapper_pg_dir,
107 .mm_users = ATOMIC_INIT(2),
108 .mm_count = ATOMIC_INIT(1),
109 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
110 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
111 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
112 .cpu_vm_mask = CPU_MASK_ALL,
113};
114
115static inline void switch_to_tboot_pt(void)
116{
117 write_cr3(virt_to_phys(tboot_pg_dir));
118}
119
120static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
121 pgprot_t prot)
122{
123 pgd_t *pgd;
124 pud_t *pud;
125 pmd_t *pmd;
126 pte_t *pte;
127
128 pgd = pgd_offset(&tboot_mm, vaddr);
129 pud = pud_alloc(&tboot_mm, pgd, vaddr);
130 if (!pud)
131 return -1;
132 pmd = pmd_alloc(&tboot_mm, pud, vaddr);
133 if (!pmd)
134 return -1;
135 pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
136 if (!pte)
137 return -1;
138 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
139 pte_unmap(pte);
140 return 0;
141}
142
143static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn,
144 unsigned long nr)
145{
146 /* Reuse the original kernel mapping */
147 tboot_pg_dir = pgd_alloc(&tboot_mm);
148 if (!tboot_pg_dir)
149 return -1;
150
151 for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) {
152 if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC))
153 return -1;
154 }
155
156 return 0;
157}
158
159static void tboot_create_trampoline(void)
160{
161 u32 map_base, map_size;
162
163 /* Create identity map for tboot shutdown code. */
164 map_base = PFN_DOWN(tboot->tboot_base);
165 map_size = PFN_UP(tboot->tboot_size);
166 if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size))
167 panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n",
168 map_base, map_size);
169}
170
171#ifdef CONFIG_ACPI_SLEEP
172
173static void add_mac_region(phys_addr_t start, unsigned long size)
174{
175 struct tboot_mac_region *mr;
176 phys_addr_t end = start + size;
177
178 if (start && size) {
179 mr = &tboot->mac_regions[tboot->num_mac_regions++];
180 mr->start = round_down(start, PAGE_SIZE);
181 mr->size = round_up(end, PAGE_SIZE) - mr->start;
182 }
183}
184
185static int tboot_setup_sleep(void)
186{
187 tboot->num_mac_regions = 0;
188
189 /* S3 resume code */
190 add_mac_region(acpi_wakeup_address, WAKEUP_SIZE);
191
192#ifdef CONFIG_X86_TRAMPOLINE
193 /* AP trampoline code */
194 add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
195#endif
196
197 /* kernel code + data + bss */
198 add_mac_region(virt_to_phys(_text), _end - _text);
199
200 tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
201
202 return 0;
203}
204
205#else /* no CONFIG_ACPI_SLEEP */
206
207static int tboot_setup_sleep(void)
208{
209 /* S3 shutdown requested, but S3 not supported by the kernel... */
210 BUG();
211 return -1;
212}
213
214#endif
215
216void tboot_shutdown(u32 shutdown_type)
217{
218 void (*shutdown)(void);
219
220 if (!tboot_enabled())
221 return;
222
223 /*
224 * if we're being called before the 1:1 mapping is set up then just
225 * return and let the normal shutdown happen; this should only be
226 * due to very early panic()
227 */
228 if (!tboot_pg_dir)
229 return;
230
231 /* if this is S3 then set regions to MAC */
232 if (shutdown_type == TB_SHUTDOWN_S3)
233 if (tboot_setup_sleep())
234 return;
235
236 tboot->shutdown_type = shutdown_type;
237
238 switch_to_tboot_pt();
239
240 shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry;
241 shutdown();
242
243 /* should not reach here */
244 while (1)
245 halt();
246}
247
248static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
249{
250#define TB_COPY_GAS(tbg, g) \
251 tbg.space_id = g.space_id; \
252 tbg.bit_width = g.bit_width; \
253 tbg.bit_offset = g.bit_offset; \
254 tbg.access_width = g.access_width; \
255 tbg.address = g.address;
256
257 TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block);
258 TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block);
259 TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block);
260 TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block);
261
262 /*
263 * We need phys addr of waking vector, but can't use virt_to_phys() on
264 * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys
265 * addr.
266 */
267 tboot->acpi_sinfo.wakeup_vector = fadt->facs +
268 offsetof(struct acpi_table_facs, firmware_waking_vector);
269}
270
271void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
272{
273 static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
274 /* S0,1,2: */ -1, -1, -1,
275 /* S3: */ TB_SHUTDOWN_S3,
276 /* S4: */ TB_SHUTDOWN_S4,
277 /* S5: */ TB_SHUTDOWN_S5 };
278
279 if (!tboot_enabled())
280 return;
281
282 tboot_copy_fadt(&acpi_gbl_FADT);
283 tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
284 tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control;
285 /* we always use the 32b wakeup vector */
286 tboot->acpi_sinfo.vector_width = 32;
287
288 if (sleep_state >= ACPI_S_STATE_COUNT ||
289 acpi_shutdown_map[sleep_state] == -1) {
290 pr_warning("unsupported sleep state 0x%x\n", sleep_state);
291 return;
292 }
293
294 tboot_shutdown(acpi_shutdown_map[sleep_state]);
295}
296
297static atomic_t ap_wfs_count;
298
299static int tboot_wait_for_aps(int num_aps)
300{
301 unsigned long timeout;
302
303 timeout = AP_WAIT_TIMEOUT*HZ;
304 while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps &&
305 timeout) {
306 mdelay(1);
307 timeout--;
308 }
309
310 if (timeout)
311 pr_warning("tboot wait for APs timeout\n");
312
313 return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
314}
315
316static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb,
317 unsigned long action, void *hcpu)
318{
319 switch (action) {
320 case CPU_DYING:
321 atomic_inc(&ap_wfs_count);
322 if (num_online_cpus() == 1)
323 if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
324 return NOTIFY_BAD;
325 break;
326 }
327 return NOTIFY_OK;
328}
329
330static struct notifier_block tboot_cpu_notifier __cpuinitdata =
331{
332 .notifier_call = tboot_cpu_callback,
333};
334
335static __init int tboot_late_init(void)
336{
337 if (!tboot_enabled())
338 return 0;
339
340 tboot_create_trampoline();
341
342 atomic_set(&ap_wfs_count, 0);
343 register_hotcpu_notifier(&tboot_cpu_notifier);
344 return 0;
345}
346
347late_initcall(tboot_late_init);
348
349/*
350 * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE)
351 */
352
353#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000
354#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000
355
356/* # pages for each config regs space - used by fixmap */
357#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \
358 TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT)
359
360/* offsets from pub/priv config space */
361#define TXTCR_HEAP_BASE 0x0300
362#define TXTCR_HEAP_SIZE 0x0308
363
364#define SHA1_SIZE 20
365
366struct sha1_hash {
367 u8 hash[SHA1_SIZE];
368};
369
370struct sinit_mle_data {
371 u32 version; /* currently 6 */
372 struct sha1_hash bios_acm_id;
373 u32 edx_senter_flags;
374 u64 mseg_valid;
375 struct sha1_hash sinit_hash;
376 struct sha1_hash mle_hash;
377 struct sha1_hash stm_hash;
378 struct sha1_hash lcp_policy_hash;
379 u32 lcp_policy_control;
380 u32 rlp_wakeup_addr;
381 u32 reserved;
382 u32 num_mdrs;
383 u32 mdrs_off;
384 u32 num_vtd_dmars;
385 u32 vtd_dmars_off;
386} __packed;
387
388struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl)
389{
390 void *heap_base, *heap_ptr, *config;
391
392 if (!tboot_enabled())
393 return dmar_tbl;
394
395 /*
396 * ACPI tables may not be DMA protected by tboot, so use DMAR copy
397 * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
398 */
399
400 /* map config space in order to get heap addr */
401 config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES *
402 PAGE_SIZE);
403 if (!config)
404 return NULL;
405
406 /* now map TXT heap */
407 heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE),
408 *(u64 *)(config + TXTCR_HEAP_SIZE));
409 iounmap(config);
410 if (!heap_base)
411 return NULL;
412
413 /* walk heap to SinitMleData */
414 /* skip BiosData */
415 heap_ptr = heap_base + *(u64 *)heap_base;
416 /* skip OsMleData */
417 heap_ptr += *(u64 *)heap_ptr;
418 /* skip OsSinitData */
419 heap_ptr += *(u64 *)heap_ptr;
420 /* now points to SinitMleDataSize; set to SinitMleData */
421 heap_ptr += sizeof(u64);
422 /* get addr of DMAR table */
423 dmar_tbl = (struct acpi_table_header *)(heap_ptr +
424 ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off -
425 sizeof(u64));
426
427 /* don't unmap heap because dmar.c needs access to this */
428
429 return dmar_tbl;
430}
431
432int tboot_force_iommu(void)
433{
434 if (!tboot_enabled())
435 return 0;
436
437 if (no_iommu || swiotlb || dmar_disabled)
438 pr_warning("Forcing Intel-IOMMU to enabled\n");
439
440 dmar_disabled = 0;
441#ifdef CONFIG_SWIOTLB
442 swiotlb = 0;
443#endif
444 no_iommu = 0;
445
446 return 1;
447}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 9fc178255c04..0ccb57d5ee35 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -348,15 +348,12 @@ SECTIONS
348 _end = .; 348 _end = .;
349 } 349 }
350 350
351 /* Sections to be discarded */
352 /DISCARD/ : {
353 *(.exitcall.exit)
354 *(.eh_frame)
355 *(.discard)
356 }
357
358 STABS_DEBUG 351 STABS_DEBUG
359 DWARF_DEBUG 352 DWARF_DEBUG
353
354 /* Sections to be discarded */
355 DISCARDS
356 /DISCARD/ : { *(.eh_frame) }
360} 357}
361 358
362 359