aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig10
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/cacheflush.h53
-rw-r--r--arch/x86/include/asm/i387.h8
-rw-r--r--arch/x86/include/asm/io.h3
-rw-r--r--arch/x86/include/asm/mce.h35
-rw-r--r--arch/x86/include/asm/mmzone_32.h43
-rw-r--r--arch/x86/include/asm/msr-index.h5
-rw-r--r--arch/x86/include/asm/percpu.h8
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/xen/page.h1
-rw-r--r--arch/x86/kernel/alternative.c17
-rw-r--r--arch/x86/kernel/apic/apic.c15
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c530
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c207
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c29
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/irq_32.c29
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/setup_percpu.c367
-rw-r--r--arch/x86/math-emu/fpu_aux.c31
-rw-r--r--arch/x86/mm/init_32.c17
-rw-r--r--arch/x86/mm/init_64.c72
-rw-r--r--arch/x86/xen/enlighten.c10
-rw-r--r--arch/x86/xen/mmu.c7
-rw-r--r--arch/x86/xen/smp.c6
30 files changed, 1242 insertions, 313 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 469f3450bf81..31758378bcd2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -138,6 +138,9 @@ config ARCH_HAS_CACHE_LINE_SIZE
138config HAVE_SETUP_PER_CPU_AREA 138config HAVE_SETUP_PER_CPU_AREA
139 def_bool y 139 def_bool y
140 140
141config HAVE_DYNAMIC_PER_CPU_AREA
142 def_bool y
143
141config HAVE_CPUMASK_OF_CPU_MAP 144config HAVE_CPUMASK_OF_CPU_MAP
142 def_bool X86_64_SMP 145 def_bool X86_64_SMP
143 146
@@ -780,6 +783,11 @@ config X86_MCE_AMD
780 Additional support for AMD specific MCE features such as 783 Additional support for AMD specific MCE features such as
781 the DRAM Error Threshold. 784 the DRAM Error Threshold.
782 785
786config X86_MCE_THRESHOLD
787 depends on X86_MCE_AMD || X86_MCE_INTEL
788 bool
789 default y
790
783config X86_MCE_NONFATAL 791config X86_MCE_NONFATAL
784 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" 792 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
785 depends on X86_32 && X86_MCE 793 depends on X86_32 && X86_MCE
@@ -1125,7 +1133,7 @@ config NODES_SHIFT
1125 Specify the maximum number of NUMA Nodes available on the target 1133 Specify the maximum number of NUMA Nodes available on the target
1126 system. Increases memory reserved to accomodate various tables. 1134 system. Increases memory reserved to accomodate various tables.
1127 1135
1128config HAVE_ARCH_BOOTMEM_NODE 1136config HAVE_ARCH_BOOTMEM
1129 def_bool y 1137 def_bool y
1130 depends on X86_32 && NUMA 1138 depends on X86_32 && NUMA
1131 1139
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 63134e31e8b9..bc9514fb3b13 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -53,6 +53,7 @@
53#define APIC_ESR_SENDILL 0x00020 53#define APIC_ESR_SENDILL 0x00020
54#define APIC_ESR_RECVILL 0x00040 54#define APIC_ESR_RECVILL 0x00040
55#define APIC_ESR_ILLREGA 0x00080 55#define APIC_ESR_ILLREGA 0x00080
56#define APIC_LVTCMCI 0x2f0
56#define APIC_ICR 0x300 57#define APIC_ICR 0x300
57#define APIC_DEST_SELF 0x40000 58#define APIC_DEST_SELF 0x40000
58#define APIC_DEST_ALLINC 0x80000 59#define APIC_DEST_ALLINC 0x80000
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 2f8466540fb5..5b301b7ff5f4 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -5,24 +5,43 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6 6
7/* Caches aren't brain-dead on the intel. */ 7/* Caches aren't brain-dead on the intel. */
8#define flush_cache_all() do { } while (0) 8static inline void flush_cache_all(void) { }
9#define flush_cache_mm(mm) do { } while (0) 9static inline void flush_cache_mm(struct mm_struct *mm) { }
10#define flush_cache_dup_mm(mm) do { } while (0) 10static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
11#define flush_cache_range(vma, start, end) do { } while (0) 11static inline void flush_cache_range(struct vm_area_struct *vma,
12#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 12 unsigned long start, unsigned long end) { }
13#define flush_dcache_page(page) do { } while (0) 13static inline void flush_cache_page(struct vm_area_struct *vma,
14#define flush_dcache_mmap_lock(mapping) do { } while (0) 14 unsigned long vmaddr, unsigned long pfn) { }
15#define flush_dcache_mmap_unlock(mapping) do { } while (0) 15static inline void flush_dcache_page(struct page *page) { }
16#define flush_icache_range(start, end) do { } while (0) 16static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
17#define flush_icache_page(vma, pg) do { } while (0) 17static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
18#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) 18static inline void flush_icache_range(unsigned long start,
19#define flush_cache_vmap(start, end) do { } while (0) 19 unsigned long end) { }
20#define flush_cache_vunmap(start, end) do { } while (0) 20static inline void flush_icache_page(struct vm_area_struct *vma,
21 struct page *page) { }
22static inline void flush_icache_user_range(struct vm_area_struct *vma,
23 struct page *page,
24 unsigned long addr,
25 unsigned long len) { }
26static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
27static inline void flush_cache_vunmap(unsigned long start,
28 unsigned long end) { }
21 29
22#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ 30static inline void copy_to_user_page(struct vm_area_struct *vma,
23 memcpy((dst), (src), (len)) 31 struct page *page, unsigned long vaddr,
24#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ 32 void *dst, const void *src,
25 memcpy((dst), (src), (len)) 33 unsigned long len)
34{
35 memcpy(dst, src, len);
36}
37
38static inline void copy_from_user_page(struct vm_area_struct *vma,
39 struct page *page, unsigned long vaddr,
40 void *dst, const void *src,
41 unsigned long len)
42{
43 memcpy(dst, src, len);
44}
26 45
27#define PG_non_WB PG_arch_1 46#define PG_non_WB PG_arch_1
28PAGEFLAG(NonWB, non_WB) 47PAGEFLAG(NonWB, non_WB)
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 48f0004db8c9..71c9e5183982 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk)
172 172
173#else /* CONFIG_X86_32 */ 173#else /* CONFIG_X86_32 */
174 174
175extern void finit(void); 175#ifdef CONFIG_MATH_EMULATION
176extern void finit_task(struct task_struct *tsk);
177#else
178static inline void finit_task(struct task_struct *tsk)
179{
180}
181#endif
176 182
177static inline void tolerant_fwait(void) 183static inline void tolerant_fwait(void)
178{ 184{
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 683d0b4c00fc..e5383e3d2f8c 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -172,8 +172,6 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
172 172
173extern void iounmap(volatile void __iomem *addr); 173extern void iounmap(volatile void __iomem *addr);
174 174
175extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
176
177 175
178#ifdef CONFIG_X86_32 176#ifdef CONFIG_X86_32
179# include "io_32.h" 177# include "io_32.h"
@@ -198,7 +196,6 @@ extern void early_ioremap_reset(void);
198extern void __iomem *early_ioremap(unsigned long offset, unsigned long size); 196extern void __iomem *early_ioremap(unsigned long offset, unsigned long size);
199extern void __iomem *early_memremap(unsigned long offset, unsigned long size); 197extern void __iomem *early_memremap(unsigned long offset, unsigned long size);
200extern void early_iounmap(void __iomem *addr, unsigned long size); 198extern void early_iounmap(void __iomem *addr, unsigned long size);
201extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
202 199
203#define IO_SPACE_LIMIT 0xffff 200#define IO_SPACE_LIMIT 0xffff
204 201
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 32c6e17b960b..563933e06a35 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -11,6 +11,8 @@
11 */ 11 */
12 12
13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ 13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
14#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
15#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
14 16
15#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ 17#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
16#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ 18#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */
@@ -90,14 +92,29 @@ extern int mce_disabled;
90 92
91#include <asm/atomic.h> 93#include <asm/atomic.h>
92 94
95void mce_setup(struct mce *m);
93void mce_log(struct mce *m); 96void mce_log(struct mce *m);
94DECLARE_PER_CPU(struct sys_device, device_mce); 97DECLARE_PER_CPU(struct sys_device, device_mce);
95extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 98extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
96 99
100/*
101 * To support more than 128 would need to escape the predefined
102 * Linux defined extended banks first.
103 */
104#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
105
97#ifdef CONFIG_X86_MCE_INTEL 106#ifdef CONFIG_X86_MCE_INTEL
98void mce_intel_feature_init(struct cpuinfo_x86 *c); 107void mce_intel_feature_init(struct cpuinfo_x86 *c);
108void cmci_clear(void);
109void cmci_reenable(void);
110void cmci_rediscover(int dying);
111void cmci_recheck(void);
99#else 112#else
100static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } 113static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
114static inline void cmci_clear(void) {}
115static inline void cmci_reenable(void) {}
116static inline void cmci_rediscover(int dying) {}
117static inline void cmci_recheck(void) {}
101#endif 118#endif
102 119
103#ifdef CONFIG_X86_MCE_AMD 120#ifdef CONFIG_X86_MCE_AMD
@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
106static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } 123static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
107#endif 124#endif
108 125
109void mce_log_therm_throt_event(unsigned int cpu, __u64 status); 126extern int mce_available(struct cpuinfo_x86 *c);
127
128void mce_log_therm_throt_event(__u64 status);
110 129
111extern atomic_t mce_entry; 130extern atomic_t mce_entry;
112 131
113extern void do_machine_check(struct pt_regs *, long); 132extern void do_machine_check(struct pt_regs *, long);
133
134typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
135DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
136
137enum mcp_flags {
138 MCP_TIMESTAMP = (1 << 0), /* log time stamp */
139 MCP_UC = (1 << 1), /* log uncorrected errors */
140};
141extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
142
114extern int mce_notify_user(void); 143extern int mce_notify_user(void);
115 144
116#endif /* !CONFIG_X86_32 */ 145#endif /* !CONFIG_X86_32 */
@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
120#else 149#else
121#define mcheck_init(c) do { } while (0) 150#define mcheck_init(c) do { } while (0)
122#endif 151#endif
123extern void stop_mce(void); 152
124extern void restart_mce(void); 153extern void (*mce_threshold_vector)(void);
125 154
126#endif /* __KERNEL__ */ 155#endif /* __KERNEL__ */
127#endif /* _ASM_X86_MCE_H */ 156#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 105fb90a0635..ede6998bd92c 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -91,46 +91,9 @@ static inline int pfn_valid(int pfn)
91#endif /* CONFIG_DISCONTIGMEM */ 91#endif /* CONFIG_DISCONTIGMEM */
92 92
93#ifdef CONFIG_NEED_MULTIPLE_NODES 93#ifdef CONFIG_NEED_MULTIPLE_NODES
94 94/* always use node 0 for bootmem on this numa platform */
95/* 95#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit) \
96 * Following are macros that are specific to this numa platform. 96 (NODE_DATA(0)->bdata)
97 */
98#define reserve_bootmem(addr, size, flags) \
99 reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
100#define alloc_bootmem(x) \
101 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
102#define alloc_bootmem_nopanic(x) \
103 __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
104 __pa(MAX_DMA_ADDRESS))
105#define alloc_bootmem_low(x) \
106 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
107#define alloc_bootmem_pages(x) \
108 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
109#define alloc_bootmem_pages_nopanic(x) \
110 __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
111 __pa(MAX_DMA_ADDRESS))
112#define alloc_bootmem_low_pages(x) \
113 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
114#define alloc_bootmem_node(pgdat, x) \
115({ \
116 struct pglist_data __maybe_unused \
117 *__alloc_bootmem_node__pgdat = (pgdat); \
118 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
119 __pa(MAX_DMA_ADDRESS)); \
120})
121#define alloc_bootmem_pages_node(pgdat, x) \
122({ \
123 struct pglist_data __maybe_unused \
124 *__alloc_bootmem_node__pgdat = (pgdat); \
125 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \
126 __pa(MAX_DMA_ADDRESS)); \
127})
128#define alloc_bootmem_low_pages_node(pgdat, x) \
129({ \
130 struct pglist_data __maybe_unused \
131 *__alloc_bootmem_node__pgdat = (pgdat); \
132 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \
133})
134#endif /* CONFIG_NEED_MULTIPLE_NODES */ 97#endif /* CONFIG_NEED_MULTIPLE_NODES */
135 98
136#endif /* _ASM_X86_MMZONE_32_H */ 99#endif /* _ASM_X86_MMZONE_32_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 358acc59ae04..2dbd2314139e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -77,6 +77,11 @@
77#define MSR_IA32_MC0_ADDR 0x00000402 77#define MSR_IA32_MC0_ADDR 0x00000402
78#define MSR_IA32_MC0_MISC 0x00000403 78#define MSR_IA32_MC0_MISC 0x00000403
79 79
80/* These are consecutive and not in the normal 4er MCE bank block */
81#define MSR_IA32_MC0_CTL2 0x00000280
82#define CMCI_EN (1ULL << 30)
83#define CMCI_THRESHOLD_MASK 0xffffULL
84
80#define MSR_P6_PERFCTR0 0x000000c1 85#define MSR_P6_PERFCTR0 0x000000c1
81#define MSR_P6_PERFCTR1 0x000000c2 86#define MSR_P6_PERFCTR1 0x000000c2
82#define MSR_P6_EVNTSEL0 0x00000186 87#define MSR_P6_EVNTSEL0 0x00000186
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index aee103b26d01..8f1d2fbec1d4 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -43,6 +43,14 @@
43#else /* ...!ASSEMBLY */ 43#else /* ...!ASSEMBLY */
44 44
45#include <linux/stringify.h> 45#include <linux/stringify.h>
46#include <asm/sections.h>
47
48#define __addr_to_pcpu_ptr(addr) \
49 (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \
50 + (unsigned long)__per_cpu_start)
51#define __pcpu_ptr_to_addr(ptr) \
52 (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \
53 - (unsigned long)__per_cpu_start)
46 54
47#ifdef CONFIG_SMP 55#ifdef CONFIG_SMP
48#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x 56#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1c097a3a6669..d0812e155f1d 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -288,6 +288,8 @@ static inline int is_new_memtype_allowed(unsigned long flags,
288 return 1; 288 return 1;
289} 289}
290 290
291pmd_t *populate_extra_pmd(unsigned long vaddr);
292pte_t *populate_extra_pte(unsigned long vaddr);
291#endif /* __ASSEMBLY__ */ 293#endif /* __ASSEMBLY__ */
292 294
293#ifdef CONFIG_X86_32 295#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 4bd990ee43df..1a918dde46b5 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x)
164 164
165 165
166xmaddr_t arbitrary_virt_to_machine(void *address); 166xmaddr_t arbitrary_virt_to_machine(void *address);
167unsigned long arbitrary_virt_to_mfn(void *vaddr);
167void make_lowmem_page_readonly(void *vaddr); 168void make_lowmem_page_readonly(void *vaddr);
168void make_lowmem_page_readwrite(void *vaddr); 169void make_lowmem_page_readwrite(void *vaddr);
169 170
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6907b8e85d52..4c80f1557433 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
414 that might execute the to be patched code. 414 that might execute the to be patched code.
415 Other CPUs are not running. */ 415 Other CPUs are not running. */
416 stop_nmi(); 416 stop_nmi();
417#ifdef CONFIG_X86_MCE 417
418 stop_mce(); 418 /*
419#endif 419 * Don't stop machine check exceptions while patching.
420 * MCEs only happen when something got corrupted and in this
421 * case we must do something about the corruption.
422 * Ignoring it is worse than a unlikely patching race.
423 * Also machine checks tend to be broadcast and if one CPU
424 * goes into machine check the others follow quickly, so we don't
425 * expect a machine check to cause undue problems during to code
426 * patching.
427 */
420 428
421 apply_alternatives(__alt_instructions, __alt_instructions_end); 429 apply_alternatives(__alt_instructions, __alt_instructions_end);
422 430
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
456 (unsigned long)__smp_locks_end); 464 (unsigned long)__smp_locks_end);
457 465
458 restart_nmi(); 466 restart_nmi();
459#ifdef CONFIG_X86_MCE
460 restart_mce();
461#endif
462} 467}
463 468
464/** 469/**
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f9cecdfd05c5..30909a258d0f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -46,6 +46,7 @@
46#include <asm/idle.h> 46#include <asm/idle.h>
47#include <asm/mtrr.h> 47#include <asm/mtrr.h>
48#include <asm/smp.h> 48#include <asm/smp.h>
49#include <asm/mce.h>
49 50
50unsigned int num_processors; 51unsigned int num_processors;
51 52
@@ -842,6 +843,14 @@ void clear_local_APIC(void)
842 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 843 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
843 } 844 }
844#endif 845#endif
846#ifdef CONFIG_X86_MCE_INTEL
847 if (maxlvt >= 6) {
848 v = apic_read(APIC_LVTCMCI);
849 if (!(v & APIC_LVT_MASKED))
850 apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
851 }
852#endif
853
845 /* 854 /*
846 * Clean APIC state for other OSs: 855 * Clean APIC state for other OSs:
847 */ 856 */
@@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)
1241 apic_write(APIC_LVT1, value); 1250 apic_write(APIC_LVT1, value);
1242 1251
1243 preempt_enable(); 1252 preempt_enable();
1253
1254#ifdef CONFIG_X86_MCE_INTEL
1255 /* Recheck CMCI information after local APIC is up on CPU #0 */
1256 if (smp_processor_id() == 0)
1257 cmci_recheck();
1258#endif
1244} 1259}
1245 1260
1246void __cpuinit end_local_APIC_setup(void) 1261void __cpuinit end_local_APIC_setup(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c3..22590cf688ae 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
601 if (!data) 601 if (!data)
602 return -ENOMEM; 602 return -ENOMEM;
603 603
604 data->acpi_data = percpu_ptr(acpi_perf_data, cpu); 604 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
605 per_cpu(drv_data, cpu) = data; 605 per_cpu(drv_data, cpu) = data;
606 606
607 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) 607 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index d7d2323bbb69..b2f89829bbe8 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o
4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o 4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o 5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o 6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
7obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633e..3552119b091d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
60 } 60 }
61} 61}
62 62
63static unsigned long old_cr4 __initdata;
64
65void __init stop_mce(void)
66{
67 old_cr4 = read_cr4();
68 clear_in_cr4(X86_CR4_MCE);
69}
70
71void __init restart_mce(void)
72{
73 if (old_cr4 & X86_CR4_MCE)
74 set_in_cr4(X86_CR4_MCE);
75}
76
77static int __init mcheck_disable(char *str) 63static int __init mcheck_disable(char *str)
78{ 64{
79 mce_disabled = 1; 65 mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fe79985ce0f2..bfbd5323a635 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s). 4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it. 5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
6 */ 8 */
7 9
8#include <linux/init.h> 10#include <linux/init.h>
@@ -24,6 +26,9 @@
24#include <linux/ctype.h> 26#include <linux/ctype.h>
25#include <linux/kmod.h> 27#include <linux/kmod.h>
26#include <linux/kdebug.h> 28#include <linux/kdebug.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31#include <linux/ratelimit.h>
27#include <asm/processor.h> 32#include <asm/processor.h>
28#include <asm/msr.h> 33#include <asm/msr.h>
29#include <asm/mce.h> 34#include <asm/mce.h>
@@ -32,7 +37,6 @@
32#include <asm/idle.h> 37#include <asm/idle.h>
33 38
34#define MISC_MCELOG_MINOR 227 39#define MISC_MCELOG_MINOR 227
35#define NR_SYSFS_BANKS 6
36 40
37atomic_t mce_entry; 41atomic_t mce_entry;
38 42
@@ -47,7 +51,7 @@ static int mce_dont_init;
47 */ 51 */
48static int tolerant = 1; 52static int tolerant = 1;
49static int banks; 53static int banks;
50static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; 54static u64 *bank;
51static unsigned long notify_user; 55static unsigned long notify_user;
52static int rip_msr; 56static int rip_msr;
53static int mce_bootlog = -1; 57static int mce_bootlog = -1;
@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
58 62
59static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
60 64
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
61/* 78/*
62 * Lockless MCE logging infrastructure. 79 * Lockless MCE logging infrastructure.
63 * This avoids deadlocks on printk locks without having to break locks. Also 80 * This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
119 print_symbol("{%s}", m->ip); 136 print_symbol("{%s}", m->ip);
120 printk("\n"); 137 printk("\n");
121 } 138 }
122 printk(KERN_EMERG "TSC %Lx ", m->tsc); 139 printk(KERN_EMERG "TSC %llx ", m->tsc);
123 if (m->addr) 140 if (m->addr)
124 printk("ADDR %Lx ", m->addr); 141 printk("ADDR %llx ", m->addr);
125 if (m->misc) 142 if (m->misc)
126 printk("MISC %Lx ", m->misc); 143 printk("MISC %llx ", m->misc);
127 printk("\n"); 144 printk("\n");
128 printk(KERN_EMERG "This is not a software problem!\n"); 145 printk(KERN_EMERG "This is not a software problem!\n");
129 printk(KERN_EMERG "Run through mcelog --ascii to decode " 146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
149 panic(msg); 166 panic(msg);
150} 167}
151 168
152static int mce_available(struct cpuinfo_x86 *c) 169int mce_available(struct cpuinfo_x86 *c)
153{ 170{
171 if (mce_dont_init)
172 return 0;
154 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
155} 174}
156 175
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
172} 191}
173 192
174/* 193/*
175 * The actual machine check handler 194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i] || !test_bit(i, *b))
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245
246 /*
247 * Clear state for this bank.
248 */
249 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
250 }
251
252 /*
253 * Don't clear MCG_STATUS here because it's only defined for
254 * exceptions.
255 */
256}
257
258/*
259 * The actual machine check handler. This only handles real
260 * exceptions when something got corrupted coming in through int 18.
261 *
262 * This is executed in NMI context not subject to normal locking rules. This
263 * implies that most kernel services cannot be safely used. Don't even
264 * think about putting a printk in there!
176 */ 265 */
177void do_machine_check(struct pt_regs * regs, long error_code) 266void do_machine_check(struct pt_regs * regs, long error_code)
178{ 267{
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
190 * error. 279 * error.
191 */ 280 */
192 int kill_it = 0; 281 int kill_it = 0;
282 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
193 283
194 atomic_inc(&mce_entry); 284 atomic_inc(&mce_entry);
195 285
196 if ((regs 286 if (notify_die(DIE_NMI, "machine check", regs, error_code,
197 && notify_die(DIE_NMI, "machine check", regs, error_code,
198 18, SIGKILL) == NOTIFY_STOP) 287 18, SIGKILL) == NOTIFY_STOP)
199 || !banks) 288 goto out2;
289 if (!banks)
200 goto out2; 290 goto out2;
201 291
202 memset(&m, 0, sizeof(struct mce)); 292 mce_setup(&m);
203 m.cpu = smp_processor_id(); 293
204 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 294 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
205 /* if the restart IP is not valid, we're done for */ 295 /* if the restart IP is not valid, we're done for */
206 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 296 if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
210 barrier(); 300 barrier();
211 301
212 for (i = 0; i < banks; i++) { 302 for (i = 0; i < banks; i++) {
213 if (i < NR_SYSFS_BANKS && !bank[i]) 303 __clear_bit(i, toclear);
304 if (!bank[i])
214 continue; 305 continue;
215 306
216 m.misc = 0; 307 m.misc = 0;
217 m.addr = 0; 308 m.addr = 0;
218 m.bank = i; 309 m.bank = i;
219 m.tsc = 0;
220 310
221 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); 311 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
222 if ((m.status & MCI_STATUS_VAL) == 0) 312 if ((m.status & MCI_STATUS_VAL) == 0)
223 continue; 313 continue;
224 314
315 /*
316 * Non uncorrected errors are handled by machine_check_poll
317 * Leave them alone.
318 */
319 if ((m.status & MCI_STATUS_UC) == 0)
320 continue;
321
322 /*
323 * Set taint even when machine check was not enabled.
324 */
325 add_taint(TAINT_MACHINE_CHECK);
326
327 __set_bit(i, toclear);
328
225 if (m.status & MCI_STATUS_EN) { 329 if (m.status & MCI_STATUS_EN) {
226 /* if PCC was set, there's no way out */ 330 /* if PCC was set, there's no way out */
227 no_way_out |= !!(m.status & MCI_STATUS_PCC); 331 no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
235 no_way_out = 1; 339 no_way_out = 1;
236 kill_it = 1; 340 kill_it = 1;
237 } 341 }
342 } else {
343 /*
344 * Machine check event was not enabled. Clear, but
345 * ignore.
346 */
347 continue;
238 } 348 }
239 349
240 if (m.status & MCI_STATUS_MISCV) 350 if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
243 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 353 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
244 354
245 mce_get_rip(&m, regs); 355 mce_get_rip(&m, regs);
246 if (error_code >= 0) 356 mce_log(&m);
247 rdtscll(m.tsc);
248 if (error_code != -2)
249 mce_log(&m);
250 357
251 /* Did this bank cause the exception? */ 358 /* Did this bank cause the exception? */
252 /* Assume that the bank with uncorrectable errors did it, 359 /* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
255 panicm = m; 362 panicm = m;
256 panicm_found = 1; 363 panicm_found = 1;
257 } 364 }
258
259 add_taint(TAINT_MACHINE_CHECK);
260 } 365 }
261 366
262 /* Never do anything final in the polling timer */
263 if (!regs)
264 goto out;
265
266 /* If we didn't find an uncorrectable error, pick 367 /* If we didn't find an uncorrectable error, pick
267 the last one (shouldn't happen, just being safe). */ 368 the last one (shouldn't happen, just being safe). */
268 if (!panicm_found) 369 if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
309 /* notify userspace ASAP */ 410 /* notify userspace ASAP */
310 set_thread_flag(TIF_MCE_NOTIFY); 411 set_thread_flag(TIF_MCE_NOTIFY);
311 412
312 out:
313 /* the last thing we do is clear state */ 413 /* the last thing we do is clear state */
314 for (i = 0; i < banks; i++) 414 for (i = 0; i < banks; i++) {
315 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 415 if (test_bit(i, toclear))
416 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
417 }
316 wrmsrl(MSR_IA32_MCG_STATUS, 0); 418 wrmsrl(MSR_IA32_MCG_STATUS, 0);
317 out2: 419 out2:
318 atomic_dec(&mce_entry); 420 atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
332 * and historically has been the register value of the 434 * and historically has been the register value of the
333 * MSR_IA32_THERMAL_STATUS (Intel) msr. 435 * MSR_IA32_THERMAL_STATUS (Intel) msr.
334 */ 436 */
335void mce_log_therm_throt_event(unsigned int cpu, __u64 status) 437void mce_log_therm_throt_event(__u64 status)
336{ 438{
337 struct mce m; 439 struct mce m;
338 440
339 memset(&m, 0, sizeof(m)); 441 mce_setup(&m);
340 m.cpu = cpu;
341 m.bank = MCE_THERMAL_BANK; 442 m.bank = MCE_THERMAL_BANK;
342 m.status = status; 443 m.status = status;
343 rdtscll(m.tsc);
344 mce_log(&m); 444 mce_log(&m);
345} 445}
346#endif /* CONFIG_X86_MCE_INTEL */ 446#endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
353 453
354static int check_interval = 5 * 60; /* 5 minutes */ 454static int check_interval = 5 * 60; /* 5 minutes */
355static int next_interval; /* in jiffies */ 455static int next_interval; /* in jiffies */
356static void mcheck_timer(struct work_struct *work); 456static void mcheck_timer(unsigned long);
357static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); 457static DEFINE_PER_CPU(struct timer_list, mce_timer);
358 458
359static void mcheck_check_cpu(void *info) 459static void mcheck_timer(unsigned long data)
360{ 460{
361 if (mce_available(&current_cpu_data)) 461 struct timer_list *t = &per_cpu(mce_timer, data);
362 do_machine_check(NULL, 0);
363}
364 462
365static void mcheck_timer(struct work_struct *work) 463 WARN_ON(smp_processor_id() != data);
366{ 464
367 on_each_cpu(mcheck_check_cpu, NULL, 1); 465 if (mce_available(&current_cpu_data))
466 machine_check_poll(MCP_TIMESTAMP,
467 &__get_cpu_var(mce_poll_banks));
368 468
369 /* 469 /*
370 * Alert userspace if needed. If we logged an MCE, reduce the 470 * Alert userspace if needed. If we logged an MCE, reduce the
@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
377 (int)round_jiffies_relative(check_interval*HZ)); 477 (int)round_jiffies_relative(check_interval*HZ));
378 } 478 }
379 479
380 schedule_delayed_work(&mcheck_work, next_interval); 480 t->expires = jiffies + next_interval;
481 add_timer(t);
482}
483
484static void mce_do_trigger(struct work_struct *work)
485{
486 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
381} 487}
382 488
489static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
490
383/* 491/*
384 * This is only called from process context. This is where we do 492 * Notify the user(s) about new machine check events.
385 * anything we need to alert userspace about new MCEs. This is called 493 * Can be called from interrupt context, but not from machine check/NMI
386 * directly from the poller and also from entry.S and idle, thanks to 494 * context.
387 * TIF_MCE_NOTIFY.
388 */ 495 */
389int mce_notify_user(void) 496int mce_notify_user(void)
390{ 497{
498 /* Not more than two messages every minute */
499 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
500
391 clear_thread_flag(TIF_MCE_NOTIFY); 501 clear_thread_flag(TIF_MCE_NOTIFY);
392 if (test_and_clear_bit(0, &notify_user)) { 502 if (test_and_clear_bit(0, &notify_user)) {
393 static unsigned long last_print;
394 unsigned long now = jiffies;
395
396 wake_up_interruptible(&mce_wait); 503 wake_up_interruptible(&mce_wait);
397 if (trigger[0])
398 call_usermodehelper(trigger, trigger_argv, NULL,
399 UMH_NO_WAIT);
400 504
401 if (time_after_eq(now, last_print + (check_interval*HZ))) { 505 /*
402 last_print = now; 506 * There is no risk of missing notifications because
507 * work_pending is always cleared before the function is
508 * executed.
509 */
510 if (trigger[0] && !work_pending(&mce_trigger_work))
511 schedule_work(&mce_trigger_work);
512
513 if (__ratelimit(&ratelimit))
403 printk(KERN_INFO "Machine check events logged\n"); 514 printk(KERN_INFO "Machine check events logged\n");
404 }
405 515
406 return 1; 516 return 1;
407 } 517 }
@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
425 535
426static __init int periodic_mcheck_init(void) 536static __init int periodic_mcheck_init(void)
427{ 537{
428 next_interval = check_interval * HZ; 538 idle_notifier_register(&mce_idle_notifier);
429 if (next_interval) 539 return 0;
430 schedule_delayed_work(&mcheck_work,
431 round_jiffies_relative(next_interval));
432 idle_notifier_register(&mce_idle_notifier);
433 return 0;
434} 540}
435__initcall(periodic_mcheck_init); 541__initcall(periodic_mcheck_init);
436 542
437
438/* 543/*
439 * Initialize Machine Checks for a CPU. 544 * Initialize Machine Checks for a CPU.
440 */ 545 */
441static void mce_init(void *dummy) 546static int mce_cap_init(void)
442{ 547{
443 u64 cap; 548 u64 cap;
444 int i; 549 unsigned b;
445 550
446 rdmsrl(MSR_IA32_MCG_CAP, cap); 551 rdmsrl(MSR_IA32_MCG_CAP, cap);
447 banks = cap & 0xff; 552 b = cap & 0xff;
448 if (banks > MCE_EXTENDED_BANK) { 553 if (b > MAX_NR_BANKS) {
449 banks = MCE_EXTENDED_BANK; 554 printk(KERN_WARNING
450 printk(KERN_INFO "MCE: warning: using only %d banks\n", 555 "MCE: Using only %u machine check banks out of %u\n",
451 MCE_EXTENDED_BANK); 556 MAX_NR_BANKS, b);
557 b = MAX_NR_BANKS;
452 } 558 }
559
560 /* Don't support asymmetric configurations today */
561 WARN_ON(banks != 0 && b != banks);
562 banks = b;
563 if (!bank) {
564 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
565 if (!bank)
566 return -ENOMEM;
567 memset(bank, 0xff, banks * sizeof(u64));
568 }
569
453 /* Use accurate RIP reporting if available. */ 570 /* Use accurate RIP reporting if available. */
454 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) 571 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
455 rip_msr = MSR_IA32_MCG_EIP; 572 rip_msr = MSR_IA32_MCG_EIP;
456 573
457 /* Log the machine checks left over from the previous reset. 574 return 0;
458 This also clears all registers */ 575}
459 do_machine_check(NULL, mce_bootlog ? -1 : -2); 576
577static void mce_init(void *dummy)
578{
579 u64 cap;
580 int i;
581 mce_banks_t all_banks;
582
583 /*
584 * Log the machine checks left over from the previous reset.
585 */
586 bitmap_fill(all_banks, MAX_NR_BANKS);
587 machine_check_poll(MCP_UC, &all_banks);
460 588
461 set_in_cr4(X86_CR4_MCE); 589 set_in_cr4(X86_CR4_MCE);
462 590
591 rdmsrl(MSR_IA32_MCG_CAP, cap);
463 if (cap & MCG_CTL_P) 592 if (cap & MCG_CTL_P)
464 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 593 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
465 594
466 for (i = 0; i < banks; i++) { 595 for (i = 0; i < banks; i++) {
467 if (i < NR_SYSFS_BANKS) 596 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
468 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
469 else
470 wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
471
472 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 597 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
473 } 598 }
474} 599}
475 600
476/* Add per CPU specific workarounds here */ 601/* Add per CPU specific workarounds here */
477static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 602static void mce_cpu_quirks(struct cpuinfo_x86 *c)
478{ 603{
479 /* This should be disabled by the BIOS, but isn't always */ 604 /* This should be disabled by the BIOS, but isn't always */
480 if (c->x86_vendor == X86_VENDOR_AMD) { 605 if (c->x86_vendor == X86_VENDOR_AMD) {
481 if(c->x86 == 15) 606 if (c->x86 == 15 && banks > 4)
482 /* disable GART TBL walk error reporting, which trips off 607 /* disable GART TBL walk error reporting, which trips off
483 incorrectly with the IOMMU & 3ware & Cerberus. */ 608 incorrectly with the IOMMU & 3ware & Cerberus. */
484 clear_bit(10, &bank[4]); 609 clear_bit(10, (unsigned long *)&bank[4]);
485 if(c->x86 <= 17 && mce_bootlog < 0) 610 if(c->x86 <= 17 && mce_bootlog < 0)
486 /* Lots of broken BIOS around that don't clear them 611 /* Lots of broken BIOS around that don't clear them
487 by default and leave crap in there. Don't log. */ 612 by default and leave crap in there. Don't log. */
@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
504 } 629 }
505} 630}
506 631
632static void mce_init_timer(void)
633{
634 struct timer_list *t = &__get_cpu_var(mce_timer);
635
636 /* data race harmless because everyone sets to the same value */
637 if (!next_interval)
638 next_interval = check_interval * HZ;
639 if (!next_interval)
640 return;
641 setup_timer(t, mcheck_timer, smp_processor_id());
642 t->expires = round_jiffies_relative(jiffies + next_interval);
643 add_timer(t);
644}
645
507/* 646/*
508 * Called for each booted CPU to set up machine checks. 647 * Called for each booted CPU to set up machine checks.
509 * Must be called with preempt off. 648 * Must be called with preempt off.
510 */ 649 */
511void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 650void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
512{ 651{
513 mce_cpu_quirks(c); 652 if (!mce_available(c))
653 return;
514 654
515 if (mce_dont_init || 655 if (mce_cap_init() < 0) {
516 !mce_available(c)) 656 mce_dont_init = 1;
517 return; 657 return;
658 }
659 mce_cpu_quirks(c);
518 660
519 mce_init(NULL); 661 mce_init(NULL);
520 mce_cpu_features(c); 662 mce_cpu_features(c);
663 mce_init_timer();
521} 664}
522 665
523/* 666/*
@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
573{ 716{
574 unsigned long *cpu_tsc; 717 unsigned long *cpu_tsc;
575 static DEFINE_MUTEX(mce_read_mutex); 718 static DEFINE_MUTEX(mce_read_mutex);
576 unsigned next; 719 unsigned prev, next;
577 char __user *buf = ubuf; 720 char __user *buf = ubuf;
578 int i, err; 721 int i, err;
579 722
@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
592 } 735 }
593 736
594 err = 0; 737 err = 0;
595 for (i = 0; i < next; i++) { 738 prev = 0;
596 unsigned long start = jiffies; 739 do {
597 740 for (i = prev; i < next; i++) {
598 while (!mcelog.entry[i].finished) { 741 unsigned long start = jiffies;
599 if (time_after_eq(jiffies, start + 2)) { 742
600 memset(mcelog.entry + i,0, sizeof(struct mce)); 743 while (!mcelog.entry[i].finished) {
601 goto timeout; 744 if (time_after_eq(jiffies, start + 2)) {
745 memset(mcelog.entry + i, 0,
746 sizeof(struct mce));
747 goto timeout;
748 }
749 cpu_relax();
602 } 750 }
603 cpu_relax(); 751 smp_rmb();
752 err |= copy_to_user(buf, mcelog.entry + i,
753 sizeof(struct mce));
754 buf += sizeof(struct mce);
755timeout:
756 ;
604 } 757 }
605 smp_rmb();
606 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
607 buf += sizeof(struct mce);
608 timeout:
609 ;
610 }
611 758
612 memset(mcelog.entry, 0, next * sizeof(struct mce)); 759 memset(mcelog.entry + prev, 0,
613 mcelog.next = 0; 760 (next - prev) * sizeof(struct mce));
761 prev = next;
762 next = cmpxchg(&mcelog.next, prev, 0);
763 } while (next != prev);
614 764
615 synchronize_sched(); 765 synchronize_sched();
616 766
@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
680 &mce_chrdev_ops, 830 &mce_chrdev_ops,
681}; 831};
682 832
683static unsigned long old_cr4 __initdata;
684
685void __init stop_mce(void)
686{
687 old_cr4 = read_cr4();
688 clear_in_cr4(X86_CR4_MCE);
689}
690
691void __init restart_mce(void)
692{
693 if (old_cr4 & X86_CR4_MCE)
694 set_in_cr4(X86_CR4_MCE);
695}
696
697/* 833/*
698 * Old style boot options parsing. Only for compatibility. 834 * Old style boot options parsing. Only for compatibility.
699 */ 835 */
@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
703 return 1; 839 return 1;
704} 840}
705 841
706/* mce=off disables machine check. Note you can re-enable it later 842/* mce=off disables machine check.
707 using sysfs.
708 mce=TOLERANCELEVEL (number, see above) 843 mce=TOLERANCELEVEL (number, see above)
709 mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 844 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
710 mce=nobootlog Don't log MCEs from before booting. */ 845 mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);
728 * Sysfs support 863 * Sysfs support
729 */ 864 */
730 865
866/*
867 * Disable machine checks on suspend and shutdown. We can't really handle
868 * them later.
869 */
870static int mce_disable(void)
871{
872 int i;
873
874 for (i = 0; i < banks; i++)
875 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
876 return 0;
877}
878
879static int mce_suspend(struct sys_device *dev, pm_message_t state)
880{
881 return mce_disable();
882}
883
884static int mce_shutdown(struct sys_device *dev)
885{
886 return mce_disable();
887}
888
731/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. 889/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
732 Only one CPU is active at this time, the others get readded later using 890 Only one CPU is active at this time, the others get readded later using
733 CPU hotplug. */ 891 CPU hotplug. */
@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)
738 return 0; 896 return 0;
739} 897}
740 898
899static void mce_cpu_restart(void *data)
900{
901 del_timer_sync(&__get_cpu_var(mce_timer));
902 if (mce_available(&current_cpu_data))
903 mce_init(NULL);
904 mce_init_timer();
905}
906
741/* Reinit MCEs after user configuration changes */ 907/* Reinit MCEs after user configuration changes */
742static void mce_restart(void) 908static void mce_restart(void)
743{ 909{
744 if (next_interval)
745 cancel_delayed_work(&mcheck_work);
746 /* Timer race is harmless here */
747 on_each_cpu(mce_init, NULL, 1);
748 next_interval = check_interval * HZ; 910 next_interval = check_interval * HZ;
749 if (next_interval) 911 on_each_cpu(mce_cpu_restart, NULL, 1);
750 schedule_delayed_work(&mcheck_work,
751 round_jiffies_relative(next_interval));
752} 912}
753 913
754static struct sysdev_class mce_sysclass = { 914static struct sysdev_class mce_sysclass = {
915 .suspend = mce_suspend,
916 .shutdown = mce_shutdown,
755 .resume = mce_resume, 917 .resume = mce_resume,
756 .name = "machinecheck", 918 .name = "machinecheck",
757}; 919};
@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
778 } \ 940 } \
779 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 941 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
780 942
781/* 943static struct sysdev_attribute *bank_attrs;
782 * TBD should generate these dynamically based on number of available banks. 944
783 * Have only 6 contol banks in /sysfs until then. 945static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
784 */ 946 char *buf)
785ACCESSOR(bank0ctl,bank[0],mce_restart()) 947{
786ACCESSOR(bank1ctl,bank[1],mce_restart()) 948 u64 b = bank[attr - bank_attrs];
787ACCESSOR(bank2ctl,bank[2],mce_restart()) 949 return sprintf(buf, "%llx\n", b);
788ACCESSOR(bank3ctl,bank[3],mce_restart()) 950}
789ACCESSOR(bank4ctl,bank[4],mce_restart()) 951
790ACCESSOR(bank5ctl,bank[5],mce_restart()) 952static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
953 const char *buf, size_t siz)
954{
955 char *end;
956 u64 new = simple_strtoull(buf, &end, 0);
957 if (end == buf)
958 return -EINVAL;
959 bank[attr - bank_attrs] = new;
960 mce_restart();
961 return end-buf;
962}
791 963
792static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, 964static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
793 char *buf) 965 char *buf)
@@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
814static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 986static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
815ACCESSOR(check_interval,check_interval,mce_restart()) 987ACCESSOR(check_interval,check_interval,mce_restart())
816static struct sysdev_attribute *mce_attributes[] = { 988static struct sysdev_attribute *mce_attributes[] = {
817 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
818 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
819 &attr_tolerant.attr, &attr_check_interval, &attr_trigger, 989 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
820 NULL 990 NULL
821}; 991};
@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
845 if (err) 1015 if (err)
846 goto error; 1016 goto error;
847 } 1017 }
1018 for (i = 0; i < banks; i++) {
1019 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1020 &bank_attrs[i]);
1021 if (err)
1022 goto error2;
1023 }
848 cpu_set(cpu, mce_device_initialized); 1024 cpu_set(cpu, mce_device_initialized);
849 1025
850 return 0; 1026 return 0;
1027error2:
1028 while (--i >= 0) {
1029 sysdev_remove_file(&per_cpu(device_mce, cpu),
1030 &bank_attrs[i]);
1031 }
851error: 1032error:
852 while (i--) { 1033 while (--i >= 0) {
853 sysdev_remove_file(&per_cpu(device_mce,cpu), 1034 sysdev_remove_file(&per_cpu(device_mce,cpu),
854 mce_attributes[i]); 1035 mce_attributes[i]);
855 } 1036 }
@@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
868 for (i = 0; mce_attributes[i]; i++) 1049 for (i = 0; mce_attributes[i]; i++)
869 sysdev_remove_file(&per_cpu(device_mce,cpu), 1050 sysdev_remove_file(&per_cpu(device_mce,cpu),
870 mce_attributes[i]); 1051 mce_attributes[i]);
1052 for (i = 0; i < banks; i++)
1053 sysdev_remove_file(&per_cpu(device_mce, cpu),
1054 &bank_attrs[i]);
871 sysdev_unregister(&per_cpu(device_mce,cpu)); 1055 sysdev_unregister(&per_cpu(device_mce,cpu));
872 cpu_clear(cpu, mce_device_initialized); 1056 cpu_clear(cpu, mce_device_initialized);
873} 1057}
874 1058
1059/* Make sure there are no machine checks on offlined CPUs. */
1060static void mce_disable_cpu(void *h)
1061{
1062 int i;
1063 unsigned long action = *(unsigned long *)h;
1064
1065 if (!mce_available(&current_cpu_data))
1066 return;
1067 if (!(action & CPU_TASKS_FROZEN))
1068 cmci_clear();
1069 for (i = 0; i < banks; i++)
1070 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1071}
1072
1073static void mce_reenable_cpu(void *h)
1074{
1075 int i;
1076 unsigned long action = *(unsigned long *)h;
1077
1078 if (!mce_available(&current_cpu_data))
1079 return;
1080 if (!(action & CPU_TASKS_FROZEN))
1081 cmci_reenable();
1082 for (i = 0; i < banks; i++)
1083 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1084}
1085
875/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1086/* Get notified when a cpu comes on/off. Be hotplug friendly. */
876static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, 1087static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
877 unsigned long action, void *hcpu) 1088 unsigned long action, void *hcpu)
878{ 1089{
879 unsigned int cpu = (unsigned long)hcpu; 1090 unsigned int cpu = (unsigned long)hcpu;
1091 struct timer_list *t = &per_cpu(mce_timer, cpu);
880 1092
881 switch (action) { 1093 switch (action) {
882 case CPU_ONLINE: 1094 case CPU_ONLINE:
@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
891 threshold_cpu_callback(action, cpu); 1103 threshold_cpu_callback(action, cpu);
892 mce_remove_device(cpu); 1104 mce_remove_device(cpu);
893 break; 1105 break;
1106 case CPU_DOWN_PREPARE:
1107 case CPU_DOWN_PREPARE_FROZEN:
1108 del_timer_sync(t);
1109 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1110 break;
1111 case CPU_DOWN_FAILED:
1112 case CPU_DOWN_FAILED_FROZEN:
1113 t->expires = round_jiffies_relative(jiffies + next_interval);
1114 add_timer_on(t, cpu);
1115 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1116 break;
1117 case CPU_POST_DEAD:
1118 /* intentionally ignoring frozen here */
1119 cmci_rediscover(cpu);
1120 break;
894 } 1121 }
895 return NOTIFY_OK; 1122 return NOTIFY_OK;
896} 1123}
@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
899 .notifier_call = mce_cpu_callback, 1126 .notifier_call = mce_cpu_callback,
900}; 1127};
901 1128
1129static __init int mce_init_banks(void)
1130{
1131 int i;
1132
1133 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1134 GFP_KERNEL);
1135 if (!bank_attrs)
1136 return -ENOMEM;
1137
1138 for (i = 0; i < banks; i++) {
1139 struct sysdev_attribute *a = &bank_attrs[i];
1140 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1141 if (!a->attr.name)
1142 goto nomem;
1143 a->attr.mode = 0644;
1144 a->show = show_bank;
1145 a->store = set_bank;
1146 }
1147 return 0;
1148
1149nomem:
1150 while (--i >= 0)
1151 kfree(bank_attrs[i].attr.name);
1152 kfree(bank_attrs);
1153 bank_attrs = NULL;
1154 return -ENOMEM;
1155}
1156
902static __init int mce_init_device(void) 1157static __init int mce_init_device(void)
903{ 1158{
904 int err; 1159 int err;
@@ -906,6 +1161,11 @@ static __init int mce_init_device(void)
906 1161
907 if (!mce_available(&boot_cpu_data)) 1162 if (!mce_available(&boot_cpu_data))
908 return -EIO; 1163 return -EIO;
1164
1165 err = mce_init_banks();
1166 if (err)
1167 return err;
1168
909 err = sysdev_class_register(&mce_sysclass); 1169 err = sysdev_class_register(&mce_sysclass);
910 if (err) 1170 if (err)
911 return err; 1171 return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 9817506dd469..c5a32f92d07e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
79 79
80static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ 80static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
81 81
82static void amd_threshold_interrupt(void);
83
82/* 84/*
83 * CPU Initialization 85 * CPU Initialization
84 */ 86 */
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
174 tr.reset = 0; 176 tr.reset = 0;
175 tr.old_limit = 0; 177 tr.old_limit = 0;
176 threshold_restart_bank(&tr); 178 threshold_restart_bank(&tr);
179
180 mce_threshold_vector = amd_threshold_interrupt;
177 } 181 }
178 } 182 }
179} 183}
@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
187 * the interrupt goes off when error_count reaches threshold_limit. 191 * the interrupt goes off when error_count reaches threshold_limit.
188 * the handler will simply log mcelog w/ software defined bank number. 192 * the handler will simply log mcelog w/ software defined bank number.
189 */ 193 */
190asmlinkage void mce_threshold_interrupt(void) 194static void amd_threshold_interrupt(void)
191{ 195{
192 unsigned int bank, block; 196 unsigned int bank, block;
193 struct mce m; 197 struct mce m;
194 u32 low = 0, high = 0, address = 0; 198 u32 low = 0, high = 0, address = 0;
195 199
196 ack_APIC_irq(); 200 mce_setup(&m);
197 exit_idle();
198 irq_enter();
199
200 memset(&m, 0, sizeof(m));
201 rdtscll(m.tsc);
202 m.cpu = smp_processor_id();
203 201
204 /* assume first bank caused it */ 202 /* assume first bank caused it */
205 for (bank = 0; bank < NR_BANKS; ++bank) { 203 for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
233 231
234 /* Log the machine check that caused the threshold 232 /* Log the machine check that caused the threshold
235 event. */ 233 event. */
236 do_machine_check(NULL, 0); 234 machine_check_poll(MCP_TIMESTAMP,
235 &__get_cpu_var(mce_poll_banks));
237 236
238 if (high & MASK_OVERFLOW_HI) { 237 if (high & MASK_OVERFLOW_HI) {
239 rdmsrl(address, m.misc); 238 rdmsrl(address, m.misc);
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
243 + bank * NR_BLOCKS 242 + bank * NR_BLOCKS
244 + block; 243 + block;
245 mce_log(&m); 244 mce_log(&m);
246 goto out; 245 return;
247 } 246 }
248 } 247 }
249 } 248 }
250out:
251 inc_irq_stat(irq_threshold_count);
252 irq_exit();
253} 249}
254 250
255/* 251/*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index aa5e287c98e0..aaa7d9730938 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,6 +1,8 @@
1/* 1/*
2 * Intel specific MCE features. 2 * Intel specific MCE features.
3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> 3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
4 * Copyright (C) 2008, 2009 Intel Corporation
5 * Author: Andi Kleen
4 */ 6 */
5 7
6#include <linux/init.h> 8#include <linux/init.h>
@@ -13,6 +15,7 @@
13#include <asm/hw_irq.h> 15#include <asm/hw_irq.h>
14#include <asm/idle.h> 16#include <asm/idle.h>
15#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
18#include <asm/apic.h>
16 19
17asmlinkage void smp_thermal_interrupt(void) 20asmlinkage void smp_thermal_interrupt(void)
18{ 21{
@@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
25 28
26 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 29 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
27 if (therm_throt_process(msr_val & 1)) 30 if (therm_throt_process(msr_val & 1))
28 mce_log_therm_throt_event(smp_processor_id(), msr_val); 31 mce_log_therm_throt_event(msr_val);
29 32
30 inc_irq_stat(irq_thermal_count); 33 inc_irq_stat(irq_thermal_count);
31 irq_exit(); 34 irq_exit();
@@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
85 return; 88 return;
86} 89}
87 90
91/*
92 * Support for Intel Correct Machine Check Interrupts. This allows
93 * the CPU to raise an interrupt when a corrected machine check happened.
94 * Normally we pick those up using a regular polling timer.
95 * Also supports reliable discovery of shared banks.
96 */
97
98static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
99
100/*
101 * cmci_discover_lock protects against parallel discovery attempts
102 * which could race against each other.
103 */
104static DEFINE_SPINLOCK(cmci_discover_lock);
105
106#define CMCI_THRESHOLD 1
107
108static int cmci_supported(int *banks)
109{
110 u64 cap;
111
112 /*
113 * Vendor check is not strictly needed, but the initial
114 * initialization is vendor keyed and this
115 * makes sure none of the backdoors are entered otherwise.
116 */
117 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
118 return 0;
119 if (!cpu_has_apic || lapic_get_maxlvt() < 6)
120 return 0;
121 rdmsrl(MSR_IA32_MCG_CAP, cap);
122 *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
123 return !!(cap & MCG_CMCI_P);
124}
125
126/*
127 * The interrupt handler. This is called on every event.
128 * Just call the poller directly to log any events.
129 * This could in theory increase the threshold under high load,
130 * but doesn't for now.
131 */
132static void intel_threshold_interrupt(void)
133{
134 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
135 mce_notify_user();
136}
137
138static void print_update(char *type, int *hdr, int num)
139{
140 if (*hdr == 0)
141 printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
142 *hdr = 1;
143 printk(KERN_CONT " %s:%d", type, num);
144}
145
146/*
147 * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
148 * on this CPU. Use the algorithm recommended in the SDM to discover shared
149 * banks.
150 */
151static void cmci_discover(int banks, int boot)
152{
153 unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
154 int hdr = 0;
155 int i;
156
157 spin_lock(&cmci_discover_lock);
158 for (i = 0; i < banks; i++) {
159 u64 val;
160
161 if (test_bit(i, owned))
162 continue;
163
164 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
165
166 /* Already owned by someone else? */
167 if (val & CMCI_EN) {
168 if (test_and_clear_bit(i, owned) || boot)
169 print_update("SHD", &hdr, i);
170 __clear_bit(i, __get_cpu_var(mce_poll_banks));
171 continue;
172 }
173
174 val |= CMCI_EN | CMCI_THRESHOLD;
175 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
176 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
177
178 /* Did the enable bit stick? -- the bank supports CMCI */
179 if (val & CMCI_EN) {
180 if (!test_and_set_bit(i, owned) || boot)
181 print_update("CMCI", &hdr, i);
182 __clear_bit(i, __get_cpu_var(mce_poll_banks));
183 } else {
184 WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
185 }
186 }
187 spin_unlock(&cmci_discover_lock);
188 if (hdr)
189 printk(KERN_CONT "\n");
190}
191
192/*
193 * Just in case we missed an event during initialization check
194 * all the CMCI owned banks.
195 */
196void cmci_recheck(void)
197{
198 unsigned long flags;
199 int banks;
200
201 if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
202 return;
203 local_irq_save(flags);
204 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
205 local_irq_restore(flags);
206}
207
208/*
209 * Disable CMCI on this CPU for all banks it owns when it goes down.
210 * This allows other CPUs to claim the banks on rediscovery.
211 */
212void cmci_clear(void)
213{
214 int i;
215 int banks;
216 u64 val;
217
218 if (!cmci_supported(&banks))
219 return;
220 spin_lock(&cmci_discover_lock);
221 for (i = 0; i < banks; i++) {
222 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
223 continue;
224 /* Disable CMCI */
225 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
226 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
227 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
228 __clear_bit(i, __get_cpu_var(mce_banks_owned));
229 }
230 spin_unlock(&cmci_discover_lock);
231}
232
233/*
234 * After a CPU went down cycle through all the others and rediscover
235 * Must run in process context.
236 */
237void cmci_rediscover(int dying)
238{
239 int banks;
240 int cpu;
241 cpumask_var_t old;
242
243 if (!cmci_supported(&banks))
244 return;
245 if (!alloc_cpumask_var(&old, GFP_KERNEL))
246 return;
247 cpumask_copy(old, &current->cpus_allowed);
248
249 for_each_online_cpu (cpu) {
250 if (cpu == dying)
251 continue;
252 if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
253 continue;
254 /* Recheck banks in case CPUs don't all have the same */
255 if (cmci_supported(&banks))
256 cmci_discover(banks, 0);
257 }
258
259 set_cpus_allowed_ptr(current, old);
260 free_cpumask_var(old);
261}
262
263/*
264 * Reenable CMCI on this CPU in case a CPU down failed.
265 */
266void cmci_reenable(void)
267{
268 int banks;
269 if (cmci_supported(&banks))
270 cmci_discover(banks, 0);
271}
272
273static __cpuinit void intel_init_cmci(void)
274{
275 int banks;
276
277 if (!cmci_supported(&banks))
278 return;
279
280 mce_threshold_vector = intel_threshold_interrupt;
281 cmci_discover(banks, 1);
282 /*
283 * For CPU #0 this runs with still disabled APIC, but that's
284 * ok because only the vector is set up. We still do another
285 * check for the banks later for CPU #0 just to make sure
286 * to not miss any events.
287 */
288 apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
289 cmci_recheck();
290}
291
88void mce_intel_feature_init(struct cpuinfo_x86 *c) 292void mce_intel_feature_init(struct cpuinfo_x86 *c)
89{ 293{
90 intel_init_thermal(c); 294 intel_init_thermal(c);
295 intel_init_cmci();
91} 296}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 000000000000..23ee9e730f78
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,29 @@
1/*
2 * Common corrected MCE threshold handler code:
3 */
4#include <linux/interrupt.h>
5#include <linux/kernel.h>
6
7#include <asm/irq_vectors.h>
8#include <asm/apic.h>
9#include <asm/idle.h>
10#include <asm/mce.h>
11
12static void default_threshold_interrupt(void)
13{
14 printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
15 THRESHOLD_APIC_VECTOR);
16}
17
18void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19
20asmlinkage void mce_threshold_interrupt(void)
21{
22 exit_idle();
23 irq_enter();
24 inc_irq_stat(irq_threshold_count);
25 mce_threshold_vector();
26 irq_exit();
27 /* Ack only at the end to avoid potential reentry */
28 ack_APIC_irq();
29}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index b0f61f0dcd0a..f2f8540a7f3d 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk)
136#ifdef CONFIG_X86_32 136#ifdef CONFIG_X86_32
137 if (!HAVE_HWFP) { 137 if (!HAVE_HWFP) {
138 memset(tsk->thread.xstate, 0, xstate_size); 138 memset(tsk->thread.xstate, 0, xstate_size);
139 finit(); 139 finit_task(tsk);
140 set_stopped_child_used_math(tsk); 140 set_stopped_child_used_math(tsk);
141 return 0; 141 return 0;
142 } 142 }
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 9dc6b2b24275..3b09634a5153 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -16,6 +16,7 @@
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/uaccess.h> 18#include <linux/uaccess.h>
19#include <linux/percpu.h>
19 20
20#include <asm/apic.h> 21#include <asm/apic.h>
21 22
@@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
55union irq_ctx { 56union irq_ctx {
56 struct thread_info tinfo; 57 struct thread_info tinfo;
57 u32 stack[THREAD_SIZE/sizeof(u32)]; 58 u32 stack[THREAD_SIZE/sizeof(u32)];
58}; 59} __attribute__((aligned(PAGE_SIZE)));
59 60
60static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 61static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
61static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 62static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
62 63
63static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; 64static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
64static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; 65static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
65 66
66static void call_on_stack(void *func, void *stack) 67static void call_on_stack(void *func, void *stack)
67{ 68{
@@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
81 u32 *isp, arg1, arg2; 82 u32 *isp, arg1, arg2;
82 83
83 curctx = (union irq_ctx *) current_thread_info(); 84 curctx = (union irq_ctx *) current_thread_info();
84 irqctx = hardirq_ctx[smp_processor_id()]; 85 irqctx = __get_cpu_var(hardirq_ctx);
85 86
86 /* 87 /*
87 * this is where we switch to the IRQ stack. However, if we are 88 * this is where we switch to the IRQ stack. However, if we are
@@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
125{ 126{
126 union irq_ctx *irqctx; 127 union irq_ctx *irqctx;
127 128
128 if (hardirq_ctx[cpu]) 129 if (per_cpu(hardirq_ctx, cpu))
129 return; 130 return;
130 131
131 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; 132 irqctx = &per_cpu(hardirq_stack, cpu);
132 irqctx->tinfo.task = NULL; 133 irqctx->tinfo.task = NULL;
133 irqctx->tinfo.exec_domain = NULL; 134 irqctx->tinfo.exec_domain = NULL;
134 irqctx->tinfo.cpu = cpu; 135 irqctx->tinfo.cpu = cpu;
135 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 136 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
136 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 137 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
137 138
138 hardirq_ctx[cpu] = irqctx; 139 per_cpu(hardirq_ctx, cpu) = irqctx;
139 140
140 irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; 141 irqctx = &per_cpu(softirq_stack, cpu);
141 irqctx->tinfo.task = NULL; 142 irqctx->tinfo.task = NULL;
142 irqctx->tinfo.exec_domain = NULL; 143 irqctx->tinfo.exec_domain = NULL;
143 irqctx->tinfo.cpu = cpu; 144 irqctx->tinfo.cpu = cpu;
144 irqctx->tinfo.preempt_count = 0; 145 irqctx->tinfo.preempt_count = 0;
145 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 146 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
146 147
147 softirq_ctx[cpu] = irqctx; 148 per_cpu(softirq_ctx, cpu) = irqctx;
148 149
149 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", 150 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
150 cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); 151 cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
151} 152}
152 153
153void irq_ctx_exit(int cpu) 154void irq_ctx_exit(int cpu)
154{ 155{
155 hardirq_ctx[cpu] = NULL; 156 per_cpu(hardirq_ctx, cpu) = NULL;
156} 157}
157 158
158asmlinkage void do_softirq(void) 159asmlinkage void do_softirq(void)
@@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
169 170
170 if (local_softirq_pending()) { 171 if (local_softirq_pending()) {
171 curctx = current_thread_info(); 172 curctx = current_thread_info();
172 irqctx = softirq_ctx[smp_processor_id()]; 173 irqctx = __get_cpu_var(softirq_ctx);
173 irqctx->tinfo.task = curctx->task; 174 irqctx->tinfo.task = curctx->task;
174 irqctx->tinfo.previous_esp = current_stack_pointer; 175 irqctx->tinfo.previous_esp = current_stack_pointer;
175 176
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1cc18d439bbb..2aef36d8aca2 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -216,6 +216,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
216 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), 216 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
217 }, 217 },
218 }, 218 },
219 { /* Handle problems with rebooting on Dell XPS710 */
220 .callback = set_bios_reboot,
221 .ident = "Dell XPS710",
222 .matches = {
223 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
224 DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
225 },
226 },
219 { } 227 { }
220}; 228};
221 229
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d992e6cff730..c29f301d3885 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -7,6 +7,7 @@
7#include <linux/crash_dump.h> 7#include <linux/crash_dump.h>
8#include <linux/smp.h> 8#include <linux/smp.h>
9#include <linux/topology.h> 9#include <linux/topology.h>
10#include <linux/pfn.h>
10#include <asm/sections.h> 11#include <asm/sections.h>
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/setup.h> 13#include <asm/setup.h>
@@ -41,6 +42,321 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
41}; 42};
42EXPORT_SYMBOL(__per_cpu_offset); 43EXPORT_SYMBOL(__per_cpu_offset);
43 44
45/**
46 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
47 *
48 * If NUMA is not configured or there is only one NUMA node available,
49 * there is no reason to consider NUMA. This function determines
50 * whether percpu allocation should consider NUMA or not.
51 *
52 * RETURNS:
53 * true if NUMA should be considered; otherwise, false.
54 */
55static bool __init pcpu_need_numa(void)
56{
57#ifdef CONFIG_NEED_MULTIPLE_NODES
58 pg_data_t *last = NULL;
59 unsigned int cpu;
60
61 for_each_possible_cpu(cpu) {
62 int node = early_cpu_to_node(cpu);
63
64 if (node_online(node) && NODE_DATA(node) &&
65 last && last != NODE_DATA(node))
66 return true;
67
68 last = NODE_DATA(node);
69 }
70#endif
71 return false;
72}
73
74/**
75 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
76 * @cpu: cpu to allocate for
77 * @size: size allocation in bytes
78 * @align: alignment
79 *
80 * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
81 * does the right thing for NUMA regardless of the current
82 * configuration.
83 *
84 * RETURNS:
85 * Pointer to the allocated area on success, NULL on failure.
86 */
87static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
88 unsigned long align)
89{
90 const unsigned long goal = __pa(MAX_DMA_ADDRESS);
91#ifdef CONFIG_NEED_MULTIPLE_NODES
92 int node = early_cpu_to_node(cpu);
93 void *ptr;
94
95 if (!node_online(node) || !NODE_DATA(node)) {
96 ptr = __alloc_bootmem_nopanic(size, align, goal);
97 pr_info("cpu %d has no node %d or node-local memory\n",
98 cpu, node);
99 pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
100 cpu, size, __pa(ptr));
101 } else {
102 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
103 size, align, goal);
104 pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
105 "%016lx\n", cpu, size, node, __pa(ptr));
106 }
107 return ptr;
108#else
109 return __alloc_bootmem_nopanic(size, align, goal);
110#endif
111}
112
113/*
114 * Remap allocator
115 *
116 * This allocator uses PMD page as unit. A PMD page is allocated for
117 * each cpu and each is remapped into vmalloc area using PMD mapping.
118 * As PMD page is quite large, only part of it is used for the first
119 * chunk. Unused part is returned to the bootmem allocator.
120 *
121 * So, the PMD pages are mapped twice - once to the physical mapping
122 * and to the vmalloc area for the first percpu chunk. The double
123 * mapping does add one more PMD TLB entry pressure but still is much
124 * better than only using 4k mappings while still being NUMA friendly.
125 */
126#ifdef CONFIG_NEED_MULTIPLE_NODES
127static size_t pcpur_size __initdata;
128static void **pcpur_ptrs __initdata;
129
130static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
131{
132 size_t off = (size_t)pageno << PAGE_SHIFT;
133
134 if (off >= pcpur_size)
135 return NULL;
136
137 return virt_to_page(pcpur_ptrs[cpu] + off);
138}
139
140static ssize_t __init setup_pcpu_remap(size_t static_size)
141{
142 static struct vm_struct vm;
143 pg_data_t *last;
144 size_t ptrs_size;
145 unsigned int cpu;
146 ssize_t ret;
147
148 /*
149 * If large page isn't supported, there's no benefit in doing
150 * this. Also, on non-NUMA, embedding is better.
151 */
152 if (!cpu_has_pse || pcpu_need_numa())
153 return -EINVAL;
154
155 last = NULL;
156 for_each_possible_cpu(cpu) {
157 int node = early_cpu_to_node(cpu);
158
159 if (node_online(node) && NODE_DATA(node) &&
160 last && last != NODE_DATA(node))
161 goto proceed;
162
163 last = NODE_DATA(node);
164 }
165 return -EINVAL;
166
167proceed:
168 /*
169 * Currently supports only single page. Supporting multiple
170 * pages won't be too difficult if it ever becomes necessary.
171 */
172 pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
173 if (pcpur_size > PMD_SIZE) {
174 pr_warning("PERCPU: static data is larger than large page, "
175 "can't use large page\n");
176 return -EINVAL;
177 }
178
179 /* allocate pointer array and alloc large pages */
180 ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
181 pcpur_ptrs = alloc_bootmem(ptrs_size);
182
183 for_each_possible_cpu(cpu) {
184 pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
185 if (!pcpur_ptrs[cpu])
186 goto enomem;
187
188 /*
189 * Only use pcpur_size bytes and give back the rest.
190 *
191 * Ingo: The 2MB up-rounding bootmem is needed to make
192 * sure the partial 2MB page is still fully RAM - it's
193 * not well-specified to have a PAT-incompatible area
194 * (unmapped RAM, device memory, etc.) in that hole.
195 */
196 free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
197 PMD_SIZE - pcpur_size);
198
199 memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
200 }
201
202 /* allocate address and map */
203 vm.flags = VM_ALLOC;
204 vm.size = num_possible_cpus() * PMD_SIZE;
205 vm_area_register_early(&vm, PMD_SIZE);
206
207 for_each_possible_cpu(cpu) {
208 pmd_t *pmd;
209
210 pmd = populate_extra_pmd((unsigned long)vm.addr
211 + cpu * PMD_SIZE);
212 set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
213 PAGE_KERNEL_LARGE));
214 }
215
216 /* we're ready, commit */
217 pr_info("PERCPU: Remapped at %p with large pages, static data "
218 "%zu bytes\n", vm.addr, static_size);
219
220 ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
221 pcpur_size - static_size, vm.addr, NULL);
222 goto out_free_ar;
223
224enomem:
225 for_each_possible_cpu(cpu)
226 if (pcpur_ptrs[cpu])
227 free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
228 ret = -ENOMEM;
229out_free_ar:
230 free_bootmem(__pa(pcpur_ptrs), ptrs_size);
231 return ret;
232}
233#else
234static ssize_t __init setup_pcpu_remap(size_t static_size)
235{
236 return -EINVAL;
237}
238#endif
239
240/*
241 * Embedding allocator
242 *
243 * The first chunk is sized to just contain the static area plus
244 * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using
245 * bootmem allocator and used as-is without being mapped into vmalloc
246 * area. This enables the first chunk to piggy back on the linear
247 * physical PMD mapping and doesn't add any additional pressure to
248 * TLB.
249 */
250static void *pcpue_ptr __initdata;
251static size_t pcpue_unit_size __initdata;
252
253static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
254{
255 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size
256 + ((size_t)pageno << PAGE_SHIFT));
257}
258
259static ssize_t __init setup_pcpu_embed(size_t static_size)
260{
261 unsigned int cpu;
262
263 /*
264 * If large page isn't supported, there's no benefit in doing
265 * this. Also, embedding allocation doesn't play well with
266 * NUMA.
267 */
268 if (!cpu_has_pse || pcpu_need_numa())
269 return -EINVAL;
270
271 /* allocate and copy */
272 pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
273 pcpue_unit_size = max_t(size_t, pcpue_unit_size, PCPU_MIN_UNIT_SIZE);
274 pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
275 PAGE_SIZE);
276 if (!pcpue_ptr)
277 return -ENOMEM;
278
279 for_each_possible_cpu(cpu)
280 memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load,
281 static_size);
282
283 /* we're ready, commit */
284 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
285 pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size);
286
287 return pcpu_setup_first_chunk(pcpue_get_page, static_size,
288 pcpue_unit_size,
289 pcpue_unit_size - static_size, pcpue_ptr,
290 NULL);
291}
292
293/*
294 * 4k page allocator
295 *
296 * This is the basic allocator. Static percpu area is allocated
297 * page-by-page and most of initialization is done by the generic
298 * setup function.
299 */
300static struct page **pcpu4k_pages __initdata;
301static int pcpu4k_nr_static_pages __initdata;
302
303static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
304{
305 if (pageno < pcpu4k_nr_static_pages)
306 return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
307 return NULL;
308}
309
310static void __init pcpu4k_populate_pte(unsigned long addr)
311{
312 populate_extra_pte(addr);
313}
314
315static ssize_t __init setup_pcpu_4k(size_t static_size)
316{
317 size_t pages_size;
318 unsigned int cpu;
319 int i, j;
320 ssize_t ret;
321
322 pcpu4k_nr_static_pages = PFN_UP(static_size);
323
324 /* unaligned allocations can't be freed, round up to page size */
325 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
326 * sizeof(pcpu4k_pages[0]));
327 pcpu4k_pages = alloc_bootmem(pages_size);
328
329 /* allocate and copy */
330 j = 0;
331 for_each_possible_cpu(cpu)
332 for (i = 0; i < pcpu4k_nr_static_pages; i++) {
333 void *ptr;
334
335 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
336 if (!ptr)
337 goto enomem;
338
339 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
340 pcpu4k_pages[j++] = virt_to_page(ptr);
341 }
342
343 /* we're ready, commit */
344 pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
345 pcpu4k_nr_static_pages, static_size);
346
347 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL,
348 pcpu4k_populate_pte);
349 goto out_free_ar;
350
351enomem:
352 while (--j >= 0)
353 free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
354 ret = -ENOMEM;
355out_free_ar:
356 free_bootmem(__pa(pcpu4k_pages), pages_size);
357 return ret;
358}
359
44static inline void setup_percpu_segment(int cpu) 360static inline void setup_percpu_segment(int cpu)
45{ 361{
46#ifdef CONFIG_X86_32 362#ifdef CONFIG_X86_32
@@ -61,38 +377,35 @@ static inline void setup_percpu_segment(int cpu)
61 */ 377 */
62void __init setup_per_cpu_areas(void) 378void __init setup_per_cpu_areas(void)
63{ 379{
64 ssize_t size; 380 size_t static_size = __per_cpu_end - __per_cpu_start;
65 char *ptr; 381 unsigned int cpu;
66 int cpu; 382 unsigned long delta;
67 383 size_t pcpu_unit_size;
68 /* Copy section for each CPU (we discard the original) */ 384 ssize_t ret;
69 size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
70 385
71 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 386 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
72 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 387 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
73 388
74 pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); 389 /*
390 * Allocate percpu area. If PSE is supported, try to make use
391 * of large page mappings. Please read comments on top of
392 * each allocator for details.
393 */
394 ret = setup_pcpu_remap(static_size);
395 if (ret < 0)
396 ret = setup_pcpu_embed(static_size);
397 if (ret < 0)
398 ret = setup_pcpu_4k(static_size);
399 if (ret < 0)
400 panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
401 static_size, ret);
75 402
76 for_each_possible_cpu(cpu) { 403 pcpu_unit_size = ret;
77#ifndef CONFIG_NEED_MULTIPLE_NODES
78 ptr = alloc_bootmem_pages(size);
79#else
80 int node = early_cpu_to_node(cpu);
81 if (!node_online(node) || !NODE_DATA(node)) {
82 ptr = alloc_bootmem_pages(size);
83 pr_info("cpu %d has no node %d or node-local memory\n",
84 cpu, node);
85 pr_debug("per cpu data for cpu%d at %016lx\n",
86 cpu, __pa(ptr));
87 } else {
88 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
89 pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
90 cpu, node, __pa(ptr));
91 }
92#endif
93 404
94 memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); 405 /* alrighty, percpu areas up and running */
95 per_cpu_offset(cpu) = ptr - __per_cpu_start; 406 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
407 for_each_possible_cpu(cpu) {
408 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
96 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); 409 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
97 per_cpu(cpu_number, cpu) = cpu; 410 per_cpu(cpu_number, cpu) = cpu;
98 setup_percpu_segment(cpu); 411 setup_percpu_segment(cpu);
@@ -125,8 +438,6 @@ void __init setup_per_cpu_areas(void)
125 */ 438 */
126 if (cpu == boot_cpu_id) 439 if (cpu == boot_cpu_id)
127 switch_to_new_gdt(cpu); 440 switch_to_new_gdt(cpu);
128
129 DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
130 } 441 }
131 442
132 /* indicate the early static arrays will soon be gone */ 443 /* indicate the early static arrays will soon be gone */
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 491e737ce547..aa0987088774 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,20 +30,29 @@ static void fclex(void)
30} 30}
31 31
32/* Needs to be externally visible */ 32/* Needs to be externally visible */
33void finit(void) 33void finit_task(struct task_struct *tsk)
34{ 34{
35 control_word = 0x037f; 35 struct i387_soft_struct *soft = &tsk->thread.xstate->soft;
36 partial_status = 0; 36 struct address *oaddr, *iaddr;
37 top = 0; /* We don't keep top in the status word internally. */ 37 soft->cwd = 0x037f;
38 fpu_tag_word = 0xffff; 38 soft->swd = 0;
39 soft->ftop = 0; /* We don't keep top in the status word internally. */
40 soft->twd = 0xffff;
39 /* The behaviour is different from that detailed in 41 /* The behaviour is different from that detailed in
40 Section 15.1.6 of the Intel manual */ 42 Section 15.1.6 of the Intel manual */
41 operand_address.offset = 0; 43 oaddr = (struct address *)&soft->foo;
42 operand_address.selector = 0; 44 oaddr->offset = 0;
43 instruction_address.offset = 0; 45 oaddr->selector = 0;
44 instruction_address.selector = 0; 46 iaddr = (struct address *)&soft->fip;
45 instruction_address.opcode = 0; 47 iaddr->offset = 0;
46 no_ip_update = 1; 48 iaddr->selector = 0;
49 iaddr->opcode = 0;
50 soft->no_update = 1;
51}
52
53void finit(void)
54{
55 finit_task(current);
47} 56}
48 57
49/* 58/*
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index d57dfffb0213..2966c6b8d304 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -131,6 +131,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
131 return pte_offset_kernel(pmd, 0); 131 return pte_offset_kernel(pmd, 0);
132} 132}
133 133
134pmd_t * __init populate_extra_pmd(unsigned long vaddr)
135{
136 int pgd_idx = pgd_index(vaddr);
137 int pmd_idx = pmd_index(vaddr);
138
139 return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
140}
141
142pte_t * __init populate_extra_pte(unsigned long vaddr)
143{
144 int pte_idx = pte_index(vaddr);
145 pmd_t *pmd;
146
147 pmd = populate_extra_pmd(vaddr);
148 return one_page_table_init(pmd) + pte_idx;
149}
150
134static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, 151static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
135 unsigned long vaddr, pte_t *lastpte) 152 unsigned long vaddr, pte_t *lastpte)
136{ 153{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 7dd7ce49d69b..8a853bc3b287 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -161,34 +161,51 @@ static __ref void *spp_getpage(void)
161 return ptr; 161 return ptr;
162} 162}
163 163
164void 164static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
165set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
166{ 165{
167 pud_t *pud; 166 if (pgd_none(*pgd)) {
168 pmd_t *pmd; 167 pud_t *pud = (pud_t *)spp_getpage();
169 pte_t *pte; 168 pgd_populate(&init_mm, pgd, pud);
169 if (pud != pud_offset(pgd, 0))
170 printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
171 pud, pud_offset(pgd, 0));
172 }
173 return pud_offset(pgd, vaddr);
174}
170 175
171 pud = pud_page + pud_index(vaddr); 176static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
177{
172 if (pud_none(*pud)) { 178 if (pud_none(*pud)) {
173 pmd = (pmd_t *) spp_getpage(); 179 pmd_t *pmd = (pmd_t *) spp_getpage();
174 pud_populate(&init_mm, pud, pmd); 180 pud_populate(&init_mm, pud, pmd);
175 if (pmd != pmd_offset(pud, 0)) { 181 if (pmd != pmd_offset(pud, 0))
176 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", 182 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
177 pmd, pmd_offset(pud, 0)); 183 pmd, pmd_offset(pud, 0));
178 return;
179 }
180 } 184 }
181 pmd = pmd_offset(pud, vaddr); 185 return pmd_offset(pud, vaddr);
186}
187
188static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
189{
182 if (pmd_none(*pmd)) { 190 if (pmd_none(*pmd)) {
183 pte = (pte_t *) spp_getpage(); 191 pte_t *pte = (pte_t *) spp_getpage();
184 pmd_populate_kernel(&init_mm, pmd, pte); 192 pmd_populate_kernel(&init_mm, pmd, pte);
185 if (pte != pte_offset_kernel(pmd, 0)) { 193 if (pte != pte_offset_kernel(pmd, 0))
186 printk(KERN_ERR "PAGETABLE BUG #02!\n"); 194 printk(KERN_ERR "PAGETABLE BUG #02!\n");
187 return;
188 }
189 } 195 }
196 return pte_offset_kernel(pmd, vaddr);
197}
198
199void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
200{
201 pud_t *pud;
202 pmd_t *pmd;
203 pte_t *pte;
204
205 pud = pud_page + pud_index(vaddr);
206 pmd = fill_pmd(pud, vaddr);
207 pte = fill_pte(pmd, vaddr);
190 208
191 pte = pte_offset_kernel(pmd, vaddr);
192 set_pte(pte, new_pte); 209 set_pte(pte, new_pte);
193 210
194 /* 211 /*
@@ -198,8 +215,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
198 __flush_tlb_one(vaddr); 215 __flush_tlb_one(vaddr);
199} 216}
200 217
201void 218void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
202set_pte_vaddr(unsigned long vaddr, pte_t pteval)
203{ 219{
204 pgd_t *pgd; 220 pgd_t *pgd;
205 pud_t *pud_page; 221 pud_t *pud_page;
@@ -216,6 +232,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
216 set_pte_vaddr_pud(pud_page, vaddr, pteval); 232 set_pte_vaddr_pud(pud_page, vaddr, pteval);
217} 233}
218 234
235pmd_t * __init populate_extra_pmd(unsigned long vaddr)
236{
237 pgd_t *pgd;
238 pud_t *pud;
239
240 pgd = pgd_offset_k(vaddr);
241 pud = fill_pud(pgd, vaddr);
242 return fill_pmd(pud, vaddr);
243}
244
245pte_t * __init populate_extra_pte(unsigned long vaddr)
246{
247 pmd_t *pmd;
248
249 pmd = populate_extra_pmd(vaddr);
250 return fill_pte(pmd, vaddr);
251}
252
219/* 253/*
220 * Create large page table mappings for a range of physical addresses. 254 * Create large page table mappings for a range of physical addresses.
221 */ 255 */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c52f4034c7fd..82cd39a6cbd3 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu)
103 103
104 vcpup = &per_cpu(xen_vcpu_info, cpu); 104 vcpup = &per_cpu(xen_vcpu_info, cpu);
105 105
106 info.mfn = virt_to_mfn(vcpup); 106 info.mfn = arbitrary_virt_to_mfn(vcpup);
107 info.offset = offset_in_page(vcpup); 107 info.offset = offset_in_page(vcpup);
108 108
109 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", 109 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
@@ -301,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
301 frames = mcs.args; 301 frames = mcs.args;
302 302
303 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { 303 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
304 frames[f] = virt_to_mfn(va); 304 frames[f] = arbitrary_virt_to_mfn((void *)va);
305
305 make_lowmem_page_readonly((void *)va); 306 make_lowmem_page_readonly((void *)va);
307 make_lowmem_page_readonly(mfn_to_virt(frames[f]));
306 } 308 }
307 309
308 MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); 310 MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
@@ -314,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
314 unsigned int cpu, unsigned int i) 316 unsigned int cpu, unsigned int i)
315{ 317{
316 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 318 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
317 xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); 319 xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
318 struct multicall_space mc = __xen_mc_entry(0); 320 struct multicall_space mc = __xen_mc_entry(0);
319 321
320 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); 322 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
@@ -488,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
488 break; 490 break;
489 491
490 default: { 492 default: {
491 xmaddr_t maddr = virt_to_machine(&dt[entry]); 493 xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
492 494
493 xen_mc_flush(); 495 xen_mc_flush();
494 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) 496 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 319bd40a57c2..cb6afa4ec95c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -276,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
276 p2m_top[topidx][idx] = mfn; 276 p2m_top[topidx][idx] = mfn;
277} 277}
278 278
279unsigned long arbitrary_virt_to_mfn(void *vaddr)
280{
281 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
282
283 return PFN_DOWN(maddr.maddr);
284}
285
279xmaddr_t arbitrary_virt_to_machine(void *vaddr) 286xmaddr_t arbitrary_virt_to_machine(void *vaddr)
280{ 287{
281 unsigned long address = (unsigned long)vaddr; 288 unsigned long address = (unsigned long)vaddr;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 035582ae815d..8d470562ffc9 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -219,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
219{ 219{
220 struct vcpu_guest_context *ctxt; 220 struct vcpu_guest_context *ctxt;
221 struct desc_struct *gdt; 221 struct desc_struct *gdt;
222 unsigned long gdt_mfn;
222 223
223 if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) 224 if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
224 return 0; 225 return 0;
@@ -248,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
248 ctxt->ldt_ents = 0; 249 ctxt->ldt_ents = 0;
249 250
250 BUG_ON((unsigned long)gdt & ~PAGE_MASK); 251 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
252
253 gdt_mfn = arbitrary_virt_to_mfn(gdt);
251 make_lowmem_page_readonly(gdt); 254 make_lowmem_page_readonly(gdt);
255 make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
252 256
253 ctxt->gdt_frames[0] = virt_to_mfn(gdt); 257 ctxt->gdt_frames[0] = gdt_mfn;
254 ctxt->gdt_ents = GDT_ENTRIES; 258 ctxt->gdt_ents = GDT_ENTRIES;
255 259
256 ctxt->user_regs.cs = __KERNEL_CS; 260 ctxt->user_regs.cs = __KERNEL_CS;