aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig.cpu17
-rw-r--r--arch/x86/include/asm/apic.h13
-rwxr-xr-xarch/x86/include/asm/cpu_debug.h33
-rw-r--r--arch/x86/include/asm/dmi.h14
-rw-r--r--arch/x86/include/asm/io_apic.h5
-rw-r--r--arch/x86/include/asm/irq_remapping.h2
-rw-r--r--arch/x86/include/asm/msidef.h1
-rw-r--r--arch/x86/include/asm/page_32_types.h5
-rw-r--r--arch/x86/include/asm/paravirt.h19
-rw-r--r--arch/x86/include/asm/pgtable-2level.h7
-rw-r--r--arch/x86/include/asm/pgtable-3level.h17
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/pgtable_32.h3
-rw-r--r--arch/x86/include/asm/sections.h7
-rw-r--r--arch/x86/include/asm/setup.h37
-rw-r--r--arch/x86/kernel/apic/apic.c20
-rw-r--r--arch/x86/kernel/apic/io_apic.c268
-rw-r--r--arch/x86/kernel/apic/probe_64.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c6
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c6
-rw-r--r--arch/x86/kernel/check.c6
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/centaur.c34
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c37
-rwxr-xr-xarch/x86/kernel/cpu/cpu_debug.c150
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c40
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c1101
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c202
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c1069
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h4
-rw-r--r--arch/x86/kernel/e820.c79
-rw-r--r--arch/x86/kernel/head32.c5
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S76
-rw-r--r--arch/x86/kernel/kprobes.c3
-rw-r--r--arch/x86/kernel/kvm.c7
-rw-r--r--arch/x86/kernel/mpparse.c261
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--arch/x86/kernel/setup.c54
-rw-r--r--arch/x86/kernel/tlb_uv.c3
-rw-r--r--arch/x86/kernel/tsc.c110
-rw-r--r--arch/x86/kernel/vmi_32.c6
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S21
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S94
-rw-r--r--arch/x86/lguest/boot.c8
-rw-r--r--arch/x86/mm/iomap_32.c1
-rw-r--r--arch/x86/mm/pageattr.c5
-rw-r--r--arch/x86/mm/pgtable_32.c2
-rw-r--r--arch/x86/mm/tlb.c5
-rw-r--r--arch/x86/xen/mmu.c7
53 files changed, 2063 insertions, 1831 deletions
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index a95eaf0e582a..924e156a85ab 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -456,24 +456,9 @@ config CPU_SUP_AMD
456 456
457 If unsure, say N. 457 If unsure, say N.
458 458
459config CPU_SUP_CENTAUR_32 459config CPU_SUP_CENTAUR
460 default y 460 default y
461 bool "Support Centaur processors" if PROCESSOR_SELECT 461 bool "Support Centaur processors" if PROCESSOR_SELECT
462 depends on !64BIT
463 ---help---
464 This enables detection, tunings and quirks for Centaur processors
465
466 You need this enabled if you want your kernel to run on a
467 Centaur CPU. Disabling this option on other types of CPUs
468 makes the kernel a tiny bit smaller. Disabling it on a Centaur
469 CPU might render the kernel unbootable.
470
471 If unsure, say N.
472
473config CPU_SUP_CENTAUR_64
474 default y
475 bool "Support Centaur processors" if PROCESSOR_SELECT
476 depends on 64BIT
477 ---help--- 462 ---help---
478 This enables detection, tunings and quirks for Centaur processors 463 This enables detection, tunings and quirks for Centaur processors
479 464
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 394d177d721b..00f5962d82d0 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -108,6 +108,16 @@ extern void native_apic_icr_write(u32 low, u32 id);
108extern u64 native_apic_icr_read(void); 108extern u64 native_apic_icr_read(void);
109 109
110#ifdef CONFIG_X86_X2APIC 110#ifdef CONFIG_X86_X2APIC
111/*
112 * Make previous memory operations globally visible before
113 * sending the IPI through x2apic wrmsr. We need a serializing instruction or
114 * mfence for this.
115 */
116static inline void x2apic_wrmsr_fence(void)
117{
118 asm volatile("mfence" : : : "memory");
119}
120
111static inline void native_apic_msr_write(u32 reg, u32 v) 121static inline void native_apic_msr_write(u32 reg, u32 v)
112{ 122{
113 if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR || 123 if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
@@ -184,6 +194,9 @@ static inline int x2apic_enabled(void)
184{ 194{
185 return 0; 195 return 0;
186} 196}
197
198#define x2apic 0
199
187#endif 200#endif
188 201
189extern int get_physical_broadcast(void); 202extern int get_physical_broadcast(void);
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
index 56f1635e4617..222802029fa6 100755
--- a/arch/x86/include/asm/cpu_debug.h
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -33,6 +33,8 @@ enum cpu_debug_bit {
33 CPU_VMX_BIT, /* VMX */ 33 CPU_VMX_BIT, /* VMX */
34 CPU_CALL_BIT, /* System Call */ 34 CPU_CALL_BIT, /* System Call */
35 CPU_BASE_BIT, /* BASE Address */ 35 CPU_BASE_BIT, /* BASE Address */
36 CPU_VER_BIT, /* Version ID */
37 CPU_CONF_BIT, /* Configuration */
36 CPU_SMM_BIT, /* System mgmt mode */ 38 CPU_SMM_BIT, /* System mgmt mode */
37 CPU_SVM_BIT, /*Secure Virtual Machine*/ 39 CPU_SVM_BIT, /*Secure Virtual Machine*/
38 CPU_OSVM_BIT, /* OS-Visible Workaround*/ 40 CPU_OSVM_BIT, /* OS-Visible Workaround*/
@@ -69,6 +71,8 @@ enum cpu_debug_bit {
69#define CPU_VMX (1 << CPU_VMX_BIT) 71#define CPU_VMX (1 << CPU_VMX_BIT)
70#define CPU_CALL (1 << CPU_CALL_BIT) 72#define CPU_CALL (1 << CPU_CALL_BIT)
71#define CPU_BASE (1 << CPU_BASE_BIT) 73#define CPU_BASE (1 << CPU_BASE_BIT)
74#define CPU_VER (1 << CPU_VER_BIT)
75#define CPU_CONF (1 << CPU_CONF_BIT)
72#define CPU_SMM (1 << CPU_SMM_BIT) 76#define CPU_SMM (1 << CPU_SMM_BIT)
73#define CPU_SVM (1 << CPU_SVM_BIT) 77#define CPU_SVM (1 << CPU_SVM_BIT)
74#define CPU_OSVM (1 << CPU_OSVM_BIT) 78#define CPU_OSVM (1 << CPU_OSVM_BIT)
@@ -123,10 +127,15 @@ enum cpu_processor_bit {
123 CPU_INTEL_ATOM_BIT, 127 CPU_INTEL_ATOM_BIT,
124 CPU_INTEL_XEON_P4_BIT, 128 CPU_INTEL_XEON_P4_BIT,
125 CPU_INTEL_XEON_MP_BIT, 129 CPU_INTEL_XEON_MP_BIT,
130/* AMD */
131 CPU_AMD_K6_BIT,
132 CPU_AMD_K7_BIT,
133 CPU_AMD_K8_BIT,
134 CPU_AMD_0F_BIT,
135 CPU_AMD_10_BIT,
136 CPU_AMD_11_BIT,
126}; 137};
127 138
128#define CPU_ALL (~0) /* Select all CPUs */
129
130#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT) 139#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT)
131#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT) 140#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT)
132#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT) 141#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT)
@@ -156,9 +165,27 @@ enum cpu_processor_bit {
156#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT) 165#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT)
157#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE) 166#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE)
158 167
159/* Select all Intel CPUs*/ 168/* Select all supported Intel CPUs */
160#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE) 169#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
161 170
171#define CPU_AMD_K6 (1 << CPU_AMD_K6_BIT)
172#define CPU_AMD_K7 (1 << CPU_AMD_K7_BIT)
173#define CPU_AMD_K8 (1 << CPU_AMD_K8_BIT)
174#define CPU_AMD_0F (1 << CPU_AMD_0F_BIT)
175#define CPU_AMD_10 (1 << CPU_AMD_10_BIT)
176#define CPU_AMD_11 (1 << CPU_AMD_11_BIT)
177
178#define CPU_K10_PLUS (CPU_AMD_10 | CPU_AMD_11)
179#define CPU_K0F_PLUS (CPU_AMD_0F | CPU_K10_PLUS)
180#define CPU_K8_PLUS (CPU_AMD_K8 | CPU_K0F_PLUS)
181#define CPU_K7_PLUS (CPU_AMD_K7 | CPU_K8_PLUS)
182
183/* Select all supported AMD CPUs */
184#define CPU_AMD_ALL (CPU_AMD_K6 | CPU_K7_PLUS)
185
186/* Select all supported CPUs */
187#define CPU_ALL (CPU_INTEL_ALL | CPU_AMD_ALL)
188
162#define MAX_CPU_FILES 512 189#define MAX_CPU_FILES 512
163 190
164struct cpu_private { 191struct cpu_private {
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index bc68212c6bc0..aa32f7e6c197 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -2,21 +2,11 @@
2#define _ASM_X86_DMI_H 2#define _ASM_X86_DMI_H
3 3
4#include <asm/io.h> 4#include <asm/io.h>
5#include <asm/setup.h>
5 6
6#define DMI_MAX_DATA 2048
7
8extern int dmi_alloc_index;
9extern char dmi_alloc_data[DMI_MAX_DATA];
10
11/* This is so early that there is no good way to allocate dynamic memory.
12 Allocate data in an BSS array. */
13static inline void *dmi_alloc(unsigned len) 7static inline void *dmi_alloc(unsigned len)
14{ 8{
15 int idx = dmi_alloc_index; 9 return extend_brk(len, sizeof(int));
16 if ((dmi_alloc_index + len) > DMI_MAX_DATA)
17 return NULL;
18 dmi_alloc_index += len;
19 return dmi_alloc_data + idx;
20} 10}
21 11
22/* Use early IO mappings for DMI because it's initialized early */ 12/* Use early IO mappings for DMI because it's initialized early */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 59cb4a1317b7..373cc2bbcad2 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -162,7 +162,8 @@ extern int (*ioapic_renumber_irq)(int ioapic, int irq);
162extern void ioapic_init_mappings(void); 162extern void ioapic_init_mappings(void);
163 163
164#ifdef CONFIG_X86_64 164#ifdef CONFIG_X86_64
165extern int save_mask_IO_APIC_setup(void); 165extern int save_IO_APIC_setup(void);
166extern void mask_IO_APIC_setup(void);
166extern void restore_IO_APIC_setup(void); 167extern void restore_IO_APIC_setup(void);
167extern void reinit_intr_remapped_IO_APIC(int); 168extern void reinit_intr_remapped_IO_APIC(int);
168#endif 169#endif
@@ -172,7 +173,7 @@ extern void probe_nr_irqs_gsi(void);
172extern int setup_ioapic_entry(int apic, int irq, 173extern int setup_ioapic_entry(int apic, int irq,
173 struct IO_APIC_route_entry *entry, 174 struct IO_APIC_route_entry *entry,
174 unsigned int destination, int trigger, 175 unsigned int destination, int trigger,
175 int polarity, int vector); 176 int polarity, int vector, int pin);
176extern void ioapic_write_entry(int apic, int pin, 177extern void ioapic_write_entry(int apic, int pin,
177 struct IO_APIC_route_entry e); 178 struct IO_APIC_route_entry e);
178#else /* !CONFIG_X86_IO_APIC */ 179#else /* !CONFIG_X86_IO_APIC */
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 20e1fd588dbf..0396760fccb8 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -1,8 +1,6 @@
1#ifndef _ASM_X86_IRQ_REMAPPING_H 1#ifndef _ASM_X86_IRQ_REMAPPING_H
2#define _ASM_X86_IRQ_REMAPPING_H 2#define _ASM_X86_IRQ_REMAPPING_H
3 3
4extern int x2apic;
5
6#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8) 4#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
7 5
8#endif /* _ASM_X86_IRQ_REMAPPING_H */ 6#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
index 6706b3006f13..4cc48af23fef 100644
--- a/arch/x86/include/asm/msidef.h
+++ b/arch/x86/include/asm/msidef.h
@@ -47,6 +47,7 @@
47#define MSI_ADDR_DEST_ID_MASK 0x00ffff0 47#define MSI_ADDR_DEST_ID_MASK 0x00ffff0
48#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \ 48#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
49 MSI_ADDR_DEST_ID_MASK) 49 MSI_ADDR_DEST_ID_MASK)
50#define MSI_ADDR_EXT_DEST_ID(dest) ((dest) & 0xffffff00)
50 51
51#define MSI_ADDR_IR_EXT_INT (1 << 4) 52#define MSI_ADDR_IR_EXT_INT (1 << 4)
52#define MSI_ADDR_IR_SHV (1 << 3) 53#define MSI_ADDR_IR_SHV (1 << 3)
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index f1e4a79a6e41..0f915ae649a7 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -39,6 +39,11 @@
39#define __VIRTUAL_MASK_SHIFT 32 39#define __VIRTUAL_MASK_SHIFT 32
40#endif /* CONFIG_X86_PAE */ 40#endif /* CONFIG_X86_PAE */
41 41
42/*
43 * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S)
44 */
45#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
46
42#ifndef __ASSEMBLY__ 47#ifndef __ASSEMBLY__
43 48
44/* 49/*
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 0617d5cc9712..7727aa8b7dda 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -317,8 +317,6 @@ struct pv_mmu_ops {
317#if PAGETABLE_LEVELS >= 3 317#if PAGETABLE_LEVELS >= 3
318#ifdef CONFIG_X86_PAE 318#ifdef CONFIG_X86_PAE
319 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); 319 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
320 void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
321 pte_t *ptep, pte_t pte);
322 void (*pte_clear)(struct mm_struct *mm, unsigned long addr, 320 void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
323 pte_t *ptep); 321 pte_t *ptep);
324 void (*pmd_clear)(pmd_t *pmdp); 322 void (*pmd_clear)(pmd_t *pmdp);
@@ -389,7 +387,7 @@ extern struct pv_lock_ops pv_lock_ops;
389 387
390#define paravirt_type(op) \ 388#define paravirt_type(op) \
391 [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ 389 [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
392 [paravirt_opptr] "m" (op) 390 [paravirt_opptr] "i" (&(op))
393#define paravirt_clobber(clobber) \ 391#define paravirt_clobber(clobber) \
394 [paravirt_clobber] "i" (clobber) 392 [paravirt_clobber] "i" (clobber)
395 393
@@ -443,7 +441,7 @@ int paravirt_disable_iospace(void);
443 * offset into the paravirt_patch_template structure, and can therefore be 441 * offset into the paravirt_patch_template structure, and can therefore be
444 * freely converted back into a structure offset. 442 * freely converted back into a structure offset.
445 */ 443 */
446#define PARAVIRT_CALL "call *%[paravirt_opptr];" 444#define PARAVIRT_CALL "call *%c[paravirt_opptr];"
447 445
448/* 446/*
449 * These macros are intended to wrap calls through one of the paravirt 447 * These macros are intended to wrap calls through one of the paravirt
@@ -1365,13 +1363,6 @@ static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
1365 pte.pte, pte.pte >> 32); 1363 pte.pte, pte.pte >> 32);
1366} 1364}
1367 1365
1368static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
1369 pte_t *ptep, pte_t pte)
1370{
1371 /* 5 arg words */
1372 pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
1373}
1374
1375static inline void pte_clear(struct mm_struct *mm, unsigned long addr, 1366static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
1376 pte_t *ptep) 1367 pte_t *ptep)
1377{ 1368{
@@ -1388,12 +1379,6 @@ static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
1388 set_pte(ptep, pte); 1379 set_pte(ptep, pte);
1389} 1380}
1390 1381
1391static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
1392 pte_t *ptep, pte_t pte)
1393{
1394 set_pte(ptep, pte);
1395}
1396
1397static inline void pte_clear(struct mm_struct *mm, unsigned long addr, 1382static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
1398 pte_t *ptep) 1383 pte_t *ptep)
1399{ 1384{
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index c1774ac9da7a..2334982b339e 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -26,13 +26,6 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
26 native_set_pte(ptep, pte); 26 native_set_pte(ptep, pte);
27} 27}
28 28
29static inline void native_set_pte_present(struct mm_struct *mm,
30 unsigned long addr,
31 pte_t *ptep, pte_t pte)
32{
33 native_set_pte(ptep, pte);
34}
35
36static inline void native_pmd_clear(pmd_t *pmdp) 29static inline void native_pmd_clear(pmd_t *pmdp)
37{ 30{
38 native_set_pmd(pmdp, __pmd(0)); 31 native_set_pmd(pmdp, __pmd(0));
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 3f13cdf61156..177b0165ea01 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -31,23 +31,6 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
31 ptep->pte_low = pte.pte_low; 31 ptep->pte_low = pte.pte_low;
32} 32}
33 33
34/*
35 * Since this is only called on user PTEs, and the page fault handler
36 * must handle the already racy situation of simultaneous page faults,
37 * we are justified in merely clearing the PTE present bit, followed
38 * by a set. The ordering here is important.
39 */
40static inline void native_set_pte_present(struct mm_struct *mm,
41 unsigned long addr,
42 pte_t *ptep, pte_t pte)
43{
44 ptep->pte_low = 0;
45 smp_wmb();
46 ptep->pte_high = pte.pte_high;
47 smp_wmb();
48 ptep->pte_low = pte.pte_low;
49}
50
51static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) 34static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
52{ 35{
53 set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); 36 set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index d0812e155f1d..29d96d168bc0 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -31,8 +31,6 @@ extern struct list_head pgd_list;
31#define set_pte(ptep, pte) native_set_pte(ptep, pte) 31#define set_pte(ptep, pte) native_set_pte(ptep, pte)
32#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) 32#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
33 33
34#define set_pte_present(mm, addr, ptep, pte) \
35 native_set_pte_present(mm, addr, ptep, pte)
36#define set_pte_atomic(ptep, pte) \ 34#define set_pte_atomic(ptep, pte) \
37 native_set_pte_atomic(ptep, pte) 35 native_set_pte_atomic(ptep, pte)
38 36
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 97612fc7632f..31bd120cf2a2 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -42,9 +42,6 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
42 */ 42 */
43#undef TEST_ACCESS_OK 43#undef TEST_ACCESS_OK
44 44
45/* The boot page tables (all created as a single array) */
46extern unsigned long pg0[];
47
48#ifdef CONFIG_X86_PAE 45#ifdef CONFIG_X86_PAE
49# include <asm/pgtable-3level.h> 46# include <asm/pgtable-3level.h>
50#else 47#else
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 2b8c5160388f..1b7ee5d673c2 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -1 +1,8 @@
1#ifndef _ASM_X86_SECTIONS_H
2#define _ASM_X86_SECTIONS_H
3
1#include <asm-generic/sections.h> 4#include <asm-generic/sections.h>
5
6extern char __brk_base[], __brk_limit[];
7
8#endif /* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 05c6f6b11fd5..fbf0521eeed8 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -100,20 +100,51 @@ extern struct boot_params boot_params;
100 */ 100 */
101#define LOWMEMSIZE() (0x9f000) 101#define LOWMEMSIZE() (0x9f000)
102 102
103/* exceedingly early brk-like allocator */
104extern unsigned long _brk_end;
105void *extend_brk(size_t size, size_t align);
106
107/*
108 * Reserve space in the brk section. The name must be unique within
109 * the file, and somewhat descriptive. The size is in bytes. Must be
110 * used at file scope.
111 *
112 * (This uses a temp function to wrap the asm so we can pass it the
113 * size parameter; otherwise we wouldn't be able to. We can't use a
114 * "section" attribute on a normal variable because it always ends up
115 * being @progbits, which ends up allocating space in the vmlinux
116 * executable.)
117 */
118#define RESERVE_BRK(name,sz) \
119 static void __section(.discard) __used \
120 __brk_reservation_fn_##name##__(void) { \
121 asm volatile ( \
122 ".pushsection .brk_reservation,\"aw\",@nobits;" \
123 ".brk." #name ":" \
124 " 1:.skip %c0;" \
125 " .size .brk." #name ", . - 1b;" \
126 " .popsection" \
127 : : "i" (sz)); \
128 }
129
103#ifdef __i386__ 130#ifdef __i386__
104 131
105void __init i386_start_kernel(void); 132void __init i386_start_kernel(void);
106extern void probe_roms(void); 133extern void probe_roms(void);
107 134
108extern unsigned long init_pg_tables_start;
109extern unsigned long init_pg_tables_end;
110
111#else 135#else
112void __init x86_64_start_kernel(char *real_mode); 136void __init x86_64_start_kernel(char *real_mode);
113void __init x86_64_start_reservations(char *real_mode_data); 137void __init x86_64_start_reservations(char *real_mode_data);
114 138
115#endif /* __i386__ */ 139#endif /* __i386__ */
116#endif /* _SETUP */ 140#endif /* _SETUP */
141#else
142#define RESERVE_BRK(name,sz) \
143 .pushsection .brk_reservation,"aw",@nobits; \
144.brk.name: \
1451: .skip sz; \
146 .size .brk.name,.-1b; \
147 .popsection
117#endif /* __ASSEMBLY__ */ 148#endif /* __ASSEMBLY__ */
118#endif /* __KERNEL__ */ 149#endif /* __KERNEL__ */
119 150
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 30909a258d0f..85eb8e100818 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -809,7 +809,7 @@ void clear_local_APIC(void)
809 u32 v; 809 u32 v;
810 810
811 /* APIC hasn't been mapped yet */ 811 /* APIC hasn't been mapped yet */
812 if (!apic_phys) 812 if (!x2apic && !apic_phys)
813 return; 813 return;
814 814
815 maxlvt = lapic_get_maxlvt(); 815 maxlvt = lapic_get_maxlvt();
@@ -1334,15 +1334,16 @@ void __init enable_IR_x2apic(void)
1334 return; 1334 return;
1335 } 1335 }
1336 1336
1337 local_irq_save(flags); 1337 ret = save_IO_APIC_setup();
1338 mask_8259A();
1339
1340 ret = save_mask_IO_APIC_setup();
1341 if (ret) { 1338 if (ret) {
1342 pr_info("Saving IO-APIC state failed: %d\n", ret); 1339 pr_info("Saving IO-APIC state failed: %d\n", ret);
1343 goto end; 1340 goto end;
1344 } 1341 }
1345 1342
1343 local_irq_save(flags);
1344 mask_IO_APIC_setup();
1345 mask_8259A();
1346
1346 ret = enable_intr_remapping(1); 1347 ret = enable_intr_remapping(1);
1347 1348
1348 if (ret && x2apic_preenabled) { 1349 if (ret && x2apic_preenabled) {
@@ -1367,10 +1368,10 @@ end_restore:
1367 else 1368 else
1368 reinit_intr_remapped_IO_APIC(x2apic_preenabled); 1369 reinit_intr_remapped_IO_APIC(x2apic_preenabled);
1369 1370
1370end:
1371 unmask_8259A(); 1371 unmask_8259A();
1372 local_irq_restore(flags); 1372 local_irq_restore(flags);
1373 1373
1374end:
1374 if (!ret) { 1375 if (!ret) {
1375 if (!x2apic_preenabled) 1376 if (!x2apic_preenabled)
1376 pr_info("Enabled x2apic and interrupt-remapping\n"); 1377 pr_info("Enabled x2apic and interrupt-remapping\n");
@@ -1523,12 +1524,10 @@ void __init early_init_lapic_mapping(void)
1523 */ 1524 */
1524void __init init_apic_mappings(void) 1525void __init init_apic_mappings(void)
1525{ 1526{
1526#ifdef CONFIG_X86_X2APIC
1527 if (x2apic) { 1527 if (x2apic) {
1528 boot_cpu_physical_apicid = read_apic_id(); 1528 boot_cpu_physical_apicid = read_apic_id();
1529 return; 1529 return;
1530 } 1530 }
1531#endif
1532 1531
1533 /* 1532 /*
1534 * If no local APIC can be found then set up a fake all 1533 * If no local APIC can be found then set up a fake all
@@ -1972,12 +1971,9 @@ static int lapic_resume(struct sys_device *dev)
1972 1971
1973 local_irq_save(flags); 1972 local_irq_save(flags);
1974 1973
1975#ifdef CONFIG_X86_X2APIC
1976 if (x2apic) 1974 if (x2apic)
1977 enable_x2apic(); 1975 enable_x2apic();
1978 else 1976 else {
1979#endif
1980 {
1981 /* 1977 /*
1982 * Make sure the APICBASE points to the right address 1978 * Make sure the APICBASE points to the right address
1983 * 1979 *
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 00e6071cefc4..42cdc78427a2 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -389,6 +389,8 @@ struct io_apic {
389 unsigned int index; 389 unsigned int index;
390 unsigned int unused[3]; 390 unsigned int unused[3];
391 unsigned int data; 391 unsigned int data;
392 unsigned int unused2[11];
393 unsigned int eoi;
392}; 394};
393 395
394static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 396static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
@@ -397,6 +399,12 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
397 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK); 399 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
398} 400}
399 401
402static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
403{
404 struct io_apic __iomem *io_apic = io_apic_base(apic);
405 writel(vector, &io_apic->eoi);
406}
407
400static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) 408static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
401{ 409{
402 struct io_apic __iomem *io_apic = io_apic_base(apic); 410 struct io_apic __iomem *io_apic = io_apic_base(apic);
@@ -546,16 +554,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
546 554
547 apic = entry->apic; 555 apic = entry->apic;
548 pin = entry->pin; 556 pin = entry->pin;
549#ifdef CONFIG_INTR_REMAP
550 /* 557 /*
551 * With interrupt-remapping, destination information comes 558 * With interrupt-remapping, destination information comes
552 * from interrupt-remapping table entry. 559 * from interrupt-remapping table entry.
553 */ 560 */
554 if (!irq_remapped(irq)) 561 if (!irq_remapped(irq))
555 io_apic_write(apic, 0x11 + pin*2, dest); 562 io_apic_write(apic, 0x11 + pin*2, dest);
556#else
557 io_apic_write(apic, 0x11 + pin*2, dest);
558#endif
559 reg = io_apic_read(apic, 0x10 + pin*2); 563 reg = io_apic_read(apic, 0x10 + pin*2);
560 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 564 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
561 reg |= vector; 565 reg |= vector;
@@ -849,9 +853,9 @@ __setup("pirq=", ioapic_pirq_setup);
849static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; 853static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
850 854
851/* 855/*
852 * Saves and masks all the unmasked IO-APIC RTE's 856 * Saves all the IO-APIC RTE's
853 */ 857 */
854int save_mask_IO_APIC_setup(void) 858int save_IO_APIC_setup(void)
855{ 859{
856 union IO_APIC_reg_01 reg_01; 860 union IO_APIC_reg_01 reg_01;
857 unsigned long flags; 861 unsigned long flags;
@@ -876,16 +880,9 @@ int save_mask_IO_APIC_setup(void)
876 } 880 }
877 881
878 for (apic = 0; apic < nr_ioapics; apic++) 882 for (apic = 0; apic < nr_ioapics; apic++)
879 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 883 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
880 struct IO_APIC_route_entry entry; 884 early_ioapic_entries[apic][pin] =
881
882 entry = early_ioapic_entries[apic][pin] =
883 ioapic_read_entry(apic, pin); 885 ioapic_read_entry(apic, pin);
884 if (!entry.mask) {
885 entry.mask = 1;
886 ioapic_write_entry(apic, pin, entry);
887 }
888 }
889 886
890 return 0; 887 return 0;
891 888
@@ -898,6 +895,25 @@ nomem:
898 return -ENOMEM; 895 return -ENOMEM;
899} 896}
900 897
898void mask_IO_APIC_setup(void)
899{
900 int apic, pin;
901
902 for (apic = 0; apic < nr_ioapics; apic++) {
903 if (!early_ioapic_entries[apic])
904 break;
905 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
906 struct IO_APIC_route_entry entry;
907
908 entry = early_ioapic_entries[apic][pin];
909 if (!entry.mask) {
910 entry.mask = 1;
911 ioapic_write_entry(apic, pin, entry);
912 }
913 }
914 }
915}
916
901void restore_IO_APIC_setup(void) 917void restore_IO_APIC_setup(void)
902{ 918{
903 int apic, pin; 919 int apic, pin;
@@ -1411,9 +1427,8 @@ void __setup_vector_irq(int cpu)
1411} 1427}
1412 1428
1413static struct irq_chip ioapic_chip; 1429static struct irq_chip ioapic_chip;
1414#ifdef CONFIG_INTR_REMAP
1415static struct irq_chip ir_ioapic_chip; 1430static struct irq_chip ir_ioapic_chip;
1416#endif 1431static struct irq_chip msi_ir_chip;
1417 1432
1418#define IOAPIC_AUTO -1 1433#define IOAPIC_AUTO -1
1419#define IOAPIC_EDGE 0 1434#define IOAPIC_EDGE 0
@@ -1452,7 +1467,6 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1452 else 1467 else
1453 desc->status &= ~IRQ_LEVEL; 1468 desc->status &= ~IRQ_LEVEL;
1454 1469
1455#ifdef CONFIG_INTR_REMAP
1456 if (irq_remapped(irq)) { 1470 if (irq_remapped(irq)) {
1457 desc->status |= IRQ_MOVE_PCNTXT; 1471 desc->status |= IRQ_MOVE_PCNTXT;
1458 if (trigger) 1472 if (trigger)
@@ -1464,7 +1478,7 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1464 handle_edge_irq, "edge"); 1478 handle_edge_irq, "edge");
1465 return; 1479 return;
1466 } 1480 }
1467#endif 1481
1468 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1482 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1469 trigger == IOAPIC_LEVEL) 1483 trigger == IOAPIC_LEVEL)
1470 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1484 set_irq_chip_and_handler_name(irq, &ioapic_chip,
@@ -1478,14 +1492,13 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1478int setup_ioapic_entry(int apic_id, int irq, 1492int setup_ioapic_entry(int apic_id, int irq,
1479 struct IO_APIC_route_entry *entry, 1493 struct IO_APIC_route_entry *entry,
1480 unsigned int destination, int trigger, 1494 unsigned int destination, int trigger,
1481 int polarity, int vector) 1495 int polarity, int vector, int pin)
1482{ 1496{
1483 /* 1497 /*
1484 * add it to the IO-APIC irq-routing table: 1498 * add it to the IO-APIC irq-routing table:
1485 */ 1499 */
1486 memset(entry,0,sizeof(*entry)); 1500 memset(entry,0,sizeof(*entry));
1487 1501
1488#ifdef CONFIG_INTR_REMAP
1489 if (intr_remapping_enabled) { 1502 if (intr_remapping_enabled) {
1490 struct intel_iommu *iommu = map_ioapic_to_ir(apic_id); 1503 struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
1491 struct irte irte; 1504 struct irte irte;
@@ -1504,7 +1517,14 @@ int setup_ioapic_entry(int apic_id, int irq,
1504 1517
1505 irte.present = 1; 1518 irte.present = 1;
1506 irte.dst_mode = apic->irq_dest_mode; 1519 irte.dst_mode = apic->irq_dest_mode;
1507 irte.trigger_mode = trigger; 1520 /*
1521 * Trigger mode in the IRTE will always be edge, and the
1522 * actual level or edge trigger will be setup in the IO-APIC
1523 * RTE. This will help simplify level triggered irq migration.
1524 * For more details, see the comments above explainig IO-APIC
1525 * irq migration in the presence of interrupt-remapping.
1526 */
1527 irte.trigger_mode = 0;
1508 irte.dlvry_mode = apic->irq_delivery_mode; 1528 irte.dlvry_mode = apic->irq_delivery_mode;
1509 irte.vector = vector; 1529 irte.vector = vector;
1510 irte.dest_id = IRTE_DEST(destination); 1530 irte.dest_id = IRTE_DEST(destination);
@@ -1515,18 +1535,21 @@ int setup_ioapic_entry(int apic_id, int irq,
1515 ir_entry->zero = 0; 1535 ir_entry->zero = 0;
1516 ir_entry->format = 1; 1536 ir_entry->format = 1;
1517 ir_entry->index = (index & 0x7fff); 1537 ir_entry->index = (index & 0x7fff);
1518 } else 1538 /*
1519#endif 1539 * IO-APIC RTE will be configured with virtual vector.
1520 { 1540 * irq handler will do the explicit EOI to the io-apic.
1541 */
1542 ir_entry->vector = pin;
1543 } else {
1521 entry->delivery_mode = apic->irq_delivery_mode; 1544 entry->delivery_mode = apic->irq_delivery_mode;
1522 entry->dest_mode = apic->irq_dest_mode; 1545 entry->dest_mode = apic->irq_dest_mode;
1523 entry->dest = destination; 1546 entry->dest = destination;
1547 entry->vector = vector;
1524 } 1548 }
1525 1549
1526 entry->mask = 0; /* enable IRQ */ 1550 entry->mask = 0; /* enable IRQ */
1527 entry->trigger = trigger; 1551 entry->trigger = trigger;
1528 entry->polarity = polarity; 1552 entry->polarity = polarity;
1529 entry->vector = vector;
1530 1553
1531 /* Mask level triggered irqs. 1554 /* Mask level triggered irqs.
1532 * Use IRQ_DELAYED_DISABLE for edge triggered irqs. 1555 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
@@ -1561,7 +1584,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1561 1584
1562 1585
1563 if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry, 1586 if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
1564 dest, trigger, polarity, cfg->vector)) { 1587 dest, trigger, polarity, cfg->vector, pin)) {
1565 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1588 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1566 mp_ioapics[apic_id].apicid, pin); 1589 mp_ioapics[apic_id].apicid, pin);
1567 __clear_irq_vector(irq, cfg); 1590 __clear_irq_vector(irq, cfg);
@@ -1642,10 +1665,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
1642{ 1665{
1643 struct IO_APIC_route_entry entry; 1666 struct IO_APIC_route_entry entry;
1644 1667
1645#ifdef CONFIG_INTR_REMAP
1646 if (intr_remapping_enabled) 1668 if (intr_remapping_enabled)
1647 return; 1669 return;
1648#endif
1649 1670
1650 memset(&entry, 0, sizeof(entry)); 1671 memset(&entry, 0, sizeof(entry));
1651 1672
@@ -2040,8 +2061,13 @@ void disable_IO_APIC(void)
2040 * If the i8259 is routed through an IOAPIC 2061 * If the i8259 is routed through an IOAPIC
2041 * Put that IOAPIC in virtual wire mode 2062 * Put that IOAPIC in virtual wire mode
2042 * so legacy interrupts can be delivered. 2063 * so legacy interrupts can be delivered.
2064 *
2065 * With interrupt-remapping, for now we will use virtual wire A mode,
2066 * as virtual wire B is little complex (need to configure both
2067 * IOAPIC RTE aswell as interrupt-remapping table entry).
2068 * As this gets called during crash dump, keep this simple for now.
2043 */ 2069 */
2044 if (ioapic_i8259.pin != -1) { 2070 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
2045 struct IO_APIC_route_entry entry; 2071 struct IO_APIC_route_entry entry;
2046 2072
2047 memset(&entry, 0, sizeof(entry)); 2073 memset(&entry, 0, sizeof(entry));
@@ -2061,7 +2087,10 @@ void disable_IO_APIC(void)
2061 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); 2087 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
2062 } 2088 }
2063 2089
2064 disconnect_bsp_APIC(ioapic_i8259.pin != -1); 2090 /*
2091 * Use virtual wire A mode when interrupt remapping is enabled.
2092 */
2093 disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1);
2065} 2094}
2066 2095
2067#ifdef CONFIG_X86_32 2096#ifdef CONFIG_X86_32
@@ -2303,37 +2332,24 @@ static int ioapic_retrigger_irq(unsigned int irq)
2303#ifdef CONFIG_SMP 2332#ifdef CONFIG_SMP
2304 2333
2305#ifdef CONFIG_INTR_REMAP 2334#ifdef CONFIG_INTR_REMAP
2306static void ir_irq_migration(struct work_struct *work);
2307
2308static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
2309 2335
2310/* 2336/*
2311 * Migrate the IO-APIC irq in the presence of intr-remapping. 2337 * Migrate the IO-APIC irq in the presence of intr-remapping.
2312 * 2338 *
2313 * For edge triggered, irq migration is a simple atomic update(of vector 2339 * For both level and edge triggered, irq migration is a simple atomic
2314 * and cpu destination) of IRTE and flush the hardware cache. 2340 * update(of vector and cpu destination) of IRTE and flush the hardware cache.
2315 *
2316 * For level triggered, we need to modify the io-apic RTE aswell with the update
2317 * vector information, along with modifying IRTE with vector and destination.
2318 * So irq migration for level triggered is little bit more complex compared to
2319 * edge triggered migration. But the good news is, we use the same algorithm
2320 * for level triggered migration as we have today, only difference being,
2321 * we now initiate the irq migration from process context instead of the
2322 * interrupt context.
2323 * 2341 *
2324 * In future, when we do a directed EOI (combined with cpu EOI broadcast 2342 * For level triggered, we eliminate the io-apic RTE modification (with the
2325 * suppression) to the IO-APIC, level triggered irq migration will also be 2343 * updated vector information), by using a virtual vector (io-apic pin number).
2326 * as simple as edge triggered migration and we can do the irq migration 2344 * Real vector that is used for interrupting cpu will be coming from
2327 * with a simple atomic update to IO-APIC RTE. 2345 * the interrupt-remapping table entry.
2328 */ 2346 */
2329static void 2347static void
2330migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) 2348migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2331{ 2349{
2332 struct irq_cfg *cfg; 2350 struct irq_cfg *cfg;
2333 struct irte irte; 2351 struct irte irte;
2334 int modify_ioapic_rte;
2335 unsigned int dest; 2352 unsigned int dest;
2336 unsigned long flags;
2337 unsigned int irq; 2353 unsigned int irq;
2338 2354
2339 if (!cpumask_intersects(mask, cpu_online_mask)) 2355 if (!cpumask_intersects(mask, cpu_online_mask))
@@ -2351,13 +2367,6 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2351 2367
2352 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); 2368 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
2353 2369
2354 modify_ioapic_rte = desc->status & IRQ_LEVEL;
2355 if (modify_ioapic_rte) {
2356 spin_lock_irqsave(&ioapic_lock, flags);
2357 __target_IO_APIC_irq(irq, dest, cfg);
2358 spin_unlock_irqrestore(&ioapic_lock, flags);
2359 }
2360
2361 irte.vector = cfg->vector; 2370 irte.vector = cfg->vector;
2362 irte.dest_id = IRTE_DEST(dest); 2371 irte.dest_id = IRTE_DEST(dest);
2363 2372
@@ -2372,73 +2381,12 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2372 cpumask_copy(desc->affinity, mask); 2381 cpumask_copy(desc->affinity, mask);
2373} 2382}
2374 2383
2375static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
2376{
2377 int ret = -1;
2378 struct irq_cfg *cfg = desc->chip_data;
2379
2380 mask_IO_APIC_irq_desc(desc);
2381
2382 if (io_apic_level_ack_pending(cfg)) {
2383 /*
2384 * Interrupt in progress. Migrating irq now will change the
2385 * vector information in the IO-APIC RTE and that will confuse
2386 * the EOI broadcast performed by cpu.
2387 * So, delay the irq migration to the next instance.
2388 */
2389 schedule_delayed_work(&ir_migration_work, 1);
2390 goto unmask;
2391 }
2392
2393 /* everthing is clear. we have right of way */
2394 migrate_ioapic_irq_desc(desc, desc->pending_mask);
2395
2396 ret = 0;
2397 desc->status &= ~IRQ_MOVE_PENDING;
2398 cpumask_clear(desc->pending_mask);
2399
2400unmask:
2401 unmask_IO_APIC_irq_desc(desc);
2402
2403 return ret;
2404}
2405
2406static void ir_irq_migration(struct work_struct *work)
2407{
2408 unsigned int irq;
2409 struct irq_desc *desc;
2410
2411 for_each_irq_desc(irq, desc) {
2412 if (desc->status & IRQ_MOVE_PENDING) {
2413 unsigned long flags;
2414
2415 spin_lock_irqsave(&desc->lock, flags);
2416 if (!desc->chip->set_affinity ||
2417 !(desc->status & IRQ_MOVE_PENDING)) {
2418 desc->status &= ~IRQ_MOVE_PENDING;
2419 spin_unlock_irqrestore(&desc->lock, flags);
2420 continue;
2421 }
2422
2423 desc->chip->set_affinity(irq, desc->pending_mask);
2424 spin_unlock_irqrestore(&desc->lock, flags);
2425 }
2426 }
2427}
2428
2429/* 2384/*
2430 * Migrates the IRQ destination in the process context. 2385 * Migrates the IRQ destination in the process context.
2431 */ 2386 */
2432static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, 2387static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2433 const struct cpumask *mask) 2388 const struct cpumask *mask)
2434{ 2389{
2435 if (desc->status & IRQ_LEVEL) {
2436 desc->status |= IRQ_MOVE_PENDING;
2437 cpumask_copy(desc->pending_mask, mask);
2438 migrate_irq_remapped_level_desc(desc);
2439 return;
2440 }
2441
2442 migrate_ioapic_irq_desc(desc, mask); 2390 migrate_ioapic_irq_desc(desc, mask);
2443} 2391}
2444static void set_ir_ioapic_affinity_irq(unsigned int irq, 2392static void set_ir_ioapic_affinity_irq(unsigned int irq,
@@ -2448,6 +2396,11 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq,
2448 2396
2449 set_ir_ioapic_affinity_irq_desc(desc, mask); 2397 set_ir_ioapic_affinity_irq_desc(desc, mask);
2450} 2398}
2399#else
2400static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2401 const struct cpumask *mask)
2402{
2403}
2451#endif 2404#endif
2452 2405
2453asmlinkage void smp_irq_move_cleanup_interrupt(void) 2406asmlinkage void smp_irq_move_cleanup_interrupt(void)
@@ -2461,6 +2414,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2461 me = smp_processor_id(); 2414 me = smp_processor_id();
2462 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 2415 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
2463 unsigned int irq; 2416 unsigned int irq;
2417 unsigned int irr;
2464 struct irq_desc *desc; 2418 struct irq_desc *desc;
2465 struct irq_cfg *cfg; 2419 struct irq_cfg *cfg;
2466 irq = __get_cpu_var(vector_irq)[vector]; 2420 irq = __get_cpu_var(vector_irq)[vector];
@@ -2480,6 +2434,18 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2480 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2434 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2481 goto unlock; 2435 goto unlock;
2482 2436
2437 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
2438 /*
2439 * Check if the vector that needs to be cleanedup is
2440 * registered at the cpu's IRR. If so, then this is not
2441 * the best time to clean it up. Lets clean it up in the
2442 * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
2443 * to myself.
2444 */
2445 if (irr & (1 << (vector % 32))) {
2446 apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
2447 goto unlock;
2448 }
2483 __get_cpu_var(vector_irq)[vector] = -1; 2449 __get_cpu_var(vector_irq)[vector] = -1;
2484 cfg->move_cleanup_count--; 2450 cfg->move_cleanup_count--;
2485unlock: 2451unlock:
@@ -2529,9 +2495,44 @@ static inline void irq_complete_move(struct irq_desc **descp) {}
2529#endif 2495#endif
2530 2496
2531#ifdef CONFIG_INTR_REMAP 2497#ifdef CONFIG_INTR_REMAP
2498static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2499{
2500 int apic, pin;
2501 struct irq_pin_list *entry;
2502
2503 entry = cfg->irq_2_pin;
2504 for (;;) {
2505
2506 if (!entry)
2507 break;
2508
2509 apic = entry->apic;
2510 pin = entry->pin;
2511 io_apic_eoi(apic, pin);
2512 entry = entry->next;
2513 }
2514}
2515
2516static void
2517eoi_ioapic_irq(struct irq_desc *desc)
2518{
2519 struct irq_cfg *cfg;
2520 unsigned long flags;
2521 unsigned int irq;
2522
2523 irq = desc->irq;
2524 cfg = desc->chip_data;
2525
2526 spin_lock_irqsave(&ioapic_lock, flags);
2527 __eoi_ioapic_irq(irq, cfg);
2528 spin_unlock_irqrestore(&ioapic_lock, flags);
2529}
2530
2532static void ack_x2apic_level(unsigned int irq) 2531static void ack_x2apic_level(unsigned int irq)
2533{ 2532{
2533 struct irq_desc *desc = irq_to_desc(irq);
2534 ack_x2APIC_irq(); 2534 ack_x2APIC_irq();
2535 eoi_ioapic_irq(desc);
2535} 2536}
2536 2537
2537static void ack_x2apic_edge(unsigned int irq) 2538static void ack_x2apic_edge(unsigned int irq)
@@ -2901,10 +2902,8 @@ static inline void __init check_timer(void)
2901 * 8259A. 2902 * 8259A.
2902 */ 2903 */
2903 if (pin1 == -1) { 2904 if (pin1 == -1) {
2904#ifdef CONFIG_INTR_REMAP
2905 if (intr_remapping_enabled) 2905 if (intr_remapping_enabled)
2906 panic("BIOS bug: timer not connected to IO-APIC"); 2906 panic("BIOS bug: timer not connected to IO-APIC");
2907#endif
2908 pin1 = pin2; 2907 pin1 = pin2;
2909 apic1 = apic2; 2908 apic1 = apic2;
2910 no_pin1 = 1; 2909 no_pin1 = 1;
@@ -2940,10 +2939,8 @@ static inline void __init check_timer(void)
2940 clear_IO_APIC_pin(0, pin1); 2939 clear_IO_APIC_pin(0, pin1);
2941 goto out; 2940 goto out;
2942 } 2941 }
2943#ifdef CONFIG_INTR_REMAP
2944 if (intr_remapping_enabled) 2942 if (intr_remapping_enabled)
2945 panic("timer doesn't work through Interrupt-remapped IO-APIC"); 2943 panic("timer doesn't work through Interrupt-remapped IO-APIC");
2946#endif
2947 local_irq_disable(); 2944 local_irq_disable();
2948 clear_IO_APIC_pin(apic1, pin1); 2945 clear_IO_APIC_pin(apic1, pin1);
2949 if (!no_pin1) 2946 if (!no_pin1)
@@ -3237,9 +3234,7 @@ void destroy_irq(unsigned int irq)
3237 if (desc) 3234 if (desc)
3238 desc->chip_data = cfg; 3235 desc->chip_data = cfg;
3239 3236
3240#ifdef CONFIG_INTR_REMAP
3241 free_irte(irq); 3237 free_irte(irq);
3242#endif
3243 spin_lock_irqsave(&vector_lock, flags); 3238 spin_lock_irqsave(&vector_lock, flags);
3244 __clear_irq_vector(irq, cfg); 3239 __clear_irq_vector(irq, cfg);
3245 spin_unlock_irqrestore(&vector_lock, flags); 3240 spin_unlock_irqrestore(&vector_lock, flags);
@@ -3265,7 +3260,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3265 3260
3266 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 3261 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
3267 3262
3268#ifdef CONFIG_INTR_REMAP
3269 if (irq_remapped(irq)) { 3263 if (irq_remapped(irq)) {
3270 struct irte irte; 3264 struct irte irte;
3271 int ir_index; 3265 int ir_index;
@@ -3291,10 +3285,13 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3291 MSI_ADDR_IR_SHV | 3285 MSI_ADDR_IR_SHV |
3292 MSI_ADDR_IR_INDEX1(ir_index) | 3286 MSI_ADDR_IR_INDEX1(ir_index) |
3293 MSI_ADDR_IR_INDEX2(ir_index); 3287 MSI_ADDR_IR_INDEX2(ir_index);
3294 } else 3288 } else {
3295#endif 3289 if (x2apic_enabled())
3296 { 3290 msg->address_hi = MSI_ADDR_BASE_HI |
3297 msg->address_hi = MSI_ADDR_BASE_HI; 3291 MSI_ADDR_EXT_DEST_ID(dest);
3292 else
3293 msg->address_hi = MSI_ADDR_BASE_HI;
3294
3298 msg->address_lo = 3295 msg->address_lo =
3299 MSI_ADDR_BASE_LO | 3296 MSI_ADDR_BASE_LO |
3300 ((apic->irq_dest_mode == 0) ? 3297 ((apic->irq_dest_mode == 0) ?
@@ -3405,6 +3402,7 @@ static struct irq_chip msi_ir_chip = {
3405#endif 3402#endif
3406 .retrigger = ioapic_retrigger_irq, 3403 .retrigger = ioapic_retrigger_irq,
3407}; 3404};
3405#endif
3408 3406
3409/* 3407/*
3410 * Map the PCI dev to the corresponding remapping hardware unit 3408 * Map the PCI dev to the corresponding remapping hardware unit
@@ -3432,7 +3430,6 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3432 } 3430 }
3433 return index; 3431 return index;
3434} 3432}
3435#endif
3436 3433
3437static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) 3434static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3438{ 3435{
@@ -3446,7 +3443,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3446 set_irq_msi(irq, msidesc); 3443 set_irq_msi(irq, msidesc);
3447 write_msi_msg(irq, &msg); 3444 write_msi_msg(irq, &msg);
3448 3445
3449#ifdef CONFIG_INTR_REMAP
3450 if (irq_remapped(irq)) { 3446 if (irq_remapped(irq)) {
3451 struct irq_desc *desc = irq_to_desc(irq); 3447 struct irq_desc *desc = irq_to_desc(irq);
3452 /* 3448 /*
@@ -3455,7 +3451,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3455 desc->status |= IRQ_MOVE_PCNTXT; 3451 desc->status |= IRQ_MOVE_PCNTXT;
3456 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); 3452 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
3457 } else 3453 } else
3458#endif
3459 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 3454 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
3460 3455
3461 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); 3456 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
@@ -3469,11 +3464,8 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3469 int ret, sub_handle; 3464 int ret, sub_handle;
3470 struct msi_desc *msidesc; 3465 struct msi_desc *msidesc;
3471 unsigned int irq_want; 3466 unsigned int irq_want;
3472
3473#ifdef CONFIG_INTR_REMAP
3474 struct intel_iommu *iommu = 0; 3467 struct intel_iommu *iommu = 0;
3475 int index = 0; 3468 int index = 0;
3476#endif
3477 3469
3478 irq_want = nr_irqs_gsi; 3470 irq_want = nr_irqs_gsi;
3479 sub_handle = 0; 3471 sub_handle = 0;
@@ -3482,7 +3474,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3482 if (irq == 0) 3474 if (irq == 0)
3483 return -1; 3475 return -1;
3484 irq_want = irq + 1; 3476 irq_want = irq + 1;
3485#ifdef CONFIG_INTR_REMAP
3486 if (!intr_remapping_enabled) 3477 if (!intr_remapping_enabled)
3487 goto no_ir; 3478 goto no_ir;
3488 3479
@@ -3510,7 +3501,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3510 set_irte_irq(irq, iommu, index, sub_handle); 3501 set_irte_irq(irq, iommu, index, sub_handle);
3511 } 3502 }
3512no_ir: 3503no_ir:
3513#endif
3514 ret = setup_msi_irq(dev, msidesc, irq); 3504 ret = setup_msi_irq(dev, msidesc, irq);
3515 if (ret < 0) 3505 if (ret < 0)
3516 goto error; 3506 goto error;
@@ -3528,7 +3518,7 @@ void arch_teardown_msi_irq(unsigned int irq)
3528 destroy_irq(irq); 3518 destroy_irq(irq);
3529} 3519}
3530 3520
3531#ifdef CONFIG_DMAR 3521#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
3532#ifdef CONFIG_SMP 3522#ifdef CONFIG_SMP
3533static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3523static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3534{ 3524{
@@ -4045,11 +4035,9 @@ void __init setup_ioapic_dest(void)
4045 else 4035 else
4046 mask = apic->target_cpus(); 4036 mask = apic->target_cpus();
4047 4037
4048#ifdef CONFIG_INTR_REMAP
4049 if (intr_remapping_enabled) 4038 if (intr_remapping_enabled)
4050 set_ir_ioapic_affinity_irq_desc(desc, mask); 4039 set_ir_ioapic_affinity_irq_desc(desc, mask);
4051 else 4040 else
4052#endif
4053 set_ioapic_affinity_irq_desc(desc, mask); 4041 set_ioapic_affinity_irq_desc(desc, mask);
4054 } 4042 }
4055 4043
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 8d7748efe6a8..1783652bb0e5 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -68,6 +68,13 @@ void __init default_setup_apic_routing(void)
68 apic = &apic_physflat; 68 apic = &apic_physflat;
69 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 69 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
70 } 70 }
71
72 /*
73 * Now that apic routing model is selected, configure the
74 * fault handling for intr remapping.
75 */
76 if (intr_remapping_enabled)
77 enable_drhd_fault_handling();
71} 78}
72 79
73/* Same for both flat and physical. */ 80/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8fb87b6dd633..4a903e2f0d17 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -57,6 +57,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
57 unsigned long query_cpu; 57 unsigned long query_cpu;
58 unsigned long flags; 58 unsigned long flags;
59 59
60 x2apic_wrmsr_fence();
61
60 local_irq_save(flags); 62 local_irq_save(flags);
61 for_each_cpu(query_cpu, mask) { 63 for_each_cpu(query_cpu, mask) {
62 __x2apic_send_IPI_dest( 64 __x2apic_send_IPI_dest(
@@ -73,6 +75,8 @@ static void
73 unsigned long query_cpu; 75 unsigned long query_cpu;
74 unsigned long flags; 76 unsigned long flags;
75 77
78 x2apic_wrmsr_fence();
79
76 local_irq_save(flags); 80 local_irq_save(flags);
77 for_each_cpu(query_cpu, mask) { 81 for_each_cpu(query_cpu, mask) {
78 if (query_cpu == this_cpu) 82 if (query_cpu == this_cpu)
@@ -90,6 +94,8 @@ static void x2apic_send_IPI_allbutself(int vector)
90 unsigned long query_cpu; 94 unsigned long query_cpu;
91 unsigned long flags; 95 unsigned long flags;
92 96
97 x2apic_wrmsr_fence();
98
93 local_irq_save(flags); 99 local_irq_save(flags);
94 for_each_online_cpu(query_cpu) { 100 for_each_online_cpu(query_cpu) {
95 if (query_cpu == this_cpu) 101 if (query_cpu == this_cpu)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 23625b9f98b2..a284359627e7 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -58,6 +58,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
58 unsigned long query_cpu; 58 unsigned long query_cpu;
59 unsigned long flags; 59 unsigned long flags;
60 60
61 x2apic_wrmsr_fence();
62
61 local_irq_save(flags); 63 local_irq_save(flags);
62 for_each_cpu(query_cpu, mask) { 64 for_each_cpu(query_cpu, mask) {
63 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), 65 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
@@ -73,6 +75,8 @@ static void
73 unsigned long query_cpu; 75 unsigned long query_cpu;
74 unsigned long flags; 76 unsigned long flags;
75 77
78 x2apic_wrmsr_fence();
79
76 local_irq_save(flags); 80 local_irq_save(flags);
77 for_each_cpu(query_cpu, mask) { 81 for_each_cpu(query_cpu, mask) {
78 if (query_cpu != this_cpu) 82 if (query_cpu != this_cpu)
@@ -89,6 +93,8 @@ static void x2apic_send_IPI_allbutself(int vector)
89 unsigned long query_cpu; 93 unsigned long query_cpu;
90 unsigned long flags; 94 unsigned long flags;
91 95
96 x2apic_wrmsr_fence();
97
92 local_irq_save(flags); 98 local_irq_save(flags);
93 for_each_online_cpu(query_cpu) { 99 for_each_online_cpu(query_cpu) {
94 if (query_cpu == this_cpu) 100 if (query_cpu == this_cpu)
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index b617b1164f1e..fc999e6fc46a 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -86,12 +86,12 @@ void __init setup_bios_corruption_check(void)
86 if (!(addr + 1)) 86 if (!(addr + 1))
87 break; 87 break;
88 88
89 if (addr >= corruption_check_size)
90 break;
91
89 if ((addr + size) > corruption_check_size) 92 if ((addr + size) > corruption_check_size)
90 size = corruption_check_size - addr; 93 size = corruption_check_size - addr;
91 94
92 if (size == 0)
93 break;
94
95 e820_update_range(addr, size, E820_RAM, E820_RESERVED); 95 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
96 scan_areas[num_scan_areas].addr = addr; 96 scan_areas[num_scan_areas].addr = addr;
97 scan_areas[num_scan_areas].size = size; 97 scan_areas[num_scan_areas].size = size;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d4356f8b7522..4e242f9a06e4 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -19,8 +19,7 @@ obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
19obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 19obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
20obj-$(CONFIG_CPU_SUP_AMD) += amd.o 20obj-$(CONFIG_CPU_SUP_AMD) += amd.o
21obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 21obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
22obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o 22obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
23obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
24obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
25obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
26 25
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 983e0830f0da..c95e831bb095 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,11 +1,11 @@
1#include <linux/bitops.h>
1#include <linux/kernel.h> 2#include <linux/kernel.h>
2#include <linux/init.h> 3#include <linux/init.h>
3#include <linux/bitops.h>
4 4
5#include <asm/processor.h> 5#include <asm/processor.h>
6#include <asm/msr.h>
7#include <asm/e820.h> 6#include <asm/e820.h>
8#include <asm/mtrr.h> 7#include <asm/mtrr.h>
8#include <asm/msr.h>
9 9
10#include "cpu.h" 10#include "cpu.h"
11 11
@@ -276,7 +276,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
276 */ 276 */
277 c->x86_capability[5] = cpuid_edx(0xC0000001); 277 c->x86_capability[5] = cpuid_edx(0xC0000001);
278 } 278 }
279 279#ifdef CONFIG_X86_32
280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ 280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */
281 if (c->x86_model >= 6 && c->x86_model <= 9) { 281 if (c->x86_model >= 6 && c->x86_model <= 9) {
282 rdmsr(MSR_VIA_FCR, lo, hi); 282 rdmsr(MSR_VIA_FCR, lo, hi);
@@ -288,6 +288,11 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
288 /* Before Nehemiah, the C3's had 3dNOW! */ 288 /* Before Nehemiah, the C3's had 3dNOW! */
289 if (c->x86_model >= 6 && c->x86_model < 9) 289 if (c->x86_model >= 6 && c->x86_model < 9)
290 set_cpu_cap(c, X86_FEATURE_3DNOW); 290 set_cpu_cap(c, X86_FEATURE_3DNOW);
291#endif
292 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
293 c->x86_cache_alignment = c->x86_clflush_size * 2;
294 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
295 }
291 296
292 display_cacheinfo(c); 297 display_cacheinfo(c);
293} 298}
@@ -316,16 +321,25 @@ enum {
316static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) 321static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
317{ 322{
318 switch (c->x86) { 323 switch (c->x86) {
324#ifdef CONFIG_X86_32
319 case 5: 325 case 5:
320 /* Emulate MTRRs using Centaur's MCR. */ 326 /* Emulate MTRRs using Centaur's MCR. */
321 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); 327 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
322 break; 328 break;
329#endif
330 case 6:
331 if (c->x86_model >= 0xf)
332 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
333 break;
323 } 334 }
335#ifdef CONFIG_X86_64
336 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
337#endif
324} 338}
325 339
326static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 340static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
327{ 341{
328 342#ifdef CONFIG_X86_32
329 char *name; 343 char *name;
330 u32 fcr_set = 0; 344 u32 fcr_set = 0;
331 u32 fcr_clr = 0; 345 u32 fcr_clr = 0;
@@ -337,8 +351,10 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
337 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway 351 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
338 */ 352 */
339 clear_cpu_cap(c, 0*32+31); 353 clear_cpu_cap(c, 0*32+31);
340 354#endif
355 early_init_centaur(c);
341 switch (c->x86) { 356 switch (c->x86) {
357#ifdef CONFIG_X86_32
342 case 5: 358 case 5:
343 switch (c->x86_model) { 359 switch (c->x86_model) {
344 case 4: 360 case 4:
@@ -442,16 +458,20 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
442 } 458 }
443 sprintf(c->x86_model_id, "WinChip %s", name); 459 sprintf(c->x86_model_id, "WinChip %s", name);
444 break; 460 break;
445 461#endif
446 case 6: 462 case 6:
447 init_c3(c); 463 init_c3(c);
448 break; 464 break;
449 } 465 }
466#ifdef CONFIG_X86_64
467 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
468#endif
450} 469}
451 470
452static unsigned int __cpuinit 471static unsigned int __cpuinit
453centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) 472centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
454{ 473{
474#ifdef CONFIG_X86_32
455 /* VIA C3 CPUs (670-68F) need further shifting. */ 475 /* VIA C3 CPUs (670-68F) need further shifting. */
456 if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) 476 if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
457 size >>= 8; 477 size >>= 8;
@@ -464,7 +484,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
464 if ((c->x86 == 6) && (c->x86_model == 9) && 484 if ((c->x86 == 6) && (c->x86_model == 9) &&
465 (c->x86_mask == 1) && (size == 65)) 485 (c->x86_mask == 1) && (size == 65))
466 size -= 1; 486 size -= 1;
467 487#endif
468 return size; 488 return size;
469} 489}
470 490
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
deleted file mode 100644
index 51b09c48c9c7..000000000000
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ /dev/null
@@ -1,37 +0,0 @@
1#include <linux/init.h>
2#include <linux/smp.h>
3
4#include <asm/cpufeature.h>
5#include <asm/processor.h>
6
7#include "cpu.h"
8
9static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
10{
11 if (c->x86 == 0x6 && c->x86_model >= 0xf)
12 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
13
14 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
15}
16
17static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
18{
19 early_init_centaur(c);
20
21 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
22 c->x86_cache_alignment = c->x86_clflush_size * 2;
23 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
24 }
25 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
26}
27
28static const struct cpu_dev centaur_cpu_dev __cpuinitconst = {
29 .c_vendor = "Centaur",
30 .c_ident = { "CentaurHauls" },
31 .c_early_init = early_init_centaur,
32 .c_init = init_centaur,
33 .c_x86_vendor = X86_VENDOR_CENTAUR,
34};
35
36cpu_dev_register(centaur_cpu_dev);
37
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 21c0cf8ced18..46e29ab96c6a 100755
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -64,6 +64,8 @@ static struct cpu_debug_base cpu_base[] = {
64 { "vmx", CPU_VMX, 0 }, 64 { "vmx", CPU_VMX, 0 },
65 { "call", CPU_CALL, 0 }, 65 { "call", CPU_CALL, 0 },
66 { "base", CPU_BASE, 0 }, 66 { "base", CPU_BASE, 0 },
67 { "ver", CPU_VER, 0 },
68 { "conf", CPU_CONF, 0 },
67 { "smm", CPU_SMM, 0 }, 69 { "smm", CPU_SMM, 0 },
68 { "svm", CPU_SVM, 0 }, 70 { "svm", CPU_SVM, 0 },
69 { "osvm", CPU_OSVM, 0 }, 71 { "osvm", CPU_OSVM, 0 },
@@ -177,54 +179,59 @@ static struct cpu_debug_range cpu_intel_range[] = {
177 179
178/* AMD Registers Range */ 180/* AMD Registers Range */
179static struct cpu_debug_range cpu_amd_range[] = { 181static struct cpu_debug_range cpu_amd_range[] = {
180 { 0x00000010, 0x00000010, CPU_TIME, CPU_ALL, }, 182 { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, },
181 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_ALL, }, 183 { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, },
182 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_ALL, }, 184 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, },
183 185 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS },
184 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_ALL, }, 186 { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS },
185 { 0x00000179, 0x0000017A, CPU_MC, CPU_ALL, }, 187 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, },
186 { 0x0000017B, 0x0000017B, CPU_MC, CPU_ALL, }, 188
187 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_ALL, }, 189 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, },
188 { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_ALL, }, 190 { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, },
189 191 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, },
190 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_ALL, }, 192 { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, },
191 { 0x00000250, 0x00000250, CPU_MTRR, CPU_ALL, }, 193
192 { 0x00000258, 0x00000259, CPU_MTRR, CPU_ALL, }, 194 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, },
193 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_ALL, }, 195 { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, },
194 { 0x00000277, 0x00000277, CPU_PAT, CPU_ALL, }, 196 { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, },
195 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_ALL, }, 197 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, },
196 198 { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, },
197 { 0x00000400, 0x00000417, CPU_MC, CPU_ALL, }, 199 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, },
198 200
199 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_ALL, }, 201 { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, },
200 { 0xC0000081, 0xC0000084, CPU_CALL, CPU_ALL, }, 202
201 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_ALL, }, 203 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, },
202 { 0xC0000103, 0xC0000103, CPU_TIME, CPU_ALL, }, 204 { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, },
203 205 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, },
204 { 0xC0000408, 0xC000040A, CPU_MC, CPU_ALL, }, 206 { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, },
205 207
206 { 0xc0010000, 0xc0010007, CPU_PMC, CPU_ALL, }, 208 { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, },
207 { 0xc0010010, 0xc0010010, CPU_MTRR, CPU_ALL, }, 209 { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, },
208 { 0xc0010016, 0xc001001A, CPU_MTRR, CPU_ALL, }, 210 { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, },
209 { 0xc001001D, 0xc001001D, CPU_MTRR, CPU_ALL, }, 211 { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, },
210 { 0xc0010030, 0xc0010035, CPU_BIOS, CPU_ALL, }, 212 { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, },
211 { 0xc0010056, 0xc0010056, CPU_SMM, CPU_ALL, }, 213 { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, },
212 { 0xc0010061, 0xc0010063, CPU_SMM, CPU_ALL, }, 214 { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, },
213 { 0xc0010074, 0xc0010074, CPU_MC, CPU_ALL, }, 215 { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, },
214 { 0xc0010111, 0xc0010113, CPU_SMM, CPU_ALL, }, 216 { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, },
215 { 0xc0010114, 0xc0010118, CPU_SVM, CPU_ALL, }, 217 { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, },
216 { 0xc0010119, 0xc001011A, CPU_SMM, CPU_ALL, }, 218 { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, },
217 { 0xc0010140, 0xc0010141, CPU_OSVM, CPU_ALL, }, 219 { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, },
218 { 0xc0010156, 0xc0010156, CPU_SMM, CPU_ALL, }, 220 { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, },
221 { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, },
222 { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, },
223 { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, },
224 { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, },
225 { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, },
219}; 226};
220 227
221 228
222static int get_cpu_modelflag(unsigned cpu) 229/* Intel */
230static int get_intel_modelflag(unsigned model)
223{ 231{
224 int flag; 232 int flag;
225 233
226 switch (per_cpu(cpu_model, cpu)) { 234 switch (model) {
227 /* Intel */
228 case 0x0501: 235 case 0x0501:
229 case 0x0502: 236 case 0x0502:
230 case 0x0504: 237 case 0x0504:
@@ -271,6 +278,59 @@ static int get_cpu_modelflag(unsigned cpu)
271 return flag; 278 return flag;
272} 279}
273 280
281/* AMD */
282static int get_amd_modelflag(unsigned model)
283{
284 int flag;
285
286 switch (model >> 8) {
287 case 0x6:
288 flag = CPU_AMD_K6;
289 break;
290 case 0x7:
291 flag = CPU_AMD_K7;
292 break;
293 case 0x8:
294 flag = CPU_AMD_K8;
295 break;
296 case 0xf:
297 flag = CPU_AMD_0F;
298 break;
299 case 0x10:
300 flag = CPU_AMD_10;
301 break;
302 case 0x11:
303 flag = CPU_AMD_11;
304 break;
305 default:
306 flag = CPU_NONE;
307 break;
308 }
309
310 return flag;
311}
312
313static int get_cpu_modelflag(unsigned cpu)
314{
315 int flag;
316
317 flag = per_cpu(cpu_model, cpu);
318
319 switch (flag >> 16) {
320 case X86_VENDOR_INTEL:
321 flag = get_intel_modelflag(flag);
322 break;
323 case X86_VENDOR_AMD:
324 flag = get_amd_modelflag(flag & 0xffff);
325 break;
326 default:
327 flag = CPU_NONE;
328 break;
329 }
330
331 return flag;
332}
333
274static int get_cpu_range_count(unsigned cpu) 334static int get_cpu_range_count(unsigned cpu)
275{ 335{
276 int index; 336 int index;
@@ -311,7 +371,8 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)
311 return 1; 371 return 1;
312 break; 372 break;
313 case X86_VENDOR_AMD: 373 case X86_VENDOR_AMD:
314 if (cpu_amd_range[i].flag & flag) 374 if ((cpu_amd_range[i].model & modelflag) &&
375 (cpu_amd_range[i].flag & flag))
315 return 1; 376 return 1;
316 break; 377 break;
317 } 378 }
@@ -337,7 +398,8 @@ static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
337 } 398 }
338 break; 399 break;
339 case X86_VENDOR_AMD: 400 case X86_VENDOR_AMD:
340 if (cpu_amd_range[index].flag & flag) { 401 if ((cpu_amd_range[index].model & modelflag) &&
402 (cpu_amd_range[index].flag & flag)) {
341 *min = cpu_amd_range[index].min; 403 *min = cpu_amd_range[index].min;
342 *max = cpu_amd_range[index].max; 404 *max = cpu_amd_range[index].max;
343 } 405 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index c5a32f92d07e..7d01be868870 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -92,7 +92,8 @@ struct thresh_restart {
92}; 92};
93 93
94/* must be called with correct cpu affinity */ 94/* must be called with correct cpu affinity */
95static long threshold_restart_bank(void *_tr) 95/* Called via smp_call_function_single() */
96static void threshold_restart_bank(void *_tr)
96{ 97{
97 struct thresh_restart *tr = _tr; 98 struct thresh_restart *tr = _tr;
98 u32 mci_misc_hi, mci_misc_lo; 99 u32 mci_misc_hi, mci_misc_lo;
@@ -119,7 +120,6 @@ static long threshold_restart_bank(void *_tr)
119 120
120 mci_misc_hi |= MASK_COUNT_EN_HI; 121 mci_misc_hi |= MASK_COUNT_EN_HI;
121 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); 122 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
122 return 0;
123} 123}
124 124
125/* cpu init entry point, called from mce.c with preempt off */ 125/* cpu init entry point, called from mce.c with preempt off */
@@ -279,7 +279,7 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
279 tr.b = b; 279 tr.b = b;
280 tr.reset = 0; 280 tr.reset = 0;
281 tr.old_limit = 0; 281 tr.old_limit = 0;
282 work_on_cpu(b->cpu, threshold_restart_bank, &tr); 282 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
283 283
284 return end - buf; 284 return end - buf;
285} 285}
@@ -301,23 +301,32 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
301 tr.b = b; 301 tr.b = b;
302 tr.reset = 0; 302 tr.reset = 0;
303 303
304 work_on_cpu(b->cpu, threshold_restart_bank, &tr); 304 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
305 305
306 return end - buf; 306 return end - buf;
307} 307}
308 308
309static long local_error_count(void *_b) 309struct threshold_block_cross_cpu {
310 struct threshold_block *tb;
311 long retval;
312};
313
314static void local_error_count_handler(void *_tbcc)
310{ 315{
311 struct threshold_block *b = _b; 316 struct threshold_block_cross_cpu *tbcc = _tbcc;
317 struct threshold_block *b = tbcc->tb;
312 u32 low, high; 318 u32 low, high;
313 319
314 rdmsr(b->address, low, high); 320 rdmsr(b->address, low, high);
315 return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); 321 tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
316} 322}
317 323
318static ssize_t show_error_count(struct threshold_block *b, char *buf) 324static ssize_t show_error_count(struct threshold_block *b, char *buf)
319{ 325{
320 return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b)); 326 struct threshold_block_cross_cpu tbcc = { .tb = b, };
327
328 smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1);
329 return sprintf(buf, "%lx\n", tbcc.retval);
321} 330}
322 331
323static ssize_t store_error_count(struct threshold_block *b, 332static ssize_t store_error_count(struct threshold_block *b,
@@ -325,7 +334,7 @@ static ssize_t store_error_count(struct threshold_block *b,
325{ 334{
326 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; 335 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
327 336
328 work_on_cpu(b->cpu, threshold_restart_bank, &tr); 337 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
329 return 1; 338 return 1;
330} 339}
331 340
@@ -394,7 +403,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
394 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) 403 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
395 return 0; 404 return 0;
396 405
397 if (rdmsr_safe(address, &low, &high)) 406 if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
398 return 0; 407 return 0;
399 408
400 if (!(high & MASK_VALID_HI)) { 409 if (!(high & MASK_VALID_HI)) {
@@ -458,12 +467,11 @@ out_free:
458 return err; 467 return err;
459} 468}
460 469
461static __cpuinit long local_allocate_threshold_blocks(void *_bank) 470static __cpuinit long
471local_allocate_threshold_blocks(int cpu, unsigned int bank)
462{ 472{
463 unsigned int *bank = _bank; 473 return allocate_threshold_blocks(cpu, bank, 0,
464 474 MSR_IA32_MC0_MISC + bank * 4);
465 return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
466 MSR_IA32_MC0_MISC + *bank * 4);
467} 475}
468 476
469/* symlinks sibling shared banks to first core. first core owns dir/files. */ 477/* symlinks sibling shared banks to first core. first core owns dir/files. */
@@ -526,7 +534,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
526 534
527 per_cpu(threshold_banks, cpu)[bank] = b; 535 per_cpu(threshold_banks, cpu)[bank] = b;
528 536
529 err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank); 537 err = local_allocate_threshold_blocks(cpu, bank);
530 if (err) 538 if (err)
531 goto out_free; 539 goto out_free;
532 540
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index aaa7d9730938..57df3d383470 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -270,7 +270,7 @@ void cmci_reenable(void)
270 cmci_discover(banks, 0); 270 cmci_discover(banks, 0);
271} 271}
272 272
273static __cpuinit void intel_init_cmci(void) 273static void intel_init_cmci(void)
274{ 274{
275 int banks; 275 int banks;
276 276
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index 191fc0533649..f4361b56f8e9 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
1obj-y := main.o if.o generic.o state.o 1obj-y := main.o if.o generic.o state.o cleanup.o
2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3 3
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
new file mode 100644
index 000000000000..ce0fe4b5c04f
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -0,0 +1,1101 @@
1/* MTRR (Memory Type Range Register) cleanup
2
3 Copyright (C) 2009 Yinghai Lu
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public
7 License as published by the Free Software Foundation; either
8 version 2 of the License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
14
15 You should have received a copy of the GNU Library General Public
16 License along with this library; if not, write to the Free
17 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18*/
19
20#include <linux/module.h>
21#include <linux/init.h>
22#include <linux/pci.h>
23#include <linux/smp.h>
24#include <linux/cpu.h>
25#include <linux/mutex.h>
26#include <linux/sort.h>
27
28#include <asm/e820.h>
29#include <asm/mtrr.h>
30#include <asm/uaccess.h>
31#include <asm/processor.h>
32#include <asm/msr.h>
33#include <asm/kvm_para.h>
34#include "mtrr.h"
35
36/* should be related to MTRR_VAR_RANGES nums */
37#define RANGE_NUM 256
38
39struct res_range {
40 unsigned long start;
41 unsigned long end;
42};
43
44static int __init
45add_range(struct res_range *range, int nr_range, unsigned long start,
46 unsigned long end)
47{
48 /* out of slots */
49 if (nr_range >= RANGE_NUM)
50 return nr_range;
51
52 range[nr_range].start = start;
53 range[nr_range].end = end;
54
55 nr_range++;
56
57 return nr_range;
58}
59
60static int __init
61add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
62 unsigned long end)
63{
64 int i;
65
66 /* try to merge it with old one */
67 for (i = 0; i < nr_range; i++) {
68 unsigned long final_start, final_end;
69 unsigned long common_start, common_end;
70
71 if (!range[i].end)
72 continue;
73
74 common_start = max(range[i].start, start);
75 common_end = min(range[i].end, end);
76 if (common_start > common_end + 1)
77 continue;
78
79 final_start = min(range[i].start, start);
80 final_end = max(range[i].end, end);
81
82 range[i].start = final_start;
83 range[i].end = final_end;
84 return nr_range;
85 }
86
87 /* need to add that */
88 return add_range(range, nr_range, start, end);
89}
90
91static void __init
92subtract_range(struct res_range *range, unsigned long start, unsigned long end)
93{
94 int i, j;
95
96 for (j = 0; j < RANGE_NUM; j++) {
97 if (!range[j].end)
98 continue;
99
100 if (start <= range[j].start && end >= range[j].end) {
101 range[j].start = 0;
102 range[j].end = 0;
103 continue;
104 }
105
106 if (start <= range[j].start && end < range[j].end &&
107 range[j].start < end + 1) {
108 range[j].start = end + 1;
109 continue;
110 }
111
112
113 if (start > range[j].start && end >= range[j].end &&
114 range[j].end > start - 1) {
115 range[j].end = start - 1;
116 continue;
117 }
118
119 if (start > range[j].start && end < range[j].end) {
120 /* find the new spare */
121 for (i = 0; i < RANGE_NUM; i++) {
122 if (range[i].end == 0)
123 break;
124 }
125 if (i < RANGE_NUM) {
126 range[i].end = range[j].end;
127 range[i].start = end + 1;
128 } else {
129 printk(KERN_ERR "run of slot in ranges\n");
130 }
131 range[j].end = start - 1;
132 continue;
133 }
134 }
135}
136
137static int __init cmp_range(const void *x1, const void *x2)
138{
139 const struct res_range *r1 = x1;
140 const struct res_range *r2 = x2;
141 long start1, start2;
142
143 start1 = r1->start;
144 start2 = r2->start;
145
146 return start1 - start2;
147}
148
149struct var_mtrr_range_state {
150 unsigned long base_pfn;
151 unsigned long size_pfn;
152 mtrr_type type;
153};
154
155static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
156static int __initdata debug_print;
157
158static int __init
159x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
160 unsigned long extra_remove_base,
161 unsigned long extra_remove_size)
162{
163 unsigned long base, size;
164 mtrr_type type;
165 int i;
166
167 for (i = 0; i < num_var_ranges; i++) {
168 type = range_state[i].type;
169 if (type != MTRR_TYPE_WRBACK)
170 continue;
171 base = range_state[i].base_pfn;
172 size = range_state[i].size_pfn;
173 nr_range = add_range_with_merge(range, nr_range, base,
174 base + size - 1);
175 }
176 if (debug_print) {
177 printk(KERN_DEBUG "After WB checking\n");
178 for (i = 0; i < nr_range; i++)
179 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
180 range[i].start, range[i].end + 1);
181 }
182
183 /* take out UC ranges */
184 for (i = 0; i < num_var_ranges; i++) {
185 type = range_state[i].type;
186 if (type != MTRR_TYPE_UNCACHABLE &&
187 type != MTRR_TYPE_WRPROT)
188 continue;
189 size = range_state[i].size_pfn;
190 if (!size)
191 continue;
192 base = range_state[i].base_pfn;
193 if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
194 (mtrr_state.enabled & 1)) {
195 /* Var MTRR contains UC entry below 1M? Skip it: */
196 printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d "
197 "contains strange UC entry under 1M, check "
198 "with your system vendor!\n", i);
199 if (base + size <= (1<<(20-PAGE_SHIFT)))
200 continue;
201 size -= (1<<(20-PAGE_SHIFT)) - base;
202 base = 1<<(20-PAGE_SHIFT);
203 }
204 subtract_range(range, base, base + size - 1);
205 }
206 if (extra_remove_size)
207 subtract_range(range, extra_remove_base,
208 extra_remove_base + extra_remove_size - 1);
209
210 /* get new range num */
211 nr_range = 0;
212 for (i = 0; i < RANGE_NUM; i++) {
213 if (!range[i].end)
214 continue;
215 nr_range++;
216 }
217 if (debug_print) {
218 printk(KERN_DEBUG "After UC checking\n");
219 for (i = 0; i < nr_range; i++)
220 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
221 range[i].start, range[i].end + 1);
222 }
223
224 /* sort the ranges */
225 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
226 if (debug_print) {
227 printk(KERN_DEBUG "After sorting\n");
228 for (i = 0; i < nr_range; i++)
229 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
230 range[i].start, range[i].end + 1);
231 }
232
233 /* clear those is not used */
234 for (i = nr_range; i < RANGE_NUM; i++)
235 memset(&range[i], 0, sizeof(range[i]));
236
237 return nr_range;
238}
239
240static struct res_range __initdata range[RANGE_NUM];
241static int __initdata nr_range;
242
243#ifdef CONFIG_MTRR_SANITIZER
244
245static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
246{
247 unsigned long sum;
248 int i;
249
250 sum = 0;
251 for (i = 0; i < nr_range; i++)
252 sum += range[i].end + 1 - range[i].start;
253
254 return sum;
255}
256
257static int enable_mtrr_cleanup __initdata =
258 CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
259
260static int __init disable_mtrr_cleanup_setup(char *str)
261{
262 enable_mtrr_cleanup = 0;
263 return 0;
264}
265early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
266
267static int __init enable_mtrr_cleanup_setup(char *str)
268{
269 enable_mtrr_cleanup = 1;
270 return 0;
271}
272early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
273
274static int __init mtrr_cleanup_debug_setup(char *str)
275{
276 debug_print = 1;
277 return 0;
278}
279early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
280
281struct var_mtrr_state {
282 unsigned long range_startk;
283 unsigned long range_sizek;
284 unsigned long chunk_sizek;
285 unsigned long gran_sizek;
286 unsigned int reg;
287};
288
289static void __init
290set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
291 unsigned char type, unsigned int address_bits)
292{
293 u32 base_lo, base_hi, mask_lo, mask_hi;
294 u64 base, mask;
295
296 if (!sizek) {
297 fill_mtrr_var_range(reg, 0, 0, 0, 0);
298 return;
299 }
300
301 mask = (1ULL << address_bits) - 1;
302 mask &= ~((((u64)sizek) << 10) - 1);
303
304 base = ((u64)basek) << 10;
305
306 base |= type;
307 mask |= 0x800;
308
309 base_lo = base & ((1ULL<<32) - 1);
310 base_hi = base >> 32;
311
312 mask_lo = mask & ((1ULL<<32) - 1);
313 mask_hi = mask >> 32;
314
315 fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
316}
317
318static void __init
319save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
320 unsigned char type)
321{
322 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
323 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
324 range_state[reg].type = type;
325}
326
327static void __init
328set_var_mtrr_all(unsigned int address_bits)
329{
330 unsigned long basek, sizek;
331 unsigned char type;
332 unsigned int reg;
333
334 for (reg = 0; reg < num_var_ranges; reg++) {
335 basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
336 sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
337 type = range_state[reg].type;
338
339 set_var_mtrr(reg, basek, sizek, type, address_bits);
340 }
341}
342
343static unsigned long to_size_factor(unsigned long sizek, char *factorp)
344{
345 char factor;
346 unsigned long base = sizek;
347
348 if (base & ((1<<10) - 1)) {
349 /* not MB alignment */
350 factor = 'K';
351 } else if (base & ((1<<20) - 1)) {
352 factor = 'M';
353 base >>= 10;
354 } else {
355 factor = 'G';
356 base >>= 20;
357 }
358
359 *factorp = factor;
360
361 return base;
362}
363
364static unsigned int __init
365range_to_mtrr(unsigned int reg, unsigned long range_startk,
366 unsigned long range_sizek, unsigned char type)
367{
368 if (!range_sizek || (reg >= num_var_ranges))
369 return reg;
370
371 while (range_sizek) {
372 unsigned long max_align, align;
373 unsigned long sizek;
374
375 /* Compute the maximum size I can make a range */
376 if (range_startk)
377 max_align = ffs(range_startk) - 1;
378 else
379 max_align = 32;
380 align = fls(range_sizek) - 1;
381 if (align > max_align)
382 align = max_align;
383
384 sizek = 1 << align;
385 if (debug_print) {
386 char start_factor = 'K', size_factor = 'K';
387 unsigned long start_base, size_base;
388
389 start_base = to_size_factor(range_startk,
390 &start_factor),
391 size_base = to_size_factor(sizek, &size_factor),
392
393 printk(KERN_DEBUG "Setting variable MTRR %d, "
394 "base: %ld%cB, range: %ld%cB, type %s\n",
395 reg, start_base, start_factor,
396 size_base, size_factor,
397 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
398 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")
399 );
400 }
401 save_var_mtrr(reg++, range_startk, sizek, type);
402 range_startk += sizek;
403 range_sizek -= sizek;
404 if (reg >= num_var_ranges)
405 break;
406 }
407 return reg;
408}
409
410static unsigned __init
411range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
412 unsigned long sizek)
413{
414 unsigned long hole_basek, hole_sizek;
415 unsigned long second_basek, second_sizek;
416 unsigned long range0_basek, range0_sizek;
417 unsigned long range_basek, range_sizek;
418 unsigned long chunk_sizek;
419 unsigned long gran_sizek;
420
421 hole_basek = 0;
422 hole_sizek = 0;
423 second_basek = 0;
424 second_sizek = 0;
425 chunk_sizek = state->chunk_sizek;
426 gran_sizek = state->gran_sizek;
427
428 /* align with gran size, prevent small block used up MTRRs */
429 range_basek = ALIGN(state->range_startk, gran_sizek);
430 if ((range_basek > basek) && basek)
431 return second_sizek;
432 state->range_sizek -= (range_basek - state->range_startk);
433 range_sizek = ALIGN(state->range_sizek, gran_sizek);
434
435 while (range_sizek > state->range_sizek) {
436 range_sizek -= gran_sizek;
437 if (!range_sizek)
438 return 0;
439 }
440 state->range_sizek = range_sizek;
441
442 /* try to append some small hole */
443 range0_basek = state->range_startk;
444 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
445
446 /* no increase */
447 if (range0_sizek == state->range_sizek) {
448 if (debug_print)
449 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
450 range0_basek<<10,
451 (range0_basek + state->range_sizek)<<10);
452 state->reg = range_to_mtrr(state->reg, range0_basek,
453 state->range_sizek, MTRR_TYPE_WRBACK);
454 return 0;
455 }
456
457 /* only cut back, when it is not the last */
458 if (sizek) {
459 while (range0_basek + range0_sizek > (basek + sizek)) {
460 if (range0_sizek >= chunk_sizek)
461 range0_sizek -= chunk_sizek;
462 else
463 range0_sizek = 0;
464
465 if (!range0_sizek)
466 break;
467 }
468 }
469
470second_try:
471 range_basek = range0_basek + range0_sizek;
472
473 /* one hole in the middle */
474 if (range_basek > basek && range_basek <= (basek + sizek))
475 second_sizek = range_basek - basek;
476
477 if (range0_sizek > state->range_sizek) {
478
479 /* one hole in middle or at end */
480 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
481
482 /* hole size should be less than half of range0 size */
483 if (hole_sizek >= (range0_sizek >> 1) &&
484 range0_sizek >= chunk_sizek) {
485 range0_sizek -= chunk_sizek;
486 second_sizek = 0;
487 hole_sizek = 0;
488
489 goto second_try;
490 }
491 }
492
493 if (range0_sizek) {
494 if (debug_print)
495 printk(KERN_DEBUG "range0: %016lx - %016lx\n",
496 range0_basek<<10,
497 (range0_basek + range0_sizek)<<10);
498 state->reg = range_to_mtrr(state->reg, range0_basek,
499 range0_sizek, MTRR_TYPE_WRBACK);
500 }
501
502 if (range0_sizek < state->range_sizek) {
503 /* need to handle left over */
504 range_sizek = state->range_sizek - range0_sizek;
505
506 if (debug_print)
507 printk(KERN_DEBUG "range: %016lx - %016lx\n",
508 range_basek<<10,
509 (range_basek + range_sizek)<<10);
510 state->reg = range_to_mtrr(state->reg, range_basek,
511 range_sizek, MTRR_TYPE_WRBACK);
512 }
513
514 if (hole_sizek) {
515 hole_basek = range_basek - hole_sizek - second_sizek;
516 if (debug_print)
517 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
518 hole_basek<<10,
519 (hole_basek + hole_sizek)<<10);
520 state->reg = range_to_mtrr(state->reg, hole_basek,
521 hole_sizek, MTRR_TYPE_UNCACHABLE);
522 }
523
524 return second_sizek;
525}
526
527static void __init
528set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
529 unsigned long size_pfn)
530{
531 unsigned long basek, sizek;
532 unsigned long second_sizek = 0;
533
534 if (state->reg >= num_var_ranges)
535 return;
536
537 basek = base_pfn << (PAGE_SHIFT - 10);
538 sizek = size_pfn << (PAGE_SHIFT - 10);
539
540 /* See if I can merge with the last range */
541 if ((basek <= 1024) ||
542 (state->range_startk + state->range_sizek == basek)) {
543 unsigned long endk = basek + sizek;
544 state->range_sizek = endk - state->range_startk;
545 return;
546 }
547 /* Write the range mtrrs */
548 if (state->range_sizek != 0)
549 second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
550
551 /* Allocate an msr */
552 state->range_startk = basek + second_sizek;
553 state->range_sizek = sizek - second_sizek;
554}
555
556/* mininum size of mtrr block that can take hole */
557static u64 mtrr_chunk_size __initdata = (256ULL<<20);
558
559static int __init parse_mtrr_chunk_size_opt(char *p)
560{
561 if (!p)
562 return -EINVAL;
563 mtrr_chunk_size = memparse(p, &p);
564 return 0;
565}
566early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
567
568/* granity of mtrr of block */
569static u64 mtrr_gran_size __initdata;
570
571static int __init parse_mtrr_gran_size_opt(char *p)
572{
573 if (!p)
574 return -EINVAL;
575 mtrr_gran_size = memparse(p, &p);
576 return 0;
577}
578early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
579
580static int nr_mtrr_spare_reg __initdata =
581 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
582
583static int __init parse_mtrr_spare_reg(char *arg)
584{
585 if (arg)
586 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
587 return 0;
588}
589
590early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
591
592static int __init
593x86_setup_var_mtrrs(struct res_range *range, int nr_range,
594 u64 chunk_size, u64 gran_size)
595{
596 struct var_mtrr_state var_state;
597 int i;
598 int num_reg;
599
600 var_state.range_startk = 0;
601 var_state.range_sizek = 0;
602 var_state.reg = 0;
603 var_state.chunk_sizek = chunk_size >> 10;
604 var_state.gran_sizek = gran_size >> 10;
605
606 memset(range_state, 0, sizeof(range_state));
607
608 /* Write the range etc */
609 for (i = 0; i < nr_range; i++)
610 set_var_mtrr_range(&var_state, range[i].start,
611 range[i].end - range[i].start + 1);
612
613 /* Write the last range */
614 if (var_state.range_sizek != 0)
615 range_to_mtrr_with_hole(&var_state, 0, 0);
616
617 num_reg = var_state.reg;
618 /* Clear out the extra MTRR's */
619 while (var_state.reg < num_var_ranges) {
620 save_var_mtrr(var_state.reg, 0, 0, 0);
621 var_state.reg++;
622 }
623
624 return num_reg;
625}
626
627struct mtrr_cleanup_result {
628 unsigned long gran_sizek;
629 unsigned long chunk_sizek;
630 unsigned long lose_cover_sizek;
631 unsigned int num_reg;
632 int bad;
633};
634
635/*
636 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
637 * chunk size: gran_size, ..., 2G
638 * so we need (1+16)*8
639 */
640#define NUM_RESULT 136
641#define PSHIFT (PAGE_SHIFT - 10)
642
643static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
644static unsigned long __initdata min_loss_pfn[RANGE_NUM];
645
646static void __init print_out_mtrr_range_state(void)
647{
648 int i;
649 char start_factor = 'K', size_factor = 'K';
650 unsigned long start_base, size_base;
651 mtrr_type type;
652
653 for (i = 0; i < num_var_ranges; i++) {
654
655 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
656 if (!size_base)
657 continue;
658
659 size_base = to_size_factor(size_base, &size_factor),
660 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
661 start_base = to_size_factor(start_base, &start_factor),
662 type = range_state[i].type;
663
664 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
665 i, start_base, start_factor,
666 size_base, size_factor,
667 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
668 ((type == MTRR_TYPE_WRPROT) ? "WP" :
669 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
670 );
671 }
672}
673
674static int __init mtrr_need_cleanup(void)
675{
676 int i;
677 mtrr_type type;
678 unsigned long size;
679 /* extra one for all 0 */
680 int num[MTRR_NUM_TYPES + 1];
681
682 /* check entries number */
683 memset(num, 0, sizeof(num));
684 for (i = 0; i < num_var_ranges; i++) {
685 type = range_state[i].type;
686 size = range_state[i].size_pfn;
687 if (type >= MTRR_NUM_TYPES)
688 continue;
689 if (!size)
690 type = MTRR_NUM_TYPES;
691 if (type == MTRR_TYPE_WRPROT)
692 type = MTRR_TYPE_UNCACHABLE;
693 num[type]++;
694 }
695
696 /* check if we got UC entries */
697 if (!num[MTRR_TYPE_UNCACHABLE])
698 return 0;
699
700 /* check if we only had WB and UC */
701 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
702 num_var_ranges - num[MTRR_NUM_TYPES])
703 return 0;
704
705 return 1;
706}
707
708static unsigned long __initdata range_sums;
709static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
710 unsigned long extra_remove_base,
711 unsigned long extra_remove_size,
712 int i)
713{
714 int num_reg;
715 static struct res_range range_new[RANGE_NUM];
716 static int nr_range_new;
717 unsigned long range_sums_new;
718
719 /* convert ranges to var ranges state */
720 num_reg = x86_setup_var_mtrrs(range, nr_range,
721 chunk_size, gran_size);
722
723 /* we got new setting in range_state, check it */
724 memset(range_new, 0, sizeof(range_new));
725 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
726 extra_remove_base, extra_remove_size);
727 range_sums_new = sum_ranges(range_new, nr_range_new);
728
729 result[i].chunk_sizek = chunk_size >> 10;
730 result[i].gran_sizek = gran_size >> 10;
731 result[i].num_reg = num_reg;
732 if (range_sums < range_sums_new) {
733 result[i].lose_cover_sizek =
734 (range_sums_new - range_sums) << PSHIFT;
735 result[i].bad = 1;
736 } else
737 result[i].lose_cover_sizek =
738 (range_sums - range_sums_new) << PSHIFT;
739
740 /* double check it */
741 if (!result[i].bad && !result[i].lose_cover_sizek) {
742 if (nr_range_new != nr_range ||
743 memcmp(range, range_new, sizeof(range)))
744 result[i].bad = 1;
745 }
746
747 if (!result[i].bad && (range_sums - range_sums_new <
748 min_loss_pfn[num_reg])) {
749 min_loss_pfn[num_reg] =
750 range_sums - range_sums_new;
751 }
752}
753
754static void __init mtrr_print_out_one_result(int i)
755{
756 char gran_factor, chunk_factor, lose_factor;
757 unsigned long gran_base, chunk_base, lose_base;
758
759 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
760 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
761 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
762 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
763 result[i].bad ? "*BAD*" : " ",
764 gran_base, gran_factor, chunk_base, chunk_factor);
765 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
766 result[i].num_reg, result[i].bad ? "-" : "",
767 lose_base, lose_factor);
768}
769
770static int __init mtrr_search_optimal_index(void)
771{
772 int i;
773 int num_reg_good;
774 int index_good;
775
776 if (nr_mtrr_spare_reg >= num_var_ranges)
777 nr_mtrr_spare_reg = num_var_ranges - 1;
778 num_reg_good = -1;
779 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
780 if (!min_loss_pfn[i])
781 num_reg_good = i;
782 }
783
784 index_good = -1;
785 if (num_reg_good != -1) {
786 for (i = 0; i < NUM_RESULT; i++) {
787 if (!result[i].bad &&
788 result[i].num_reg == num_reg_good &&
789 !result[i].lose_cover_sizek) {
790 index_good = i;
791 break;
792 }
793 }
794 }
795
796 return index_good;
797}
798
799
800int __init mtrr_cleanup(unsigned address_bits)
801{
802 unsigned long extra_remove_base, extra_remove_size;
803 unsigned long base, size, def, dummy;
804 mtrr_type type;
805 u64 chunk_size, gran_size;
806 int index_good;
807 int i;
808
809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
810 return 0;
811 rdmsr(MTRRdefType_MSR, def, dummy);
812 def &= 0xff;
813 if (def != MTRR_TYPE_UNCACHABLE)
814 return 0;
815
816 /* get it and store it aside */
817 memset(range_state, 0, sizeof(range_state));
818 for (i = 0; i < num_var_ranges; i++) {
819 mtrr_if->get(i, &base, &size, &type);
820 range_state[i].base_pfn = base;
821 range_state[i].size_pfn = size;
822 range_state[i].type = type;
823 }
824
825 /* check if we need handle it and can handle it */
826 if (!mtrr_need_cleanup())
827 return 0;
828
829 /* print original var MTRRs at first, for debugging: */
830 printk(KERN_DEBUG "original variable MTRRs\n");
831 print_out_mtrr_range_state();
832
833 memset(range, 0, sizeof(range));
834 extra_remove_size = 0;
835 extra_remove_base = 1 << (32 - PAGE_SHIFT);
836 if (mtrr_tom2)
837 extra_remove_size =
838 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
839 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
840 extra_remove_size);
841 /*
842 * [0, 1M) should always be coverred by var mtrr with WB
843 * and fixed mtrrs should take effective before var mtrr for it
844 */
845 nr_range = add_range_with_merge(range, nr_range, 0,
846 (1ULL<<(20 - PAGE_SHIFT)) - 1);
847 /* sort the ranges */
848 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
849
850 range_sums = sum_ranges(range, nr_range);
851 printk(KERN_INFO "total RAM coverred: %ldM\n",
852 range_sums >> (20 - PAGE_SHIFT));
853
854 if (mtrr_chunk_size && mtrr_gran_size) {
855 i = 0;
856 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
857 extra_remove_base, extra_remove_size, i);
858
859 mtrr_print_out_one_result(i);
860
861 if (!result[i].bad) {
862 set_var_mtrr_all(address_bits);
863 printk(KERN_DEBUG "New variable MTRRs\n");
864 print_out_mtrr_range_state();
865 return 1;
866 }
867 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
868 "will find optimal one\n");
869 }
870
871 i = 0;
872 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
873 memset(result, 0, sizeof(result));
874 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
875
876 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
877 chunk_size <<= 1) {
878
879 if (i >= NUM_RESULT)
880 continue;
881
882 mtrr_calc_range_state(chunk_size, gran_size,
883 extra_remove_base, extra_remove_size, i);
884 if (debug_print) {
885 mtrr_print_out_one_result(i);
886 printk(KERN_INFO "\n");
887 }
888
889 i++;
890 }
891 }
892
893 /* try to find the optimal index */
894 index_good = mtrr_search_optimal_index();
895
896 if (index_good != -1) {
897 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
898 i = index_good;
899 mtrr_print_out_one_result(i);
900
901 /* convert ranges to var ranges state */
902 chunk_size = result[i].chunk_sizek;
903 chunk_size <<= 10;
904 gran_size = result[i].gran_sizek;
905 gran_size <<= 10;
906 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
907 set_var_mtrr_all(address_bits);
908 printk(KERN_DEBUG "New variable MTRRs\n");
909 print_out_mtrr_range_state();
910 return 1;
911 } else {
912 /* print out all */
913 for (i = 0; i < NUM_RESULT; i++)
914 mtrr_print_out_one_result(i);
915 }
916
917 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
918 printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
919
920 return 0;
921}
922#else
923int __init mtrr_cleanup(unsigned address_bits)
924{
925 return 0;
926}
927#endif
928
929static int disable_mtrr_trim;
930
931static int __init disable_mtrr_trim_setup(char *str)
932{
933 disable_mtrr_trim = 1;
934 return 0;
935}
936early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
937
938/*
939 * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
940 * for memory >4GB. Check for that here.
941 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
942 * apply to are wrong, but so far we don't know of any such case in the wild.
943 */
944#define Tom2Enabled (1U << 21)
945#define Tom2ForceMemTypeWB (1U << 22)
946
947int __init amd_special_default_mtrr(void)
948{
949 u32 l, h;
950
951 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
952 return 0;
953 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
954 return 0;
955 /* In case some hypervisor doesn't pass SYSCFG through */
956 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
957 return 0;
958 /*
959 * Memory between 4GB and top of mem is forced WB by this magic bit.
960 * Reserved before K8RevF, but should be zero there.
961 */
962 if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
963 (Tom2Enabled | Tom2ForceMemTypeWB))
964 return 1;
965 return 0;
966}
967
968static u64 __init real_trim_memory(unsigned long start_pfn,
969 unsigned long limit_pfn)
970{
971 u64 trim_start, trim_size;
972 trim_start = start_pfn;
973 trim_start <<= PAGE_SHIFT;
974 trim_size = limit_pfn;
975 trim_size <<= PAGE_SHIFT;
976 trim_size -= trim_start;
977
978 return e820_update_range(trim_start, trim_size, E820_RAM,
979 E820_RESERVED);
980}
981/**
982 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
983 * @end_pfn: ending page frame number
984 *
985 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
986 * memory configurations. This routine checks that the highest MTRR matches
987 * the end of memory, to make sure the MTRRs having a write back type cover
988 * all of the memory the kernel is intending to use. If not, it'll trim any
989 * memory off the end by adjusting end_pfn, removing it from the kernel's
990 * allocation pools, warning the user with an obnoxious message.
991 */
992int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
993{
994 unsigned long i, base, size, highest_pfn = 0, def, dummy;
995 mtrr_type type;
996 u64 total_trim_size;
997
998 /* extra one for all 0 */
999 int num[MTRR_NUM_TYPES + 1];
1000 /*
1001 * Make sure we only trim uncachable memory on machines that
1002 * support the Intel MTRR architecture:
1003 */
1004 if (!is_cpu(INTEL) || disable_mtrr_trim)
1005 return 0;
1006 rdmsr(MTRRdefType_MSR, def, dummy);
1007 def &= 0xff;
1008 if (def != MTRR_TYPE_UNCACHABLE)
1009 return 0;
1010
1011 /* get it and store it aside */
1012 memset(range_state, 0, sizeof(range_state));
1013 for (i = 0; i < num_var_ranges; i++) {
1014 mtrr_if->get(i, &base, &size, &type);
1015 range_state[i].base_pfn = base;
1016 range_state[i].size_pfn = size;
1017 range_state[i].type = type;
1018 }
1019
1020 /* Find highest cached pfn */
1021 for (i = 0; i < num_var_ranges; i++) {
1022 type = range_state[i].type;
1023 if (type != MTRR_TYPE_WRBACK)
1024 continue;
1025 base = range_state[i].base_pfn;
1026 size = range_state[i].size_pfn;
1027 if (highest_pfn < base + size)
1028 highest_pfn = base + size;
1029 }
1030
1031 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
1032 if (!highest_pfn) {
1033 printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
1034 return 0;
1035 }
1036
1037 /* check entries number */
1038 memset(num, 0, sizeof(num));
1039 for (i = 0; i < num_var_ranges; i++) {
1040 type = range_state[i].type;
1041 if (type >= MTRR_NUM_TYPES)
1042 continue;
1043 size = range_state[i].size_pfn;
1044 if (!size)
1045 type = MTRR_NUM_TYPES;
1046 num[type]++;
1047 }
1048
1049 /* no entry for WB? */
1050 if (!num[MTRR_TYPE_WRBACK])
1051 return 0;
1052
1053 /* check if we only had WB and UC */
1054 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1055 num_var_ranges - num[MTRR_NUM_TYPES])
1056 return 0;
1057
1058 memset(range, 0, sizeof(range));
1059 nr_range = 0;
1060 if (mtrr_tom2) {
1061 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1062 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
1063 if (highest_pfn < range[nr_range].end + 1)
1064 highest_pfn = range[nr_range].end + 1;
1065 nr_range++;
1066 }
1067 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
1068
1069 total_trim_size = 0;
1070 /* check the head */
1071 if (range[0].start)
1072 total_trim_size += real_trim_memory(0, range[0].start);
1073 /* check the holes */
1074 for (i = 0; i < nr_range - 1; i++) {
1075 if (range[i].end + 1 < range[i+1].start)
1076 total_trim_size += real_trim_memory(range[i].end + 1,
1077 range[i+1].start);
1078 }
1079 /* check the top */
1080 i = nr_range - 1;
1081 if (range[i].end + 1 < end_pfn)
1082 total_trim_size += real_trim_memory(range[i].end + 1,
1083 end_pfn);
1084
1085 if (total_trim_size) {
1086 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
1087 " all of memory, losing %lluMB of RAM.\n",
1088 total_trim_size >> 20);
1089
1090 if (!changed_by_mtrr_cleanup)
1091 WARN_ON(1);
1092
1093 printk(KERN_INFO "update e820 for mtrr\n");
1094 update_e820();
1095
1096 return 1;
1097 }
1098
1099 return 0;
1100}
1101
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0c0a455fe95c..37f28fc7cf95 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -33,13 +33,31 @@ u64 mtrr_tom2;
33struct mtrr_state_type mtrr_state = {}; 33struct mtrr_state_type mtrr_state = {};
34EXPORT_SYMBOL_GPL(mtrr_state); 34EXPORT_SYMBOL_GPL(mtrr_state);
35 35
36static int __initdata mtrr_show; 36/**
37static int __init mtrr_debug(char *opt) 37 * BIOS is expected to clear MtrrFixDramModEn bit, see for example
38 * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
39 * Opteron Processors" (26094 Rev. 3.30 February 2006), section
40 * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
41 * to 1 during BIOS initalization of the fixed MTRRs, then cleared to
42 * 0 for operation."
43 */
44static inline void k8_check_syscfg_dram_mod_en(void)
38{ 45{
39 mtrr_show = 1; 46 u32 lo, hi;
40 return 0; 47
48 if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
49 (boot_cpu_data.x86 >= 0x0f)))
50 return;
51
52 rdmsr(MSR_K8_SYSCFG, lo, hi);
53 if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
54 printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
55 " not cleared by BIOS, clearing this bit\n",
56 smp_processor_id());
57 lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
58 mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
59 }
41} 60}
42early_param("mtrr.show", mtrr_debug);
43 61
44/* 62/*
45 * Returns the effective MTRR type for the region 63 * Returns the effective MTRR type for the region
@@ -174,6 +192,8 @@ get_fixed_ranges(mtrr_type * frs)
174 unsigned int *p = (unsigned int *) frs; 192 unsigned int *p = (unsigned int *) frs;
175 int i; 193 int i;
176 194
195 k8_check_syscfg_dram_mod_en();
196
177 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); 197 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
178 198
179 for (i = 0; i < 2; i++) 199 for (i = 0; i < 2; i++)
@@ -188,18 +208,94 @@ void mtrr_save_fixed_ranges(void *info)
188 get_fixed_ranges(mtrr_state.fixed_ranges); 208 get_fixed_ranges(mtrr_state.fixed_ranges);
189} 209}
190 210
191static void print_fixed(unsigned base, unsigned step, const mtrr_type*types) 211static unsigned __initdata last_fixed_start;
212static unsigned __initdata last_fixed_end;
213static mtrr_type __initdata last_fixed_type;
214
215static void __init print_fixed_last(void)
216{
217 if (!last_fixed_end)
218 return;
219
220 printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start,
221 last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
222
223 last_fixed_end = 0;
224}
225
226static void __init update_fixed_last(unsigned base, unsigned end,
227 mtrr_type type)
228{
229 last_fixed_start = base;
230 last_fixed_end = end;
231 last_fixed_type = type;
232}
233
234static void __init print_fixed(unsigned base, unsigned step,
235 const mtrr_type *types)
192{ 236{
193 unsigned i; 237 unsigned i;
194 238
195 for (i = 0; i < 8; ++i, ++types, base += step) 239 for (i = 0; i < 8; ++i, ++types, base += step) {
196 printk(KERN_INFO "MTRR %05X-%05X %s\n", 240 if (last_fixed_end == 0) {
197 base, base + step - 1, mtrr_attrib_to_str(*types)); 241 update_fixed_last(base, base + step, *types);
242 continue;
243 }
244 if (last_fixed_end == base && last_fixed_type == *types) {
245 last_fixed_end = base + step;
246 continue;
247 }
248 /* new segments: gap or different type */
249 print_fixed_last();
250 update_fixed_last(base, base + step, *types);
251 }
198} 252}
199 253
200static void prepare_set(void); 254static void prepare_set(void);
201static void post_set(void); 255static void post_set(void);
202 256
257static void __init print_mtrr_state(void)
258{
259 unsigned int i;
260 int high_width;
261
262 printk(KERN_DEBUG "MTRR default type: %s\n",
263 mtrr_attrib_to_str(mtrr_state.def_type));
264 if (mtrr_state.have_fixed) {
265 printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n",
266 mtrr_state.enabled & 1 ? "en" : "dis");
267 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
268 for (i = 0; i < 2; ++i)
269 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
270 for (i = 0; i < 8; ++i)
271 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
272
273 /* tail */
274 print_fixed_last();
275 }
276 printk(KERN_DEBUG "MTRR variable ranges %sabled:\n",
277 mtrr_state.enabled & 2 ? "en" : "dis");
278 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
279 for (i = 0; i < num_var_ranges; ++i) {
280 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
281 printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n",
282 i,
283 high_width,
284 mtrr_state.var_ranges[i].base_hi,
285 mtrr_state.var_ranges[i].base_lo >> 12,
286 high_width,
287 mtrr_state.var_ranges[i].mask_hi,
288 mtrr_state.var_ranges[i].mask_lo >> 12,
289 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
290 else
291 printk(KERN_DEBUG " %u disabled\n", i);
292 }
293 if (mtrr_tom2) {
294 printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n",
295 mtrr_tom2, mtrr_tom2>>20);
296 }
297}
298
203/* Grab all of the MTRR state for this CPU into *state */ 299/* Grab all of the MTRR state for this CPU into *state */
204void __init get_mtrr_state(void) 300void __init get_mtrr_state(void)
205{ 301{
@@ -231,41 +327,9 @@ void __init get_mtrr_state(void)
231 mtrr_tom2 |= low; 327 mtrr_tom2 |= low;
232 mtrr_tom2 &= 0xffffff800000ULL; 328 mtrr_tom2 &= 0xffffff800000ULL;
233 } 329 }
234 if (mtrr_show) { 330
235 int high_width; 331 print_mtrr_state();
236 332
237 printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
238 if (mtrr_state.have_fixed) {
239 printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
240 mtrr_state.enabled & 1 ? "en" : "dis");
241 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
242 for (i = 0; i < 2; ++i)
243 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
244 for (i = 0; i < 8; ++i)
245 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
246 }
247 printk(KERN_INFO "MTRR variable ranges %sabled:\n",
248 mtrr_state.enabled & 2 ? "en" : "dis");
249 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
250 for (i = 0; i < num_var_ranges; ++i) {
251 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
252 printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
253 i,
254 high_width,
255 mtrr_state.var_ranges[i].base_hi,
256 mtrr_state.var_ranges[i].base_lo >> 12,
257 high_width,
258 mtrr_state.var_ranges[i].mask_hi,
259 mtrr_state.var_ranges[i].mask_lo >> 12,
260 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
261 else
262 printk(KERN_INFO "MTRR %u disabled\n", i);
263 }
264 if (mtrr_tom2) {
265 printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
266 mtrr_tom2, mtrr_tom2>>20);
267 }
268 }
269 mtrr_state_set = 1; 333 mtrr_state_set = 1;
270 334
271 /* PAT setup for BP. We need to go through sync steps here */ 335 /* PAT setup for BP. We need to go through sync steps here */
@@ -308,27 +372,10 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
308} 372}
309 373
310/** 374/**
311 * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs
312 * see AMD publication no. 24593, chapter 3.2.1 for more information
313 */
314static inline void k8_enable_fixed_iorrs(void)
315{
316 unsigned lo, hi;
317
318 rdmsr(MSR_K8_SYSCFG, lo, hi);
319 mtrr_wrmsr(MSR_K8_SYSCFG, lo
320 | K8_MTRRFIXRANGE_DRAM_ENABLE
321 | K8_MTRRFIXRANGE_DRAM_MODIFY, hi);
322}
323
324/**
325 * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have 375 * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have
326 * @msr: MSR address of the MTTR which should be checked and updated 376 * @msr: MSR address of the MTTR which should be checked and updated
327 * @changed: pointer which indicates whether the MTRR needed to be changed 377 * @changed: pointer which indicates whether the MTRR needed to be changed
328 * @msrwords: pointer to the MSR values which the MSR should have 378 * @msrwords: pointer to the MSR values which the MSR should have
329 *
330 * If K8 extentions are wanted, update the K8 SYSCFG MSR also.
331 * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information.
332 */ 379 */
333static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) 380static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
334{ 381{
@@ -337,10 +384,6 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
337 rdmsr(msr, lo, hi); 384 rdmsr(msr, lo, hi);
338 385
339 if (lo != msrwords[0] || hi != msrwords[1]) { 386 if (lo != msrwords[0] || hi != msrwords[1]) {
340 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
341 (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
342 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
343 k8_enable_fixed_iorrs();
344 mtrr_wrmsr(msr, msrwords[0], msrwords[1]); 387 mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
345 *changed = true; 388 *changed = true;
346 } 389 }
@@ -376,22 +419,31 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
376{ 419{
377 unsigned int mask_lo, mask_hi, base_lo, base_hi; 420 unsigned int mask_lo, mask_hi, base_lo, base_hi;
378 unsigned int tmp, hi; 421 unsigned int tmp, hi;
422 int cpu;
423
424 /*
425 * get_mtrr doesn't need to update mtrr_state, also it could be called
426 * from any cpu, so try to print it out directly.
427 */
428 cpu = get_cpu();
379 429
380 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 430 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
431
381 if ((mask_lo & 0x800) == 0) { 432 if ((mask_lo & 0x800) == 0) {
382 /* Invalid (i.e. free) range */ 433 /* Invalid (i.e. free) range */
383 *base = 0; 434 *base = 0;
384 *size = 0; 435 *size = 0;
385 *type = 0; 436 *type = 0;
386 return; 437 goto out_put_cpu;
387 } 438 }
388 439
389 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); 440 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
390 441
391 /* Work out the shifted address mask. */ 442 /* Work out the shifted address mask: */
392 tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; 443 tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
393 mask_lo = size_or_mask | tmp; 444 mask_lo = size_or_mask | tmp;
394 /* Expand tmp with high bits to all 1s*/ 445
446 /* Expand tmp with high bits to all 1s: */
395 hi = fls(tmp); 447 hi = fls(tmp);
396 if (hi > 0) { 448 if (hi > 0) {
397 tmp |= ~((1<<(hi - 1)) - 1); 449 tmp |= ~((1<<(hi - 1)) - 1);
@@ -402,11 +454,19 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
402 } 454 }
403 } 455 }
404 456
405 /* This works correctly if size is a power of two, i.e. a 457 /*
406 contiguous range. */ 458 * This works correctly if size is a power of two, i.e. a
459 * contiguous range:
460 */
407 *size = -mask_lo; 461 *size = -mask_lo;
408 *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; 462 *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
409 *type = base_lo & 0xff; 463 *type = base_lo & 0xff;
464
465 printk(KERN_DEBUG " get_mtrr: cpu%d reg%02d base=%010lx size=%010lx %s\n",
466 cpu, reg, *base, *size,
467 mtrr_attrib_to_str(*type & 0xff));
468out_put_cpu:
469 put_cpu();
410} 470}
411 471
412/** 472/**
@@ -419,6 +479,8 @@ static int set_fixed_ranges(mtrr_type * frs)
419 bool changed = false; 479 bool changed = false;
420 int block=-1, range; 480 int block=-1, range;
421 481
482 k8_check_syscfg_dram_mod_en();
483
422 while (fixed_range_blocks[++block].ranges) 484 while (fixed_range_blocks[++block].ranges)
423 for (range=0; range < fixed_range_blocks[block].ranges; range++) 485 for (range=0; range < fixed_range_blocks[block].ranges; range++)
424 set_fixed_range(fixed_range_blocks[block].base_msr + range, 486 set_fixed_range(fixed_range_blocks[block].base_msr + range,
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 236a401b8259..03cda01f57c7 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -574,7 +574,7 @@ struct mtrr_value {
574 unsigned long lsize; 574 unsigned long lsize;
575}; 575};
576 576
577static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES]; 577static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
578 578
579static int mtrr_save(struct sys_device * sysdev, pm_message_t state) 579static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
580{ 580{
@@ -582,9 +582,9 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
582 582
583 for (i = 0; i < num_var_ranges; i++) { 583 for (i = 0; i < num_var_ranges; i++) {
584 mtrr_if->get(i, 584 mtrr_if->get(i,
585 &mtrr_state[i].lbase, 585 &mtrr_value[i].lbase,
586 &mtrr_state[i].lsize, 586 &mtrr_value[i].lsize,
587 &mtrr_state[i].ltype); 587 &mtrr_value[i].ltype);
588 } 588 }
589 return 0; 589 return 0;
590} 590}
@@ -594,11 +594,11 @@ static int mtrr_restore(struct sys_device * sysdev)
594 int i; 594 int i;
595 595
596 for (i = 0; i < num_var_ranges; i++) { 596 for (i = 0; i < num_var_ranges; i++) {
597 if (mtrr_state[i].lsize) 597 if (mtrr_value[i].lsize)
598 set_mtrr(i, 598 set_mtrr(i,
599 mtrr_state[i].lbase, 599 mtrr_value[i].lbase,
600 mtrr_state[i].lsize, 600 mtrr_value[i].lsize,
601 mtrr_state[i].ltype); 601 mtrr_value[i].ltype);
602 } 602 }
603 return 0; 603 return 0;
604} 604}
@@ -610,1058 +610,7 @@ static struct sysdev_driver mtrr_sysdev_driver = {
610 .resume = mtrr_restore, 610 .resume = mtrr_restore,
611}; 611};
612 612
613/* should be related to MTRR_VAR_RANGES nums */ 613int __initdata changed_by_mtrr_cleanup;
614#define RANGE_NUM 256
615
616struct res_range {
617 unsigned long start;
618 unsigned long end;
619};
620
621static int __init
622add_range(struct res_range *range, int nr_range, unsigned long start,
623 unsigned long end)
624{
625 /* out of slots */
626 if (nr_range >= RANGE_NUM)
627 return nr_range;
628
629 range[nr_range].start = start;
630 range[nr_range].end = end;
631
632 nr_range++;
633
634 return nr_range;
635}
636
637static int __init
638add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
639 unsigned long end)
640{
641 int i;
642
643 /* try to merge it with old one */
644 for (i = 0; i < nr_range; i++) {
645 unsigned long final_start, final_end;
646 unsigned long common_start, common_end;
647
648 if (!range[i].end)
649 continue;
650
651 common_start = max(range[i].start, start);
652 common_end = min(range[i].end, end);
653 if (common_start > common_end + 1)
654 continue;
655
656 final_start = min(range[i].start, start);
657 final_end = max(range[i].end, end);
658
659 range[i].start = final_start;
660 range[i].end = final_end;
661 return nr_range;
662 }
663
664 /* need to add that */
665 return add_range(range, nr_range, start, end);
666}
667
668static void __init
669subtract_range(struct res_range *range, unsigned long start, unsigned long end)
670{
671 int i, j;
672
673 for (j = 0; j < RANGE_NUM; j++) {
674 if (!range[j].end)
675 continue;
676
677 if (start <= range[j].start && end >= range[j].end) {
678 range[j].start = 0;
679 range[j].end = 0;
680 continue;
681 }
682
683 if (start <= range[j].start && end < range[j].end &&
684 range[j].start < end + 1) {
685 range[j].start = end + 1;
686 continue;
687 }
688
689
690 if (start > range[j].start && end >= range[j].end &&
691 range[j].end > start - 1) {
692 range[j].end = start - 1;
693 continue;
694 }
695
696 if (start > range[j].start && end < range[j].end) {
697 /* find the new spare */
698 for (i = 0; i < RANGE_NUM; i++) {
699 if (range[i].end == 0)
700 break;
701 }
702 if (i < RANGE_NUM) {
703 range[i].end = range[j].end;
704 range[i].start = end + 1;
705 } else {
706 printk(KERN_ERR "run of slot in ranges\n");
707 }
708 range[j].end = start - 1;
709 continue;
710 }
711 }
712}
713
714static int __init cmp_range(const void *x1, const void *x2)
715{
716 const struct res_range *r1 = x1;
717 const struct res_range *r2 = x2;
718 long start1, start2;
719
720 start1 = r1->start;
721 start2 = r2->start;
722
723 return start1 - start2;
724}
725
726struct var_mtrr_range_state {
727 unsigned long base_pfn;
728 unsigned long size_pfn;
729 mtrr_type type;
730};
731
732static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
733static int __initdata debug_print;
734
735static int __init
736x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
737 unsigned long extra_remove_base,
738 unsigned long extra_remove_size)
739{
740 unsigned long i, base, size;
741 mtrr_type type;
742
743 for (i = 0; i < num_var_ranges; i++) {
744 type = range_state[i].type;
745 if (type != MTRR_TYPE_WRBACK)
746 continue;
747 base = range_state[i].base_pfn;
748 size = range_state[i].size_pfn;
749 nr_range = add_range_with_merge(range, nr_range, base,
750 base + size - 1);
751 }
752 if (debug_print) {
753 printk(KERN_DEBUG "After WB checking\n");
754 for (i = 0; i < nr_range; i++)
755 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
756 range[i].start, range[i].end + 1);
757 }
758
759 /* take out UC ranges */
760 for (i = 0; i < num_var_ranges; i++) {
761 type = range_state[i].type;
762 if (type != MTRR_TYPE_UNCACHABLE &&
763 type != MTRR_TYPE_WRPROT)
764 continue;
765 size = range_state[i].size_pfn;
766 if (!size)
767 continue;
768 base = range_state[i].base_pfn;
769 subtract_range(range, base, base + size - 1);
770 }
771 if (extra_remove_size)
772 subtract_range(range, extra_remove_base,
773 extra_remove_base + extra_remove_size - 1);
774
775 /* get new range num */
776 nr_range = 0;
777 for (i = 0; i < RANGE_NUM; i++) {
778 if (!range[i].end)
779 continue;
780 nr_range++;
781 }
782 if (debug_print) {
783 printk(KERN_DEBUG "After UC checking\n");
784 for (i = 0; i < nr_range; i++)
785 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
786 range[i].start, range[i].end + 1);
787 }
788
789 /* sort the ranges */
790 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
791 if (debug_print) {
792 printk(KERN_DEBUG "After sorting\n");
793 for (i = 0; i < nr_range; i++)
794 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
795 range[i].start, range[i].end + 1);
796 }
797
798 /* clear those is not used */
799 for (i = nr_range; i < RANGE_NUM; i++)
800 memset(&range[i], 0, sizeof(range[i]));
801
802 return nr_range;
803}
804
805static struct res_range __initdata range[RANGE_NUM];
806static int __initdata nr_range;
807
808#ifdef CONFIG_MTRR_SANITIZER
809
810static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
811{
812 unsigned long sum;
813 int i;
814
815 sum = 0;
816 for (i = 0; i < nr_range; i++)
817 sum += range[i].end + 1 - range[i].start;
818
819 return sum;
820}
821
822static int enable_mtrr_cleanup __initdata =
823 CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
824
825static int __init disable_mtrr_cleanup_setup(char *str)
826{
827 enable_mtrr_cleanup = 0;
828 return 0;
829}
830early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
831
832static int __init enable_mtrr_cleanup_setup(char *str)
833{
834 enable_mtrr_cleanup = 1;
835 return 0;
836}
837early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
838
839static int __init mtrr_cleanup_debug_setup(char *str)
840{
841 debug_print = 1;
842 return 0;
843}
844early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
845
846struct var_mtrr_state {
847 unsigned long range_startk;
848 unsigned long range_sizek;
849 unsigned long chunk_sizek;
850 unsigned long gran_sizek;
851 unsigned int reg;
852};
853
854static void __init
855set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
856 unsigned char type, unsigned int address_bits)
857{
858 u32 base_lo, base_hi, mask_lo, mask_hi;
859 u64 base, mask;
860
861 if (!sizek) {
862 fill_mtrr_var_range(reg, 0, 0, 0, 0);
863 return;
864 }
865
866 mask = (1ULL << address_bits) - 1;
867 mask &= ~((((u64)sizek) << 10) - 1);
868
869 base = ((u64)basek) << 10;
870
871 base |= type;
872 mask |= 0x800;
873
874 base_lo = base & ((1ULL<<32) - 1);
875 base_hi = base >> 32;
876
877 mask_lo = mask & ((1ULL<<32) - 1);
878 mask_hi = mask >> 32;
879
880 fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
881}
882
883static void __init
884save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
885 unsigned char type)
886{
887 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
888 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
889 range_state[reg].type = type;
890}
891
892static void __init
893set_var_mtrr_all(unsigned int address_bits)
894{
895 unsigned long basek, sizek;
896 unsigned char type;
897 unsigned int reg;
898
899 for (reg = 0; reg < num_var_ranges; reg++) {
900 basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
901 sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
902 type = range_state[reg].type;
903
904 set_var_mtrr(reg, basek, sizek, type, address_bits);
905 }
906}
907
908static unsigned long to_size_factor(unsigned long sizek, char *factorp)
909{
910 char factor;
911 unsigned long base = sizek;
912
913 if (base & ((1<<10) - 1)) {
914 /* not MB alignment */
915 factor = 'K';
916 } else if (base & ((1<<20) - 1)){
917 factor = 'M';
918 base >>= 10;
919 } else {
920 factor = 'G';
921 base >>= 20;
922 }
923
924 *factorp = factor;
925
926 return base;
927}
928
929static unsigned int __init
930range_to_mtrr(unsigned int reg, unsigned long range_startk,
931 unsigned long range_sizek, unsigned char type)
932{
933 if (!range_sizek || (reg >= num_var_ranges))
934 return reg;
935
936 while (range_sizek) {
937 unsigned long max_align, align;
938 unsigned long sizek;
939
940 /* Compute the maximum size I can make a range */
941 if (range_startk)
942 max_align = ffs(range_startk) - 1;
943 else
944 max_align = 32;
945 align = fls(range_sizek) - 1;
946 if (align > max_align)
947 align = max_align;
948
949 sizek = 1 << align;
950 if (debug_print) {
951 char start_factor = 'K', size_factor = 'K';
952 unsigned long start_base, size_base;
953
954 start_base = to_size_factor(range_startk, &start_factor),
955 size_base = to_size_factor(sizek, &size_factor),
956
957 printk(KERN_DEBUG "Setting variable MTRR %d, "
958 "base: %ld%cB, range: %ld%cB, type %s\n",
959 reg, start_base, start_factor,
960 size_base, size_factor,
961 (type == MTRR_TYPE_UNCACHABLE)?"UC":
962 ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
963 );
964 }
965 save_var_mtrr(reg++, range_startk, sizek, type);
966 range_startk += sizek;
967 range_sizek -= sizek;
968 if (reg >= num_var_ranges)
969 break;
970 }
971 return reg;
972}
973
974static unsigned __init
975range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
976 unsigned long sizek)
977{
978 unsigned long hole_basek, hole_sizek;
979 unsigned long second_basek, second_sizek;
980 unsigned long range0_basek, range0_sizek;
981 unsigned long range_basek, range_sizek;
982 unsigned long chunk_sizek;
983 unsigned long gran_sizek;
984
985 hole_basek = 0;
986 hole_sizek = 0;
987 second_basek = 0;
988 second_sizek = 0;
989 chunk_sizek = state->chunk_sizek;
990 gran_sizek = state->gran_sizek;
991
992 /* align with gran size, prevent small block used up MTRRs */
993 range_basek = ALIGN(state->range_startk, gran_sizek);
994 if ((range_basek > basek) && basek)
995 return second_sizek;
996 state->range_sizek -= (range_basek - state->range_startk);
997 range_sizek = ALIGN(state->range_sizek, gran_sizek);
998
999 while (range_sizek > state->range_sizek) {
1000 range_sizek -= gran_sizek;
1001 if (!range_sizek)
1002 return 0;
1003 }
1004 state->range_sizek = range_sizek;
1005
1006 /* try to append some small hole */
1007 range0_basek = state->range_startk;
1008 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
1009
1010 /* no increase */
1011 if (range0_sizek == state->range_sizek) {
1012 if (debug_print)
1013 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
1014 range0_basek<<10,
1015 (range0_basek + state->range_sizek)<<10);
1016 state->reg = range_to_mtrr(state->reg, range0_basek,
1017 state->range_sizek, MTRR_TYPE_WRBACK);
1018 return 0;
1019 }
1020
1021 /* only cut back, when it is not the last */
1022 if (sizek) {
1023 while (range0_basek + range0_sizek > (basek + sizek)) {
1024 if (range0_sizek >= chunk_sizek)
1025 range0_sizek -= chunk_sizek;
1026 else
1027 range0_sizek = 0;
1028
1029 if (!range0_sizek)
1030 break;
1031 }
1032 }
1033
1034second_try:
1035 range_basek = range0_basek + range0_sizek;
1036
1037 /* one hole in the middle */
1038 if (range_basek > basek && range_basek <= (basek + sizek))
1039 second_sizek = range_basek - basek;
1040
1041 if (range0_sizek > state->range_sizek) {
1042
1043 /* one hole in middle or at end */
1044 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
1045
1046 /* hole size should be less than half of range0 size */
1047 if (hole_sizek >= (range0_sizek >> 1) &&
1048 range0_sizek >= chunk_sizek) {
1049 range0_sizek -= chunk_sizek;
1050 second_sizek = 0;
1051 hole_sizek = 0;
1052
1053 goto second_try;
1054 }
1055 }
1056
1057 if (range0_sizek) {
1058 if (debug_print)
1059 printk(KERN_DEBUG "range0: %016lx - %016lx\n",
1060 range0_basek<<10,
1061 (range0_basek + range0_sizek)<<10);
1062 state->reg = range_to_mtrr(state->reg, range0_basek,
1063 range0_sizek, MTRR_TYPE_WRBACK);
1064 }
1065
1066 if (range0_sizek < state->range_sizek) {
1067 /* need to handle left over */
1068 range_sizek = state->range_sizek - range0_sizek;
1069
1070 if (debug_print)
1071 printk(KERN_DEBUG "range: %016lx - %016lx\n",
1072 range_basek<<10,
1073 (range_basek + range_sizek)<<10);
1074 state->reg = range_to_mtrr(state->reg, range_basek,
1075 range_sizek, MTRR_TYPE_WRBACK);
1076 }
1077
1078 if (hole_sizek) {
1079 hole_basek = range_basek - hole_sizek - second_sizek;
1080 if (debug_print)
1081 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
1082 hole_basek<<10,
1083 (hole_basek + hole_sizek)<<10);
1084 state->reg = range_to_mtrr(state->reg, hole_basek,
1085 hole_sizek, MTRR_TYPE_UNCACHABLE);
1086 }
1087
1088 return second_sizek;
1089}
1090
1091static void __init
1092set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
1093 unsigned long size_pfn)
1094{
1095 unsigned long basek, sizek;
1096 unsigned long second_sizek = 0;
1097
1098 if (state->reg >= num_var_ranges)
1099 return;
1100
1101 basek = base_pfn << (PAGE_SHIFT - 10);
1102 sizek = size_pfn << (PAGE_SHIFT - 10);
1103
1104 /* See if I can merge with the last range */
1105 if ((basek <= 1024) ||
1106 (state->range_startk + state->range_sizek == basek)) {
1107 unsigned long endk = basek + sizek;
1108 state->range_sizek = endk - state->range_startk;
1109 return;
1110 }
1111 /* Write the range mtrrs */
1112 if (state->range_sizek != 0)
1113 second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
1114
1115 /* Allocate an msr */
1116 state->range_startk = basek + second_sizek;
1117 state->range_sizek = sizek - second_sizek;
1118}
1119
1120/* mininum size of mtrr block that can take hole */
1121static u64 mtrr_chunk_size __initdata = (256ULL<<20);
1122
1123static int __init parse_mtrr_chunk_size_opt(char *p)
1124{
1125 if (!p)
1126 return -EINVAL;
1127 mtrr_chunk_size = memparse(p, &p);
1128 return 0;
1129}
1130early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
1131
1132/* granity of mtrr of block */
1133static u64 mtrr_gran_size __initdata;
1134
1135static int __init parse_mtrr_gran_size_opt(char *p)
1136{
1137 if (!p)
1138 return -EINVAL;
1139 mtrr_gran_size = memparse(p, &p);
1140 return 0;
1141}
1142early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
1143
1144static int nr_mtrr_spare_reg __initdata =
1145 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
1146
1147static int __init parse_mtrr_spare_reg(char *arg)
1148{
1149 if (arg)
1150 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
1151 return 0;
1152}
1153
1154early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
1155
1156static int __init
1157x86_setup_var_mtrrs(struct res_range *range, int nr_range,
1158 u64 chunk_size, u64 gran_size)
1159{
1160 struct var_mtrr_state var_state;
1161 int i;
1162 int num_reg;
1163
1164 var_state.range_startk = 0;
1165 var_state.range_sizek = 0;
1166 var_state.reg = 0;
1167 var_state.chunk_sizek = chunk_size >> 10;
1168 var_state.gran_sizek = gran_size >> 10;
1169
1170 memset(range_state, 0, sizeof(range_state));
1171
1172 /* Write the range etc */
1173 for (i = 0; i < nr_range; i++)
1174 set_var_mtrr_range(&var_state, range[i].start,
1175 range[i].end - range[i].start + 1);
1176
1177 /* Write the last range */
1178 if (var_state.range_sizek != 0)
1179 range_to_mtrr_with_hole(&var_state, 0, 0);
1180
1181 num_reg = var_state.reg;
1182 /* Clear out the extra MTRR's */
1183 while (var_state.reg < num_var_ranges) {
1184 save_var_mtrr(var_state.reg, 0, 0, 0);
1185 var_state.reg++;
1186 }
1187
1188 return num_reg;
1189}
1190
1191struct mtrr_cleanup_result {
1192 unsigned long gran_sizek;
1193 unsigned long chunk_sizek;
1194 unsigned long lose_cover_sizek;
1195 unsigned int num_reg;
1196 int bad;
1197};
1198
1199/*
1200 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
1201 * chunk size: gran_size, ..., 2G
1202 * so we need (1+16)*8
1203 */
1204#define NUM_RESULT 136
1205#define PSHIFT (PAGE_SHIFT - 10)
1206
1207static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
1208static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1209
1210static void __init print_out_mtrr_range_state(void)
1211{
1212 int i;
1213 char start_factor = 'K', size_factor = 'K';
1214 unsigned long start_base, size_base;
1215 mtrr_type type;
1216
1217 for (i = 0; i < num_var_ranges; i++) {
1218
1219 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1220 if (!size_base)
1221 continue;
1222
1223 size_base = to_size_factor(size_base, &size_factor),
1224 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1225 start_base = to_size_factor(start_base, &start_factor),
1226 type = range_state[i].type;
1227
1228 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1229 i, start_base, start_factor,
1230 size_base, size_factor,
1231 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1232 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1233 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1234 );
1235 }
1236}
1237
1238static int __init mtrr_need_cleanup(void)
1239{
1240 int i;
1241 mtrr_type type;
1242 unsigned long size;
1243 /* extra one for all 0 */
1244 int num[MTRR_NUM_TYPES + 1];
1245
1246 /* check entries number */
1247 memset(num, 0, sizeof(num));
1248 for (i = 0; i < num_var_ranges; i++) {
1249 type = range_state[i].type;
1250 size = range_state[i].size_pfn;
1251 if (type >= MTRR_NUM_TYPES)
1252 continue;
1253 if (!size)
1254 type = MTRR_NUM_TYPES;
1255 if (type == MTRR_TYPE_WRPROT)
1256 type = MTRR_TYPE_UNCACHABLE;
1257 num[type]++;
1258 }
1259
1260 /* check if we got UC entries */
1261 if (!num[MTRR_TYPE_UNCACHABLE])
1262 return 0;
1263
1264 /* check if we only had WB and UC */
1265 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1266 num_var_ranges - num[MTRR_NUM_TYPES])
1267 return 0;
1268
1269 return 1;
1270}
1271
1272static unsigned long __initdata range_sums;
1273static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
1274 unsigned long extra_remove_base,
1275 unsigned long extra_remove_size,
1276 int i)
1277{
1278 int num_reg;
1279 static struct res_range range_new[RANGE_NUM];
1280 static int nr_range_new;
1281 unsigned long range_sums_new;
1282
1283 /* convert ranges to var ranges state */
1284 num_reg = x86_setup_var_mtrrs(range, nr_range,
1285 chunk_size, gran_size);
1286
1287 /* we got new setting in range_state, check it */
1288 memset(range_new, 0, sizeof(range_new));
1289 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1290 extra_remove_base, extra_remove_size);
1291 range_sums_new = sum_ranges(range_new, nr_range_new);
1292
1293 result[i].chunk_sizek = chunk_size >> 10;
1294 result[i].gran_sizek = gran_size >> 10;
1295 result[i].num_reg = num_reg;
1296 if (range_sums < range_sums_new) {
1297 result[i].lose_cover_sizek =
1298 (range_sums_new - range_sums) << PSHIFT;
1299 result[i].bad = 1;
1300 } else
1301 result[i].lose_cover_sizek =
1302 (range_sums - range_sums_new) << PSHIFT;
1303
1304 /* double check it */
1305 if (!result[i].bad && !result[i].lose_cover_sizek) {
1306 if (nr_range_new != nr_range ||
1307 memcmp(range, range_new, sizeof(range)))
1308 result[i].bad = 1;
1309 }
1310
1311 if (!result[i].bad && (range_sums - range_sums_new <
1312 min_loss_pfn[num_reg])) {
1313 min_loss_pfn[num_reg] =
1314 range_sums - range_sums_new;
1315 }
1316}
1317
1318static void __init mtrr_print_out_one_result(int i)
1319{
1320 char gran_factor, chunk_factor, lose_factor;
1321 unsigned long gran_base, chunk_base, lose_base;
1322
1323 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1324 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1325 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1326 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1327 result[i].bad ? "*BAD*" : " ",
1328 gran_base, gran_factor, chunk_base, chunk_factor);
1329 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1330 result[i].num_reg, result[i].bad ? "-" : "",
1331 lose_base, lose_factor);
1332}
1333
1334static int __init mtrr_search_optimal_index(void)
1335{
1336 int i;
1337 int num_reg_good;
1338 int index_good;
1339
1340 if (nr_mtrr_spare_reg >= num_var_ranges)
1341 nr_mtrr_spare_reg = num_var_ranges - 1;
1342 num_reg_good = -1;
1343 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1344 if (!min_loss_pfn[i])
1345 num_reg_good = i;
1346 }
1347
1348 index_good = -1;
1349 if (num_reg_good != -1) {
1350 for (i = 0; i < NUM_RESULT; i++) {
1351 if (!result[i].bad &&
1352 result[i].num_reg == num_reg_good &&
1353 !result[i].lose_cover_sizek) {
1354 index_good = i;
1355 break;
1356 }
1357 }
1358 }
1359
1360 return index_good;
1361}
1362
1363
1364static int __init mtrr_cleanup(unsigned address_bits)
1365{
1366 unsigned long extra_remove_base, extra_remove_size;
1367 unsigned long base, size, def, dummy;
1368 mtrr_type type;
1369 u64 chunk_size, gran_size;
1370 int index_good;
1371 int i;
1372
1373 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
1374 return 0;
1375 rdmsr(MTRRdefType_MSR, def, dummy);
1376 def &= 0xff;
1377 if (def != MTRR_TYPE_UNCACHABLE)
1378 return 0;
1379
1380 /* get it and store it aside */
1381 memset(range_state, 0, sizeof(range_state));
1382 for (i = 0; i < num_var_ranges; i++) {
1383 mtrr_if->get(i, &base, &size, &type);
1384 range_state[i].base_pfn = base;
1385 range_state[i].size_pfn = size;
1386 range_state[i].type = type;
1387 }
1388
1389 /* check if we need handle it and can handle it */
1390 if (!mtrr_need_cleanup())
1391 return 0;
1392
1393 /* print original var MTRRs at first, for debugging: */
1394 printk(KERN_DEBUG "original variable MTRRs\n");
1395 print_out_mtrr_range_state();
1396
1397 memset(range, 0, sizeof(range));
1398 extra_remove_size = 0;
1399 extra_remove_base = 1 << (32 - PAGE_SHIFT);
1400 if (mtrr_tom2)
1401 extra_remove_size =
1402 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
1403 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
1404 extra_remove_size);
1405 /*
1406 * [0, 1M) should always be coverred by var mtrr with WB
1407 * and fixed mtrrs should take effective before var mtrr for it
1408 */
1409 nr_range = add_range_with_merge(range, nr_range, 0,
1410 (1ULL<<(20 - PAGE_SHIFT)) - 1);
1411 /* sort the ranges */
1412 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
1413
1414 range_sums = sum_ranges(range, nr_range);
1415 printk(KERN_INFO "total RAM coverred: %ldM\n",
1416 range_sums >> (20 - PAGE_SHIFT));
1417
1418 if (mtrr_chunk_size && mtrr_gran_size) {
1419 i = 0;
1420 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
1421 extra_remove_base, extra_remove_size, i);
1422
1423 mtrr_print_out_one_result(i);
1424
1425 if (!result[i].bad) {
1426 set_var_mtrr_all(address_bits);
1427 return 1;
1428 }
1429 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1430 "will find optimal one\n");
1431 }
1432
1433 i = 0;
1434 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1435 memset(result, 0, sizeof(result));
1436 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1437
1438 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1439 chunk_size <<= 1) {
1440
1441 if (i >= NUM_RESULT)
1442 continue;
1443
1444 mtrr_calc_range_state(chunk_size, gran_size,
1445 extra_remove_base, extra_remove_size, i);
1446 if (debug_print) {
1447 mtrr_print_out_one_result(i);
1448 printk(KERN_INFO "\n");
1449 }
1450
1451 i++;
1452 }
1453 }
1454
1455 /* try to find the optimal index */
1456 index_good = mtrr_search_optimal_index();
1457
1458 if (index_good != -1) {
1459 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1460 i = index_good;
1461 mtrr_print_out_one_result(i);
1462
1463 /* convert ranges to var ranges state */
1464 chunk_size = result[i].chunk_sizek;
1465 chunk_size <<= 10;
1466 gran_size = result[i].gran_sizek;
1467 gran_size <<= 10;
1468 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1469 set_var_mtrr_all(address_bits);
1470 printk(KERN_DEBUG "New variable MTRRs\n");
1471 print_out_mtrr_range_state();
1472 return 1;
1473 } else {
1474 /* print out all */
1475 for (i = 0; i < NUM_RESULT; i++)
1476 mtrr_print_out_one_result(i);
1477 }
1478
1479 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
1480 printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
1481
1482 return 0;
1483}
1484#else
1485static int __init mtrr_cleanup(unsigned address_bits)
1486{
1487 return 0;
1488}
1489#endif
1490
1491static int __initdata changed_by_mtrr_cleanup;
1492
1493static int disable_mtrr_trim;
1494
1495static int __init disable_mtrr_trim_setup(char *str)
1496{
1497 disable_mtrr_trim = 1;
1498 return 0;
1499}
1500early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
1501
1502/*
1503 * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
1504 * for memory >4GB. Check for that here.
1505 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
1506 * apply to are wrong, but so far we don't know of any such case in the wild.
1507 */
1508#define Tom2Enabled (1U << 21)
1509#define Tom2ForceMemTypeWB (1U << 22)
1510
1511int __init amd_special_default_mtrr(void)
1512{
1513 u32 l, h;
1514
1515 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
1516 return 0;
1517 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
1518 return 0;
1519 /* In case some hypervisor doesn't pass SYSCFG through */
1520 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
1521 return 0;
1522 /*
1523 * Memory between 4GB and top of mem is forced WB by this magic bit.
1524 * Reserved before K8RevF, but should be zero there.
1525 */
1526 if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
1527 (Tom2Enabled | Tom2ForceMemTypeWB))
1528 return 1;
1529 return 0;
1530}
1531
1532static u64 __init real_trim_memory(unsigned long start_pfn,
1533 unsigned long limit_pfn)
1534{
1535 u64 trim_start, trim_size;
1536 trim_start = start_pfn;
1537 trim_start <<= PAGE_SHIFT;
1538 trim_size = limit_pfn;
1539 trim_size <<= PAGE_SHIFT;
1540 trim_size -= trim_start;
1541
1542 return e820_update_range(trim_start, trim_size, E820_RAM,
1543 E820_RESERVED);
1544}
1545/**
1546 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
1547 * @end_pfn: ending page frame number
1548 *
1549 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
1550 * memory configurations. This routine checks that the highest MTRR matches
1551 * the end of memory, to make sure the MTRRs having a write back type cover
1552 * all of the memory the kernel is intending to use. If not, it'll trim any
1553 * memory off the end by adjusting end_pfn, removing it from the kernel's
1554 * allocation pools, warning the user with an obnoxious message.
1555 */
1556int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1557{
1558 unsigned long i, base, size, highest_pfn = 0, def, dummy;
1559 mtrr_type type;
1560 u64 total_trim_size;
1561
1562 /* extra one for all 0 */
1563 int num[MTRR_NUM_TYPES + 1];
1564 /*
1565 * Make sure we only trim uncachable memory on machines that
1566 * support the Intel MTRR architecture:
1567 */
1568 if (!is_cpu(INTEL) || disable_mtrr_trim)
1569 return 0;
1570 rdmsr(MTRRdefType_MSR, def, dummy);
1571 def &= 0xff;
1572 if (def != MTRR_TYPE_UNCACHABLE)
1573 return 0;
1574
1575 /* get it and store it aside */
1576 memset(range_state, 0, sizeof(range_state));
1577 for (i = 0; i < num_var_ranges; i++) {
1578 mtrr_if->get(i, &base, &size, &type);
1579 range_state[i].base_pfn = base;
1580 range_state[i].size_pfn = size;
1581 range_state[i].type = type;
1582 }
1583
1584 /* Find highest cached pfn */
1585 for (i = 0; i < num_var_ranges; i++) {
1586 type = range_state[i].type;
1587 if (type != MTRR_TYPE_WRBACK)
1588 continue;
1589 base = range_state[i].base_pfn;
1590 size = range_state[i].size_pfn;
1591 if (highest_pfn < base + size)
1592 highest_pfn = base + size;
1593 }
1594
1595 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
1596 if (!highest_pfn) {
1597 printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
1598 return 0;
1599 }
1600
1601 /* check entries number */
1602 memset(num, 0, sizeof(num));
1603 for (i = 0; i < num_var_ranges; i++) {
1604 type = range_state[i].type;
1605 if (type >= MTRR_NUM_TYPES)
1606 continue;
1607 size = range_state[i].size_pfn;
1608 if (!size)
1609 type = MTRR_NUM_TYPES;
1610 num[type]++;
1611 }
1612
1613 /* no entry for WB? */
1614 if (!num[MTRR_TYPE_WRBACK])
1615 return 0;
1616
1617 /* check if we only had WB and UC */
1618 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1619 num_var_ranges - num[MTRR_NUM_TYPES])
1620 return 0;
1621
1622 memset(range, 0, sizeof(range));
1623 nr_range = 0;
1624 if (mtrr_tom2) {
1625 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1626 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
1627 if (highest_pfn < range[nr_range].end + 1)
1628 highest_pfn = range[nr_range].end + 1;
1629 nr_range++;
1630 }
1631 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
1632
1633 total_trim_size = 0;
1634 /* check the head */
1635 if (range[0].start)
1636 total_trim_size += real_trim_memory(0, range[0].start);
1637 /* check the holes */
1638 for (i = 0; i < nr_range - 1; i++) {
1639 if (range[i].end + 1 < range[i+1].start)
1640 total_trim_size += real_trim_memory(range[i].end + 1,
1641 range[i+1].start);
1642 }
1643 /* check the top */
1644 i = nr_range - 1;
1645 if (range[i].end + 1 < end_pfn)
1646 total_trim_size += real_trim_memory(range[i].end + 1,
1647 end_pfn);
1648
1649 if (total_trim_size) {
1650 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
1651 " all of memory, losing %lluMB of RAM.\n",
1652 total_trim_size >> 20);
1653
1654 if (!changed_by_mtrr_cleanup)
1655 WARN_ON(1);
1656
1657 printk(KERN_INFO "update e820 for mtrr\n");
1658 update_e820();
1659
1660 return 1;
1661 }
1662
1663 return 0;
1664}
1665 614
1666/** 615/**
1667 * mtrr_bp_init - initialize mtrrs on the boot CPU 616 * mtrr_bp_init - initialize mtrrs on the boot CPU
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index ffd60409cc6d..77f67f7b347a 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -79,6 +79,7 @@ extern struct mtrr_ops * mtrr_if;
79 79
80extern unsigned int num_var_ranges; 80extern unsigned int num_var_ranges;
81extern u64 mtrr_tom2; 81extern u64 mtrr_tom2;
82extern struct mtrr_state_type mtrr_state;
82 83
83void mtrr_state_warn(void); 84void mtrr_state_warn(void);
84const char *mtrr_attrib_to_str(int x); 85const char *mtrr_attrib_to_str(int x);
@@ -88,3 +89,6 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned);
88int amd_init_mtrr(void); 89int amd_init_mtrr(void);
89int cyrix_init_mtrr(void); 90int cyrix_init_mtrr(void);
90int centaur_init_mtrr(void); 91int centaur_init_mtrr(void);
92
93extern int changed_by_mtrr_cleanup;
94extern int mtrr_cleanup(unsigned address_bits);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 95b81c18b6bc..fb638d9ce6d2 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -131,6 +131,31 @@ void __init e820_add_region(u64 start, u64 size, int type)
131 __e820_add_region(&e820, start, size, type); 131 __e820_add_region(&e820, start, size, type);
132} 132}
133 133
134static void __init e820_print_type(u32 type)
135{
136 switch (type) {
137 case E820_RAM:
138 case E820_RESERVED_KERN:
139 printk(KERN_CONT "(usable)");
140 break;
141 case E820_RESERVED:
142 printk(KERN_CONT "(reserved)");
143 break;
144 case E820_ACPI:
145 printk(KERN_CONT "(ACPI data)");
146 break;
147 case E820_NVS:
148 printk(KERN_CONT "(ACPI NVS)");
149 break;
150 case E820_UNUSABLE:
151 printk(KERN_CONT "(unusable)");
152 break;
153 default:
154 printk(KERN_CONT "type %u", type);
155 break;
156 }
157}
158
134void __init e820_print_map(char *who) 159void __init e820_print_map(char *who)
135{ 160{
136 int i; 161 int i;
@@ -140,27 +165,8 @@ void __init e820_print_map(char *who)
140 (unsigned long long) e820.map[i].addr, 165 (unsigned long long) e820.map[i].addr,
141 (unsigned long long) 166 (unsigned long long)
142 (e820.map[i].addr + e820.map[i].size)); 167 (e820.map[i].addr + e820.map[i].size));
143 switch (e820.map[i].type) { 168 e820_print_type(e820.map[i].type);
144 case E820_RAM: 169 printk(KERN_CONT "\n");
145 case E820_RESERVED_KERN:
146 printk(KERN_CONT "(usable)\n");
147 break;
148 case E820_RESERVED:
149 printk(KERN_CONT "(reserved)\n");
150 break;
151 case E820_ACPI:
152 printk(KERN_CONT "(ACPI data)\n");
153 break;
154 case E820_NVS:
155 printk(KERN_CONT "(ACPI NVS)\n");
156 break;
157 case E820_UNUSABLE:
158 printk("(unusable)\n");
159 break;
160 default:
161 printk(KERN_CONT "type %u\n", e820.map[i].type);
162 break;
163 }
164 } 170 }
165} 171}
166 172
@@ -427,6 +433,7 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
427 u64 size, unsigned old_type, 433 u64 size, unsigned old_type,
428 unsigned new_type) 434 unsigned new_type)
429{ 435{
436 u64 end;
430 unsigned int i; 437 unsigned int i;
431 u64 real_updated_size = 0; 438 u64 real_updated_size = 0;
432 439
@@ -435,21 +442,43 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
435 if (size > (ULLONG_MAX - start)) 442 if (size > (ULLONG_MAX - start))
436 size = ULLONG_MAX - start; 443 size = ULLONG_MAX - start;
437 444
445 end = start + size;
446 printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
447 (unsigned long long) start,
448 (unsigned long long) end);
449 e820_print_type(old_type);
450 printk(KERN_CONT " ==> ");
451 e820_print_type(new_type);
452 printk(KERN_CONT "\n");
453
438 for (i = 0; i < e820x->nr_map; i++) { 454 for (i = 0; i < e820x->nr_map; i++) {
439 struct e820entry *ei = &e820x->map[i]; 455 struct e820entry *ei = &e820x->map[i];
440 u64 final_start, final_end; 456 u64 final_start, final_end;
457 u64 ei_end;
458
441 if (ei->type != old_type) 459 if (ei->type != old_type)
442 continue; 460 continue;
443 /* totally covered? */ 461
444 if (ei->addr >= start && 462 ei_end = ei->addr + ei->size;
445 (ei->addr + ei->size) <= (start + size)) { 463 /* totally covered by new range? */
464 if (ei->addr >= start && ei_end <= end) {
446 ei->type = new_type; 465 ei->type = new_type;
447 real_updated_size += ei->size; 466 real_updated_size += ei->size;
448 continue; 467 continue;
449 } 468 }
469
470 /* new range is totally covered? */
471 if (ei->addr < start && ei_end > end) {
472 __e820_add_region(e820x, start, size, new_type);
473 __e820_add_region(e820x, end, ei_end - end, ei->type);
474 ei->size = start - ei->addr;
475 real_updated_size += size;
476 continue;
477 }
478
450 /* partially covered */ 479 /* partially covered */
451 final_start = max(start, ei->addr); 480 final_start = max(start, ei->addr);
452 final_end = min(start + size, ei->addr + ei->size); 481 final_end = min(end, ei_end);
453 if (final_start >= final_end) 482 if (final_start >= final_end)
454 continue; 483 continue;
455 484
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index ac108d1fe182..3f8579f8d42c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -18,7 +18,7 @@ void __init i386_start_kernel(void)
18{ 18{
19 reserve_trampoline_memory(); 19 reserve_trampoline_memory();
20 20
21 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 21 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
22 22
23#ifdef CONFIG_BLK_DEV_INITRD 23#ifdef CONFIG_BLK_DEV_INITRD
24 /* Reserve INITRD */ 24 /* Reserve INITRD */
@@ -29,9 +29,6 @@ void __init i386_start_kernel(void)
29 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 29 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
30 } 30 }
31#endif 31#endif
32 reserve_early(init_pg_tables_start, init_pg_tables_end,
33 "INIT_PG_TABLE");
34
35 reserve_ebda_region(); 32 reserve_ebda_region();
36 33
37 /* 34 /*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f5b272247690..70eaa852c732 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -100,7 +100,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
100 100
101 reserve_trampoline_memory(); 101 reserve_trampoline_memory();
102 102
103 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 103 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
104 104
105#ifdef CONFIG_BLK_DEV_INITRD 105#ifdef CONFIG_BLK_DEV_INITRD
106 /* Reserve INITRD */ 106 /* Reserve INITRD */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c32ca19d591a..30683883e0cd 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -38,42 +38,40 @@
38#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id 38#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
39 39
40/* 40/*
41 * This is how much memory *in addition to the memory covered up to 41 * This is how much memory in addition to the memory covered up to
42 * and including _end* we need mapped initially. 42 * and including _end we need mapped initially.
43 * We need: 43 * We need:
44 * - one bit for each possible page, but only in low memory, which means 44 * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
45 * 2^32/4096/8 = 128K worst case (4G/4G split.) 45 * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
46 * - enough space to map all low memory, which means
47 * (2^32/4096) / 1024 pages (worst case, non PAE)
48 * (2^32/4096) / 512 + 4 pages (worst case for PAE)
49 * - a few pages for allocator use before the kernel pagetable has
50 * been set up
51 * 46 *
52 * Modulo rounding, each megabyte assigned here requires a kilobyte of 47 * Modulo rounding, each megabyte assigned here requires a kilobyte of
53 * memory, which is currently unreclaimed. 48 * memory, which is currently unreclaimed.
54 * 49 *
55 * This should be a multiple of a page. 50 * This should be a multiple of a page.
51 *
52 * KERNEL_IMAGE_SIZE should be greater than pa(_end)
53 * and small than max_low_pfn, otherwise will waste some page table entries
56 */ 54 */
57LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
58
59/*
60 * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
61 * pagetables from above the 16MB DMA limit, so we'll have to set
62 * up pagetables 16MB more (worst-case):
63 */
64#ifdef CONFIG_DEBUG_PAGEALLOC
65LOW_PAGES = LOW_PAGES + 0x1000000
66#endif
67 55
68#if PTRS_PER_PMD > 1 56#if PTRS_PER_PMD > 1
69PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD 57#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
70#else 58#else
71PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) 59#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
72#endif 60#endif
73BOOTBITMAP_SIZE = LOW_PAGES / 8
74ALLOCATOR_SLOP = 4
75 61
76INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm 62/* Enough space to fit pagetables for the low memory linear map */
63MAPPING_BEYOND_END = \
64 PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
65
66/*
67 * Worst-case size of the kernel mapping we need to make:
68 * the worst-case size of the kernel itself, plus the extra we need
69 * to map for the linear map.
70 */
71KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
72
73INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
74RESERVE_BRK(pagetables, INIT_MAP_SIZE)
77 75
78/* 76/*
79 * 32-bit kernel entrypoint; only used by the boot CPU. On entry, 77 * 32-bit kernel entrypoint; only used by the boot CPU. On entry,
@@ -166,10 +164,10 @@ num_subarch_entries = (. - subarch_entries) / 4
166 164
167/* 165/*
168 * Initialize page tables. This creates a PDE and a set of page 166 * Initialize page tables. This creates a PDE and a set of page
169 * tables, which are located immediately beyond _end. The variable 167 * tables, which are located immediately beyond __brk_base. The variable
170 * init_pg_tables_end is set up to point to the first "safe" location. 168 * _brk_end is set up to point to the first "safe" location.
171 * Mappings are created both at virtual address 0 (identity mapping) 169 * Mappings are created both at virtual address 0 (identity mapping)
172 * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. 170 * and PAGE_OFFSET for up to _end.
173 * 171 *
174 * Note that the stack is not yet set up! 172 * Note that the stack is not yet set up!
175 */ 173 */
@@ -190,8 +188,7 @@ default_entry:
190 188
191 xorl %ebx,%ebx /* %ebx is kept at zero */ 189 xorl %ebx,%ebx /* %ebx is kept at zero */
192 190
193 movl $pa(pg0), %edi 191 movl $pa(__brk_base), %edi
194 movl %edi, pa(init_pg_tables_start)
195 movl $pa(swapper_pg_pmd), %edx 192 movl $pa(swapper_pg_pmd), %edx
196 movl $PTE_IDENT_ATTR, %eax 193 movl $PTE_IDENT_ATTR, %eax
19710: 19410:
@@ -209,14 +206,14 @@ default_entry:
209 loop 11b 206 loop 11b
210 207
211 /* 208 /*
212 * End condition: we must map up to and including INIT_MAP_BEYOND_END 209 * End condition: we must map up to the end + MAPPING_BEYOND_END.
213 * bytes beyond the end of our own page tables.
214 */ 210 */
215 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp 211 movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
216 cmpl %ebp,%eax 212 cmpl %ebp,%eax
217 jb 10b 213 jb 10b
2181: 2141:
219 movl %edi,pa(init_pg_tables_end) 215 addl $__PAGE_OFFSET, %edi
216 movl %edi, pa(_brk_end)
220 shrl $12, %eax 217 shrl $12, %eax
221 movl %eax, pa(max_pfn_mapped) 218 movl %eax, pa(max_pfn_mapped)
222 219
@@ -227,8 +224,7 @@ default_entry:
227 224
228page_pde_offset = (__PAGE_OFFSET >> 20); 225page_pde_offset = (__PAGE_OFFSET >> 20);
229 226
230 movl $pa(pg0), %edi 227 movl $pa(__brk_base), %edi
231 movl %edi, pa(init_pg_tables_start)
232 movl $pa(swapper_pg_dir), %edx 228 movl $pa(swapper_pg_dir), %edx
233 movl $PTE_IDENT_ATTR, %eax 229 movl $PTE_IDENT_ATTR, %eax
23410: 23010:
@@ -242,14 +238,13 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
242 addl $0x1000,%eax 238 addl $0x1000,%eax
243 loop 11b 239 loop 11b
244 /* 240 /*
245 * End condition: we must map up to and including INIT_MAP_BEYOND_END 241 * End condition: we must map up to the end + MAPPING_BEYOND_END.
246 * bytes beyond the end of our own page tables; the +0x007 is
247 * the attribute bits
248 */ 242 */
249 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp 243 movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
250 cmpl %ebp,%eax 244 cmpl %ebp,%eax
251 jb 10b 245 jb 10b
252 movl %edi,pa(init_pg_tables_end) 246 addl $__PAGE_OFFSET, %edi
247 movl %edi, pa(_brk_end)
253 shrl $12, %eax 248 shrl $12, %eax
254 movl %eax, pa(max_pfn_mapped) 249 movl %eax, pa(max_pfn_mapped)
255 250
@@ -636,6 +631,7 @@ swapper_pg_fixmap:
636 .fill 1024,4,0 631 .fill 1024,4,0
637ENTRY(empty_zero_page) 632ENTRY(empty_zero_page)
638 .fill 4096,1,0 633 .fill 4096,1,0
634
639/* 635/*
640 * This starts the data section. 636 * This starts the data section.
641 */ 637 */
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index e948b28a5a9a..55b94614e348 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -193,6 +193,9 @@ static int __kprobes can_boost(kprobe_opcode_t *opcodes)
193 kprobe_opcode_t opcode; 193 kprobe_opcode_t opcode;
194 kprobe_opcode_t *orig_opcodes = opcodes; 194 kprobe_opcode_t *orig_opcodes = opcodes;
195 195
196 if (search_exception_tables((unsigned long)opcodes))
197 return 0; /* Page fault may occur on this address. */
198
196retry: 199retry:
197 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) 200 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
198 return 0; 201 return 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 478bca986eca..33019ddb56b4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -138,12 +138,6 @@ static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
138 kvm_mmu_write(ptep, pte_val(pte)); 138 kvm_mmu_write(ptep, pte_val(pte));
139} 139}
140 140
141static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
142 pte_t *ptep, pte_t pte)
143{
144 kvm_mmu_write(ptep, pte_val(pte));
145}
146
147static void kvm_pte_clear(struct mm_struct *mm, 141static void kvm_pte_clear(struct mm_struct *mm,
148 unsigned long addr, pte_t *ptep) 142 unsigned long addr, pte_t *ptep)
149{ 143{
@@ -220,7 +214,6 @@ static void paravirt_ops_setup(void)
220#if PAGETABLE_LEVELS >= 3 214#if PAGETABLE_LEVELS >= 3
221#ifdef CONFIG_X86_PAE 215#ifdef CONFIG_X86_PAE
222 pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic; 216 pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
223 pv_mmu_ops.set_pte_present = kvm_set_pte_present;
224 pv_mmu_ops.pte_clear = kvm_pte_clear; 217 pv_mmu_ops.pte_clear = kvm_pte_clear;
225 pv_mmu_ops.pmd_clear = kvm_pmd_clear; 218 pv_mmu_ops.pmd_clear = kvm_pmd_clear;
226#endif 219#endif
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 47673e02ae58..290cb57f4697 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -109,9 +109,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
109 } else 109 } else
110 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 110 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
111} 111}
112#endif
113
114#ifdef CONFIG_X86_IO_APIC
115 112
116static int bad_ioapic(unsigned long address) 113static int bad_ioapic(unsigned long address)
117{ 114{
@@ -224,8 +221,12 @@ static void __init MP_intsrc_info(struct mpc_intsrc *m)
224 if (++mp_irq_entries == MAX_IRQ_SOURCES) 221 if (++mp_irq_entries == MAX_IRQ_SOURCES)
225 panic("Max # of irq sources exceeded!!\n"); 222 panic("Max # of irq sources exceeded!!\n");
226} 223}
224#else /* CONFIG_X86_IO_APIC */
225static inline void __init MP_bus_info(struct mpc_bus *m) {}
226static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
227static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
228#endif /* CONFIG_X86_IO_APIC */
227 229
228#endif
229 230
230static void __init MP_lintsrc_info(struct mpc_lintsrc *m) 231static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
231{ 232{
@@ -275,6 +276,12 @@ static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
275 return 1; 276 return 1;
276} 277}
277 278
279static void skip_entry(unsigned char **ptr, int *count, int size)
280{
281 *ptr += size;
282 *count += size;
283}
284
278static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) 285static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
279{ 286{
280 char str[16]; 287 char str[16];
@@ -310,55 +317,27 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
310 while (count < mpc->length) { 317 while (count < mpc->length) {
311 switch (*mpt) { 318 switch (*mpt) {
312 case MP_PROCESSOR: 319 case MP_PROCESSOR:
313 { 320 /* ACPI may have already provided this data */
314 struct mpc_cpu *m = (struct mpc_cpu *)mpt; 321 if (!acpi_lapic)
315 /* ACPI may have already provided this data */ 322 MP_processor_info((struct mpc_cpu *)mpt);
316 if (!acpi_lapic) 323 skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
317 MP_processor_info(m); 324 break;
318 mpt += sizeof(*m);
319 count += sizeof(*m);
320 break;
321 }
322 case MP_BUS: 325 case MP_BUS:
323 { 326 MP_bus_info((struct mpc_bus *)mpt);
324 struct mpc_bus *m = (struct mpc_bus *)mpt; 327 skip_entry(&mpt, &count, sizeof(struct mpc_bus));
325#ifdef CONFIG_X86_IO_APIC 328 break;
326 MP_bus_info(m);
327#endif
328 mpt += sizeof(*m);
329 count += sizeof(*m);
330 break;
331 }
332 case MP_IOAPIC: 329 case MP_IOAPIC:
333 { 330 MP_ioapic_info((struct mpc_ioapic *)mpt);
334#ifdef CONFIG_X86_IO_APIC 331 skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
335 struct mpc_ioapic *m = (struct mpc_ioapic *)mpt; 332 break;
336 MP_ioapic_info(m);
337#endif
338 mpt += sizeof(struct mpc_ioapic);
339 count += sizeof(struct mpc_ioapic);
340 break;
341 }
342 case MP_INTSRC: 333 case MP_INTSRC:
343 { 334 MP_intsrc_info((struct mpc_intsrc *)mpt);
344#ifdef CONFIG_X86_IO_APIC 335 skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
345 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 336 break;
346
347 MP_intsrc_info(m);
348#endif
349 mpt += sizeof(struct mpc_intsrc);
350 count += sizeof(struct mpc_intsrc);
351 break;
352 }
353 case MP_LINTSRC: 337 case MP_LINTSRC:
354 { 338 MP_lintsrc_info((struct mpc_lintsrc *)mpt);
355 struct mpc_lintsrc *m = 339 skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
356 (struct mpc_lintsrc *)mpt; 340 break;
357 MP_lintsrc_info(m);
358 mpt += sizeof(*m);
359 count += sizeof(*m);
360 break;
361 }
362 default: 341 default:
363 /* wrong mptable */ 342 /* wrong mptable */
364 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); 343 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
@@ -689,6 +668,31 @@ void __init get_smp_config(void)
689 __get_smp_config(0); 668 __get_smp_config(0);
690} 669}
691 670
671static void smp_reserve_bootmem(struct mpf_intel *mpf)
672{
673 unsigned long size = get_mpc_size(mpf->physptr);
674#ifdef CONFIG_X86_32
675 /*
676 * We cannot access to MPC table to compute table size yet,
677 * as only few megabytes from the bottom is mapped now.
678 * PC-9800's MPC table places on the very last of physical
679 * memory; so that simply reserving PAGE_SIZE from mpf->physptr
680 * yields BUG() in reserve_bootmem.
681 * also need to make sure physptr is below than max_low_pfn
682 * we don't need reserve the area above max_low_pfn
683 */
684 unsigned long end = max_low_pfn * PAGE_SIZE;
685
686 if (mpf->physptr < end) {
687 if (mpf->physptr + size > end)
688 size = end - mpf->physptr;
689 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
690 }
691#else
692 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
693#endif
694}
695
692static int __init smp_scan_config(unsigned long base, unsigned long length, 696static int __init smp_scan_config(unsigned long base, unsigned long length,
693 unsigned reserve) 697 unsigned reserve)
694{ 698{
@@ -717,35 +721,9 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
717 if (!reserve) 721 if (!reserve)
718 return 1; 722 return 1;
719 reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf), 723 reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
720 BOOTMEM_DEFAULT);
721 if (mpf->physptr) {
722 unsigned long size = get_mpc_size(mpf->physptr);
723#ifdef CONFIG_X86_32
724 /*
725 * We cannot access to MPC table to compute
726 * table size yet, as only few megabytes from
727 * the bottom is mapped now.
728 * PC-9800's MPC table places on the very last
729 * of physical memory; so that simply reserving
730 * PAGE_SIZE from mpf->physptr yields BUG()
731 * in reserve_bootmem.
732 * also need to make sure physptr is below than
733 * max_low_pfn
734 * we don't need reserve the area above max_low_pfn
735 */
736 unsigned long end = max_low_pfn * PAGE_SIZE;
737
738 if (mpf->physptr < end) {
739 if (mpf->physptr + size > end)
740 size = end - mpf->physptr;
741 reserve_bootmem_generic(mpf->physptr, size,
742 BOOTMEM_DEFAULT);
743 }
744#else
745 reserve_bootmem_generic(mpf->physptr, size,
746 BOOTMEM_DEFAULT); 724 BOOTMEM_DEFAULT);
747#endif 725 if (mpf->physptr)
748 } 726 smp_reserve_bootmem(mpf);
749 727
750 return 1; 728 return 1;
751 } 729 }
@@ -848,7 +826,57 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
848#define SPARE_SLOT_NUM 20 826#define SPARE_SLOT_NUM 20
849 827
850static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; 828static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
851#endif 829
830static void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
831{
832 int i;
833
834 apic_printk(APIC_VERBOSE, "OLD ");
835 print_MP_intsrc_info(m);
836
837 i = get_MP_intsrc_index(m);
838 if (i > 0) {
839 assign_to_mpc_intsrc(&mp_irqs[i], m);
840 apic_printk(APIC_VERBOSE, "NEW ");
841 print_mp_irq_info(&mp_irqs[i]);
842 return;
843 }
844 if (!i) {
845 /* legacy, do nothing */
846 return;
847 }
848 if (*nr_m_spare < SPARE_SLOT_NUM) {
849 /*
850 * not found (-1), or duplicated (-2) are invalid entries,
851 * we need to use the slot later
852 */
853 m_spare[*nr_m_spare] = m;
854 *nr_m_spare += 1;
855 }
856}
857#else /* CONFIG_X86_IO_APIC */
858static inline void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
859#endif /* CONFIG_X86_IO_APIC */
860
861static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
862 int count)
863{
864 if (!mpc_new_phys) {
865 pr_info("No spare slots, try to append...take your risk, "
866 "new mpc_length %x\n", count);
867 } else {
868 if (count <= mpc_new_length)
869 pr_info("No spare slots, try to append..., "
870 "new mpc_length %x\n", count);
871 else {
872 pr_err("mpc_new_length %lx is too small\n",
873 mpc_new_length);
874 return -1;
875 }
876 }
877
878 return 0;
879}
852 880
853static int __init replace_intsrc_all(struct mpc_table *mpc, 881static int __init replace_intsrc_all(struct mpc_table *mpc,
854 unsigned long mpc_new_phys, 882 unsigned long mpc_new_phys,
@@ -856,71 +884,30 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
856{ 884{
857#ifdef CONFIG_X86_IO_APIC 885#ifdef CONFIG_X86_IO_APIC
858 int i; 886 int i;
859 int nr_m_spare = 0;
860#endif 887#endif
861
862 int count = sizeof(*mpc); 888 int count = sizeof(*mpc);
889 int nr_m_spare = 0;
863 unsigned char *mpt = ((unsigned char *)mpc) + count; 890 unsigned char *mpt = ((unsigned char *)mpc) + count;
864 891
865 printk(KERN_INFO "mpc_length %x\n", mpc->length); 892 printk(KERN_INFO "mpc_length %x\n", mpc->length);
866 while (count < mpc->length) { 893 while (count < mpc->length) {
867 switch (*mpt) { 894 switch (*mpt) {
868 case MP_PROCESSOR: 895 case MP_PROCESSOR:
869 { 896 skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
870 struct mpc_cpu *m = (struct mpc_cpu *)mpt; 897 break;
871 mpt += sizeof(*m);
872 count += sizeof(*m);
873 break;
874 }
875 case MP_BUS: 898 case MP_BUS:
876 { 899 skip_entry(&mpt, &count, sizeof(struct mpc_bus));
877 struct mpc_bus *m = (struct mpc_bus *)mpt; 900 break;
878 mpt += sizeof(*m);
879 count += sizeof(*m);
880 break;
881 }
882 case MP_IOAPIC: 901 case MP_IOAPIC:
883 { 902 skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
884 mpt += sizeof(struct mpc_ioapic); 903 break;
885 count += sizeof(struct mpc_ioapic);
886 break;
887 }
888 case MP_INTSRC: 904 case MP_INTSRC:
889 { 905 check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare);
890#ifdef CONFIG_X86_IO_APIC 906 skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
891 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 907 break;
892
893 apic_printk(APIC_VERBOSE, "OLD ");
894 print_MP_intsrc_info(m);
895 i = get_MP_intsrc_index(m);
896 if (i > 0) {
897 assign_to_mpc_intsrc(&mp_irqs[i], m);
898 apic_printk(APIC_VERBOSE, "NEW ");
899 print_mp_irq_info(&mp_irqs[i]);
900 } else if (!i) {
901 /* legacy, do nothing */
902 } else if (nr_m_spare < SPARE_SLOT_NUM) {
903 /*
904 * not found (-1), or duplicated (-2)
905 * are invalid entries,
906 * we need to use the slot later
907 */
908 m_spare[nr_m_spare] = m;
909 nr_m_spare++;
910 }
911#endif
912 mpt += sizeof(struct mpc_intsrc);
913 count += sizeof(struct mpc_intsrc);
914 break;
915 }
916 case MP_LINTSRC: 908 case MP_LINTSRC:
917 { 909 skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
918 struct mpc_lintsrc *m = 910 break;
919 (struct mpc_lintsrc *)mpt;
920 mpt += sizeof(*m);
921 count += sizeof(*m);
922 break;
923 }
924 default: 911 default:
925 /* wrong mptable */ 912 /* wrong mptable */
926 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); 913 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
@@ -950,16 +937,8 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
950 } else { 937 } else {
951 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 938 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
952 count += sizeof(struct mpc_intsrc); 939 count += sizeof(struct mpc_intsrc);
953 if (!mpc_new_phys) { 940 if (!check_slot(mpc_new_phys, mpc_new_length, count))
954 printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count); 941 goto out;
955 } else {
956 if (count <= mpc_new_length)
957 printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
958 else {
959 printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
960 goto out;
961 }
962 }
963 assign_to_mpc_intsrc(&mp_irqs[i], m); 942 assign_to_mpc_intsrc(&mp_irqs[i], m);
964 mpc->length = count; 943 mpc->length = count;
965 mpt += sizeof(struct mpc_intsrc); 944 mpt += sizeof(struct mpc_intsrc);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 63dd358d8ee1..8e45f4464880 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -470,7 +470,6 @@ struct pv_mmu_ops pv_mmu_ops = {
470#if PAGETABLE_LEVELS >= 3 470#if PAGETABLE_LEVELS >= 3
471#ifdef CONFIG_X86_PAE 471#ifdef CONFIG_X86_PAE
472 .set_pte_atomic = native_set_pte_atomic, 472 .set_pte_atomic = native_set_pte_atomic,
473 .set_pte_present = native_set_pte_present,
474 .pte_clear = native_pte_clear, 473 .pte_clear = native_pte_clear,
475 .pmd_clear = native_pmd_clear, 474 .pmd_clear = native_pmd_clear,
476#endif 475#endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6afa5232dbb7..156f87582c6c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -65,11 +65,11 @@ void exit_thread(void)
65{ 65{
66 struct task_struct *me = current; 66 struct task_struct *me = current;
67 struct thread_struct *t = &me->thread; 67 struct thread_struct *t = &me->thread;
68 unsigned long *bp = t->io_bitmap_ptr;
68 69
69 if (me->thread.io_bitmap_ptr) { 70 if (bp) {
70 struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); 71 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
71 72
72 kfree(t->io_bitmap_ptr);
73 t->io_bitmap_ptr = NULL; 73 t->io_bitmap_ptr = NULL;
74 clear_thread_flag(TIF_IO_BITMAP); 74 clear_thread_flag(TIF_IO_BITMAP);
75 /* 75 /*
@@ -78,6 +78,7 @@ void exit_thread(void)
78 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 78 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
79 t->io_bitmap_max = 0; 79 t->io_bitmap_max = 0;
80 put_cpu(); 80 put_cpu();
81 kfree(bp);
81 } 82 }
82 83
83 ds_exit_thread(current); 84 ds_exit_thread(current);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f28c56e6bf94..a0d26237d7cf 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,8 +112,13 @@
112#define ARCH_SETUP 112#define ARCH_SETUP
113#endif 113#endif
114 114
115RESERVE_BRK(dmi_alloc, 65536);
116
115unsigned int boot_cpu_id __read_mostly; 117unsigned int boot_cpu_id __read_mostly;
116 118
119static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
120unsigned long _brk_end = (unsigned long)__brk_base;
121
117#ifdef CONFIG_X86_64 122#ifdef CONFIG_X86_64
118int default_cpu_present_to_apicid(int mps_cpu) 123int default_cpu_present_to_apicid(int mps_cpu)
119{ 124{
@@ -158,12 +163,6 @@ static struct resource bss_resource = {
158 163
159 164
160#ifdef CONFIG_X86_32 165#ifdef CONFIG_X86_32
161/* This value is set up by the early boot code to point to the value
162 immediately after the boot time page tables. It contains a *physical*
163 address, and must not be in the .bss segment! */
164unsigned long init_pg_tables_start __initdata = ~0UL;
165unsigned long init_pg_tables_end __initdata = ~0UL;
166
167static struct resource video_ram_resource = { 166static struct resource video_ram_resource = {
168 .name = "Video RAM area", 167 .name = "Video RAM area",
169 .start = 0xa0000, 168 .start = 0xa0000,
@@ -219,12 +218,6 @@ unsigned long mmu_cr4_features = X86_CR4_PAE;
219int bootloader_type; 218int bootloader_type;
220 219
221/* 220/*
222 * Early DMI memory
223 */
224int dmi_alloc_index;
225char dmi_alloc_data[DMI_MAX_DATA];
226
227/*
228 * Setup options 221 * Setup options
229 */ 222 */
230struct screen_info screen_info; 223struct screen_info screen_info;
@@ -269,6 +262,35 @@ static inline void copy_edd(void)
269} 262}
270#endif 263#endif
271 264
265void * __init extend_brk(size_t size, size_t align)
266{
267 size_t mask = align - 1;
268 void *ret;
269
270 BUG_ON(_brk_start == 0);
271 BUG_ON(align & mask);
272
273 _brk_end = (_brk_end + mask) & ~mask;
274 BUG_ON((char *)(_brk_end + size) > __brk_limit);
275
276 ret = (void *)_brk_end;
277 _brk_end += size;
278
279 memset(ret, 0, size);
280
281 return ret;
282}
283
284static void __init reserve_brk(void)
285{
286 if (_brk_end > _brk_start)
287 reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
288
289 /* Mark brk area as locked down and no longer taking any
290 new allocations */
291 _brk_start = 0;
292}
293
272#ifdef CONFIG_BLK_DEV_INITRD 294#ifdef CONFIG_BLK_DEV_INITRD
273 295
274#ifdef CONFIG_X86_32 296#ifdef CONFIG_X86_32
@@ -717,11 +739,7 @@ void __init setup_arch(char **cmdline_p)
717 init_mm.start_code = (unsigned long) _text; 739 init_mm.start_code = (unsigned long) _text;
718 init_mm.end_code = (unsigned long) _etext; 740 init_mm.end_code = (unsigned long) _etext;
719 init_mm.end_data = (unsigned long) _edata; 741 init_mm.end_data = (unsigned long) _edata;
720#ifdef CONFIG_X86_32 742 init_mm.brk = _brk_end;
721 init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
722#else
723 init_mm.brk = (unsigned long) &_end;
724#endif
725 743
726 code_resource.start = virt_to_phys(_text); 744 code_resource.start = virt_to_phys(_text);
727 code_resource.end = virt_to_phys(_etext)-1; 745 code_resource.end = virt_to_phys(_etext)-1;
@@ -842,6 +860,8 @@ void __init setup_arch(char **cmdline_p)
842 setup_bios_corruption_check(); 860 setup_bios_corruption_check();
843#endif 861#endif
844 862
863 reserve_brk();
864
845 /* max_pfn_mapped is updated here */ 865 /* max_pfn_mapped is updated here */
846 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); 866 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
847 max_pfn_mapped = max_low_pfn_mapped; 867 max_pfn_mapped = max_low_pfn_mapped;
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index d038b9c45cf8..79c073247284 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -750,7 +750,7 @@ static int __init uv_bau_init(void)
750 int node; 750 int node;
751 int nblades; 751 int nblades;
752 int last_blade; 752 int last_blade;
753 int cur_cpu = 0; 753 int cur_cpu;
754 754
755 if (!is_uv_system()) 755 if (!is_uv_system())
756 return 0; 756 return 0;
@@ -760,6 +760,7 @@ static int __init uv_bau_init(void)
760 uv_mmask = (1UL << uv_hub_info->n_val) - 1; 760 uv_mmask = (1UL << uv_hub_info->n_val) - 1;
761 nblades = 0; 761 nblades = 0;
762 last_blade = -1; 762 last_blade = -1;
763 cur_cpu = 0;
763 for_each_online_node(node) { 764 for_each_online_node(node) {
764 blade = uv_node_to_blade_id(node); 765 blade = uv_node_to_blade_id(node);
765 if (blade == last_blade) 766 if (blade == last_blade)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 83d53ce5d4c4..462b9ba67e92 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -273,30 +273,43 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
273 * use the TSC value at the transitions to calculate a pretty 273 * use the TSC value at the transitions to calculate a pretty
274 * good value for the TSC frequencty. 274 * good value for the TSC frequencty.
275 */ 275 */
276static inline int pit_expect_msb(unsigned char val) 276static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
277{ 277{
278 int count = 0; 278 int count;
279 u64 tsc = 0;
279 280
280 for (count = 0; count < 50000; count++) { 281 for (count = 0; count < 50000; count++) {
281 /* Ignore LSB */ 282 /* Ignore LSB */
282 inb(0x42); 283 inb(0x42);
283 if (inb(0x42) != val) 284 if (inb(0x42) != val)
284 break; 285 break;
286 tsc = get_cycles();
285 } 287 }
286 return count > 50; 288 *deltap = get_cycles() - tsc;
289 *tscp = tsc;
290
291 /*
292 * We require _some_ success, but the quality control
293 * will be based on the error terms on the TSC values.
294 */
295 return count > 5;
287} 296}
288 297
289/* 298/*
290 * How many MSB values do we want to see? We aim for a 299 * How many MSB values do we want to see? We aim for
291 * 15ms calibration, which assuming a 2us counter read 300 * a maximum error rate of 500ppm (in practice the
292 * error should give us roughly 150 ppm precision for 301 * real error is much smaller), but refuse to spend
293 * the calibration. 302 * more than 25ms on it.
294 */ 303 */
295#define QUICK_PIT_MS 15 304#define MAX_QUICK_PIT_MS 25
296#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) 305#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
297 306
298static unsigned long quick_pit_calibrate(void) 307static unsigned long quick_pit_calibrate(void)
299{ 308{
309 int i;
310 u64 tsc, delta;
311 unsigned long d1, d2;
312
300 /* Set the Gate high, disable speaker */ 313 /* Set the Gate high, disable speaker */
301 outb((inb(0x61) & ~0x02) | 0x01, 0x61); 314 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
302 315
@@ -315,45 +328,52 @@ static unsigned long quick_pit_calibrate(void)
315 outb(0xff, 0x42); 328 outb(0xff, 0x42);
316 outb(0xff, 0x42); 329 outb(0xff, 0x42);
317 330
318 if (pit_expect_msb(0xff)) { 331 /*
319 int i; 332 * The PIT starts counting at the next edge, so we
320 u64 t1, t2, delta; 333 * need to delay for a microsecond. The easiest way
321 unsigned char expect = 0xfe; 334 * to do that is to just read back the 16-bit counter
322 335 * once from the PIT.
323 t1 = get_cycles(); 336 */
324 for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) { 337 inb(0x42);
325 if (!pit_expect_msb(expect)) 338 inb(0x42);
326 goto failed; 339
340 if (pit_expect_msb(0xff, &tsc, &d1)) {
341 for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
342 if (!pit_expect_msb(0xff-i, &delta, &d2))
343 break;
344
345 /*
346 * Iterate until the error is less than 500 ppm
347 */
348 delta -= tsc;
349 if (d1+d2 < delta >> 11)
350 goto success;
327 } 351 }
328 t2 = get_cycles();
329
330 /*
331 * Make sure we can rely on the second TSC timestamp:
332 */
333 if (!pit_expect_msb(expect))
334 goto failed;
335
336 /*
337 * Ok, if we get here, then we've seen the
338 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
339 * times, and each MSB had many hits, so we never
340 * had any sudden jumps.
341 *
342 * As a result, we can depend on there not being
343 * any odd delays anywhere, and the TSC reads are
344 * reliable.
345 *
346 * kHz = ticks / time-in-seconds / 1000;
347 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
348 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
349 */
350 delta = (t2 - t1)*PIT_TICK_RATE;
351 do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
352 printk("Fast TSC calibration using PIT\n");
353 return delta;
354 } 352 }
355failed: 353 printk("Fast TSC calibration failed\n");
356 return 0; 354 return 0;
355
356success:
357 /*
358 * Ok, if we get here, then we've seen the
359 * MSB of the PIT decrement 'i' times, and the
360 * error has shrunk to less than 500 ppm.
361 *
362 * As a result, we can depend on there not being
363 * any odd delays anywhere, and the TSC reads are
364 * reliable (within the error). We also adjust the
365 * delta to the middle of the error bars, just
366 * because it looks nicer.
367 *
368 * kHz = ticks / time-in-seconds / 1000;
369 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
370 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
371 */
372 delta += (long)(d2 - d1)/2;
373 delta *= PIT_TICK_RATE;
374 do_div(delta, i*256*1000);
375 printk("Fast TSC calibration using PIT\n");
376 return delta;
357} 377}
358 378
359/** 379/**
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 2cc4a90e2cb3..95deb9f2211e 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -395,11 +395,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
395 vmi_ops.update_pte(ptep, VMI_PAGE_PT); 395 vmi_ops.update_pte(ptep, VMI_PAGE_PT);
396} 396}
397 397
398static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
399{
400 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
401}
402
403static void vmi_set_pud(pud_t *pudp, pud_t pudval) 398static void vmi_set_pud(pud_t *pudp, pud_t pudval)
404{ 399{
405 /* Um, eww */ 400 /* Um, eww */
@@ -750,7 +745,6 @@ static inline int __init activate_vmi(void)
750 pv_mmu_ops.set_pmd = vmi_set_pmd; 745 pv_mmu_ops.set_pmd = vmi_set_pmd;
751#ifdef CONFIG_X86_PAE 746#ifdef CONFIG_X86_PAE
752 pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic; 747 pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
753 pv_mmu_ops.set_pte_present = vmi_set_pte_present;
754 pv_mmu_ops.set_pud = vmi_set_pud; 748 pv_mmu_ops.set_pud = vmi_set_pud;
755 pv_mmu_ops.pte_clear = vmi_pte_clear; 749 pv_mmu_ops.pte_clear = vmi_pte_clear;
756 pv_mmu_ops.pmd_clear = vmi_pmd_clear; 750 pv_mmu_ops.pmd_clear = vmi_pmd_clear;
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 0d860963f268..62ad500d55f3 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -189,15 +189,24 @@ SECTIONS
189 *(.bss) 189 *(.bss)
190 . = ALIGN(4); 190 . = ALIGN(4);
191 __bss_stop = .; 191 __bss_stop = .;
192 _end = . ; 192 }
193 /* This is where the kernel creates the early boot page tables */ 193
194 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
194 . = ALIGN(PAGE_SIZE); 195 . = ALIGN(PAGE_SIZE);
195 pg0 = . ; 196 __brk_base = . ;
197 . += 64 * 1024 ; /* 64k alignment slop space */
198 *(.brk_reservation) /* areas brk users have reserved */
199 __brk_limit = . ;
200 }
201
202 .end : AT(ADDR(.end) - LOAD_OFFSET) {
203 _end = . ;
196 } 204 }
197 205
198 /* Sections to be discarded */ 206 /* Sections to be discarded */
199 /DISCARD/ : { 207 /DISCARD/ : {
200 *(.exitcall.exit) 208 *(.exitcall.exit)
209 *(.discard)
201 } 210 }
202 211
203 STABS_DEBUG 212 STABS_DEBUG
@@ -205,6 +214,12 @@ SECTIONS
205 DWARF_DEBUG 214 DWARF_DEBUG
206} 215}
207 216
217/*
218 * Build-time check on the image size:
219 */
220ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
221 "kernel image bigger than KERNEL_IMAGE_SIZE")
222
208#ifdef CONFIG_KEXEC 223#ifdef CONFIG_KEXEC
209/* Link time checks */ 224/* Link time checks */
210#include <asm/kexec.h> 225#include <asm/kexec.h>
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 5bf54e40c6ef..c8742507b030 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -29,8 +29,8 @@ SECTIONS
29{ 29{
30 . = __START_KERNEL; 30 . = __START_KERNEL;
31 phys_startup_64 = startup_64 - LOAD_OFFSET; 31 phys_startup_64 = startup_64 - LOAD_OFFSET;
32 _text = .; /* Text and read-only data */
33 .text : AT(ADDR(.text) - LOAD_OFFSET) { 32 .text : AT(ADDR(.text) - LOAD_OFFSET) {
33 _text = .; /* Text and read-only data */
34 /* First the code that has to be first for bootstrapping */ 34 /* First the code that has to be first for bootstrapping */
35 *(.text.head) 35 *(.text.head)
36 _stext = .; 36 _stext = .;
@@ -61,13 +61,13 @@ SECTIONS
61 .data : AT(ADDR(.data) - LOAD_OFFSET) { 61 .data : AT(ADDR(.data) - LOAD_OFFSET) {
62 DATA_DATA 62 DATA_DATA
63 CONSTRUCTORS 63 CONSTRUCTORS
64 _edata = .; /* End of data section */
64 } :data 65 } :data
65 66
66 _edata = .; /* End of data section */
67 67
68 . = ALIGN(PAGE_SIZE);
69 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
70 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { 68 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
69 . = ALIGN(PAGE_SIZE);
70 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
71 *(.data.cacheline_aligned) 71 *(.data.cacheline_aligned)
72 } 72 }
73 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); 73 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
@@ -125,29 +125,29 @@ SECTIONS
125#undef VVIRT_OFFSET 125#undef VVIRT_OFFSET
126#undef VVIRT 126#undef VVIRT
127 127
128 . = ALIGN(THREAD_SIZE); /* init_task */
129 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { 128 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
129 . = ALIGN(THREAD_SIZE); /* init_task */
130 *(.data.init_task) 130 *(.data.init_task)
131 }:data.init 131 }:data.init
132 132
133 . = ALIGN(PAGE_SIZE);
134 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { 133 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
134 . = ALIGN(PAGE_SIZE);
135 *(.data.page_aligned) 135 *(.data.page_aligned)
136 } 136 }
137 137
138 /* might get freed after init */
139 . = ALIGN(PAGE_SIZE);
140 __smp_alt_begin = .;
141 __smp_locks = .;
142 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 138 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
139 /* might get freed after init */
140 . = ALIGN(PAGE_SIZE);
141 __smp_alt_begin = .;
142 __smp_locks = .;
143 *(.smp_locks) 143 *(.smp_locks)
144 __smp_locks_end = .;
145 . = ALIGN(PAGE_SIZE);
146 __smp_alt_end = .;
144 } 147 }
145 __smp_locks_end = .;
146 . = ALIGN(PAGE_SIZE);
147 __smp_alt_end = .;
148 148
149 . = ALIGN(PAGE_SIZE); /* Init code and data */ 149 . = ALIGN(PAGE_SIZE); /* Init code and data */
150 __init_begin = .; 150 __init_begin = .; /* paired with __init_end */
151 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 151 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
152 _sinittext = .; 152 _sinittext = .;
153 INIT_TEXT 153 INIT_TEXT
@@ -159,40 +159,42 @@ SECTIONS
159 __initdata_end = .; 159 __initdata_end = .;
160 } 160 }
161 161
162 . = ALIGN(16); 162 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
163 __setup_start = .; 163 . = ALIGN(16);
164 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } 164 __setup_start = .;
165 __setup_end = .; 165 *(.init.setup)
166 __initcall_start = .; 166 __setup_end = .;
167 }
167 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { 168 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
169 __initcall_start = .;
168 INITCALLS 170 INITCALLS
171 __initcall_end = .;
169 } 172 }
170 __initcall_end = .;
171 __con_initcall_start = .;
172 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { 173 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
174 __con_initcall_start = .;
173 *(.con_initcall.init) 175 *(.con_initcall.init)
176 __con_initcall_end = .;
174 } 177 }
175 __con_initcall_end = .;
176 __x86_cpu_dev_start = .;
177 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 178 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
179 __x86_cpu_dev_start = .;
178 *(.x86_cpu_dev.init) 180 *(.x86_cpu_dev.init)
181 __x86_cpu_dev_end = .;
179 } 182 }
180 __x86_cpu_dev_end = .;
181 SECURITY_INIT 183 SECURITY_INIT
182 184
183 . = ALIGN(8); 185 . = ALIGN(8);
184 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { 186 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
185 __parainstructions = .; 187 __parainstructions = .;
186 *(.parainstructions) 188 *(.parainstructions)
187 __parainstructions_end = .; 189 __parainstructions_end = .;
188 } 190 }
189 191
190 . = ALIGN(8);
191 __alt_instructions = .;
192 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { 192 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
193 . = ALIGN(8);
194 __alt_instructions = .;
193 *(.altinstructions) 195 *(.altinstructions)
196 __alt_instructions_end = .;
194 } 197 }
195 __alt_instructions_end = .;
196 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { 198 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
197 *(.altinstr_replacement) 199 *(.altinstr_replacement)
198 } 200 }
@@ -207,9 +209,11 @@ SECTIONS
207 209
208#ifdef CONFIG_BLK_DEV_INITRD 210#ifdef CONFIG_BLK_DEV_INITRD
209 . = ALIGN(PAGE_SIZE); 211 . = ALIGN(PAGE_SIZE);
210 __initramfs_start = .; 212 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
211 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } 213 __initramfs_start = .;
212 __initramfs_end = .; 214 *(.init.ramfs)
215 __initramfs_end = .;
216 }
213#endif 217#endif
214 218
215#ifdef CONFIG_SMP 219#ifdef CONFIG_SMP
@@ -229,20 +233,29 @@ SECTIONS
229 . = ALIGN(PAGE_SIZE); 233 . = ALIGN(PAGE_SIZE);
230 __init_end = .; 234 __init_end = .;
231 235
232 . = ALIGN(PAGE_SIZE);
233 __nosave_begin = .;
234 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { 236 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
235 *(.data.nosave) 237 . = ALIGN(PAGE_SIZE);
238 __nosave_begin = .;
239 *(.data.nosave)
240 . = ALIGN(PAGE_SIZE);
241 __nosave_end = .;
236 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ 242 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
237 . = ALIGN(PAGE_SIZE);
238 __nosave_end = .;
239 243
240 __bss_start = .; /* BSS */
241 .bss : AT(ADDR(.bss) - LOAD_OFFSET) { 244 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
245 . = ALIGN(PAGE_SIZE);
246 __bss_start = .; /* BSS */
242 *(.bss.page_aligned) 247 *(.bss.page_aligned)
243 *(.bss) 248 *(.bss)
244 } 249 __bss_stop = .;
245 __bss_stop = .; 250 }
251
252 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
253 . = ALIGN(PAGE_SIZE);
254 __brk_base = . ;
255 . += 64 * 1024 ; /* 64k alignment slop space */
256 *(.brk_reservation) /* areas brk users have reserved */
257 __brk_limit = . ;
258 }
246 259
247 _end = . ; 260 _end = . ;
248 261
@@ -250,6 +263,7 @@ SECTIONS
250 /DISCARD/ : { 263 /DISCARD/ : {
251 *(.exitcall.exit) 264 *(.exitcall.exit)
252 *(.eh_frame) 265 *(.eh_frame)
266 *(.discard)
253 } 267 }
254 268
255 STABS_DEBUG 269 STABS_DEBUG
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 9fe4ddaa8f6f..90e44a10e68a 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1058,14 +1058,6 @@ __init void lguest_init(void)
1058 * lguest_init() where the rest of the fairly chaotic boot setup 1058 * lguest_init() where the rest of the fairly chaotic boot setup
1059 * occurs. */ 1059 * occurs. */
1060 1060
1061 /* The native boot code sets up initial page tables immediately after
1062 * the kernel itself, and sets init_pg_tables_end so they're not
1063 * clobbered. The Launcher places our initial pagetables somewhere at
1064 * the top of our physical memory, so we don't need extra space: set
1065 * init_pg_tables_end to the end of the kernel. */
1066 init_pg_tables_start = __pa(pg0);
1067 init_pg_tables_end = __pa(pg0);
1068
1069 /* As described in head_32.S, we map the first 128M of memory. */ 1061 /* As described in head_32.S, we map the first 128M of memory. */
1070 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1062 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1071 1063
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 6e60ba698cee..699c9b2895ae 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -18,7 +18,6 @@
18 18
19#include <asm/iomap.h> 19#include <asm/iomap.h>
20#include <asm/pat.h> 20#include <asm/pat.h>
21#include <asm/highmem.h>
22#include <linux/module.h> 21#include <linux/module.h>
23 22
24int is_io_mapping_possible(resource_size_t base, unsigned long size) 23int is_io_mapping_possible(resource_size_t base, unsigned long size)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 9c4294986af7..1280565670e4 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -16,6 +16,7 @@
16#include <asm/processor.h> 16#include <asm/processor.h>
17#include <asm/tlbflush.h> 17#include <asm/tlbflush.h>
18#include <asm/sections.h> 18#include <asm/sections.h>
19#include <asm/setup.h>
19#include <asm/uaccess.h> 20#include <asm/uaccess.h>
20#include <asm/pgalloc.h> 21#include <asm/pgalloc.h>
21#include <asm/proto.h> 22#include <asm/proto.h>
@@ -95,7 +96,7 @@ static inline unsigned long highmap_start_pfn(void)
95 96
96static inline unsigned long highmap_end_pfn(void) 97static inline unsigned long highmap_end_pfn(void)
97{ 98{
98 return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; 99 return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
99} 100}
100 101
101#endif 102#endif
@@ -711,7 +712,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
711 * No need to redo, when the primary call touched the high 712 * No need to redo, when the primary call touched the high
712 * mapping already: 713 * mapping already:
713 */ 714 */
714 if (within(vaddr, (unsigned long) _text, (unsigned long) _end)) 715 if (within(vaddr, (unsigned long) _text, _brk_end))
715 return 0; 716 return 0;
716 717
717 /* 718 /*
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index f2e477c91c1b..46c8834aedc0 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -50,7 +50,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
50 } 50 }
51 pte = pte_offset_kernel(pmd, vaddr); 51 pte = pte_offset_kernel(pmd, vaddr);
52 if (pte_val(pteval)) 52 if (pte_val(pteval))
53 set_pte_present(&init_mm, vaddr, pte, pteval); 53 set_pte_at(&init_mm, vaddr, pte, pteval);
54 else 54 else
55 pte_clear(&init_mm, vaddr, pte); 55 pte_clear(&init_mm, vaddr, pte);
56 56
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index a654d59e4483..821e97017e95 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -187,11 +187,6 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
187 cpumask, cpumask_of(smp_processor_id())); 187 cpumask, cpumask_of(smp_processor_id()));
188 188
189 /* 189 /*
190 * Make the above memory operations globally visible before
191 * sending the IPI.
192 */
193 smp_mb();
194 /*
195 * We have to send the IPI only to 190 * We have to send the IPI only to
196 * CPUs affected. 191 * CPUs affected.
197 */ 192 */
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index cb6afa4ec95c..db3802fb7b84 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1723,9 +1723,9 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1723{ 1723{
1724 pmd_t *kernel_pmd; 1724 pmd_t *kernel_pmd;
1725 1725
1726 init_pg_tables_start = __pa(pgd); 1726 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1727 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1727 xen_start_info->nr_pt_frames * PAGE_SIZE +
1728 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); 1728 512*1024);
1729 1729
1730 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 1730 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1731 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); 1731 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
@@ -1870,7 +1870,6 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
1870 1870
1871#ifdef CONFIG_X86_PAE 1871#ifdef CONFIG_X86_PAE
1872 .set_pte_atomic = xen_set_pte_atomic, 1872 .set_pte_atomic = xen_set_pte_atomic,
1873 .set_pte_present = xen_set_pte_at,
1874 .pte_clear = xen_pte_clear, 1873 .pte_clear = xen_pte_clear,
1875 .pmd_clear = xen_pmd_clear, 1874 .pmd_clear = xen_pmd_clear,
1876#endif /* CONFIG_X86_PAE */ 1875#endif /* CONFIG_X86_PAE */